From 195c45911e79c33cf0bb986721365fb06df5a153 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Tue, 2 Jul 2019 04:33:45 -0400 Subject: Import Upstream version 1.8.7 --- .gitignore | 75 ++ .travis.yml | 36 + CHANGELOG.md | 630 ++++++++++++ LICENSE | 339 +++++++ Makefile | 45 + README.rst | 244 +++++ bin/gallery-dl | 7 + docs/configuration.rst | 1615 +++++++++++++++++++++++++++++++ docs/gallery-dl-example.conf | 172 ++++ docs/gallery-dl.conf | 172 ++++ docs/supportedsites.rst | 137 +++ gallery_dl/__init__.py | 255 +++++ gallery_dl/__main__.py | 20 + gallery_dl/aes.py | 337 +++++++ gallery_dl/cache.py | 204 ++++ gallery_dl/cloudflare.py | 176 ++++ gallery_dl/config.py | 155 +++ gallery_dl/downloader/__init__.py | 39 + gallery_dl/downloader/common.py | 170 ++++ gallery_dl/downloader/http.py | 128 +++ gallery_dl/downloader/text.py | 37 + gallery_dl/downloader/ytdl.py | 81 ++ gallery_dl/exception.py | 79 ++ gallery_dl/extractor/2chan.py | 95 ++ gallery_dl/extractor/35photo.py | 205 ++++ gallery_dl/extractor/3dbooru.py | 81 ++ gallery_dl/extractor/4chan.py | 36 + gallery_dl/extractor/500px.py | 238 +++++ gallery_dl/extractor/8chan.py | 29 + gallery_dl/extractor/8muses.py | 129 +++ gallery_dl/extractor/__init__.py | 189 ++++ gallery_dl/extractor/artstation.py | 369 +++++++ gallery_dl/extractor/behance.py | 179 ++++ gallery_dl/extractor/bobx.py | 112 +++ gallery_dl/extractor/booru.py | 265 +++++ gallery_dl/extractor/chan.py | 61 ++ gallery_dl/extractor/common.py | 432 +++++++++ gallery_dl/extractor/danbooru.py | 86 ++ gallery_dl/extractor/deviantart.py | 992 +++++++++++++++++++ gallery_dl/extractor/directlink.py | 56 ++ gallery_dl/extractor/dynastyscans.py | 145 +++ gallery_dl/extractor/e621.py | 71 ++ gallery_dl/extractor/exhentai.py | 382 ++++++++ gallery_dl/extractor/fallenangels.py | 105 ++ gallery_dl/extractor/flickr.py | 503 ++++++++++ gallery_dl/extractor/foolfuuka.py | 157 +++ gallery_dl/extractor/foolslide.py | 240 +++++ gallery_dl/extractor/gelbooru.py | 130 +++ gallery_dl/extractor/gfycat.py | 83 ++ gallery_dl/extractor/hbrowse.py | 101 ++ gallery_dl/extractor/hentai2read.py | 101 ++ gallery_dl/extractor/hentaicafe.py | 88 ++ gallery_dl/extractor/hentaifoundry.py | 264 +++++ gallery_dl/extractor/hentaifox.py | 117 +++ gallery_dl/extractor/hentaihere.py | 101 ++ gallery_dl/extractor/hentainexus.py | 96 ++ gallery_dl/extractor/hitomi.py | 103 ++ gallery_dl/extractor/hypnohub.py | 68 ++ gallery_dl/extractor/idolcomplex.py | 59 ++ gallery_dl/extractor/imagebam.py | 128 +++ gallery_dl/extractor/imagefap.py | 195 ++++ gallery_dl/extractor/imagehosts.py | 251 +++++ gallery_dl/extractor/imgbox.py | 134 +++ gallery_dl/extractor/imgth.py | 61 ++ gallery_dl/extractor/imgur.py | 183 ++++ gallery_dl/extractor/instagram.py | 277 ++++++ gallery_dl/extractor/keenspot.py | 157 +++ gallery_dl/extractor/khinsider.py | 69 ++ gallery_dl/extractor/kissmanga.py | 223 +++++ gallery_dl/extractor/komikcast.py | 117 +++ gallery_dl/extractor/konachan.py | 85 ++ gallery_dl/extractor/livedoor.py | 156 +++ gallery_dl/extractor/luscious.py | 208 ++++ gallery_dl/extractor/mangadex.py | 180 ++++ gallery_dl/extractor/mangafox.py | 61 ++ gallery_dl/extractor/mangahere.py | 138 +++ gallery_dl/extractor/mangapanda.py | 36 + gallery_dl/extractor/mangapark.py | 140 +++ gallery_dl/extractor/mangareader.py | 119 +++ gallery_dl/extractor/mangastream.py | 54 ++ gallery_dl/extractor/mangoxo.py | 176 ++++ gallery_dl/extractor/mastodon.py | 203 ++++ gallery_dl/extractor/message.py | 54 ++ gallery_dl/extractor/myportfolio.py | 95 ++ gallery_dl/extractor/newgrounds.py | 155 +++ gallery_dl/extractor/ngomik.py | 51 + gallery_dl/extractor/nhentai.py | 135 +++ gallery_dl/extractor/nijie.py | 205 ++++ gallery_dl/extractor/nsfwalbum.py | 62 ++ gallery_dl/extractor/oauth.py | 375 +++++++ gallery_dl/extractor/paheal.py | 120 +++ gallery_dl/extractor/patreon.py | 183 ++++ gallery_dl/extractor/photobucket.py | 178 ++++ gallery_dl/extractor/piczel.py | 118 +++ gallery_dl/extractor/pinterest.py | 260 +++++ gallery_dl/extractor/pixiv.py | 517 ++++++++++ gallery_dl/extractor/pixnet.py | 179 ++++ gallery_dl/extractor/plurk.py | 125 +++ gallery_dl/extractor/pornhub.py | 157 +++ gallery_dl/extractor/pururin.py | 102 ++ gallery_dl/extractor/reactor.py | 338 +++++++ gallery_dl/extractor/readcomiconline.py | 97 ++ gallery_dl/extractor/recursive.py | 55 ++ gallery_dl/extractor/reddit.py | 313 ++++++ gallery_dl/extractor/rule34.py | 63 ++ gallery_dl/extractor/safebooru.py | 61 ++ gallery_dl/extractor/sankaku.py | 299 ++++++ gallery_dl/extractor/sankakucomplex.py | 120 +++ gallery_dl/extractor/seiga.py | 198 ++++ gallery_dl/extractor/senmanga.py | 65 ++ gallery_dl/extractor/sexcom.py | 194 ++++ gallery_dl/extractor/shopify.py | 136 +++ gallery_dl/extractor/simplyhentai.py | 187 ++++ gallery_dl/extractor/slickpic.py | 140 +++ gallery_dl/extractor/slideshare.py | 86 ++ gallery_dl/extractor/smugmug.py | 316 ++++++ gallery_dl/extractor/test.py | 86 ++ gallery_dl/extractor/tsumino.py | 343 +++++++ gallery_dl/extractor/tumblr.py | 425 ++++++++ gallery_dl/extractor/twitter.py | 202 ++++ gallery_dl/extractor/vanillarock.py | 95 ++ gallery_dl/extractor/wallhaven.py | 148 +++ gallery_dl/extractor/warosu.py | 108 +++ gallery_dl/extractor/weibo.py | 137 +++ gallery_dl/extractor/wikiart.py | 134 +++ gallery_dl/extractor/xhamster.py | 171 ++++ gallery_dl/extractor/xvideos.py | 140 +++ gallery_dl/extractor/yandere.py | 68 ++ gallery_dl/extractor/yaplog.py | 109 +++ gallery_dl/extractor/yuki.py | 125 +++ gallery_dl/job.py | 492 ++++++++++ gallery_dl/oauth.py | 132 +++ gallery_dl/option.py | 304 ++++++ gallery_dl/output.py | 221 +++++ gallery_dl/postprocessor/__init__.py | 44 + gallery_dl/postprocessor/classify.py | 49 + gallery_dl/postprocessor/common.py | 25 + gallery_dl/postprocessor/exec.py | 43 + gallery_dl/postprocessor/metadata.py | 65 ++ gallery_dl/postprocessor/ugoira.py | 132 +++ gallery_dl/postprocessor/zip.py | 65 ++ gallery_dl/text.py | 278 ++++++ gallery_dl/util.py | 673 +++++++++++++ gallery_dl/version.py | 9 + requirements.txt | 1 + scripts/bash_completion.py | 56 ++ scripts/build_testresult_db.py | 56 ++ scripts/create_test_data.py | 69 ++ scripts/hook-gallery_dl.py | 9 + scripts/man.py | 304 ++++++ scripts/pyinstaller.py | 18 + scripts/release.sh | 167 ++++ scripts/run_tests.sh | 24 + scripts/supportedsites.py | 264 +++++ scripts/util.py | 11 + setup.cfg | 3 + setup.py | 134 +++ snap/local/launchers/gallery-dl-launch | 32 + snap/snapcraft.yaml | 110 +++ test/__init__.py | 0 test/test_config.py | 81 ++ test/test_cookies.py | 130 +++ test/test_downloader.py | 235 +++++ test/test_extractor.py | 186 ++++ test/test_oauth.py | 104 ++ test/test_results.py | 344 +++++++ test/test_text.py | 409 ++++++++ test/test_util.py | 395 ++++++++ 168 files changed, 28676 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CHANGELOG.md create mode 100644 LICENSE create mode 100644 Makefile create mode 100644 README.rst create mode 100755 bin/gallery-dl create mode 100644 docs/configuration.rst create mode 100644 docs/gallery-dl-example.conf create mode 100644 docs/gallery-dl.conf create mode 100644 docs/supportedsites.rst create mode 100644 gallery_dl/__init__.py create mode 100644 gallery_dl/__main__.py create mode 100644 gallery_dl/aes.py create mode 100644 gallery_dl/cache.py create mode 100644 gallery_dl/cloudflare.py create mode 100644 gallery_dl/config.py create mode 100644 gallery_dl/downloader/__init__.py create mode 100644 gallery_dl/downloader/common.py create mode 100644 gallery_dl/downloader/http.py create mode 100644 gallery_dl/downloader/text.py create mode 100644 gallery_dl/downloader/ytdl.py create mode 100644 gallery_dl/exception.py create mode 100644 gallery_dl/extractor/2chan.py create mode 100644 gallery_dl/extractor/35photo.py create mode 100644 gallery_dl/extractor/3dbooru.py create mode 100644 gallery_dl/extractor/4chan.py create mode 100644 gallery_dl/extractor/500px.py create mode 100644 gallery_dl/extractor/8chan.py create mode 100644 gallery_dl/extractor/8muses.py create mode 100644 gallery_dl/extractor/__init__.py create mode 100644 gallery_dl/extractor/artstation.py create mode 100644 gallery_dl/extractor/behance.py create mode 100644 gallery_dl/extractor/bobx.py create mode 100644 gallery_dl/extractor/booru.py create mode 100644 gallery_dl/extractor/chan.py create mode 100644 gallery_dl/extractor/common.py create mode 100644 gallery_dl/extractor/danbooru.py create mode 100644 gallery_dl/extractor/deviantart.py create mode 100644 gallery_dl/extractor/directlink.py create mode 100644 gallery_dl/extractor/dynastyscans.py create mode 100644 gallery_dl/extractor/e621.py create mode 100644 gallery_dl/extractor/exhentai.py create mode 100644 gallery_dl/extractor/fallenangels.py create mode 100644 gallery_dl/extractor/flickr.py create mode 100644 gallery_dl/extractor/foolfuuka.py create mode 100644 gallery_dl/extractor/foolslide.py create mode 100644 gallery_dl/extractor/gelbooru.py create mode 100644 gallery_dl/extractor/gfycat.py create mode 100644 gallery_dl/extractor/hbrowse.py create mode 100644 gallery_dl/extractor/hentai2read.py create mode 100644 gallery_dl/extractor/hentaicafe.py create mode 100644 gallery_dl/extractor/hentaifoundry.py create mode 100644 gallery_dl/extractor/hentaifox.py create mode 100644 gallery_dl/extractor/hentaihere.py create mode 100644 gallery_dl/extractor/hentainexus.py create mode 100644 gallery_dl/extractor/hitomi.py create mode 100644 gallery_dl/extractor/hypnohub.py create mode 100644 gallery_dl/extractor/idolcomplex.py create mode 100644 gallery_dl/extractor/imagebam.py create mode 100644 gallery_dl/extractor/imagefap.py create mode 100644 gallery_dl/extractor/imagehosts.py create mode 100644 gallery_dl/extractor/imgbox.py create mode 100644 gallery_dl/extractor/imgth.py create mode 100644 gallery_dl/extractor/imgur.py create mode 100644 gallery_dl/extractor/instagram.py create mode 100644 gallery_dl/extractor/keenspot.py create mode 100644 gallery_dl/extractor/khinsider.py create mode 100644 gallery_dl/extractor/kissmanga.py create mode 100644 gallery_dl/extractor/komikcast.py create mode 100644 gallery_dl/extractor/konachan.py create mode 100644 gallery_dl/extractor/livedoor.py create mode 100644 gallery_dl/extractor/luscious.py create mode 100644 gallery_dl/extractor/mangadex.py create mode 100644 gallery_dl/extractor/mangafox.py create mode 100644 gallery_dl/extractor/mangahere.py create mode 100644 gallery_dl/extractor/mangapanda.py create mode 100644 gallery_dl/extractor/mangapark.py create mode 100644 gallery_dl/extractor/mangareader.py create mode 100644 gallery_dl/extractor/mangastream.py create mode 100644 gallery_dl/extractor/mangoxo.py create mode 100644 gallery_dl/extractor/mastodon.py create mode 100644 gallery_dl/extractor/message.py create mode 100644 gallery_dl/extractor/myportfolio.py create mode 100644 gallery_dl/extractor/newgrounds.py create mode 100644 gallery_dl/extractor/ngomik.py create mode 100644 gallery_dl/extractor/nhentai.py create mode 100644 gallery_dl/extractor/nijie.py create mode 100644 gallery_dl/extractor/nsfwalbum.py create mode 100644 gallery_dl/extractor/oauth.py create mode 100644 gallery_dl/extractor/paheal.py create mode 100644 gallery_dl/extractor/patreon.py create mode 100644 gallery_dl/extractor/photobucket.py create mode 100644 gallery_dl/extractor/piczel.py create mode 100644 gallery_dl/extractor/pinterest.py create mode 100644 gallery_dl/extractor/pixiv.py create mode 100644 gallery_dl/extractor/pixnet.py create mode 100644 gallery_dl/extractor/plurk.py create mode 100644 gallery_dl/extractor/pornhub.py create mode 100644 gallery_dl/extractor/pururin.py create mode 100644 gallery_dl/extractor/reactor.py create mode 100644 gallery_dl/extractor/readcomiconline.py create mode 100644 gallery_dl/extractor/recursive.py create mode 100644 gallery_dl/extractor/reddit.py create mode 100644 gallery_dl/extractor/rule34.py create mode 100644 gallery_dl/extractor/safebooru.py create mode 100644 gallery_dl/extractor/sankaku.py create mode 100644 gallery_dl/extractor/sankakucomplex.py create mode 100644 gallery_dl/extractor/seiga.py create mode 100644 gallery_dl/extractor/senmanga.py create mode 100644 gallery_dl/extractor/sexcom.py create mode 100644 gallery_dl/extractor/shopify.py create mode 100644 gallery_dl/extractor/simplyhentai.py create mode 100644 gallery_dl/extractor/slickpic.py create mode 100644 gallery_dl/extractor/slideshare.py create mode 100644 gallery_dl/extractor/smugmug.py create mode 100644 gallery_dl/extractor/test.py create mode 100644 gallery_dl/extractor/tsumino.py create mode 100644 gallery_dl/extractor/tumblr.py create mode 100644 gallery_dl/extractor/twitter.py create mode 100644 gallery_dl/extractor/vanillarock.py create mode 100644 gallery_dl/extractor/wallhaven.py create mode 100644 gallery_dl/extractor/warosu.py create mode 100644 gallery_dl/extractor/weibo.py create mode 100644 gallery_dl/extractor/wikiart.py create mode 100644 gallery_dl/extractor/xhamster.py create mode 100644 gallery_dl/extractor/xvideos.py create mode 100644 gallery_dl/extractor/yandere.py create mode 100644 gallery_dl/extractor/yaplog.py create mode 100644 gallery_dl/extractor/yuki.py create mode 100644 gallery_dl/job.py create mode 100644 gallery_dl/oauth.py create mode 100644 gallery_dl/option.py create mode 100644 gallery_dl/output.py create mode 100644 gallery_dl/postprocessor/__init__.py create mode 100644 gallery_dl/postprocessor/classify.py create mode 100644 gallery_dl/postprocessor/common.py create mode 100644 gallery_dl/postprocessor/exec.py create mode 100644 gallery_dl/postprocessor/metadata.py create mode 100644 gallery_dl/postprocessor/ugoira.py create mode 100644 gallery_dl/postprocessor/zip.py create mode 100644 gallery_dl/text.py create mode 100644 gallery_dl/util.py create mode 100644 gallery_dl/version.py create mode 100644 requirements.txt create mode 100755 scripts/bash_completion.py create mode 100755 scripts/build_testresult_db.py create mode 100755 scripts/create_test_data.py create mode 100644 scripts/hook-gallery_dl.py create mode 100755 scripts/man.py create mode 100755 scripts/pyinstaller.py create mode 100755 scripts/release.sh create mode 100755 scripts/run_tests.sh create mode 100755 scripts/supportedsites.py create mode 100644 scripts/util.py create mode 100644 setup.cfg create mode 100644 setup.py create mode 100755 snap/local/launchers/gallery-dl-launch create mode 100644 snap/snapcraft.yaml create mode 100644 test/__init__.py create mode 100644 test/test_config.py create mode 100644 test/test_cookies.py create mode 100644 test/test_downloader.py create mode 100644 test/test_extractor.py create mode 100644 test/test_oauth.py create mode 100644 test/test_results.py create mode 100644 test/test_text.py create mode 100644 test/test_util.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2e257a8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,75 @@ +archive/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Manpages +gallery-dl.1 +gallery-dl.conf.5 + +# Bash completion +gallery-dl.bash_completion + +# Snap packaging specific +/snap/.snapcraft/ +/parts/ +/stage/ +/prime/ + +/*.snap +/*_source.tar.bz2 diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..6158941 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,36 @@ +language: python +python: + - "3.4" + - "3.5" + - "3.6" + - "pypy3" +env: + - GALLERYDL_TESTS=core +matrix: + include: + - python: "3.7" + dist: xenial + - python: "3.8-dev" + dist: xenial + - python: "3.6" + env: GALLERYDL_TESTS=results + - language: minimal + dist: xenial + addons: + snaps: + - name: snapcraft + classic: true + env: SNAP_TESTS=true + +git: + depth: 3 + quiet: true +branches: + only: + - master + - /^v\d+\.\d+\.\d+(-\S*)?$/ + - /^test(-\w+)+$/ + +script: + - 'if test "${SNAP_TESTS}" != true; then ./scripts/run_tests.sh; else true; fi' + - 'if test "${SNAP_TESTS}" = true; then sudo apt update && snapcraft --destructive-mode && sudo snap try && snap run gallery-dl --verbose https://twitter.com/ubuntu/status/1121001597092364288; else true; fi' diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..cd74a9f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,630 @@ +# Changelog + +## 1.8.7 - 2019-06-28 +### Additions +- Support for + - `vanillarock` - https://vanilla-rock.com/ ([#254](https://github.com/mikf/gallery-dl/issues/254)) + - `nsfwalbum` - https://nsfwalbum.com/ ([#287](https://github.com/mikf/gallery-dl/issues/287)) +- `artist` and `tags` metadata for `hentaicafe` ([#238](https://github.com/mikf/gallery-dl/issues/238)) +- `description` metadata for `instagram` ([#310](https://github.com/mikf/gallery-dl/issues/310)) +- Format string option to replace a substring with another - `R//` ([#318](https://github.com/mikf/gallery-dl/issues/318)) +### Changes +- Delete empty archives created by the `zip` post-processor ([#316](https://github.com/mikf/gallery-dl/issues/316)) +### Fixes +- Handle `hitomi` Game CG galleries correctly ([#321](https://github.com/mikf/gallery-dl/issues/321)) +- Miscellaneous fixes for `deviantart`, `hitomi`, `pururin`, `kissmanga`, `keenspot`, `mangoxo`, `imagefap` + +## 1.8.6 - 2019-06-14 +### Additions +- Support for + - `slickpic` - https://www.slickpic.com/ ([#249](https://github.com/mikf/gallery-dl/issues/249)) + - `xhamster` - https://xhamster.com/ ([#281](https://github.com/mikf/gallery-dl/issues/281)) + - `pornhub` - https://www.pornhub.com/ ([#282](https://github.com/mikf/gallery-dl/issues/282)) + - `8muses` - https://www.8muses.com/ ([#305](https://github.com/mikf/gallery-dl/issues/305)) +- `extra` option for `deviantart` to download Sta.sh content linked in description texts ([#302](https://github.com/mikf/gallery-dl/issues/302)) +### Changes +- Detect `directlink` URLs with upper case filename extensions ([#296](https://github.com/mikf/gallery-dl/issues/296)) +### Fixes +- Improved error handling for `tumblr` API calls ([#297](https://github.com/mikf/gallery-dl/issues/297)) +- Fixed extraction of `livedoor` blogs ([#301](https://github.com/mikf/gallery-dl/issues/301)) +- Fixed extraction of special `deviantart` Sta.sh items ([#307](https://github.com/mikf/gallery-dl/issues/307)) +- Fixed pagination for specific `keenspot` comics + +## 1.8.5 - 2019-06-01 +### Additions +- Support for + - `keenspot` - http://keenspot.com/ ([#223](https://github.com/mikf/gallery-dl/issues/223)) + - `sankakucomplex` - https://www.sankakucomplex.com ([#258](https://github.com/mikf/gallery-dl/issues/258)) +- `folders` option for `deviantart` to add a list of containing folders to each file ([#276](https://github.com/mikf/gallery-dl/issues/276)) +- `captcha` option for `kissmanga` and `readcomiconline` to control CAPTCHA handling ([#279](https://github.com/mikf/gallery-dl/issues/279)) +- `filename` metadata for files downloaded with youtube-dl ([#291](https://github.com/mikf/gallery-dl/issues/291)) +### Changes +- Adjust `wallhaven` extractors to new page layout: + - use API and add `api-key` option + - removed traditional login support +- Provide original filenames for `patreon` downloads ([#268](https://github.com/mikf/gallery-dl/issues/268)) +- Use e-hentai.org or exhentai.org depending on input URL ([#278](https://github.com/mikf/gallery-dl/issues/278)) +### Fixes +- Fix pagination over `sankaku` popular listings ([#265](https://github.com/mikf/gallery-dl/issues/265)) +- Fix folder and collection extraction on `deviantart` ([#271](https://github.com/mikf/gallery-dl/issues/271)) +- Detect "AreYouHuman" redirects on `readcomiconline` ([#279](https://github.com/mikf/gallery-dl/issues/279)) +- Miscellaneous fixes for `hentainexus`, `livedoor`, `ngomik` + +## 1.8.4 - 2019-05-17 +### Additions +- Support for + - `patreon` - https://www.patreon.com/ ([#226](https://github.com/mikf/gallery-dl/issues/226)) + - `hentainexus` - https://hentainexus.com/ ([#256](https://github.com/mikf/gallery-dl/issues/256)) +- `date` metadata fields for `pixiv` ([#248](https://github.com/mikf/gallery-dl/issues/248)), `instagram` ([#250](https://github.com/mikf/gallery-dl/issues/250)), `exhentai`, and `newgrounds` +### Changes +- Improved `flickr` metadata and video extraction ([#246](https://github.com/mikf/gallery-dl/issues/246)) +### Fixes +- Download original GIF animations from `deviantart` ([#242](https://github.com/mikf/gallery-dl/issues/242)) +- Ignore missing `edge_media_to_comment` fields on `instagram` ([#250](https://github.com/mikf/gallery-dl/issues/250)) +- Fix serialization of `datetime` objects for `--write-metadata` ([#251](https://github.com/mikf/gallery-dl/issues/251), [#252](https://github.com/mikf/gallery-dl/issues/252)) +- Allow multiple post-processor command-line options at once ([#253](https://github.com/mikf/gallery-dl/issues/253)) +- Prevent crash on `booru` sites when no tags are available ([#259](https://github.com/mikf/gallery-dl/issues/259)) +- Fix extraction on `instagram` after `rhx_gis` field removal ([#266](https://github.com/mikf/gallery-dl/issues/266)) +- Avoid Cloudflare CAPTCHAs for Python interpreters built against OpenSSL < 1.1.1 +- Miscellaneous fixes for `luscious` + +## 1.8.3 - 2019-05-04 +### Additions +- Support for + - `plurk` - https://www.plurk.com/ ([#212](https://github.com/mikf/gallery-dl/issues/212)) + - `sexcom` - https://www.sex.com/ ([#147](https://github.com/mikf/gallery-dl/issues/147)) +- `--clear-cache` +- `date` metadata fields for `deviantart`, `twitter`, and `tumblr` ([#224](https://github.com/mikf/gallery-dl/issues/224), [#232](https://github.com/mikf/gallery-dl/issues/232)) +### Changes +- Standalone executables are now built using PyInstaller: + - uses the latest CPython interpreter (Python 3.7.3) + - available on several platforms (Windows, Linux, macOS) + - includes the `certifi` CA bundle, `youtube-dl`, and `pyOpenSSL` on Windows +### Fixes +- Patch `urllib3`'s default list of SSL/TLS ciphers to prevent Cloudflare CAPTCHAs ([#227](https://github.com/mikf/gallery-dl/issues/227)) + (Windows users need to install `pyOpenSSL` for this to take effect) +- Provide fallback URLs for `twitter` images ([#237](https://github.com/mikf/gallery-dl/issues/237)) +- Send `Referer` headers when downloading from `hitomi` ([#239](https://github.com/mikf/gallery-dl/issues/239)) +- Updated login procedure on `mangoxo` + +## 1.8.2 - 2019-04-12 +### Additions +- Support for + - `pixnet` - https://www.pixnet.net/ ([#177](https://github.com/mikf/gallery-dl/issues/177)) + - `wikiart` - https://www.wikiart.org/ ([#179](https://github.com/mikf/gallery-dl/issues/179)) + - `mangoxo` - https://www.mangoxo.com/ ([#184](https://github.com/mikf/gallery-dl/issues/184)) + - `yaplog` - https://yaplog.jp/ ([#190](https://github.com/mikf/gallery-dl/issues/190)) + - `livedoor` - http://blog.livedoor.jp/ ([#190](https://github.com/mikf/gallery-dl/issues/190)) +- Login support for `mangoxo` ([#184](https://github.com/mikf/gallery-dl/issues/184)) and `twitter` ([#214](https://github.com/mikf/gallery-dl/issues/214)) +### Changes +- Increased required `Requests` version to 2.11.0 +### Fixes +- Improved image quality on `reactor` sites ([#210](https://github.com/mikf/gallery-dl/issues/210)) +- Support `imagebam` galleries with more than 100 images ([#219](https://github.com/mikf/gallery-dl/issues/219)) +- Updated Cloudflare bypass code + +## 1.8.1 - 2019-03-29 +### Additions +- Support for: + - `35photo` - https://35photo.pro/ ([#162](https://github.com/mikf/gallery-dl/issues/162)) + - `500px` - https://500px.com/ ([#185](https://github.com/mikf/gallery-dl/issues/185)) +- `instagram` extractor for hashtags ([#202](https://github.com/mikf/gallery-dl/issues/202)) +- Option to get more metadata on `deviantart` ([#189](https://github.com/mikf/gallery-dl/issues/189)) +- Man pages and bash completion ([#150](https://github.com/mikf/gallery-dl/issues/150)) +- Snap improvements ([#197](https://github.com/mikf/gallery-dl/issues/197), [#199](https://github.com/mikf/gallery-dl/issues/199), [#207](https://github.com/mikf/gallery-dl/issues/207)) +### Changes +- Better FFmpeg arguments for `--ugoira-conv` +- Adjusted metadata for `luscious` albums +### Fixes +- Proper handling of `instagram` multi-image posts ([#178](https://github.com/mikf/gallery-dl/issues/178), [#201](https://github.com/mikf/gallery-dl/issues/201)) +- Fixed `tumblr` avatar URLs when not using OAuth1.0 ([#193](https://github.com/mikf/gallery-dl/issues/193)) +- Miscellaneous fixes for `exhentai`, `komikcast` + +## 1.8.0 - 2019-03-15 +### Additions +- Support for: + - `weibo` - https://www.weibo.com/ + - `pururin` - https://pururin.io/ ([#174](https://github.com/mikf/gallery-dl/issues/174)) + - `fashionnova` - https://www.fashionnova.com/ ([#175](https://github.com/mikf/gallery-dl/issues/175)) + - `shopify` sites in general ([#175](https://github.com/mikf/gallery-dl/issues/175)) +- Snap packaging ([#169](https://github.com/mikf/gallery-dl/issues/169), [#170](https://github.com/mikf/gallery-dl/issues/170), [#187](https://github.com/mikf/gallery-dl/issues/187), [#188](https://github.com/mikf/gallery-dl/issues/188)) +- Automatic Cloudflare DDoS protection bypass +- Extractor and Job information for logging format strings +- `dynastyscans` image and search extractors ([#163](https://github.com/mikf/gallery-dl/issues/163)) +- `deviantart` scraps extractor ([#168](https://github.com/mikf/gallery-dl/issues/168)) +- `artstation` extractor for artwork listings ([#172](https://github.com/mikf/gallery-dl/issues/172)) +- `smugmug` video support and improved image format selection ([#183](https://github.com/mikf/gallery-dl/issues/183)) +### Changes +- More metadata for `nhentai` galleries +- Combined `myportfolio` extractors into one +- Renamed `name` metadata field to `filename` and removed the original `filename` field +- Simplified and improved internal data structures +- Optimized creation of child extractors +### Fixes +- Filter empty `tumblr` URLs ([#165](https://github.com/mikf/gallery-dl/issues/165)) +- Filter ads and improve connection speed on `hentaifoundry` +- Show proper error messages if `luscious` galleries are unavailable +- Miscellaneous fixes for `mangahere`, `ngomik`, `simplyhentai`, `imgspice` +### Removals +- `seaotterscans` + +## 1.7.0 - 2019-02-05 +- Added support for: + - `photobucket` - http://photobucket.com/ ([#117](https://github.com/mikf/gallery-dl/issues/117)) + - `hentaifox` - https://hentaifox.com/ ([#160](https://github.com/mikf/gallery-dl/issues/160)) + - `tsumino` - https://www.tsumino.com/ ([#161](https://github.com/mikf/gallery-dl/issues/161)) +- Added the ability to dynamically generate extractors based on a user's config file for + - [`mastodon`](https://github.com/tootsuite/mastodon) instances ([#144](https://github.com/mikf/gallery-dl/issues/144)) + - [`foolslide`](https://github.com/FoolCode/FoOlSlide) based sites + - [`foolfuuka`](https://github.com/FoolCode/FoolFuuka) based archives +- Added an extractor for `behance` collections ([#157](https://github.com/mikf/gallery-dl/issues/157)) +- Added login support for `luscious` ([#159](https://github.com/mikf/gallery-dl/issues/159)) and `tsumino` ([#161](https://github.com/mikf/gallery-dl/issues/161)) +- Added an option to stop downloading if the `exhentai` image limit is exceeded ([#141](https://github.com/mikf/gallery-dl/issues/141)) +- Fixed extraction issues for `behance` and `mangapark` + +## 1.6.3 - 2019-01-18 +- Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135)) +- Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149)) +- Added authentication support for `danbooru` ([#151](https://github.com/mikf/gallery-dl/issues/151)) +- Added tag metadata for `exhentai` and `hbrowse` galleries +- Improved `*reactor` extractors ([#148](https://github.com/mikf/gallery-dl/issues/148)) +- Fixed extraction issues for `nhentai` ([#156](https://github.com/mikf/gallery-dl/issues/156)), `pinterest`, `mangapark` + +## 1.6.2 - 2019-01-01 +- Added support for: + - `instagram` - https://www.instagram.com/ ([#134](https://github.com/mikf/gallery-dl/issues/134)) +- Added support for multiple items on sta.sh pages ([#113](https://github.com/mikf/gallery-dl/issues/113)) +- Added option to download `tumblr` avatars ([#137](https://github.com/mikf/gallery-dl/issues/137)) +- Changed defaults for visited post types and inline media on `tumblr` +- Improved inline extraction of `tumblr` posts ([#133](https://github.com/mikf/gallery-dl/issues/133), [#137](https://github.com/mikf/gallery-dl/issues/137)) +- Improved error handling and retry behavior of all API calls +- Improved handling of missing fields in format strings ([#136](https://github.com/mikf/gallery-dl/issues/136)) +- Fixed hash extraction for unusual `tumblr` URLs ([#129](https://github.com/mikf/gallery-dl/issues/129)) +- Fixed image subdomains for `hitomi` galleries ([#142](https://github.com/mikf/gallery-dl/issues/142)) +- Fixed and improved miscellaneous issues for `kissmanga` ([#20](https://github.com/mikf/gallery-dl/issues/20)), `luscious`, `mangapark`, `readcomiconline` + +## 1.6.1 - 2018-11-28 +- Added support for: + - `joyreactor` - http://joyreactor.cc/ ([#114](https://github.com/mikf/gallery-dl/issues/114)) + - `pornreactor` - http://pornreactor.cc/ ([#114](https://github.com/mikf/gallery-dl/issues/114)) + - `newgrounds` - https://www.newgrounds.com/ ([#119](https://github.com/mikf/gallery-dl/issues/119)) +- Added extractor for search results on `luscious` ([#127](https://github.com/mikf/gallery-dl/issues/127)) +- Fixed filenames of ZIP archives ([#126](https://github.com/mikf/gallery-dl/issues/126)) +- Fixed extraction issues for `gfycat`, `hentaifoundry` ([#125](https://github.com/mikf/gallery-dl/issues/125)), `mangafox` + +## 1.6.0 - 2018-11-17 +- Added support for: + - `wallhaven` - https://alpha.wallhaven.cc/ + - `yuki` - https://yuki.la/ +- Added youtube-dl integration and video downloads for `twitter` ([#99](https://github.com/mikf/gallery-dl/issues/99)), `behance`, `artstation` +- Added per-extractor options for network connections (`retries`, `timeout`, `verify`) +- Added a `--no-check-certificate` command-line option +- Added ability to specify the number of skipped downloads before aborting/exiting ([#115](https://github.com/mikf/gallery-dl/issues/115)) +- Added extractors for scraps, favorites, popular and recent images on `hentaifoundry` ([#110](https://github.com/mikf/gallery-dl/issues/110)) +- Improved login procedure for `pixiv` to avoid unwanted emails on each new login +- Improved album metadata and error handling for `flickr` ([#109](https://github.com/mikf/gallery-dl/issues/109)) +- Updated default User-Agent string to Firefox 62 ([#122](https://github.com/mikf/gallery-dl/issues/122)) +- Fixed `twitter` API response handling when logged in ([#123](https://github.com/mikf/gallery-dl/issues/123)) +- Fixed issue when converting Ugoira using H.264 +- Fixed miscellaneous issues for `2chan`, `deviantart`, `fallenangels`, `flickr`, `imagefap`, `pinterest`, `turboimagehost`, `warosu`, `yuki` ([#112](https://github.com/mikf/gallery-dl/issues/112)) + +## 1.5.3 - 2018-09-14 +- Added support for: + - `hentaicafe` - https://hentai.cafe/ ([#101](https://github.com/mikf/gallery-dl/issues/101)) + - `bobx` - http://www.bobx.com/dark/ +- Added black-/whitelist options for post-processor modules +- Added support for `tumblr` inline videos ([#102](https://github.com/mikf/gallery-dl/issues/102)) +- Fixed extraction of `smugmug` albums without owner ([#100](https://github.com/mikf/gallery-dl/issues/100)) +- Fixed issues when using default config values with `reddit` extractors ([#104](https://github.com/mikf/gallery-dl/issues/104)) +- Fixed pagination for user favorites on `sankaku` ([#106](https://github.com/mikf/gallery-dl/issues/106)) +- Fixed a crash when processing `deviantart` journals ([#108](https://github.com/mikf/gallery-dl/issues/108)) + +## 1.5.2 - 2018-08-31 +- Added support for `twitter` timelines ([#96](https://github.com/mikf/gallery-dl/issues/96)) +- Added option to suppress FFmpeg output during ugoira conversions +- Improved filename formatter performance +- Improved inline image quality on `tumblr` ([#98](https://github.com/mikf/gallery-dl/issues/98)) +- Fixed image URLs for newly released `mangadex` chapters +- Fixed a smaller issue with `deviantart` journals +- Replaced `subapics` with `ngomik` + +## 1.5.1 - 2018-08-17 +- Added support for: + - `piczel` - https://piczel.tv/ +- Added support for related pins on `pinterest` +- Fixed accessing "offensive" galleries on `exhentai` ([#97](https://github.com/mikf/gallery-dl/issues/97)) +- Fixed extraction issues for `mangadex`, `komikcast` and `behance` +- Removed original-image functionality from `tumblr`, since "raw" images are no longer accessible + +## 1.5.0 - 2018-08-03 +- Added support for: + - `behance` - https://www.behance.net/ + - `myportfolio` - https://www.myportfolio.com/ ([#95](https://github.com/mikf/gallery-dl/issues/95)) +- Added custom format string options to handle long strings ([#92](https://github.com/mikf/gallery-dl/issues/92), [#94](https://github.com/mikf/gallery-dl/issues/94)) + - Slicing: `"{field[10:40]}"` + - Replacement: `"{field:L40/too long/}"` +- Improved frame rate handling for ugoira conversions +- Improved private access token usage on `deviantart` +- Fixed metadata extraction for some images on `nijie` +- Fixed chapter extraction on `mangahere` +- Removed `whatisthisimnotgoodwithcomputers` +- Removed support for Python 3.3 + +## 1.4.2 - 2018-07-06 +- Added image-pool extractors for `safebooru` and `rule34` +- Added option for extended tag information on `booru` sites ([#92](https://github.com/mikf/gallery-dl/issues/92)) +- Added support for DeviantArt's new URL format +- Added support for `mangapark` mirrors +- Changed `imagefap` extractors to use HTTPS +- Fixed crash when skipping downloads for files without known extension + +## 1.4.1 - 2018-06-22 +- Added an `ugoira` post-processor to convert `pixiv` animations to WebM +- Added `--zip` and `--ugoira-conv` command-line options +- Changed how ugoira frame information is handled + - instead of being written to a separate file, it is now made available as metadata field of the ZIP archive +- Fixed manga and chapter titles for `mangadex` +- Fixed file deletion by post-processors + +## 1.4.0 - 2018-06-08 +- Added support for: + - `simplyhentai` - https://www.simply-hentai.com/ ([#89](https://github.com/mikf/gallery-dl/issues/89)) +- Added extractors for + - `pixiv` search results and followed users + - `deviantart` search results and popular listings +- Added post-processors to perform actions on downloaded files +- Added options to configure logging behavior +- Added OAuth support for `smugmug` +- Changed `pixiv` extractors to use the AppAPI + - this breaks `favorite` archive IDs and changes some metadata fields +- Changed the default filename format for `tumblr` and renamed `offset` to `num` +- Fixed a possible UnicodeDecodeError during installation ([#86](https://github.com/mikf/gallery-dl/issues/86)) +- Fixed extraction of `mangadex` manga with more than 100 chapters ([#84](https://github.com/mikf/gallery-dl/issues/84)) +- Fixed miscellaneous issues for `imgur`, `reddit`, `komikcast`, `mangafox` and `imagebam` + +## 1.3.5 - 2018-05-04 +- Added support for: + - `smugmug` - https://www.smugmug.com/ +- Added title information for `mangadex` chapters +- Improved the `pinterest` API implementation ([#83](https://github.com/mikf/gallery-dl/issues/83)) +- Improved error handling for `deviantart` and `tumblr` +- Removed `gomanga` and `puremashiro` + +## 1.3.4 - 2018-04-20 +- Added support for custom OAuth2 credentials for `pinterest` +- Improved rate limit handling for `tumblr` extractors +- Improved `hentaifoundry` extractors +- Improved `imgur` URL patterns +- Fixed miscellaneous extraction issues for `luscious` and `komikcast` +- Removed `loveisover` and `spectrumnexus` + +## 1.3.3 - 2018-04-06 +- Added extractors for + - `nhentai` search results + - `exhentai` search results and favorites + - `nijie` doujins and favorites +- Improved metadata extraction for `exhentai` and `nijie` +- Improved `tumblr` extractors by avoiding unnecessary API calls +- Fixed Cloudflare DDoS protection bypass +- Fixed errors when trying to print unencodable characters + +## 1.3.2 - 2018-03-23 +- Added extractors for `artstation` albums, challenges and search results +- Improved URL and metadata extraction for `hitomi`and `nhentai` +- Fixed page transitions for `danbooru` API results ([#82](https://github.com/mikf/gallery-dl/issues/82)) + +## 1.3.1 - 2018-03-16 +- Added support for: + - `mangadex` - https://mangadex.org/ + - `artstation` - https://www.artstation.com/ +- Added Cloudflare DDoS protection bypass to `komikcast` extractors +- Changed archive ID formats for `deviantart` folders and collections +- Improved error handling for `deviantart` API calls +- Removed `imgchili` and various smaller image hosts + +## 1.3.0 - 2018-03-02 +- Added `--proxy` to explicitly specify a proxy server ([#76](https://github.com/mikf/gallery-dl/issues/76)) +- Added options to customize [archive ID formats](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorarchive-format) and [undefined replacement fields](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorkeywords-default) +- Changed various archive ID formats to improve their behavior for favorites / bookmarks / etc. + - Affected modules are `deviantart`, `flickr`, `tumblr`, `pixiv` and all …boorus +- Improved `sankaku` and `idolcomplex` support by + - respecting `page` and `next` URL parameters ([#79](https://github.com/mikf/gallery-dl/issues/79)) + - bypassing the page-limit for unauthenticated users +- Improved `directlink` metadata by properly unquoting it +- Fixed `pixiv` ugoira extraction ([#78](https://github.com/mikf/gallery-dl/issues/78)) +- Fixed miscellaneous extraction issues for `mangastream` and `tumblr` +- Removed `yeet`, `chronos`, `coreimg`, `hosturimage`, `imageontime`, `img4ever`, `imgmaid`, `imgupload` + +## 1.2.0 - 2018-02-16 +- Added support for: + - `paheal` - https://rule34.paheal.net/ ([#69](https://github.com/mikf/gallery-dl/issues/69)) + - `komikcast` - https://komikcast.com/ ([#70](https://github.com/mikf/gallery-dl/issues/70)) + - `subapics` - http://subapics.com/ ([#70](https://github.com/mikf/gallery-dl/issues/70)) +- Added `--download-archive` to record downloaded files in an archive file +- Added `--write-log` to write logging output to a file +- Added a filetype check on download completion to fix incorrectly assigned filename extensions ([#63](https://github.com/mikf/gallery-dl/issues/63)) +- Added the `tumblr:...` pseudo URI scheme to support custom domains for Tumblr blogs ([#71](https://github.com/mikf/gallery-dl/issues/71)) +- Added fallback URLs for `tumblr` images ([#64](https://github.com/mikf/gallery-dl/issues/64)) +- Added support for `reddit`-hosted images ([#68](https://github.com/mikf/gallery-dl/issues/68)) +- Improved the input file format by allowing comments and per-URL options +- Fixed OAuth 1.0 signature generation for Python 3.3 and 3.4 ([#75](https://github.com/mikf/gallery-dl/issues/75)) +- Fixed smaller issues for `luscious`, `hentai2read`, `hentaihere` and `imgur` +- Removed the `batoto` module + +## 1.1.2 - 2018-01-12 +- Added support for: + - `puremashiro` - http://reader.puremashiro.moe/ ([#66](https://github.com/mikf/gallery-dl/issues/66)) + - `idolcomplex` - https://idol.sankakucomplex.com/ +- Added an option to filter reblogs on `tumblr` ([#61](https://github.com/mikf/gallery-dl/issues/61)) +- Added OAuth user authentication for `tumblr` ([#65](https://github.com/mikf/gallery-dl/issues/65)) +- Added support for `slideshare` mobile URLs ([#67](https://github.com/mikf/gallery-dl/issues/67)) +- Improved pagination for various …booru sites to work around page limits +- Fixed chapter information parsing for certain manga on `kissmanga` ([#58](https://github.com/mikf/gallery-dl/issues/58)) and `batoto` ([#60](https://github.com/mikf/gallery-dl/issues/60)) + +## 1.1.1 - 2017-12-22 +- Added support for: + - `slideshare` - https://www.slideshare.net/ ([#54](https://github.com/mikf/gallery-dl/issues/54)) +- Added pool- and post-extractors for `sankaku` +- Added OAuth user authentication for `deviantart` +- Updated `luscious` to support `members.luscious.net` URLs ([#55](https://github.com/mikf/gallery-dl/issues/55)) +- Updated `mangahere` to use their new domain name (mangahere.cc) and support mobile URLs +- Updated `gelbooru` to not be restricted to the first 20,000 images ([#56](https://github.com/mikf/gallery-dl/issues/56)) +- Fixed extraction issues for `nhentai` and `khinsider` + +## 1.1.0 - 2017-12-08 +- Added the ``-r/--limit-rate`` command-line option to set a maximum download rate +- Added the ``--sleep`` command-line option to specify the number of seconds to sleep before each download +- Updated `gelbooru` to no longer use their now disabled API +- Fixed SWF extraction for `sankaku` ([#52](https://github.com/mikf/gallery-dl/issues/52)) +- Fixed extraction issues for `hentai2read` and `khinsider` +- Removed the deprecated `--images` and `--chapters` options +- Removed the ``mangazuki`` module + +## 1.0.2 - 2017-11-24 +- Added an option to set a [custom user-agent string](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractoruser-agent) +- Improved retry behavior for failed HTTP requests +- Improved `seiga` by providing better metadata and getting more than the latest 200 images +- Improved `tumblr` by adding support for [all post types](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractortumblrposts), scanning for [inline images](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractortumblrinline) and following [external links](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractortumblrexternal) ([#48](https://github.com/mikf/gallery-dl/issues/48)) +- Fixed extraction issues for `hbrowse`, `khinsider` and `senmanga` + +## 1.0.1 - 2017-11-10 +- Added support for: + - `xvideos` - https://www.xvideos.com/ ([#45](https://github.com/mikf/gallery-dl/issues/45)) +- Fixed exception handling during file downloads which could lead to a premature exit +- Fixed an issue with `tumblr` where not all images would be downloaded when using tags ([#48](https://github.com/mikf/gallery-dl/issues/48)) +- Fixed extraction issues for `imgbox` ([#47](https://github.com/mikf/gallery-dl/issues/47)), `mangastream` ([#49](https://github.com/mikf/gallery-dl/issues/49)) and `mangahere` + +## 1.0.0 - 2017-10-27 +- Added support for: + - `warosu` - https://warosu.org/ + - `b4k` - https://arch.b4k.co/ +- Added support for `pixiv` ranking lists +- Added support for `booru` popular lists (`danbooru`, `e621`, `konachan`, `yandere`, `3dbooru`) +- Added the `--cookies` command-line and [`cookies`](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorcookies) config option to load additional cookies +- Added the `--filter` and `--chapter-filter` command-line options to select individual images or manga-chapters by their metadata using simple Python expressions ([#43](https://github.com/mikf/gallery-dl/issues/43)) +- Added the [`verify`](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#downloaderhttpverify) config option to control certificate verification during file downloads +- Added config options to overwrite internally used API credentials ([API Tokens & IDs](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#api-tokens-ids)) +- Added `-K` as a shortcut for `--list-keywords` +- Changed the `--images` and `--chapters` command-line options to `--range` and `--chapter-range` +- Changed keyword names for various modules to make them accessible by `--filter`. In general minus signs have been replaced with underscores (e.g. `gallery-id` -> `gallery_id`). +- Changed default filename formats for manga extractors to optionally use volume and title information +- Improved the downloader modules to use [`.part` files](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#downloaderpart) and support resuming incomplete downloads ([#29](https://github.com/mikf/gallery-dl/issues/29)) +- Improved `deviantart` by distinguishing between users and groups ([#26](https://github.com/mikf/gallery-dl/issues/26)), always using HTTPS, and always downloading full-sized original images +- Improved `sankaku` by adding authentication support and fixing various other issues ([#44](https://github.com/mikf/gallery-dl/issues/44)) +- Improved URL pattern for direct image links ([#30](https://github.com/mikf/gallery-dl/issues/30)) +- Fixed an issue with `luscious` not getting original image URLs ([#33](https://github.com/mikf/gallery-dl/issues/33)) +- Fixed various smaller issues for `batoto`, `hentai2read` ([#38](https://github.com/mikf/gallery-dl/issues/38)), `jaiminisbox`, `khinsider`, `kissmanga` ([#28](https://github.com/mikf/gallery-dl/issues/28), [#46](https://github.com/mikf/gallery-dl/issues/46)), `mangahere`, `pawoo`, `twitter` +- Removed `kisscomic` and `yonkouprod` modules + +## 0.9.1 - 2017-07-24 +- Added support for: + - `2chan` - https://www.2chan.net/ + - `4plebs` - https://archive.4plebs.org/ + - `archivedmoe` - https://archived.moe/ + - `archiveofsins` - https://archiveofsins.com/ + - `desuarchive` - https://desuarchive.org/ + - `fireden` - https://boards.fireden.net/ + - `loveisover` - https://archive.loveisover.me/ + - `nyafuu` - https://archive.nyafuu.org/ + - `rbt` - https://rbt.asia/ + - `thebarchive` - https://thebarchive.com/ + - `mangazuki` - https://mangazuki.co/ +- Improved `reddit` to allow submission filtering by ID and human-readable dates +- Improved `deviantart` to support group galleries and gallery folders ([#26](https://github.com/mikf/gallery-dl/issues/26)) +- Changed `deviantart` to use better default path formats +- Fixed extraction of larger `imgur` albums +- Fixed some smaller issues for `pixiv`, `batoto` and `fallenangels` + +## 0.9.0 - 2017-06-28 +- Added support for: + - `reddit` - https://www.reddit.com/ ([#15](https://github.com/mikf/gallery-dl/issues/15)) + - `flickr` - https://www.flickr.com/ ([#16](https://github.com/mikf/gallery-dl/issues/16)) + - `gfycat` - https://gfycat.com/ +- Added support for direct image links +- Added user authentication via [OAuth](https://github.com/mikf/gallery-dl#52oauth) for `reddit` and `flickr` +- Added support for user authentication data from [`.netrc`](https://stackoverflow.com/tags/.netrc/info) files ([#22](https://github.com/mikf/gallery-dl/issues/22)) +- Added a simple progress indicator for multiple URLs ([#19](https://github.com/mikf/gallery-dl/issues/19)) +- Added the `--write-unsupported` command-line option to write unsupported URLs to a file +- Added documentation for all available config options ([configuration.rst](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst)) +- Improved `pixiv` to support tags for user downloads ([#17](https://github.com/mikf/gallery-dl/issues/17)) +- Improved `pixiv` to support shortened and http://pixiv.me/... URLs ([#23](https://github.com/mikf/gallery-dl/issues/23)) +- Improved `imgur` to properly handle `.gifv` images and provide better metadata +- Fixed an issue with `kissmanga` where metadata parsing for some series failed ([#20](https://github.com/mikf/gallery-dl/issues/20)) +- Fixed an issue with getting filename extensions from `Content-Type` response headers + +## 0.8.4 - 2017-05-21 +- Added the `--abort-on-skip` option to stop extraction if a download would be skipped +- Improved the output format of the `--list-keywords` option +- Updated `deviantart` to support all media types and journals +- Updated `fallenangels` to support their [Vietnamese version](https://truyen.fascans.com/) +- Fixed an issue with multiple tags on ...booru sites +- Removed the `yomanga` module + +## 0.8.3 - 2017-05-01 +- Added support for https://pawoo.net/ +- Added manga extractors for all [FoOlSlide](https://foolcode.github.io/FoOlSlide/)-based modules +- Added the `-q/--quiet` and `-v/--verbose` options to control output verbosity +- Added the `-j/--dump-json` option to dump extractor results in JSON format +- Added the `--ignore-config` option +- Updated the `exhentai` extractor to fall back to using the e-hentai version if no username is given +- Updated `deviantart` to support sta.sh URLs +- Fixed an issue with `kissmanga` which prevented image URLs from being decrypted properly (again) +- Fixed an issue with `pixhost` where for an image inside an album it would always download the first image of that album ([#13](https://github.com/mikf/gallery-dl/issues/13)) +- Removed the `mangashare` and `readcomics` modules + +## 0.8.2 - 2017-04-10 +- Fixed an issue in `kissmanga` which prevented image URLs from being decrypted properly + +## 0.8.1 - 2017-04-09 +- Added new extractors: + - `kireicake` - https://reader.kireicake.com/ + - `seaotterscans` - https://reader.seaotterscans.com/ +- Added a favourites extractor for `deviantart` +- Re-enabled the `kissmanga` module +- Updated `nijie` to support multi-page image listings +- Updated `mangastream` to support readms.net URLs +- Updated `exhentai` to support e-hentai.org URLs +- Updated `fallenangels` to support their new domain and site layout + +## 0.8.0 - 2017-03-28 +- Added logging support +- Added the `-R/--retries` option to specify how often a download should be retried before giving up +- Added the `--http-timeout` option to set a timeout for HTTP connections +- Improved error handling/tolerance during HTTP file downloads ([#10](https://github.com/mikf/gallery-dl/issues/10)) +- Improved option parsing and the help message from `-h/--help` +- Changed the way configuration values are used by prioritizing top-level values + - This allows for cmdline options like `-u/--username` to overwrite values set in configuration files +- Fixed an issue with `imagefap.com` where incorrectly reported gallery sizes would cause the extractor to fail ([#9](https://github.com/mikf/gallery-dl/issues/9)) +- Fixed an issue with `seiga.nicovideo.jp` where invalid characters in an API response caused the XML parser to fail +- Fixed an issue with `seiga.nicovideo.jp` where the filename extension for the first image would be used for all others +- Removed support for old configuration paths on Windows +- Removed several modules: + - `mangamint`: site is down + - `whentai`: now requires account with VIP status for original images + - `kissmanga`: encrypted image URLs (will be re-added later) + +## 0.7.0 - 2017-03-06 +- Added `--images` and `--chapters` options + - Specifies which images (or chapters) to download through a comma-separated list of indices or index-ranges + - Example: `--images -2,4,6-8,10-` will select images with index 1, 2, 4, 6, 7, 8 and 10 up to the last one +- Changed the `-g`/`--get-urls` option + - The amount of how often the -g option is given now determines up until which level URLs are resolved. + - See 3bca86618505c21628cd9c7179ce933a78d00ca2 +- Changed several option keys: + - `directory_fmt` -> `directory` + - `filename_fmt` -> `filename` + - `download-original` -> `original` +- Improved [FoOlSlide](https://foolcode.github.io/FoOlSlide/)-based extractors +- Fixed URL extraction for hentai2read +- Fixed an issue with deviantart, where the API access token wouldn't get refreshed + +## 0.6.4 - 2017-02-13 +- Added new extractors: + - fallenangels (famatg.com) +- Fixed url- and data-extraction for: + - nhentai + - mangamint + - twitter + - imagetwist +- Disabled InsecureConnectionWarning when no certificates are available + +## 0.6.3 - 2017-01-25 +- Added new extractors: + - gomanga + - yomanga + - mangafox +- Fixed deviantart extractor failing - switched to using their API +- Fixed an issue with SQLite on Python 3.6 +- Automated test builds via Travis CI +- Standalone executables for Windows + +## 0.6.2 - 2017-01-05 +- Added new extractors: + - kisscomic + - readcomics + - yonkouprod + - jaiminisbox +- Added manga extractor to batoto-module +- Added user extractor to seiga-module +- Added `-i`/`--input-file` argument to allow local files and stdin as input (like wget) +- Added basic support for `file://` URLs + - this allows for the recursive extractor to be applied to local files: + - `$ gallery-dl r:file://[path to file]` +- Added a utility extractor to run unit test URLs +- Updated luscious to deal with API changes +- Fixed twitter to provide the original image URL +- Minor fixes to hentaifoundry +- Removed imgclick extractor + +## 0.6.1 - 2016-11-30 +- Added new extractors: + - whentai + - readcomiconline + - sensescans, worldthree + - imgmaid, imagevenue, img4ever, imgspot, imgtrial, pixhost +- Added base class for extractors of [FoOlSlide](https://foolcode.github.io/FoOlSlide/)-based sites +- Changed default paths for configuration files on Windows + - old paths are still supported, but that will change in future versions +- Fixed aborting downloads if a single one failed ([#5](https://github.com/mikf/gallery-dl/issues/5)) +- Fixed cloudflare-bypass cache containing outdated cookies +- Fixed image URLs for hitomi and 8chan +- Updated deviantart to always provide the highest quality image +- Updated README.rst +- Removed doujinmode extractor + +## 0.6.0 - 2016-10-08 +- Added new extractors: + - hentaihere + - dokireader + - twitter + - rapidimg, picmaniac +- Added support to find filename extensions by Content-Type response header +- Fixed filename/path issues on Windows ([#4](https://github.com/mikf/gallery-dl/issues/4)): + - Enable path names with more than 260 characters + - Remove trailing spaces in path segments +- Updated Job class to automatically set category/subcategory keywords + +## 0.5.2 - 2016-09-23 +- Added new extractors: + - pinterest + - rule34 + - dynastyscans + - imagebam, coreimg, imgcandy, imgtrex +- Added login capabilities for batoto +- Added `--version` cmdline argument to print the current program version and exit +- Added `--list-extractors` cmdline argument to print names of all extractor classes together with descriptions and example URLs +- Added proper error messages if an image/user does not exist +- Added unittests for every extractor + +## 0.5.1 - 2016-08-22 +- Added new extractors: + - luscious + - doujinmode + - hentaibox + - seiga + - imagefap +- Changed error output to use stderr instead of stdout +- Fixed broken pipes causing an exception-dump by catching BrokenPipeErrors + +## 0.5.0 - 2016-07-25 + +## 0.4.1 - 2015-12-03 +- New modules (imagetwist, turboimagehost) +- Manga-extractors: Download entire manga and not just single chapters +- Generic extractor (provisional) +- Better and configurable console output +- Windows support + +## 0.4.0 - 2015-11-26 + +## 0.3.3 - 2015-11-10 + +## 0.3.2 - 2015-11-04 + +## 0.3.1 - 2015-10-30 + +## 0.3.0 - 2015-10-05 + +## 0.2.0 - 2015-06-28 + +## 0.1.0 - 2015-05-27 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..d159169 --- /dev/null +++ b/LICENSE @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..5a98fcd --- /dev/null +++ b/Makefile @@ -0,0 +1,45 @@ + +PREFIX ?= /usr/local +BINDIR ?= $(PREFIX)/bin +MANDIR ?= $(PREFIX)/man +SHAREDIR ?= $(PREFIX)/share +PYTHON ?= /usr/bin/env python3 + +# set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local +SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi) + +all: man completion docs/supportedsites.rst + +clean: + $(RM) gallery-dl.1 gallery-dl.conf.5 gallery-dl.bash_completion + $(RM) -r build/ + +install: man completion + $(PYTHON) setup.py install + +release: man completion docs/supportedsites.rst + scripts/release.sh + +test: + scripts/run_tests.sh + +executable: + scripts/pyinstaller.py + +completion: gallery-dl.bash_completion + +man: gallery-dl.1 gallery-dl.conf.5 + +.PHONY: all clean install release test executable completion man + +docs/supportedsites.rst: gallery_dl/*/*.py scripts/supportedsites.py + $(PYTHON) scripts/supportedsites.py + +gallery-dl.1: gallery_dl/option.py scripts/man.py + $(PYTHON) scripts/man.py + +gallery-dl.conf.5: docs/configuration.rst scripts/man.py + $(PYTHON) scripts/man.py + +gallery-dl.bash_completion: gallery_dl/option.py scripts/bash_completion.py + $(PYTHON) scripts/bash_completion.py diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..873c034 --- /dev/null +++ b/README.rst @@ -0,0 +1,244 @@ +========== +gallery-dl +========== + +*gallery-dl* is a command-line program to download image-galleries and +-collections from several image hosting sites (see `Supported Sites`_). +It is a cross-platform tool with many configuration options +and powerful filenaming capabilities. + + +|pypi| |build| |gitter| + + +Dependencies +============ + +- Python_ 3.4+ +- Requests_ + +Optional +-------- + +- FFmpeg_: Pixiv Ugoira to WebM conversion +- youtube-dl_: Video downloads + + +Installation +============ + +Pip +--- + +The stable releases of *gallery-dl* are distributed on PyPI_ and can be +easily installed or upgraded using pip_: + +.. code:: bash + + $ pip install --upgrade gallery-dl + +Installing the latest dev-version directly from GitHub can be done with +pip_ as well: + +.. code:: bash + + $ pip install --upgrade https://github.com/mikf/gallery-dl/archive/master.zip + +Be sure the Python interpreter used for pip_ is version 3.4 or higher. +You might have to use :code:`pip3` or :code:`python3 -m pip` +depending on your system's defaults. + + +From Source +----------- + +Get the code by either + +* Downloading a stable_ or dev_ archive and unpacking it +* Or via :code:`git clone https://github.com/mikf/gallery-dl.git` + +Navigate into the respective directory and run the :code:`setup.py` file. + +.. code:: bash + + $ wget https://github.com/mikf/gallery-dl/archive/master.zip + $ unzip master.zip + # or + $ git clone https://github.com/mikf/gallery-dl.git + + $ cd gallery-dl + $ python setup.py install + + +Standalone Executable +--------------------- + +Download a standalone executable file, +put it into your `PATH `__, +and run it inside a command prompt (like ``cmd.exe``). + +- `Windows `__ +- `Linux `__ + +These executables include a Python 3.7 interpreter +and all required Python packages. + + +Snap +---- + +Linux users that are using a distro that is supported by Snapd_ can install *gallery-dl* from the Snap Store: + +.. code:: bash + + $ snap install gallery-dl + + +Usage +===== + +To use *gallery-dl* simply call it with the URLs you wish to download images +from: + +.. code:: bash + + $ gallery-dl [OPTION]... URL... + +See also :code:`gallery-dl --help`. + + +Examples +-------- + +Download images; in this case from danbooru via tag search for 'bonocho': + +.. code:: bash + + $ gallery-dl http://danbooru.donmai.us/posts?tags=bonocho + + +Get the direct URL of an image from a site that requires authentication: + +.. code:: bash + + $ gallery-dl -g -u -p http://seiga.nicovideo.jp/seiga/im3211703 + + +| Search a remote resource for URLs and download images from them: +| (URLs for which no extractor can be found will be silently ignored) + +.. code:: bash + + $ gallery-dl r:https://pastebin.com/raw/FLwrCYsT + + +Configuration +============= + +Configuration files for *gallery-dl* use a JSON-based file format. + +| For a (more or less) complete example with options set to their default values, + see gallery-dl.conf_. +| For a configuration file example with more involved settings and options, + see gallery-dl-example.conf_. +| A list of all available configuration options and their + descriptions can be found in configuration.rst_. + +*gallery-dl* searches for configuration files in the following places: + ++--------------------------------------------+------------------------------------------+ +| Linux | Windows | ++--------------------------------------------+------------------------------------------+ +|* ``/etc/gallery-dl.conf`` |* | +|* ``${HOME}/.config/gallery-dl/config.json``|* ``%USERPROFILE%\gallery-dl\config.json``| +|* ``${HOME}/.gallery-dl.conf`` |* ``%USERPROFILE%\gallery-dl.conf`` | ++--------------------------------------------+------------------------------------------+ + +(``%USERPROFILE%`` usually refers to the user's home directory, +i.e. ``C:\Users\\``) + +Values in later configuration files will override previous ones. + + +Authentication +============== + +Username & Password +------------------- + +Some extractors require you to provide valid login-credentials in the form of +a username & password pair. +This is necessary for ``pixiv``, ``nijie`` and ``seiga`` +and optional (but strongly recommended) for ``exhentai``, ``luscious``, +``sankaku``, ``idolcomplex``, ``tsumino`` and ``wallhaven``. + +You can set the necessary information in your configuration file +(cf. gallery-dl.conf_) + +.. code:: + + { + "extractor": { + ... + "pixiv": { + "username": "", + "password": "" + } + ... + } + } + +or you can provide them directly via the +:code:`-u/--username` and :code:`-p/--password` or via the +:code:`-o/--option` command-line options + +.. code:: bash + + $ gallery-dl -u -p URL + $ gallery-dl -o username= -o password= URL + +OAuth +----- + +*gallery-dl* supports user authentication via OAuth_ for +``deviantart``, ``flickr``, ``reddit``, ``smugmug`` and ``tumblr``. +This is entirely optional, but grants *gallery-dl* the ability +to issue requests on your account's behalf and enables it to access resources +which would otherwise be unavailable to a public user. + +To link your account to *gallery-dl*, start by invoking it with +``oauth:`` as an argument. For example: + +.. code:: bash + + $ gallery-dl oauth:flickr + +You will be sent to the site's authorization page and asked to grant read +access to *gallery-dl*. Authorize it and you will be shown one or more +"tokens", which should be added to your configuration file. + + +.. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf +.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf +.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst +.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst +.. _stable: https://github.com/mikf/gallery-dl/archive/v1.8.7.zip +.. _dev: https://github.com/mikf/gallery-dl/archive/master.zip + +.. _Python: https://www.python.org/downloads/ +.. _PyPI: https://pypi.org/ +.. _pip: https://pip.pypa.io/en/stable/ +.. _Requests: http://docs.python-requests.org/en/master/ +.. _FFmpeg: https://www.ffmpeg.org/ +.. _youtube-dl: https://ytdl-org.github.io/youtube-dl/ +.. _Snapd: https://docs.snapcraft.io/installing-snapd +.. _OAuth: https://en.wikipedia.org/wiki/OAuth + +.. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg + :target: https://pypi.org/project/gallery-dl/ + +.. |build| image:: https://travis-ci.org/mikf/gallery-dl.svg?branch=master + :target: https://travis-ci.org/mikf/gallery-dl + +.. |gitter| image:: https://badges.gitter.im/gallery-dl/main.svg + :target: https://gitter.im/gallery-dl/main diff --git a/bin/gallery-dl b/bin/gallery-dl new file mode 100755 index 0000000..12da2fd --- /dev/null +++ b/bin/gallery-dl @@ -0,0 +1,7 @@ +#!/usr/bin/env python3 +# -*- coding: UTF-8 -*- + +import gallery_dl + +if __name__ == '__main__': + gallery_dl.main() diff --git a/docs/configuration.rst b/docs/configuration.rst new file mode 100644 index 0000000..c606c6c --- /dev/null +++ b/docs/configuration.rst @@ -0,0 +1,1615 @@ +Configuration +############# + +Contents +======== + +1) `Extractor Options`_ +2) `Extractor-specific Options`_ +3) `Downloader Options`_ +4) `Output Options`_ +5) `Postprocessor Options`_ +6) `Miscellaneous Options`_ +7) `API Tokens & IDs`_ + + + +Extractor Options +================= + + +Each extractor is identified by its ``category`` and ``subcategory``. +The ``category`` is the lowercase site name without any spaces or special +characters, which is usually just the module name +(``pixiv``, ``danbooru``, ...). +The ``subcategory`` is a lowercase word describing the general functionality +of that extractor (``user``, ``favorite``, ``manga``, ...). + +Each one of the following options can be specified on multiple levels of the +configuration tree: + +================== ===== +Base level: ``extractor.`` +Category level: ``extractor..`` +Subcategory level: ``extractor...`` +================== ===== + +A value in a "deeper" level hereby overrides a value of the same name on a +lower level. Setting the ``extractor.pixiv.filename`` value, for example, lets +you specify a general filename pattern for all the different pixiv extractors. +Using the ``extractor.pixiv.user.filename`` value lets you override this +general pattern specifically for ``PixivUserExtractor`` instances. + +The ``category`` and ``subcategory`` of all extractors are included in the +output of ``gallery-dl --list-extractors``. For a specific URL these values +can also be determined by using the ``-K``/``--list-keywords`` command-line +option (see the example below). + +extractor.*.filename +-------------------- +=========== ===== +Type ``string`` +Example ``"{manga}_c{chapter}_{page:>03}.{extension}"`` +Description A `format string`_ to build the resulting filename + for a downloaded file. + + The available replacement keys depend on the extractor used. A list + of keys for a specific one can be acquired by calling *gallery-dl* + with the ``-K``/``--list-keywords`` command-line option. + For example: + + .. code:: + + $ gallery-dl -K http://seiga.nicovideo.jp/seiga/im5977527 + Keywords for directory names: + ----------------------------- + category + seiga + subcategory + image + + Keywords for filenames: + ----------------------- + category + seiga + extension + None + image-id + 5977527 + subcategory + image + + Note: Even if the value of the ``extension`` key is missing or + ``None``, it will filled in later when the file download is + starting. This key is therefore always available to provide + a valid filename extension. +=========== ===== + + +extractor.*.directory +--------------------- +=========== ===== +Type ``list`` of ``strings`` +Example ``["{category}", "{manga}", "c{chapter} - {title}"]`` +Description A list of `format strings`_ for the resulting target directory. + + Each individual string in such a list represents a single path + segment, which will be joined together and appended to the + base-directory_ to form the complete target directory path. +=========== ===== + + +extractor.*.base-directory +-------------------------- +=========== ===== +Type |Path|_ +Default ``"./gallery-dl/"`` +Description Directory path used as the base for all download destinations. +=========== ===== + + +extractor.*.skip +---------------- +=========== ===== +Type ``bool`` or ``string`` +Default ``true`` +Description Controls the behavior when downloading files whose filename + already exists. + + * ``true``: Skip downloads + * ``false``: Overwrite already existing files + + * ``"abort"``: Abort the current extractor run + * ``"abort:N"``: Skip downloads and abort extractor run + after ``N`` consecutive skips + + * ``"exit"``: Exit the program altogether + * ``"exit:N"``: Skip downloads and exit the program + after ``N`` consecutive skips +=========== ===== + + +extractor.*.sleep +----------------- +=========== ===== +Type ``float`` +Default ``0`` +Description Number of seconds to sleep before each download. +=========== ===== + + +extractor.*.username & .password +-------------------------------- +=========== ===== +Type ``string`` +Default ``null`` +Description The username and password to use when attempting to log in to + another site. + + Specifying username and password is + required for the ``pixiv``, ``nijie`` and ``seiga`` modules and + optional (but strongly recommended) for ``danbooru``, ``exhentai``, + ``sankaku`` and ``idolcomplex``. + + These values can also be set via the ``-u/--username`` and + ``-p/--password`` command-line options or by using a |.netrc|_ file. + (see Authentication_) + + Note: The password for ``danbooru`` is the API key found in your + user profile, not the password for your account. +=========== ===== + + +extractor.*.netrc +----------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Enable the use of |.netrc|_ authentication data. +=========== ===== + + +extractor.*.cookies +------------------- +=========== ===== +Type |Path|_ or ``object`` +Default ``null`` +Description Source to read additional cookies from. + + * If this is a |Path|_, it specifies a + Mozilla/Netscape format cookies.txt file. + * If this is an ``object``, its key-value pairs, which should both + be ``strings``, will be used as cookie-names and -values. +=========== ===== + + +extractor.*.proxy +----------------- +=========== ===== +Type ``string`` or ``object`` +Default ``null`` +Description Proxy (or proxies) to be used for remote connections. + + * If this is a ``string``, it is the proxy URL for all + outgoing requests. + * If this is an ``object``, it is a scheme-to-proxy mapping to + specify different proxy URLs for each scheme. + It is also possible to set a proxy for a specific host by using + ``scheme://host`` as key. + See `Requests' proxy documentation`_ for more details. + + Example: + + .. code:: + + { + "http": "http://10.10.1.10:3128", + "https": "http://10.10.1.10:1080", + "http://10.20.1.128": "http://10.10.1.10:5323" + } + + Note: All proxy URLs should include a scheme, + otherwise ``http://`` is assumed. +=========== ===== + + +extractor.*.user-agent +---------------------- +=========== ===== +Type ``string`` +Default ``"Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0"`` +Description User-Agent header value to be used for HTTP requests. + + Note: This option has no effect on `pixiv` and + `readcomiconline` extractors, as these need specific values to + function correctly. +=========== ===== + + +extractor.*.keywords +-------------------- +=========== ===== +Type ``object`` +Example ``{"type": "Pixel Art", "type_id": 123}`` +Description Additional key-value pairs to be added to each metadata dictionary. +=========== ===== + + +extractor.*.keywords-default +---------------------------- +=========== ===== +Type any +Default ``"None"`` +Description Default value used for missing or undefined keyword names in + format strings. +=========== ===== + + +extractor.*.category-transfer +----------------------------- +=========== ===== +Type ``bool`` +Default Extractor-specific +Description Transfer an extractor's (sub)category values to all child + extractors spawned by it, to let them inherit their parent's + config options. +=========== ===== + + +extractor.*.archive +------------------- +=========== ===== +Type |Path|_ +Default ``null`` +Description File to store IDs of downloaded files in. Downloads of files + already recorded in this archive file will be skipped_. + + The resulting archive file is not a plain text file but an SQLite3 + database, as either lookup operations are significantly faster or + memory requirements are significantly lower when the + amount of stored IDs gets reasonably large. +=========== ===== + + +extractor.*.archive-format +-------------------------- +=========== ===== +Type ``string`` +Example ``"{id}_{offset}"`` +Description An alternative `format string`_ to build archive IDs with. +=========== ===== + + +extractor.*.postprocessors +-------------------------- +=========== ===== +Type ``list`` of |Postprocessor Configuration|_ objects +Example .. code:: + + [ + {"name": "zip", "compression": "zip"}, + {"name": "exec", "command": ["/home/foobar/script", "{category}", "{image_id}"]} + ] + +Description A list of post-processors to be applied to each downloaded file + in the same order as they are specified. +=========== ===== + + +extractor.*.retries +------------------- +=========== ===== +Type ``integer`` +Default ``5`` +Description Number of times a failed HTTP request is retried before giving up. +=========== ===== + + +extractor.*.timeout +------------------- +=========== ===== +Type ``float`` or ``null`` +Default ``30`` +Description Amount of time (in seconds) to wait for a successful connection + and response from a remote server. + + This value gets internally used as the |timeout|_ parameter for the + |requests.request()|_ method. +=========== ===== + + +extractor.*.verify +------------------ +=========== ===== +Type ``bool`` or ``string`` +Default ``true`` +Description Controls whether to verify SSL/TLS certificates for HTTPS requests. + + If this is a ``string``, it must be the path to a CA bundle to use + instead of the default certificates. + + This value gets internally used as the |verify|_ parameter for the + |requests.request()|_ method. +=========== ===== + + +extractor.*.image-range +----------------------- +=========== ===== +Type ``string`` +Example | ``"10-20"``, + | ``"-5, 10, 30-50, 100-"`` +Description Index-range(s) specifying which images to download. + + Note: The index of the first image is ``1``. +=========== ===== + + +extractor.*.chapter-range +------------------------- +=========== ===== +Type ``string`` +Description Like `image-range`__, but applies to delegated URLs + like manga-chapters, etc. +=========== ===== + +__ `extractor.*.image-range`_ + + +extractor.*.image-filter +------------------------ +=========== ===== +Type ``string`` +Example | ``"width >= 1200 and width/height > 1.2"``, + | ``"re.search(r'foo(bar)+', description)"`` +Description | Python expression controlling which images to download. + | Files for which the expression evaluates to ``False`` + are ignored. + | Available keys are the filename-specific ones listed + by ``-K`` or ``-j``. +=========== ===== + + +extractor.*.chapter-filter +-------------------------- +=========== ===== +Type ``string`` +Description Like `image-filter`__, but applies to delegated URLs + like manga-chapters, etc. +=========== ===== + +__ `extractor.*.image-filter`_ + + + +Extractor-specific Options +========================== + + +extractor.artstation.external +----------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Try to follow external URLs of embedded players. +=========== ===== + + +extractor.deviantart.extra +-------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Download extra Sta.sh resources from description texts. + + Note: Enabling this option also enables deviantart.metadata_. +=========== ===== + + +extractor.deviantart.flat +------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Select the directory structure created by the Gallery- and + Favorite-Extractors. + + * ``true``: Use a flat directory structure. + * ``false``: Collect a list of all gallery-folders or + favorites-collections and transfer any further work to other + extractors (``folder`` or ``collection``), which will then + create individual subdirectories for each of them. +=========== ===== + + +extractor.deviantart.folders +---------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Provide a ``folders`` metadata field that contains the names of all + folders a deviation is present in. + + Note: Gathering this information requires a lot of API calls. + Use with caution. +=========== ===== + + +extractor.deviantart.journals +----------------------------- +=========== ===== +Type ``string`` +Default ``"html"`` +Description Selects the output format of journal entries. + + * ``"html"``: HTML with (roughly) the same layout as on DeviantArt. + * ``"text"``: Plain text with image references and HTML tags removed. + * ``"none"``: Don't download journals. +=========== ===== + + +extractor.deviantart.mature +--------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Enable mature content. + + This option simply sets the |mature_content|_ parameter for API + calls to either ``"true"`` or ``"false"`` and does not do any other + form of content filtering. +=========== ===== + + +extractor.deviantart.metadata +----------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Request extended metadata for deviation objects to additionally + provide ``description``, ``tags``, ``license`` and ``is_watching`` + fields. +=========== ===== + + +extractor.deviantart.original +----------------------------- +=========== ===== +Type ``bool`` or ``string`` +Default ``true`` +Description Download original files if available. + + Setting this option to ``"images"`` only downloads original + files if they are images and falls back to preview versions for + everything else (archives, etc.). +=========== ===== + + +extractor.deviantart.refresh-token +---------------------------------- +=========== ===== +Type ``string`` +Default ``null`` +Description The ``refresh_token`` value you get from linking your + DeviantArt account to *gallery-dl*. + + Using a ``refresh_token`` allows you to access private or otherwise + not publicly available deviations. +=========== ===== + + +extractor.deviantart.wait-min +----------------------------- +=========== ===== +Type ``integer`` +Default ``0`` +Description Minimum wait time in seconds before API requests. + + Note: This value will internally be rounded up + to the next power of 2. +=========== ===== + + +extractor.exhentai.limits +------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Check image download limits + and stop extraction when they are exceeded. +=========== ===== + + +extractor.exhentai.original +--------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Download full-sized original images if available. +=========== ===== + + +extractor.exhentai.wait-min & .wait-max +--------------------------------------- +=========== ===== +Type ``float`` +Default ``3.0`` and ``6.0`` +Description Minimum and maximum wait time in seconds between each image + + ExHentai detects and blocks automated downloaders. + *gallery-dl* waits a randomly selected number of + seconds between ``wait-min`` and ``wait-max`` after + each image to prevent getting blocked. +=========== ===== + + +extractor.flickr.access-token & .access-token-secret +---------------------------------------------------- +=========== ===== +Type ``string`` +Default ``null`` +Description The ``access_token`` and ``access_token_secret`` values you get + from linking your Flickr account to *gallery-dl*. +=========== ===== + + +extractor.flickr.videos +----------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Extract and download videos. +=========== ===== + + +extractor.flickr.size-max +-------------------------- +=========== ===== +Type ``integer`` or ``string`` +Default ``null`` +Description Sets the maximum allowed size for downloaded images. + + * If this is an ``integer``, it specifies the maximum image dimension + (width and height) in pixels. + * If this is a ``string``, it should be one of Flickr's format specifiers + (``"Original"``, ``"Large"``, ... or ``"o"``, ``"k"``, ``"h"``, + ``"l"``, ...) to use as an upper limit. +=========== ===== + + +extractor.gelbooru.api +---------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Enable use of Gelbooru's API. + + Set this value to `false` if the API has been disabled to switch + to manual information extraction. +=========== ===== + + +extractor.gfycat.format +----------------------- +=========== ===== +Type ``string`` +Default ``"mp4"`` +Description The name of the preferred animation format, which can be one of + ``"mp4"``, ``"webm"``, ``"gif"``, ``"webp"`` or ``"mjpg"``. + + If the selected format is not available, ``"mp4"``, ``"webm"`` + and ``"gif"`` (in that order) will be tried instead, until an + available format is found. +=========== ===== + + +extractor.imgur.mp4 +------------------- +=========== ===== +Type ``bool`` or ``string`` +Default ``true`` +Description Controls whether to choose the GIF or MP4 version of an animation. + + * ``true``: Follow Imgur's advice and choose MP4 if the + ``prefer_video`` flag in an image's metadata is set. + * ``false``: Always choose GIF. + * ``"always"``: Always choose MP4. +=========== ===== + + +extractor.kissmanga.captcha +--------------------------- +=========== ===== +Type ``string`` +Default ``"stop"`` +Description Controls how to handle redirects to CAPTCHA pages. + + * ``"stop``: Stop the current extractor run. + * ``"wait``: Ask the user to solve the CAPTCHA and wait. +=========== ===== + + +extractor.oauth.browser +----------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Controls how a user is directed to an OAuth authorization site. + + * ``true``: Use Python's |webbrowser.open()|_ method to automatically + open the URL in the user's browser. + * ``false``: Ask the user to copy & paste an URL from the terminal. +=========== ===== + + +extractor.photobucket.subalbums +------------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Download subalbums. +=========== ===== + + +extractor.pixiv.ugoira +---------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Download Pixiv's Ugoira animations or ignore them. + + These animations come as a ``.zip`` file containing all the single + animation frames in JPEG format. +=========== ===== + + +extractor.plurk.comments +------------------------ +=========== ===== +Type ``bool`` +Default ``false`` +Description Also search Plurk comments for URLs. +=========== ===== + + +extractor.reactor.wait-min & .wait-max +-------------------------------------- +=========== ===== +Type ``float`` +Default ``3.0`` and ``6.0`` +Description Minimum and maximum wait time in seconds between HTTP requests + during the extraction process. +=========== ===== + + +extractor.readcomiconline.captcha +--------------------------------- +=========== ===== +Type ``string`` +Default ``"stop"`` +Description Controls how to handle redirects to CAPTCHA pages. + + * ``"stop``: Stop the current extractor run. + * ``"wait``: Ask the user to solve the CAPTCHA and wait. +=========== ===== + + +extractor.recursive.blacklist +----------------------------- +=========== ===== +Type ``list`` of ``strings`` +Default ``["directlink", "oauth", "recursive", "test"]`` +Description A list of extractor categories which should be ignored when using + the ``recursive`` extractor. +=========== ===== + + +extractor.reddit.comments +------------------------- +=========== ===== +Type ``integer`` or ``string`` +Default ``500`` +Description The value of the ``limit`` parameter when loading + a submission and its comments. + This number (roughly) specifies the total amount of comments + being retrieved with the first API call. + + Reddit's internal default and maximum values for this parameter + appear to be 200 and 500 respectively. + + The value `0` ignores all comments and significantly reduces the + time required when scanning a subreddit. +=========== ===== + + +extractor.reddit.morecomments +----------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Retrieve additional comments by resolving the ``more`` comment + stubs in the base comment tree. + + This requires 1 additional API call for every 100 extra comments. +=========== ===== + + +extractor.reddit.date-min & .date-max +------------------------------------- +=========== ===== +Type ``integer`` or ``string`` +Default ``0`` and ``253402210800`` (timestamp of |datetime.max|_) +Description Ignore all submissions posted before/after this date. + + * If this is an ``integer``, it represents the date as UTC timestamp. + * If this is a ``string``, it will get parsed according to date-format_. +=========== ===== + + +extractor.reddit.date-format +---------------------------- +=========== ===== +Type ``string`` +Default ``"%Y-%m-%dT%H:%M:%S"`` +Description An explicit format string used to parse the ``string`` values of + `date-min and date-max`_. + + See |strptime|_ for a list of formatting directives. +=========== ===== + + +extractor.reddit.id-min & .id-max +--------------------------------- +=========== ===== +Type ``string`` +Example ``"6kmzv2"`` +Description Ignore all submissions posted before/after the submission with + this ID. +=========== ===== + + +extractor.reddit.recursion +-------------------------- +=========== ===== +Type ``integer`` +Default ``0`` +Description Reddit extractors can recursively visit other submissions + linked to in the initial set of submissions. + This value sets the maximum recursion depth. + + Special values: + + * ``0``: Recursion is disabled + * ``-1``: Infinite recursion (don't do this) +=========== ===== + + +extractor.reddit.refresh-token +------------------------------ +=========== ===== +Type ``string`` +Default ``null`` +Description The ``refresh_token`` value you get from linking your + Reddit account to *gallery-dl*. + + Using a ``refresh_token`` allows you to access private or otherwise + not publicly available subreddits, given that your account is + authorized to do so, + but requests to the reddit API are going to be rate limited + at 600 requests every 10 minutes/600 seconds. +=========== ===== + + +extractor.sankaku.wait-min & .wait-max +-------------------------------------- +=========== ===== +Type ``float`` +Default ``3.0`` and ``6.0`` +Description Minimum and maximum wait time in seconds between each image + + Sankaku Channel responds with ``429 Too Many Requests`` if it + receives too many HTTP requests in a certain amount of time. + Waiting a few seconds between each request tries to prevent that. +=========== ===== + + +extractor.smugmug.videos +------------------------ +=========== ===== +Type ``bool`` +Default ``true`` +Description Download video files. +=========== ===== + + +extractor.tumblr.avatar +----------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Download blog avatars. +=========== ===== + + +extractor.tumblr.external +------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Follow external URLs (e.g. from "Link" posts) and try to extract + images from them. +=========== ===== + + +extractor.tumblr.inline +----------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Search posts for inline images and videos. +=========== ===== + + +extractor.tumblr.reblogs +------------------------ +=========== ===== +Type ``bool`` or ``string`` +Default ``true`` +Description * ``true``: Extract media from reblogged posts + * ``false``: Skip reblogged posts + * ``"same-blog"``: Skip reblogged posts unless the original post + is from the same blog +=========== ===== + + +extractor.tumblr.posts +---------------------- +=========== ===== +Type ``string`` or ``list`` of ``strings`` +Default ``"all"`` +Example ``"video,audio,link"`` or ``["video", "audio", "link"]`` +Description A (comma-separated) list of post types to extract images, etc. from. + + Possible types are ``text``, ``quote``, ``link``, ``answer``, + ``video``, ``audio``, ``photo``, ``chat``. + + You can use ``"all"`` instead of listing all types separately. +=========== ===== + + +extractor.twitter.retweets +-------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Extract images from retweets. +=========== ===== + + +extractor.twitter.videos +------------------------ +=========== ===== +Type ``bool`` +Default ``false`` +Description Use `youtube-dl`_ to download from video tweets. +=========== ===== + + +extractor.wallhaven.api-key +--------------------------- +=========== ===== +Type ``string`` +Default ``null`` +Description Your `API Key `__ to use + your account's browsing settings and default filters when searching. + + See https://wallhaven.cc/help/api for more information. +=========== ===== + + +extractor.[booru].tags +---------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Categorize tags by their respective types + and provide them as ``tags_`` metadata fields. + + Note: This requires 1 additional HTTP request for each post. +=========== ===== + + +extractor.[manga-extractor].chapter-reverse +------------------------------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Reverse the order of chapter URLs extracted from manga pages. + + * ``true``: Start with the latest chapter + * ``false``: Start with the first chapter +=========== ===== + + + +Downloader Options +================== + + +downloader.*.enabled +-------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Enable/Disable this downloader module. +=========== ===== + + +downloader.*.part +----------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Controls the use of ``.part`` files during file downloads. + + * ``true``: Write downloaded data into ``.part`` files and rename + them upon download completion. This mode additionally supports + resuming incomplete downloads. + * ``false``: Do not use ``.part`` files and write data directly + into the actual output files. +=========== ===== + + +downloader.*.part-directory +--------------------------- +=========== ===== +Type |Path|_ +Default ``null`` +Description Alternate location for ``.part`` files. + + Missing directories will be created as needed. + If this value is ``null``, ``.part`` files are going to be stored + alongside the actual output files. +=========== ===== + + +downloader.*.rate +----------------- +=========== ===== +Type ``string`` +Default ``null`` +Examples ``"32000"``, ``"500k"``, ``"2.5M"`` +Description Maximum download rate in bytes per second. + + Possible values are valid integer or floating-point numbers + optionally followed by one of ``k``, ``m``. ``g``, ``t`` or ``p``. + These suffixes are case-insensitive. +=========== ===== + + +downloader.*.retries +-------------------- +=========== ===== +Type ``integer`` +Default `extractor.*.retries`_ +Description Number of retries during file downloads. +=========== ===== + + +downloader.*.timeout +-------------------- +=========== ===== +Type ``float`` or ``null`` +Default `extractor.*.timeout`_ +Description Connection timeout during file downloads. +=========== ===== + + +downloader.*.verify +------------------- +=========== ===== +Type ``bool`` or ``string`` +Default `extractor.*.verify`_ +Description Certificate validation during file downloads. +=========== ===== + + +downloader.ytdl.format +---------------------- +=========== ===== +Type ``string`` +Default youtube-dl's default, currently ``"bestvideo+bestaudio/best"`` +Description Video `format selection + `__ + directly passed to youtube-dl. +=========== ===== + + +downloader.ytdl.logging +----------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description | Route youtube-dl's output through gallery-dl's logging system. + | Otherwise youtube-dl will write its output directly to stdout/stderr. + + Note: Set ``quiet`` and ``no_warnings`` in + `downloader.ytdl.raw-options`_ to ``true`` to suppress all output. +=========== ===== + + +downloader.ytdl.raw-options +--------------------------- +=========== ===== +Type ``object`` +Example .. code:: + + { + "quiet": true, + "writesubtitles": true, + "merge_output_format": "mkv" + } + +Description | Additional options passed directly to the ``YoutubeDL`` constructor. + | All available options can be found in `youtube-dl's docstrings + `__. +=========== ===== + + + +Output Options +============== + + +output.mode +----------- +=========== ===== +Type ``string`` +Default ``"auto"`` +Description Controls the output string format and status indicators. + + * ``"null"``: No output + * ``"pipe"``: Suitable for piping to other processes or files + * ``"terminal"``: Suitable for the standard Windows console + * ``"color"``: Suitable for terminals that understand ANSI escape codes and colors + * ``"auto"``: Automatically choose the best suitable output mode +=========== ===== + + +output.shorten +-------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Controls whether the output strings should be shortened to fit + on one console line. +=========== ===== + + +output.progress +--------------- +=========== ===== +Type ``bool`` or ``string`` +Default ``true`` +Description Controls the progress indicator when *gallery-dl* is run with + multiple URLs as arguments. + + * ``true``: Show the default progress indicator + (``"[{current}/{total}] {url}"``) + * ``false``: Do not show any progress indicator + * Any ``string``: Show the progress indicator using this + as a custom `format string`_. Possible replacement keys are + ``current``, ``total`` and ``url``. +=========== ===== + + +output.log +---------- +=========== ===== +Type ``string`` or |Logging Configuration|_ +Default ``"[{name}][{levelname}] {message}"`` +Description Configuration for standard logging output to stderr. + + If this is a simple ``string``, it specifies + the format string for logging messages. +=========== ===== + + +output.logfile +-------------- +=========== ===== +Type |Path|_ or |Logging Configuration|_ +Default ``null`` +Description File to write logging output to. +=========== ===== + + +output.unsupportedfile +---------------------- +=========== ===== +Type |Path|_ or |Logging Configuration|_ +Default ``null`` +Description File to write external URLs unsupported by *gallery-dl* to. + + The default format string here is ``"{message}"``. +=========== ===== + + +output.num-to-str +----------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Convert numeric values (``integer`` or ``float``) to ``string`` + before outputting them as JSON. +=========== ===== + + + +Postprocessor Options +===================== + + +classify +-------- + +Categorize files by filename extension + +classify.mapping +---------------- +=========== ===== +Type ``object`` +Default .. code:: + + { + "Pictures" : ["jpg", "jpeg", "png", "gif", "bmp", "svg", "webp"], + "Video" : ["flv", "ogv", "avi", "mp4", "mpg", "mpeg", "3gp", "mkv", "webm", "vob", "wmv"], + "Music" : ["mp3", "aac", "flac", "ogg", "wma", "m4a", "wav"], + "Archives" : ["zip", "rar", "7z", "tar", "gz", "bz2"] + } + +Description A mapping from directory names to filename extensions that should + be stored in them. + + Files with an extension not listed will be ignored and stored + in their default location. +=========== ===== + + +exec +---- + +Execute external commands. + +exec.async +---------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Controls whether to wait for a subprocess to finish + or to let it run asynchronously. +=========== ===== + +exec.command +------------ +=========== ===== +Type ``list`` of ``strings`` +Example ``["echo", "{user[account]}", "{id}"]`` +Description The command to run. + + Each element of this list is treated as a `format string`_ using + the files' metadata. +=========== ===== + + +metadata +-------- + +Write image metadata to separate files + +metadata.mode +------------- +=========== ===== +Type ``string`` +Default ``"json"`` +Description Select how to write metadata. + + * ``"json"``: all metadata using `json.dump() + `_ + * ``"tags"``: ``tags`` separated by newlines + * ``"custom"``: result of applying `metadata.format`_ to a file's + metadata dictionary +=========== ===== + +metadata.extension +------------------ +=========== ===== +Type ``string`` +Default ``"json"`` or ``"txt"`` +Description Filename extension for metadata files. +=========== ===== + +metadata.format +--------------- +=========== ===== +Type ``string`` +Example ``"tags:\n\n{tags:J\n}\n"`` +Description Custom format string to build content of metadata files. + + Note: Only applies for ``"mode": "custom"``. +=========== ===== + + +ugoira +------ + +Convert Pixiv Ugoira to WebM using `FFmpeg `__. + +ugoira.extension +---------------- +=========== ===== +Type ``string`` +Default ``"webm"`` +Description Filename extension for the resulting video files. +=========== ===== + +ugoira.ffmpeg-args +------------------ +=========== ===== +Type ``list`` of ``strings`` +Default ``null`` +Example ``["-c:v", "libvpx-vp9", "-an", "-b:v", "2M"]`` +Description Additional FFmpeg command-line arguments. +=========== ===== + +ugoira.ffmpeg-location +---------------------- +=========== ===== +Type |Path|_ +Default ``"ffmpeg"`` +Description Location of the ``ffmpeg`` (or ``avconv``) executable to use. +=========== ===== + +ugoira.ffmpeg-output +-------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Show FFmpeg output. +=========== ===== + +ugoira.ffmpeg-twopass +--------------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Enable Two-Pass encoding. +=========== ===== + +ugoira.framerate +---------------- +=========== ===== +Type ``string`` +Default ``"auto"`` +Description Controls the frame rate argument (``-r``) for FFmpeg + + * ``"auto"``: Automatically assign a fitting frame rate + based on delays between frames. + * any other ``string``: Use this value as argument for ``-r``. + * ``null`` or an empty ``string``: Don't set an explicit frame rate. +=========== ===== + +ugoira.keep-files +----------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Keep ZIP archives after conversion. +=========== ===== + +ugoira.libx264-prevent-odd +-------------------------- +=========== ===== +Type ``bool`` +Default ``true`` +Description Prevent ``"width/height not divisible by 2"`` errors + when using ``libx264`` or ``libx265`` encoders + by applying a simple cropping filter. See this `Stack Overflow + thread `__ + for more information. + + This option, when ``libx264/5`` is used, automatically + adds ``["-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)"]`` + to the list of FFmpeg command-line arguments + to reduce an odd width/height by 1 pixel and make them even. +=========== ===== + + +zip +--- + +Store files in a ZIP archive. + +zip.compression +--------------- +=========== ===== +Type ``string`` +Default ``"store"`` +Description Compression method to use when writing the archive. + + Possible values are ``"store"``, ``"zip"``, ``"bzip2"``, ``"lzma"``. +=========== ===== + +zip.extension +------------- +=========== ===== +Type ``string`` +Default ``"zip"`` +Description Filename extension for the created ZIP archive. +=========== ===== + +zip.keep-files +-------------- +=========== ===== +Type ``bool`` +Default ``false`` +Description Keep the actual files after writing them to a ZIP archive. +=========== ===== + + + +Miscellaneous Options +===================== + + +cache.file +---------- +=========== ===== +Type |Path|_ +Default |tempfile.gettempdir()|_ + ``".gallery-dl.cache"`` +Description Path of the SQLite3 database used to cache login sessions, + cookies and API tokens across `gallery-dl` invocations. + + Set this option to ``null`` or an invalid path to disable + this cache. +=========== ===== + + + +API Tokens & IDs +================ + + +All configuration keys listed in this section have fully functional default +values embedded into *gallery-dl* itself, but if things unexpectedly break +or you want to use your own personal client credentials, you can follow these +instructions to get an alternative set of API tokens and IDs. + + +extractor.deviantart.client-id & .client-secret +----------------------------------------------- +=========== ===== +Type ``string`` +How To * login and visit DeviantArt's + `Applications & Keys `__ + section + * click "Register your Application" + * scroll to "OAuth2 Redirect URI Whitelist (Required)" + and enter "https://mikf.github.io/gallery-dl/oauth-redirect.html" + * click "Save" (top right) + * copy ``client_id`` and ``client_secret`` of your new + application and put them in your configuration file +=========== ===== + + +extractor.flickr.api-key & .api-secret +-------------------------------------- +=========== ===== +Type ``string`` +How To * login and `Create an App `__ + in Flickr's `App Garden `__ + * click "APPLY FOR A NON-COMMERCIAL KEY" + * fill out the form with a random name and description + and click "SUBMIT" + * copy ``Key`` and ``Secret`` and put them in your configuration + file +=========== ===== + + +extractor.pawoo.access-token +---------------------------- +=========== ===== +Type ``string`` +How To +=========== ===== + + +extractor.reddit.client-id & .user-agent +---------------------------------------- +=========== ===== +Type ``string`` +How To * login and visit the `apps `__ + section of your account's preferences + * click the "are you a developer? create an app..." button + * fill out the form, choose "installed app", preferably set + "http://localhost:6414/" as "redirect uri" and finally click + "create app" + * copy the client id (third line, under your application's name and + "installed app") and put it in your configuration file + * use "``Python::v1.0 (by /u/)``" as + user-agent and replace ```` and ```` + accordingly (see Reddit's + `API access rules `__) +=========== ===== + + +extractor.smugmug.api-key & .api-secret +--------------------------------------- +=========== ===== +Type ``string`` +How To * login and `Apply for an API Key `__ + * use a random name and description, + set "Type" to "Application", "Platform" to "All", + and "Use" to "Non-Commercial" + * fill out the two checkboxes at the bottom and click "Apply" + * copy ``API Key`` and ``API Secret`` + and put them in your configuration file +=========== ===== + + +extractor.tumblr.api-key & .api-secret +-------------------------------------- +=========== ===== +Type ``string`` +How To * login and visit Tumblr's + `Applications `__ section + * click "Register application" + * fill out the form: use a random name and description, set + https://example.org/ as "Application Website" and "Default + callback URL" + * solve Google's "I'm not a robot" challenge and click "Register" + * click "Show secret key" (below "OAuth Consumer Key") + * copy your ``OAuth Consumer Key`` and ``Secret Key`` + and put them in your configuration file +=========== ===== + + + +Custom Types +============ + + +Path +---- +=========== ===== +Type ``string`` or ``list`` of ``strings`` +Examples * ``"file.ext"`` + * ``"~/path/to/file.ext"`` + * ``"$HOME/path/to/file.ext"`` + * ``["$HOME", "path", "to", "file.ext"]`` +Description A |Path|_ is a ``string`` representing the location of a file + or directory. + + Simple `tilde expansion `__ + and `environment variable expansion `__ + is supported. + + In Windows environments, backslashes (``"\"``) can, in addition to + forward slashes (``"/"``), be used as path separators. + Because backslashes are JSON's escape character, + they themselves have to be escaped. + The path ``C:\path\to\file.ext`` has therefore to be written as + ``"C:\\path\\to\\file.ext"`` if you want to use backslashes. +=========== ===== + + +Logging Configuration +--------------------- +=========== ===== +Type ``object`` + +Example .. code:: + + { + "format": "{asctime} {name}: {message}", + "format-date": "%H:%M:%S", + "path": "~/log.txt", + "encoding": "ascii" + } + +Description Extended logging output configuration. + + * format + * Format string for logging messages + + In addition to the default + `LogRecord attributes `__, + it is also possible to access the current + `extractor `__ + and `job `__ + objects as well as their attributes + (e.g. ``"{extractor.url}"``) + * Default: ``"[{name}][{levelname}] {message}"`` + * format-date + * Format string for ``{asctime}`` fields in logging messages + (see `strftime() directives `__) + * Default: ``"%Y-%m-%d %H:%M:%S"`` + * level + * Minimum logging message level + (one of ``"debug"``, ``"info"``, ``"warning"``, ``"error"``, ``"exception"``) + * Default: ``"info"`` + * path + * |Path|_ to the output file + * mode + * Mode in which the file is opened; + use ``"w"`` to truncate or ``"a"`` to append + (see `open() `__) + * Default: ``"w"`` + * encoding + * File encoding + * Default: ``"utf-8"`` + + Note: path, mode and encoding are only applied when configuring + logging output to a file. +=========== ===== + + +Postprocessor Configuration +--------------------------- +=========== ===== +Type ``object`` + +Example .. code:: + + { + "name": "zip", + "compression": "store", + "extension": "cbz", + "whitelist": ["mangadex", "exhentai", "nhentai"] + } + +Description An object with the ``name`` of a post-processor and its options. + + See `Postprocessor Options`_ for a list of all available + post-processors and their respective options. + + You can also set a ``whitelist`` or ``blacklist`` to + only enable or disable a post-processor for the specified + extractor categories. +=========== ===== + + + +.. |.netrc| replace:: ``.netrc`` +.. |tempfile.gettempdir()| replace:: ``tempfile.gettempdir()`` +.. |requests.request()| replace:: ``requests.request()`` +.. |timeout| replace:: ``timeout`` +.. |verify| replace:: ``verify`` +.. |mature_content| replace:: ``mature_content`` +.. |webbrowser.open()| replace:: ``webbrowser.open()`` +.. |datetime.max| replace:: ``datetime.max`` +.. |Path| replace:: ``Path`` +.. |Logging Configuration| replace:: ``Logging Configuration`` +.. |Postprocessor Configuration| replace:: ``Postprocessor Configuration`` +.. |strptime| replace:: strftime() and strptime() Behavior + +.. _base-directory: `extractor.*.base-directory`_ +.. _skipped: `extractor.*.skip`_ +.. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_ +.. _date-format: extractor.reddit.date-format_ +.. _deviantart.metadata: extractor.deviantart.metadata_ + +.. _.netrc: https://stackoverflow.com/tags/.netrc/info +.. _tempfile.gettempdir(): https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir +.. _requests.request(): https://docs.python-requests.org/en/master/api/#requests.request +.. _timeout: https://docs.python-requests.org/en/latest/user/advanced/#timeouts +.. _verify: https://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification +.. _`Requests' proxy documentation`: http://docs.python-requests.org/en/master/user/advanced/#proxies +.. _format string: https://docs.python.org/3/library/string.html#formatstrings +.. _format strings: https://docs.python.org/3/library/string.html#formatstrings +.. _strptime: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior +.. _mature_content: https://www.deviantart.com/developers/http/v1/20160316/object/deviation +.. _webbrowser.open(): https://docs.python.org/3/library/webbrowser.html +.. _datetime.max: https://docs.python.org/3/library/datetime.html#datetime.datetime.max +.. _Authentication: https://github.com/mikf/gallery-dl#5authentication +.. _youtube-dl: https://github.com/ytdl-org/youtube-dl diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf new file mode 100644 index 0000000..a5270d2 --- /dev/null +++ b/docs/gallery-dl-example.conf @@ -0,0 +1,172 @@ +{ + "extractor": + { + "base-directory": "~/gallery-dl/", + "archive": "~/gallery-dl/archive.sqlite3", + "proxy": "http://10.10.1.10:3128", + + "postprocessors": [ + { + "name": "ugoira", + "whitelist": ["pixiv", "danbooru"], + "ffmpeg-twopass": true, + "ffmpeg-args": ["-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an"] + }, + { + "name": "metadata", + "whitelist": ["danbooru", "yandere", "sankaku"], + "mode": "tags" + } + ], + + "pixiv": + { + "archive": "~/gallery-dl/archive-pixiv.sqlite3", + + "filename": "{id}{num}.{extension}", + "directory": ["Pixiv", "Works", "{user[id]}"], + + "username": "foo", + "password": "bar", + + "favorite": + { + "directory": ["Pixiv", "Favorites", "{user[id]}"] + }, + + "bookmark": + { + "directory": ["Pixiv", "My Bookmarks"], + + "username": "foo123", + "password": "bar123" + } + }, + + "exhentai": + { + "cookies": + { + "ipb_member_id": "12345", + "ipb_pass_hash": "1234567890abcdef" + }, + + "proxy": + { + "http": "http://10.10.1.10:8080", + "https": "https://10.10.1.10:443" + }, + + "filename": "{num:>04}_{name}.{extension}", + "directory": ["{category!c}", "{title}"], + + "wait-min": 1.0, + "wait-max": 5.0 + }, + + "mangadex": + { + "postprocessors": [{ + "name": "zip", + "keep-files": false, + "compression": "zip" + }] + }, + + "flickr": + { + "access-token": "1234567890-abcdef", + "access-token-secret": "1234567890abcdef", + "size-max": 1920 + }, + + "reddit": + { + "morecomments": true, + "date-min": "2017-01", + "date-format": "%Y-%m", + "recursion": 1 + }, + + "sankaku": + { + "sleep": 2, + "wait-min": 5.0, + "wait-max": 5.0, + "cookies": "~/gallery-dl/cookies-sankaku.txt" + }, + + "tumblr": + { + "posts": "all", + "external": false, + "reblogs": false, + "inline": true, + + "likes": + { + "posts": "video,photo,link", + "external": true, + "reblogs": true + } + }, + + "mastodon": + { + "mastodon.xyz": + { + "access-token": "cab65529..." + }, + "tabletop.social": { + "access-token": "513a36c6..." + }, + + "directory": ["mastodon", "{instance}", "{account[username]!l}"], + "filename": "{id}_{media[id]}.{extension}" + }, + + "foolslide": { + "otscans": {"root": "https://otscans.com/foolslide"}, + "helvetica": {"root": "https://helveticascans.com/r" } + }, + + "foolfuuka": { + "fireden-onion": {"root": "http://ydt6jy2ng3s3xg2e.onion"}, + "scalearchive": {"root": "https://archive.scaled.team" } + } + }, + + "downloader": + { + "part-directory": "/tmp/.download/", + "rate": "1M", + "retries": 3, + "timeout": 8.5 + }, + + "output": + { + "mode": "terminal", + "log": { + "format": "{name}: {message}", + "level": "info" + }, + "logfile": { + "path": "~/gallery-dl/log.txt", + "mode": "w", + "level": "debug" + }, + "unsupportedfile": { + "path": "~/gallery-dl/unsupported.txt", + "mode": "a", + "format": "{asctime} {message}", + "format-date": "%Y-%m-%d-%H-%M-%S" + } + }, + + "cache": { + "file": "~/gallery-dl/cache.sqlite3" + }, + + "netrc": true +} diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf new file mode 100644 index 0000000..c792e9e --- /dev/null +++ b/docs/gallery-dl.conf @@ -0,0 +1,172 @@ +{ + "extractor": + { + "base-directory": "./gallery-dl/", + "postprocessors": null, + "archive": null, + "cookies": null, + "proxy": null, + "skip": true, + "sleep": 0, + "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0", + + "artstation": + { + "external": false + }, + "danbooru": + { + "username": null, + "password": null + }, + "deviantart": + { + "refresh-token": null, + "extra": false, + "flat": true, + "folders": false, + "journals": "html", + "mature": true, + "metadata": false, + "original": true, + "wait-min": 0 + }, + "exhentai": + { + "username": null, + "password": null, + "original": true, + "wait-min": 3.0, + "wait-max": 6.0 + }, + "flickr": + { + "access-token": null, + "access-token-secret": null, + "videos": true, + "size-max": null + }, + "gelbooru": + { + "api": true + }, + "gfycat": + { + "format": "mp4" + }, + "idolcomplex": + { + "username": null, + "password": null, + "wait-min": 3.0, + "wait-max": 6.0 + }, + "imgur": + { + "mp4": true + }, + "kissmanga": + { + "captcha": "stop" + }, + "nijie": + { + "username": null, + "password": null + }, + "oauth": + { + "browser": true + }, + "pixiv": + { + "username": null, + "password": null, + "ugoira": true + }, + "reactor": + { + "wait-min": 3.0, + "wait-max": 6.0 + }, + "readcomiconline": + { + "captcha": "stop" + }, + "recursive": + { + "blacklist": ["directlink", "oauth", "recursive", "test"] + }, + "reddit": + { + "refresh-token": null, + "comments": 500, + "morecomments": false, + "date-min": 0, + "date-max": 253402210800, + "date-format": "%Y-%m-%dT%H:%M:%S", + "id-min": "0", + "id-max": "zik0zj", + "recursion": 0, + "user-agent": "Python:gallery-dl:0.8.4 (by /u/mikf1)" + }, + "sankaku": + { + "username": null, + "password": null, + "wait-min": 3.0, + "wait-max": 6.0 + }, + "seiga": + { + "username": null, + "password": null + }, + "tumblr": + { + "avatar": false, + "external": false, + "inline": true, + "posts": "all", + "reblogs": true + }, + "twitter": + { + "retweets": true, + "videos": false + }, + "wallhaven": + { + "api-key": null + }, + "booru": + { + "tags": false + } + }, + + "downloader": + { + "part": true, + "part-directory": null, + + "http": + { + "rate": null, + "retries": 5, + "timeout": 30.0, + "verify": true + } + }, + + "output": + { + "mode": "auto", + "progress": true, + "shorten": true, + "logfile": null, + "unsupportedfile": null + }, + + "netrc": false +} diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst new file mode 100644 index 0000000..f47ed10 --- /dev/null +++ b/docs/supportedsites.rst @@ -0,0 +1,137 @@ +Supported Sites +=============== +==================== =================================== ================================================== ================ +Site URL Capabilities Authentication +==================== =================================== ================================================== ================ +35PHOTO https://35photo.pro/ Images from Users, Genres, individual Images +3dbooru http://behoimi.org/ Pools, Popular Images, Posts, Tag-Searches +4chan https://www.4chan.org/ Threads +4plebs https://archive.4plebs.org/ Threads +500px https://500px.com/ Images from Users, Galleries, individual Images +8chan https://8ch.net/ Threads +8muses https://www.8muses.com/ Albums +Adobe Portfolio https://www.myportfolio.com/ Galleries +arch.b4k.co https://arch.b4k.co/ Threads +Archive of Sins https://archiveofsins.com/ Threads +Archived.Moe https://archived.moe/ Threads +ArtStation https://www.artstation.com/ |artstation-C| +Behance https://www.behance.net/ Images from Users, Collections, Galleries +BobX http://www.bobx.com/dark/ Galleries, Idols +Danbooru https://danbooru.donmai.us/ Pools, Popular Images, Posts, Tag-Searches Optional +Desuarchive https://desuarchive.org/ Threads +DeviantArt https://www.deviantart.com/ |deviantart-C| Optional (OAuth) +Doki Reader https://kobato.hologfx.com/reader/ Chapters, Manga +Dynasty Reader https://dynasty-scans.com/ Chapters, individual Images, Search Results +e621 https://e621.net/ Pools, Popular Images, Posts, Tag-Searches +ExHentai https://exhentai.org/ Favorites, Galleries, Search Results Optional +Fallen Angels Scans https://www.fascans.com/ Chapters, Manga +Fashion Nova https://www.fashionnova.com/ Collections, Products +Fireden https://boards.fireden.net/ Threads +Flickr https://www.flickr.com/ |flickr-C| Optional (OAuth) +Futaba Channel https://www.2chan.net/ Threads +Gelbooru https://gelbooru.com/ Pools, Posts, Tag-Searches +Gfycat https://gfycat.com/ individual Images +HBrowse https://www.hbrowse.com/ Chapters, Manga +Hentai Cafe https://hentai.cafe/ Chapters, Manga +Hentai Foundry https://www.hentai-foundry.com/ |hentaifoundry-C| +Hentai2Read https://hentai2read.com/ Chapters, Manga +HentaiFox https://hentaifox.com/ Galleries, Search Results +HentaiHere https://hentaihere.com/ Chapters, Manga +Hentainexus https://hentainexus.com/ Galleries, Search Results +Hitomi.la https://hitomi.la/ Galleries +Hypnohub https://hypnohub.net/ Pools, Popular Images, Posts, Tag-Searches +Idol Complex https://idol.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional +ImageBam http://www.imagebam.com/ Galleries, individual Images +ImageFap https://imagefap.com/ Images from Users, Galleries, individual Images +imgbox https://imgbox.com/ Galleries, individual Images +imgth https://imgth.com/ Galleries +imgur https://imgur.com/ Albums, individual Images +Instagram https://www.instagram.com/ Images from Users, individual Images, Tag-Searches +Jaimini's Box https://jaiminisbox.com/reader/ Chapters, Manga +Joyreactor http://joyreactor.cc/ |joyreactor-C| +Keenspot http://www.keenspot.com/ Comics +Khinsider https://downloads.khinsider.com/ Soundtracks +Kirei Cake https://reader.kireicake.com/ Chapters, Manga +KissManga https://kissmanga.com/ Chapters, Manga +Komikcast https://komikcast.com/ Chapters, Manga +Konachan https://konachan.com/ Pools, Popular Images, Posts, Tag-Searches +livedoor Blog http://blog.livedoor.jp/ Blogs, Posts +Luscious https://luscious.net/ Albums, Search Results Optional +Manga Fox https://fanfox.net/ Chapters +Manga Here https://www.mangahere.cc/ Chapters, Manga +Manga Stream https://readms.net/ Chapters +MangaDex https://mangadex.org/ Chapters, Manga +Mangapanda https://www.mangapanda.com/ Chapters, Manga +MangaPark https://mangapark.me/ Chapters, Manga +Mangareader https://www.mangareader.net/ Chapters, Manga +Mangoxo https://www.mangoxo.com/ Albums, Channels Optional +Newgrounds https://www.newgrounds.com/ Images from Users, individual Images, Videos +Ngomik http://ngomik.in/ Chapters +nhentai https://nhentai.net/ Galleries, Search Results +Niconico Seiga https://seiga.nicovideo.jp/ Images from Users, individual Images Required +nijie https://nijie.info/ |nijie-C| Required +NSFWalbum.com https://nsfwalbum.com/ Albums +Nyafuu Archive https://archive.nyafuu.org/ Threads +Patreon https://www.patreon.com/ Images from Users, Creators +Pawoo https://pawoo.net/ Images from Users, Images from Statuses +Photobucket https://photobucket.com/ Albums, individual Images +Piczel https://piczel.tv/ Images from Users, Folders, individual Images +Pinterest https://www.pinterest.com/ Boards, Pins, pin.it Links, related Pins +Pixiv https://www.pixiv.net/ |pixiv-C| Required +Pixnet https://www.pixnet.net/ |pixnet-C| +Plurk https://www.plurk.com/ Posts, Timelines +Pornhub https://www.pornhub.com/ Images from Users, Galleries +Pornreactor http://pornreactor.cc/ |pornreactor-C| +PowerManga https://read.powermanga.org/ Chapters, Manga +Pururin https://pururin.io/ Galleries +Read Comic Online https://readcomiconline.to/ Comic-Issues, Comics +RebeccaBlackTech https://rbt.asia/ Threads +Reddit https://www.reddit.com/ individual Images, Submissions, Subreddits Optional (OAuth) +rule #34 https://rule34.paheal.net/ Posts, Tag-Searches +Rule 34 https://rule34.xxx/ Pools, Posts, Tag-Searches +Safebooru https://safebooru.org/ Pools, Posts, Tag-Searches +Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional +Sankaku Complex https://www.sankakucomplex.com/ Articles, Tag-Searches +Sen Manga https://raw.senmanga.com/ Chapters +Sense-Scans http://sensescans.com/reader/ Chapters, Manga +Sex.com https://www.sex.com/ Boards, Pins, Search Results +Simply Hentai https://www.simply-hentai.com/ Galleries, individual Images, Videos +SlickPic https://www.slickpic.com/ Images from Users, Albums +SlideShare https://www.slideshare.net/ Presentations +SmugMug https://www.smugmug.com/ |smugmug-C| Optional (OAuth) +The /b/ Archive https://thebarchive.com/ Threads +Tsumino https://www.tsumino.com/ Galleries, Search Results Optional +Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth) +Twitter https://twitter.com/ Media Timelines, Timelines, Tweets Optional +Wallhaven https://wallhaven.cc/ individual Images, Search Results +Warosu https://warosu.org/ Threads +Weibo https://www.weibo.com/ Images from Users, Images from Statuses +WikiArt.org https://www.wikiart.org/ Artists, Artworks +World Three http://www.slide.world-three.org/ Chapters, Manga +xHamster https://xhamster.com/ Images from Users, Galleries +XVideos https://www.xvideos.com/ Images from Users, Galleries +Yandere https://yande.re/ Pools, Popular Images, Posts, Tag-Searches +yaplog! https://yaplog.jp/ Blogs, Posts +|yuki-S| https://yuki.la/ Threads +Acidimg https://acidimg.cc/ individual Images +Imagetwist https://imagetwist.com/ individual Images +Imagevenue http://imagevenue.com/ individual Images +Imgspice https://imgspice.com/ individual Images +Imxto https://imx.to/ individual Images +Pixhost https://pixhost.to/ individual Images +Postimg https://postimages.org/ individual Images +Turboimagehost https://www.turboimagehost.com/ individual Images +もえぴりあ https://vanilla-rock.com/ Posts, Tag-Searches +==================== =================================== ================================================== ================ + +.. |artstation-C| replace:: Images from Users, Albums, Artwork Listings, Challenges, individual Images, Likes, Search Results +.. |deviantart-C| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals, Popular Images, Scraps, Sta.sh +.. |flickr-C| replace:: Images from Users, Albums, Favorites, Galleries, Groups, individual Images, Search Results +.. |hentaifoundry-C| replace:: Images from Users, Favorites, individual Images, Popular Images, Recent Images, Scraps +.. |joyreactor-C| replace:: Images from Users, Posts, Search Results, Tag-Searches +.. |nijie-C| replace:: Images from Users, Doujin, Favorites, individual Images +.. |pixiv-C| replace:: Images from Users, Favorites, Follows, pixiv.me Links, Rankings, Search Results, Individual Images +.. |pixnet-C| replace:: Images from Users, Folders, individual Images, Sets +.. |pornreactor-C| replace:: Images from Users, Posts, Search Results, Tag-Searches +.. |smugmug-C| replace:: Albums, individual Images, Images from Users and Folders +.. |yuki-S| replace:: yuki.la 4chan archive diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py new file mode 100644 index 0000000..3643a5c --- /dev/null +++ b/gallery_dl/__init__.py @@ -0,0 +1,255 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +from __future__ import unicode_literals, print_function + +__author__ = "Mike Fährmann" +__copyright__ = "Copyright 2014-2018 Mike Fährmann" +__license__ = "GPLv2" +__maintainer__ = "Mike Fährmann" +__email__ = "mike_faehrmann@web.de" + +import sys + +if sys.hexversion < 0x3040000: + sys.exit("Python 3.4+ required") + +import json +import logging +from . import version, config, option, output, extractor, job, util, exception + +__version__ = version.__version__ + + +def progress(urls, pformat): + """Wrapper around urls to output a simple progress indicator""" + if pformat is True: + pformat = "[{current}/{total}] {url}" + pinfo = {"total": len(urls)} + for pinfo["current"], pinfo["url"] in enumerate(urls, 1): + print(pformat.format_map(pinfo), file=sys.stderr) + yield pinfo["url"] + + +def parse_inputfile(file, log): + """Filter and process strings from an input file. + + Lines starting with '#' and empty lines will be ignored. + Lines starting with '-' will be interpreted as a key-value pair separated + by an '='. where 'key' is a dot-separated option name and 'value' is a + JSON-parsable value for it. These config options will be applied while + processing the next URL. + Lines starting with '-G' are the same as above, except these options will + be valid for all following URLs, i.e. they are Global. + Everything else will be used as potential URL. + + Example input file: + + # settings global options + -G base-directory = "/tmp/" + -G skip = false + + # setting local options for the next URL + -filename="spaces_are_optional.jpg" + -skip = true + + https://example.org/ + + # next URL uses default filename and 'skip' is false. + https://example.com/index.htm + """ + gconf = [] + lconf = [] + + for line in file: + line = line.strip() + + if not line or line[0] == "#": + # empty line or comment + continue + + elif line[0] == "-": + # config spec + if len(line) >= 2 and line[1] == "G": + conf = gconf + line = line[2:] + else: + conf = lconf + line = line[1:] + + key, sep, value = line.partition("=") + if not sep: + log.warning("input file: invalid = pair: %s", line) + continue + + try: + value = json.loads(value.strip()) + except ValueError as exc: + log.warning("input file: unable to parse '%s': %s", value, exc) + continue + + conf.append((key.strip().split("."), value)) + + else: + # url + if gconf or lconf: + yield util.ExtendedUrl(line, gconf, lconf) + gconf = [] + lconf = [] + else: + yield line + + +def main(): + try: + if sys.stdout.encoding.lower() != "utf-8": + output.replace_std_streams() + + parser = option.build_parser() + args = parser.parse_args() + log = output.initialize_logging(args.loglevel) + + # configuration + if args.load_config: + config.load() + if args.cfgfiles: + config.load(args.cfgfiles, strict=True) + if args.yamlfiles: + config.load(args.yamlfiles, strict=True, fmt="yaml") + if args.postprocessors: + config.set(("postprocessors", ), args.postprocessors) + for key, value in args.options: + config.set(key, value) + + # stream logging handler + output.configure_logging_handler( + "log", logging.getLogger().handlers[0]) + + # file logging handler + handler = output.setup_logging_handler( + "logfile", lvl=args.loglevel) + if handler: + logging.getLogger().addHandler(handler) + + # loglevels + if args.loglevel >= logging.ERROR: + config.set(("output", "mode"), "null") + elif args.loglevel <= logging.DEBUG: + import platform + import subprocess + import os.path + import requests + + head = "" + try: + out, err = subprocess.Popen( + ("git", "rev-parse", "--short", "HEAD"), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__)), + ).communicate() + if out and not err: + head = " - Git HEAD: " + out.decode().rstrip() + except (OSError, subprocess.SubprocessError): + pass + + log.debug("Version %s%s", __version__, head) + log.debug("Python %s - %s", + platform.python_version(), platform.platform()) + try: + log.debug("requests %s - urllib3 %s", + requests.__version__, + requests.packages.urllib3.__version__) + except AttributeError: + pass + + if args.list_modules: + for module_name in extractor.modules: + print(module_name) + elif args.list_extractors: + for extr in extractor.extractors(): + if not extr.__doc__: + continue + print(extr.__name__) + print(extr.__doc__) + print("Category:", extr.category, + "- Subcategory:", extr.subcategory) + test = next(extr._get_tests(), None) + if test: + print("Example :", test[0]) + print() + elif args.clear_cache: + from . import cache + log = logging.getLogger("cache") + cnt = cache.clear() + + if cnt is None: + log.error("Database file not available") + else: + log.info( + "Deleted %d %s from '%s'", + cnt, "entry" if cnt == 1 else "entries", cache._path(), + ) + else: + if not args.urls and not args.inputfile: + parser.error( + "The following arguments are required: URL\n" + "Use 'gallery-dl --help' to get a list of all options.") + + if args.list_urls: + jobtype = job.UrlJob + jobtype.maxdepth = args.list_urls + else: + jobtype = args.jobtype or job.DownloadJob + + urls = args.urls + if args.inputfile: + try: + if args.inputfile == "-": + file = sys.stdin + else: + file = open(args.inputfile, encoding="utf-8") + urls += parse_inputfile(file, log) + file.close() + except OSError as exc: + log.warning("input file: %s", exc) + + # unsupported file logging handler + handler = output.setup_logging_handler( + "unsupportedfile", fmt="{message}") + if handler: + ulog = logging.getLogger("unsupported") + ulog.addHandler(handler) + ulog.propagate = False + job.Job.ulog = ulog + + pformat = config.get(("output", "progress"), True) + if pformat and len(urls) > 1 and args.loglevel < logging.ERROR: + urls = progress(urls, pformat) + + for url in urls: + try: + log.debug("Starting %s for '%s'", jobtype.__name__, url) + if isinstance(url, util.ExtendedUrl): + for key, value in url.gconfig: + config.set(key, value) + with config.apply(url.lconfig): + jobtype(url.value).run() + else: + jobtype(url).run() + except exception.NoExtractorError: + log.error("No suitable extractor found for '%s'", url) + + except KeyboardInterrupt: + print("\nKeyboardInterrupt", file=sys.stderr) + except BrokenPipeError: + pass + except IOError as exc: + import errno + if exc.errno != errno.EPIPE: + raise diff --git a/gallery_dl/__main__.py b/gallery_dl/__main__.py new file mode 100644 index 0000000..04ea9fe --- /dev/null +++ b/gallery_dl/__main__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2017 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import sys + +if __package__ is None and not hasattr(sys, "frozen"): + import os.path + path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + sys.path.insert(0, os.path.realpath(path)) + +import gallery_dl + +if __name__ == "__main__": + gallery_dl.main() diff --git a/gallery_dl/aes.py b/gallery_dl/aes.py new file mode 100644 index 0000000..a45f50e --- /dev/null +++ b/gallery_dl/aes.py @@ -0,0 +1,337 @@ +# -*- coding: utf-8 -*- + +# This is a stripped down version of youtube-dl's aes module. +# All credit for this code goes to the authors of the youtube-dl project. +# https://ytdl-org.github.io/youtube-dl/ +# https://github.com/ytdl-org/youtube-dl/ + +import base64 +from math import ceil + +BLOCK_SIZE_BYTES = 16 + + +def aes_cbc_decrypt(data, key, iv): + """ + Decrypt with aes in CBC mode + + @param {int[]} data cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + decrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + block += [0] * (BLOCK_SIZE_BYTES - len(block)) + + decrypted_block = aes_decrypt(block, expanded_key) + decrypted_data += xor(decrypted_block, previous_cipher_block) + previous_cipher_block = block + decrypted_data = decrypted_data[:len(data)] + + return decrypted_data + + +def aes_cbc_decrypt_text(data, key, iv): + """ + Decrypt with aes in CBC mode + + @param {string} data base64 encoded cipher + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {string} decrypted data as utf8 encoded string + """ + data = base64.standard_b64decode(bytes(data, "ascii")) + charcodes = aes_cbc_decrypt(list(data), key, iv) + last = charcodes[-1] + if last <= 16: + charcodes = charcodes[:-last] + return bytes(charcodes).decode() + + +def key_expansion(data): + """ + Generate key schedule + + @param {int[]} data 16/24/32-Byte cipher key + @returns {int[]} 176/208/240-Byte expanded key + """ + data = data[:] # copy + rcon_iteration = 1 + key_size_bytes = len(data) + expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES + + while len(data) < expanded_key_size_bytes: + temp = data[-4:] + temp = key_schedule_core(temp, rcon_iteration) + rcon_iteration += 1 + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + for _ in range(3): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + if key_size_bytes == 32: + temp = data[-4:] + temp = sub_bytes(temp) + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + + if key_size_bytes == 32: + rounds = 3 + elif key_size_bytes == 24: + rounds = 2 + else: + rounds = 0 + for _ in range(rounds): + temp = data[-4:] + data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes]) + data = data[:expanded_key_size_bytes] + + return data + + +def aes_decrypt(data, expanded_key): + """ + Decrypt one block with aes + + @param {int[]} data 16-Byte cipher + @param {int[]} expanded_key 176/208/240-Byte expanded key + @returns {int[]} 16-Byte state + """ + rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1 + + for i in range(rounds, 0, -1): + data = xor( + data, + expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + ) + if i != rounds: + data = mix_columns_inv(data) + data = shift_rows_inv(data) + data = sub_bytes_inv(data) + data = xor(data, expanded_key[:BLOCK_SIZE_BYTES]) + + return data + + +RCON = ( + 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, +) +SBOX = ( + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, + 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76, + 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, + 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, + 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC, + 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, + 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75, + 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, + 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, + 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B, + 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, + 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8, + 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, + 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, + 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17, + 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, + 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB, + 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, + 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, + 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9, + 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, + 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A, + 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, + 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, + 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94, + 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, + 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16, +) +SBOX_INV = ( + 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, + 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, + 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, + 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, + 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, + 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, + 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, + 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, + 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, + 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, + 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, + 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, + 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, + 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, + 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, + 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, + 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, + 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, + 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, + 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, + 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, + 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, + 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, + 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, + 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, + 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, + 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, + 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, + 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, + 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d, +) +MIX_COLUMN_MATRIX = ( + (0x2, 0x3, 0x1, 0x1), + (0x1, 0x2, 0x3, 0x1), + (0x1, 0x1, 0x2, 0x3), + (0x3, 0x1, 0x1, 0x2), +) +MIX_COLUMN_MATRIX_INV = ( + (0xE, 0xB, 0xD, 0x9), + (0x9, 0xE, 0xB, 0xD), + (0xD, 0x9, 0xE, 0xB), + (0xB, 0xD, 0x9, 0xE), +) +RIJNDAEL_EXP_TABLE = ( + 0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF, + 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35, + 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4, + 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA, + 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26, + 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31, + 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC, + 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD, + 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7, + 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88, + 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F, + 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A, + 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0, + 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3, + 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC, + 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0, + 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2, + 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41, + 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0, + 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75, + 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E, + 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80, + 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF, + 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54, + 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09, + 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA, + 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91, + 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E, + 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C, + 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17, + 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD, + 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01, +) +RIJNDAEL_LOG_TABLE = ( + 0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6, + 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03, + 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef, + 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1, + 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a, + 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78, + 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24, + 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e, + 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94, + 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38, + 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62, + 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10, + 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42, + 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba, + 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca, + 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57, + 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74, + 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8, + 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5, + 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0, + 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec, + 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7, + 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86, + 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d, + 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc, + 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1, + 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47, + 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab, + 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89, + 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5, + 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18, + 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07, +) + + +def sub_bytes(data): + return [SBOX[x] for x in data] + + +def sub_bytes_inv(data): + return [SBOX_INV[x] for x in data] + + +def rotate(data): + return data[1:] + [data[0]] + + +def key_schedule_core(data, rcon_iteration): + data = rotate(data) + data = sub_bytes(data) + data[0] = data[0] ^ RCON[rcon_iteration] + return data + + +def xor(data1, data2): + return [x ^ y for x, y in zip(data1, data2)] + + +def rijndael_mul(a, b): + if a == 0 or b == 0: + return 0 + return RIJNDAEL_EXP_TABLE[ + (RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF + ] + + +def mix_column(data, matrix): + data_mixed = [] + for row in range(4): + mixed = 0 + for column in range(4): + # xor is (+) and (-) + mixed ^= rijndael_mul(data[column], matrix[row][column]) + data_mixed.append(mixed) + return data_mixed + + +def mix_columns(data, matrix=MIX_COLUMN_MATRIX): + data_mixed = [] + for i in range(4): + column = data[i * 4: (i + 1) * 4] + data_mixed += mix_column(column, matrix) + return data_mixed + + +def mix_columns_inv(data): + return mix_columns(data, MIX_COLUMN_MATRIX_INV) + + +def shift_rows_inv(data): + data_shifted = [] + for column in range(4): + for row in range(4): + data_shifted.append(data[((column - row) & 0b11) * 4 + row]) + return data_shifted + + +__all__ = ['key_expansion', 'aes_cbc_decrypt', 'aes_cbc_decrypt_text'] diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py new file mode 100644 index 0000000..e6ba61a --- /dev/null +++ b/gallery_dl/cache.py @@ -0,0 +1,204 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Decorators to keep function results in an in-memory and database cache""" + +import sqlite3 +import pickle +import time +import functools +from . import config, util + + +class CacheDecorator(): + """Simplified in-memory cache""" + def __init__(self, func, keyarg): + self.func = func + self.cache = {} + self.keyarg = keyarg + + def __get__(self, instance, cls): + return functools.partial(self.__call__, instance) + + def __call__(self, *args, **kwargs): + key = "" if self.keyarg is None else args[self.keyarg] + try: + value = self.cache[key] + except KeyError: + value = self.cache[key] = self.func(*args, **kwargs) + return value + + def update(self, key, value): + self.cache[key] = value + + def invalidate(self, key): + try: + del self.cache[key] + except KeyError: + pass + + +class MemoryCacheDecorator(CacheDecorator): + """In-memory cache""" + def __init__(self, func, keyarg, maxage): + CacheDecorator.__init__(self, func, keyarg) + self.maxage = maxage + + def __call__(self, *args, **kwargs): + key = "" if self.keyarg is None else args[self.keyarg] + timestamp = int(time.time()) + try: + value, expires = self.cache[key] + except KeyError: + expires = 0 + if expires < timestamp: + value = self.func(*args, **kwargs) + expires = timestamp + self.maxage + self.cache[key] = value, expires + return value + + def update(self, key, value): + self.cache[key] = value, int(time.time()) + self.maxage + + +class DatabaseCacheDecorator(): + """Database cache""" + db = None + _init = True + + def __init__(self, func, keyarg, maxage): + self.key = "%s.%s" % (func.__module__, func.__name__) + self.func = func + self.cache = {} + self.keyarg = keyarg + self.maxage = maxage + + def __get__(self, obj, objtype): + return functools.partial(self.__call__, obj) + + def __call__(self, *args, **kwargs): + key = "" if self.keyarg is None else args[self.keyarg] + timestamp = int(time.time()) + + # in-memory cache lookup + try: + value, expires = self.cache[key] + if expires > timestamp: + return value + except KeyError: + pass + + # database lookup + fullkey = "%s-%s" % (self.key, key) + cursor = self.cursor() + try: + cursor.execute("BEGIN EXCLUSIVE") + except sqlite3.OperationalError: + pass # Silently swallow exception - workaround for Python 3.6 + try: + cursor.execute( + "SELECT value, expires FROM data WHERE key=? LIMIT 1", + (fullkey,), + ) + result = cursor.fetchone() + + if result and result[1] > timestamp: + value, expires = result + value = pickle.loads(value) + else: + value = self.func(*args, **kwargs) + expires = timestamp + self.maxage + cursor.execute( + "INSERT OR REPLACE INTO data VALUES (?,?,?)", + (fullkey, pickle.dumps(value), expires), + ) + finally: + self.db.commit() + self.cache[key] = value, expires + return value + + def update(self, key, value): + expires = int(time.time()) + self.maxage + self.cache[key] = value, expires + self.cursor().execute( + "INSERT OR REPLACE INTO data VALUES (?,?,?)", + ("%s-%s" % (self.key, key), pickle.dumps(value), expires), + ) + + def invalidate(self, key): + try: + del self.cache[key] + except KeyError: + pass + self.cursor().execute( + "DELETE FROM data WHERE key=? LIMIT 1", + ("%s-%s" % (self.key, key),), + ) + + def cursor(self): + if self._init: + self.db.execute( + "CREATE TABLE IF NOT EXISTS data " + "(key TEXT PRIMARY KEY, value TEXT, expires INTEGER)" + ) + DatabaseCacheDecorator._init = False + return self.db.cursor() + + +def memcache(maxage=None, keyarg=None): + if maxage: + def wrap(func): + return MemoryCacheDecorator(func, keyarg, maxage) + else: + def wrap(func): + return CacheDecorator(func, keyarg) + return wrap + + +def cache(maxage=3600, keyarg=None): + def wrap(func): + return DatabaseCacheDecorator(func, keyarg, maxage) + return wrap + + +def clear(): + """Delete all database entries""" + db = DatabaseCacheDecorator.db + + if db: + rowcount = 0 + cursor = db.cursor() + try: + cursor.execute("DELETE FROM data") + except sqlite3.OperationalError: + pass # database is not initialized, can't be modified, etc. + else: + rowcount = cursor.rowcount + db.commit() + cursor.execute("VACUUM") + return rowcount + + return None + + +def _path(): + path = config.get(("cache", "file"), -1) + + if path == -1: + import tempfile + import os.path + return os.path.join(tempfile.gettempdir(), ".gallery-dl.cache") + + return util.expand_path(path) + + +try: + DatabaseCacheDecorator.db = sqlite3.connect( + _path(), timeout=30, check_same_thread=False) +except (TypeError, sqlite3.OperationalError): + cache = memcache # noqa: F811 diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py new file mode 100644 index 0000000..b9bf32d --- /dev/null +++ b/gallery_dl/cloudflare.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Methods to access sites behind Cloudflare protection""" + +import re +import time +import operator +import collections +import urllib.parse +from . import text, exception +from .cache import memcache + + +def is_challenge(response): + return (response.status_code == 503 and + response.headers.get("Server", "").startswith("cloudflare") and + b"jschl-answer" in response.content) + + +def is_captcha(response): + return (response.status_code == 403 and + b'name="captcha-bypass"' in response.content) + + +def solve_challenge(session, response, kwargs): + """Solve Cloudflare challenge and get cfclearance cookie""" + parsed = urllib.parse.urlsplit(response.url) + root = parsed.scheme + "://" + parsed.netloc + + cf_kwargs = {} + headers = cf_kwargs["headers"] = collections.OrderedDict() + params = cf_kwargs["params"] = collections.OrderedDict() + + page = response.text + params["s"] = text.extract(page, 'name="s" value="', '"')[0] + params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0] + params["pass"] = text.extract(page, 'name="pass" value="', '"')[0] + params["jschl_answer"] = solve_js_challenge(page, parsed.netloc) + headers["Referer"] = response.url + + time.sleep(4) + + url = root + "/cdn-cgi/l/chk_jschl" + cf_kwargs["allow_redirects"] = False + cf_response = session.request("GET", url, **cf_kwargs) + + location = cf_response.headers.get("Location") + if not location: + import logging + log = logging.getLogger("cloudflare") + rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected" + log.error("%s response", rtype) + log.debug("Headers:\n%s", cf_response.headers) + log.debug("Content:\n%s", cf_response.text) + raise exception.StopExtraction() + + if location[0] == "/": + location = root + location + else: + location = re.sub(r"(https?):/(?!/)", r"\1://", location) + + for cookie in cf_response.cookies: + if cookie.name == "cf_clearance": + return location, cookie.domain, { + cookie.name: cookie.value, + "__cfduid" : response.cookies.get("__cfduid", ""), + } + return location, "", {} + + +def solve_js_challenge(page, netloc): + """Evaluate JS challenge in 'page' to get 'jschl_answer' value""" + + # build variable name + # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk + data, pos = text.extract_all(page, ( + ('var' , ',f, ', '='), + ('key' , '"' , '"'), + ('expr', ':' , '}'), + )) + variable = "{}.{}".format(data["var"], data["key"]) + vlength = len(variable) + + # evaluate the initial expression + solution = evaluate_expression(data["expr"], page, netloc) + + # iterator over all remaining expressions + # and combine their values in 'solution' + expressions = text.extract( + page, "'challenge-form');", "f.submit();", pos)[0] + for expr in expressions.split(";")[1:]: + + if expr.startswith(variable): + # select arithmetc function based on operator (+/-/*) + func = OPERATORS[expr[vlength]] + # evaluate the rest of the expression + value = evaluate_expression(expr[vlength+2:], page, netloc) + # combine expression value with our current solution + solution = func(solution, value) + + elif expr.startswith("a.value"): + if "t.length)" in expr: + # add length of hostname + solution += len(netloc) + if ".toFixed(" in expr: + # trim solution to 10 decimal places + # and strip trailing zeros + solution = "{:.10f}".format(solution).rstrip("0") + return solution + + +def evaluate_expression(expr, page, netloc, *, + split_re=re.compile(r"[(+]+([^)]*)\)")): + """Evaluate a single Javascript expression for the challenge""" + + if expr.startswith("function(p)"): + # get HTML element with ID k and evaluate the expression inside + # 'eval(eval("document.getElementById(k).innerHTML"))' + k, pos = text.extract(page, "k = '", "'") + e, pos = text.extract(page, 'id="'+k+'"', '<') + return evaluate_expression(e.partition(">")[2], page, netloc) + + if "/" in expr: + # split the expression in numerator and denominator subexpressions, + # evaluate them separately, + # and return their fraction-result + num, _, denom = expr.partition("/") + num = evaluate_expression(num, page, netloc) + denom = evaluate_expression(denom, page, netloc) + return num / denom + + if "function(p)" in expr: + # split initial expression and function code + initial, _, func = expr.partition("function(p)") + # evaluate said expression + initial = evaluate_expression(initial, page, netloc) + # get function argument and use it as index into 'netloc' + index = evaluate_expression(func[func.index("}")+1:], page, netloc) + return initial + ord(netloc[int(index)]) + + # iterate over all subexpressions, + # evaluate them, + # and accumulate their values in 'result' + result = "" + for subexpr in split_re.findall(expr) or (expr,): + result += str(sum( + VALUES[part] + for part in subexpr.split("[]") + )) + return int(result) + + +OPERATORS = { + "+": operator.add, + "-": operator.sub, + "*": operator.mul, +} + +VALUES = { + "": 0, + "+": 0, + "!+": 1, + "!!": 1, + "+!!": 1, +} + + +@memcache(keyarg=0) +def cookies(category): + return None diff --git a/gallery_dl/config.py b/gallery_dl/config.py new file mode 100644 index 0000000..da52f1e --- /dev/null +++ b/gallery_dl/config.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Global configuration module""" + +import sys +import json +import os.path +import logging +from . import util + +log = logging.getLogger("config") + + +# -------------------------------------------------------------------- +# internals + +_config = {} + +if os.name == "nt": + _default_configs = [ + r"%USERPROFILE%\gallery-dl\config.json", + r"%USERPROFILE%\gallery-dl.conf", + ] +else: + _default_configs = [ + "/etc/gallery-dl.conf", + "${HOME}/.config/gallery-dl/config.json", + "${HOME}/.gallery-dl.conf", + ] + + +# -------------------------------------------------------------------- +# public interface + +def load(files=None, strict=False, fmt="json"): + """Load JSON configuration files""" + if fmt == "yaml": + try: + import yaml + parsefunc = yaml.safe_load + except ImportError: + log.error("Could not import 'yaml' module") + return + else: + parsefunc = json.load + + for path in files or _default_configs: + path = util.expand_path(path) + try: + with open(path, encoding="utf-8") as file: + confdict = parsefunc(file) + except OSError as exc: + if strict: + log.error("%s", exc) + sys.exit(1) + except Exception as exc: + log.warning("Could not parse '%s': %s", path, exc) + if strict: + sys.exit(2) + else: + if not _config: + _config.update(confdict) + else: + util.combine_dict(_config, confdict) + + +def clear(): + """Reset configuration to an empty state""" + _config.clear() + + +def get(keys, default=None, conf=_config): + """Get the value of property 'key' or a default value""" + try: + for k in keys: + conf = conf[k] + return conf + except (KeyError, AttributeError): + return default + + +def interpolate(keys, default=None, conf=_config): + """Interpolate the value of 'key'""" + try: + lkey = keys[-1] + if lkey in conf: + return conf[lkey] + for k in keys: + if lkey in conf: + default = conf[lkey] + conf = conf[k] + return conf + except (KeyError, AttributeError): + return default + + +def set(keys, value, conf=_config): + """Set the value of property 'key' for this session""" + for k in keys[:-1]: + try: + conf = conf[k] + except KeyError: + temp = {} + conf[k] = temp + conf = temp + conf[keys[-1]] = value + + +def setdefault(keys, value, conf=_config): + """Set the value of property 'key' if it doesn't exist""" + for k in keys[:-1]: + try: + conf = conf[k] + except KeyError: + temp = {} + conf[k] = temp + conf = temp + return conf.setdefault(keys[-1], value) + + +def unset(keys, conf=_config): + """Unset the value of property 'key'""" + try: + for k in keys[:-1]: + conf = conf[k] + del conf[keys[-1]] + except (KeyError, AttributeError): + pass + + +class apply(): + """Context Manager: apply a collection of key-value pairs""" + _sentinel = object() + + def __init__(self, kvlist): + self.original = [] + self.kvlist = kvlist + + def __enter__(self): + for key, value in self.kvlist: + self.original.append((key, get(key, self._sentinel))) + set(key, value) + + def __exit__(self, etype, value, traceback): + for key, value in self.original: + if value is self._sentinel: + unset(key) + else: + set(key, value) diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py new file mode 100644 index 0000000..97972cd --- /dev/null +++ b/gallery_dl/downloader/__init__.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Downloader modules""" + +import importlib + +modules = [ + "http", + "text", + "ytdl", +] + + +def find(scheme): + """Return downloader class suitable for handling the given scheme""" + try: + return _cache[scheme] + except KeyError: + klass = None + try: + if scheme in modules: # prevent unwanted imports + module = importlib.import_module("." + scheme, __package__) + klass = module.__downloader__ + except (ImportError, AttributeError, TypeError): + pass + _cache[scheme] = klass + return klass + + +# -------------------------------------------------------------------- +# internals + +_cache = {} diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py new file mode 100644 index 0000000..4803c85 --- /dev/null +++ b/gallery_dl/downloader/common.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Common classes and constants used by downloader modules.""" + +import os +import time +import logging +from .. import config, util, exception +from requests.exceptions import RequestException +from ssl import SSLError + + +class DownloaderBase(): + """Base class for downloaders""" + scheme = "" + retries = 1 + + def __init__(self, extractor, output): + self.session = extractor.session + self.out = output + self.log = logging.getLogger("downloader." + self.scheme) + self.downloading = False + self.part = self.config("part", True) + self.partdir = self.config("part-directory") + + if self.partdir: + self.partdir = util.expand_path(self.partdir) + os.makedirs(self.partdir, exist_ok=True) + + def config(self, key, default=None): + """Interpolate config value for 'key'""" + return config.interpolate(("downloader", self.scheme, key), default) + + def download(self, url, pathfmt): + """Download the resource at 'url' and write it to a file-like object""" + try: + return self.download_impl(url, pathfmt) + except Exception: + print() + raise + finally: + # remove file from incomplete downloads + if self.downloading and not self.part: + try: + os.remove(pathfmt.temppath) + except (OSError, AttributeError): + pass + + def download_impl(self, url, pathfmt): + """Actual implementaion of the download process""" + adj_ext = None + tries = 0 + msg = "" + + if self.part: + pathfmt.part_enable(self.partdir) + + while True: + self.reset() + if tries: + self.log.warning("%s (%d/%d)", msg, tries, self.retries) + if tries >= self.retries: + return False + time.sleep(tries) + tries += 1 + + # check for .part file + filesize = pathfmt.part_size() + + # connect to (remote) source + try: + offset, size = self.connect(url, filesize) + except exception.DownloadRetry as exc: + msg = exc + continue + except exception.DownloadComplete: + break + except Exception as exc: + self.log.warning(exc) + return False + + # check response + if not offset: + mode = "w+b" + if filesize: + self.log.info("Unable to resume partial download") + else: + mode = "r+b" + self.log.info("Resuming download at byte %d", offset) + + # set missing filename extension + if not pathfmt.has_extension: + pathfmt.set_extension(self.get_extension()) + if pathfmt.exists(): + pathfmt.temppath = "" + return True + + self.out.start(pathfmt.path) + self.downloading = True + with pathfmt.open(mode) as file: + if offset: + file.seek(offset) + + # download content + try: + self.receive(file) + except (RequestException, SSLError) as exc: + msg = exc + print() + continue + + # check filesize + if size and file.tell() < size: + msg = "filesize mismatch ({} < {})".format( + file.tell(), size) + continue + + # check filename extension + adj_ext = self._check_extension(file, pathfmt) + + break + + self.downloading = False + if adj_ext: + pathfmt.set_extension(adj_ext) + return True + + def connect(self, url, offset): + """Connect to 'url' while respecting 'offset' if possible + + Returns a 2-tuple containing the actual offset and expected filesize. + If the returned offset-value is greater than zero, all received data + will be appended to the existing .part file. + Return '0' as second tuple-field to indicate an unknown filesize. + """ + + def receive(self, file): + """Write data to 'file'""" + + def reset(self): + """Reset internal state / cleanup""" + + def get_extension(self): + """Return a filename extension appropriate for the current request""" + + @staticmethod + def _check_extension(file, pathfmt): + """Check filename extension against fileheader""" + extension = pathfmt.keywords["extension"] + if extension in FILETYPE_CHECK: + file.seek(0) + header = file.read(8) + if len(header) >= 8 and not FILETYPE_CHECK[extension](header): + for ext, check in FILETYPE_CHECK.items(): + if ext != extension and check(header): + return ext + return None + + +FILETYPE_CHECK = { + "jpg": lambda h: h[0:2] == b"\xff\xd8", + "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", + "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97, +} diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py new file mode 100644 index 0000000..961c1a2 --- /dev/null +++ b/gallery_dl/downloader/http.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Downloader module for http:// and https:// URLs""" + +import time +import mimetypes +from requests.exceptions import ConnectionError, Timeout +from .common import DownloaderBase +from .. import text, exception + + +class HttpDownloader(DownloaderBase): + scheme = "http" + + def __init__(self, extractor, output): + DownloaderBase.__init__(self, extractor, output) + self.response = None + self.retries = self.config("retries", extractor._retries) + self.timeout = self.config("timeout", extractor._timeout) + self.verify = self.config("verify", extractor._verify) + self.rate = self.config("rate") + self.chunk_size = 16384 + + if self.rate: + self.rate = text.parse_bytes(self.rate) + if not self.rate: + self.log.warning("Invalid rate limit specified") + elif self.rate < self.chunk_size: + self.chunk_size = self.rate + + def connect(self, url, offset): + headers = {} + if offset: + headers["Range"] = "bytes={}-".format(offset) + + try: + self.response = self.session.request( + "GET", url, stream=True, headers=headers, allow_redirects=True, + timeout=self.timeout, verify=self.verify) + except (ConnectionError, Timeout) as exc: + raise exception.DownloadRetry(exc) + + code = self.response.status_code + if code == 200: # OK + offset = 0 + size = self.response.headers.get("Content-Length") + elif code == 206: # Partial Content + size = self.response.headers["Content-Range"].rpartition("/")[2] + elif code == 416: # Requested Range Not Satisfiable + raise exception.DownloadComplete() + elif code == 429 or 500 <= code < 600: # Server Error + raise exception.DownloadRetry( + "{} Server Error: {} for url: {}".format( + code, self.response.reason, url)) + else: + self.response.raise_for_status() + + return offset, text.parse_int(size) + + def receive(self, file): + if self.rate: + total = 0 # total amount of bytes received + start = time.time() # start time + + for data in self.response.iter_content(self.chunk_size): + file.write(data) + + if self.rate: + total += len(data) + expected = total / self.rate # expected elapsed time + delta = time.time() - start # actual elapsed time since start + if delta < expected: + # sleep if less time passed than expected + time.sleep(expected - delta) + + def reset(self): + if self.response: + self.response.close() + self.response = None + + def get_extension(self): + mtype = self.response.headers.get("Content-Type", "image/jpeg") + mtype = mtype.partition(";")[0] + + if mtype in MIMETYPE_MAP: + return MIMETYPE_MAP[mtype] + + exts = mimetypes.guess_all_extensions(mtype, strict=False) + if exts: + exts.sort() + return exts[-1][1:] + + self.log.warning( + "No filename extension found for MIME type '%s'", mtype) + return "txt" + + +MIMETYPE_MAP = { + "image/jpeg": "jpg", + "image/jpg": "jpg", + "image/png": "png", + "image/gif": "gif", + "image/bmp": "bmp", + "image/webp": "webp", + "image/svg+xml": "svg", + + "video/webm": "webm", + "video/ogg": "ogg", + "video/mp4": "mp4", + + "audio/wav": "wav", + "audio/x-wav": "wav", + "audio/webm": "webm", + "audio/ogg": "ogg", + "audio/mpeg": "mp3", + + "application/ogg": "ogg", + "application/octet-stream": "bin", +} + + +__downloader__ = HttpDownloader diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py new file mode 100644 index 0000000..ca33863 --- /dev/null +++ b/gallery_dl/downloader/text.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Downloader module for text: URLs""" + +from .common import DownloaderBase + + +class TextDownloader(DownloaderBase): + scheme = "text" + + def __init__(self, extractor, output): + DownloaderBase.__init__(self, extractor, output) + self.content = b"" + + def connect(self, url, offset): + data = url.encode() + self.content = data[offset + 5:] + return offset, len(data) - 5 + + def receive(self, file): + file.write(self.content) + + def reset(self): + self.content = b"" + + @staticmethod + def get_extension(): + return "txt" + + +__downloader__ = TextDownloader diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py new file mode 100644 index 0000000..57a84d0 --- /dev/null +++ b/gallery_dl/downloader/ytdl.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Downloader module for URLs requiring youtube-dl support""" + +from youtube_dl import YoutubeDL +from .common import DownloaderBase +from .. import text +import os + + +class YoutubeDLDownloader(DownloaderBase): + scheme = "ytdl" + + def __init__(self, extractor, output): + DownloaderBase.__init__(self, extractor, output) + + options = { + "format": self.config("format") or None, + "ratelimit": text.parse_bytes(self.config("rate"), None), + "retries": self.config("retries", extractor._retries), + "socket_timeout": self.config("timeout", extractor._timeout), + "nocheckcertificate": not self.config("verify", extractor._verify), + "nopart": not self.part, + } + options.update(self.config("raw-options") or {}) + + if self.config("logging", True): + options["logger"] = self.log + + self.ytdl = YoutubeDL(options) + + def download(self, url, pathfmt): + try: + info_dict = self.ytdl.extract_info(url[5:], download=False) + except Exception: + return False + + if "entries" in info_dict: + index = pathfmt.keywords.get("_ytdl_index") + if index is None: + return self._download_playlist(pathfmt, info_dict) + else: + info_dict = info_dict["entries"][index] + return self._download_video(pathfmt, info_dict) + + def _download_video(self, pathfmt, info_dict): + if "url" in info_dict: + text.nameext_from_url(info_dict["url"], pathfmt.keywords) + pathfmt.set_extension(info_dict["ext"]) + if pathfmt.exists(): + pathfmt.temppath = "" + return True + if self.part and self.partdir: + pathfmt.temppath = os.path.join( + self.partdir, pathfmt.filename) + self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%%") + + self.out.start(pathfmt.path) + try: + self.ytdl.process_info(info_dict) + except Exception: + self.log.debug("Traceback", exc_info=True) + return False + return True + + def _download_playlist(self, pathfmt, info_dict): + pathfmt.set_extension("%(playlist_index)s.%(ext)s") + self.ytdl.params["outtmpl"] = pathfmt.realpath + + for entry in info_dict["entries"]: + self.ytdl.process_info(entry) + return True + + +__downloader__ = YoutubeDLDownloader diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py new file mode 100644 index 0000000..3e86177 --- /dev/null +++ b/gallery_dl/exception.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Exception classes used by gallery-dl + +Class Hierarchy: + +Exception + +-- GalleryDLException + +-- ExtractionError + | +-- AuthenticationError + | +-- AuthorizationError + | +-- NotFoundError + | +-- HttpError + +-- DownloadError + | +-- DownloadComplete + | +-- DownloadRetry + +-- NoExtractorError + +-- FormatError + +-- FilterError + +-- StopExtraction +""" + + +class GalleryDLException(Exception): + """Base class for GalleryDL exceptions""" + + +class ExtractionError(GalleryDLException): + """Base class for exceptions during information extraction""" + + +class AuthenticationError(ExtractionError): + """Invalid or missing login information""" + + +class AuthorizationError(ExtractionError): + """Insufficient privileges to access a resource""" + + +class NotFoundError(ExtractionError): + """Requested resource (gallery/image) does not exist""" + + +class HttpError(ExtractionError): + """HTTP request during extraction failed""" + + +class DownloadError(GalleryDLException): + """Base class for exceptions during file downloads""" + + +class DownloadRetry(DownloadError): + """Download attempt failed and should be retried""" + + +class DownloadComplete(DownloadError): + """Output file of attempted download is already complete""" + + +class NoExtractorError(GalleryDLException): + """No extractor can handle the given URL""" + + +class FormatError(GalleryDLException): + """Error while building output path""" + + +class FilterError(GalleryDLException): + """Error while evaluating a filter expression""" + + +class StopExtraction(GalleryDLException): + """Extraction should stop""" diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py new file mode 100644 index 0000000..8df8645 --- /dev/null +++ b/gallery_dl/extractor/2chan.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.2chan.net/""" + +from .common import Extractor, Message +from .. import text + + +class FutabaThreadExtractor(Extractor): + """Extractor for images from threads on www.2chan.net""" + category = "2chan" + subcategory = "thread" + directory_fmt = ("{category}", "{board_name}", "{thread}") + filename_fmt = "{tim}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" + pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)" + test = ("http://dec.2chan.net/70/res/947.htm", { + "url": "c5c12b80b290e224b6758507b3bb952044f4595b", + "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.server, self.board, self.thread = match.groups() + + def items(self): + url = "https://{}.2chan.net/{}/res/{}.htm".format( + self.server, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(page): + if "filename" not in post: + continue + post.update(data) + url = self.url_fmt.format_map(post) + yield Message.Url, url, post + + def metadata(self, page): + """Collect metadata for extractor-job""" + title = text.extract(page, "", "")[0] + title, _, boardname = title.rpartition(" - ") + return { + "server": self.server, + "title": title, + "board": self.board, + "board_name": boardname[:-4], + "thread": self.thread, + } + + def posts(self, page): + """Build a list of all post-objects""" + page = text.extract( + page, '
')[0] + return [ + self.parse(post) + for post in page.split('') + ] + + def parse(self, post): + """Build post-object by extracting data from an HTML post""" + data = self._extract_post(post) + if '', ''), + ("name", '', ' '), + ("now" , ' ', ' '), + (None , '', ''), + ))[0] + + @staticmethod + def _extract_image(post, data): + text.extract_all(post, ( + (None , '_blank', ''), + ("filename", '>', '<'), + ("fsize" , '(', ' '), + ), 0, data) diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py new file mode 100644 index 0000000..50dbfe8 --- /dev/null +++ b/gallery_dl/extractor/35photo.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://35photo.pro/""" + +from .common import Extractor, Message +from .. import text + + +class _35photoExtractor(Extractor): + category = "35photo" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{id}{title:?_//}_{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + root = "https://35photo.pro" + + def items(self): + first = True + data = self.metadata() + yield Message.Version, 1 + + for photo_id in self.photos(): + for photo in self._photo_data(photo_id): + photo.update(data) + url = photo["url"] + if first: + first = False + yield Message.Directory, photo + yield Message.Url, url, text.nameext_from_url(url, photo) + + def metadata(self): + """Returns general metadata""" + return {} + + def photos(self): + """Returns an iterable containing all relevant photo IDs""" + + def _pagination(self, params, extra_ids=None): + url = "https://35photo.pro/show_block.php" + headers = {"Referer": self.root, "X-Requested-With": "XMLHttpRequest"} + params["type"] = "getNextPageData" + + if "lastId" not in params: + params["lastId"] = "999999999" + if extra_ids: + yield from extra_ids + while params["lastId"]: + data = self.request(url, headers=headers, params=params).json() + yield from self._photo_ids(data["data"]) + params["lastId"] = data["lastId"] + + def _photo_data(self, photo_id): + params = {"method": "photo.getData", "photoId": photo_id} + data = self.request( + "https://api.35photo.pro/", params=params).json()["data"][photo_id] + info = { + "url" : data["src"], + "id" : data["photo_id"], + "title" : data["photo_name"], + "description": data["photo_desc"], + "tags" : data["tags"] or [], + "views" : data["photo_see"], + "favorites" : data["photo_fav"], + "score" : data["photo_rating"], + "type" : data["photo_type"], + "date" : data["timeAdd"], + "user" : data["user_login"], + "user_id" : data["user_id"], + "user_name" : data["user_name"], + "other" : data["otherData"], + } + + if "series" in data: + for info["num"], photo in enumerate(data["series"], 1): + info["url"] = photo["src"] + info["id_series"] = text.parse_int(photo["id"]) + info["title_series"] = photo["title"] or "" + yield info.copy() + else: + info["num"] = 1 + yield info + + @staticmethod + def _photo_ids(page): + """Extract unique photo IDs and return them as sorted list""" + # searching for photo-id="..." doesn't always work (see unit tests) + return sorted( + set(text.extract_iter(page, "/photo_", "/")), + key=text.parse_int, + reverse=True, + ) + + +class _35photoUserExtractor(_35photoExtractor): + """Extractor for all images of a user on 35photo.pro""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro" + r"/(?!photo_|genre_)([^/?&#]+)") + test = ( + ("https://35photo.pro/liya", { + "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg", + "count": 9, + }), + ("https://35photo.pro/suhoveev", { + # last photo ID (1267028) isn't given as 'photo-id="" + # there are only 23 photos without the last one + "count": ">= 33", + }), + ("https://en.35photo.pro/liya"), + ("https://ru.35photo.pro/liya"), + ) + + def __init__(self, match): + _35photoExtractor.__init__(self, match) + self.user = match.group(1) + self.user_id = 0 + + def metadata(self): + url = "{}/{}/".format(self.root, self.user) + page = self.request(url).text + self.user_id = text.parse_int(text.extract(page, "/user_", ".xml")[0]) + return { + "user": self.user, + "user_id": self.user_id, + } + + def photos(self): + return self._pagination({ + "page": "photoUser", + "user_id": self.user_id, + }) + + +class _35photoGenreExtractor(_35photoExtractor): + """Extractor for images of a specific genre on 35photo.pro""" + subcategory = "genre" + directory_fmt = ("{category}", "Genre", "{genre}") + archive_fmt = "g{genre_id}_{id}_{num}" + pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/genre_(\d+)(/new/)?" + test = ( + ("https://35photo.pro/genre_109/", { + "range": "1-30", + }), + ("https://35photo.pro/genre_109/new/"), + ) + + def __init__(self, match): + _35photoExtractor.__init__(self, match) + self.genre_id, self.new = match.groups() + self.photo_ids = None + + def metadata(self): + url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/") + page = self.request(url).text + self.photo_ids = self._photo_ids(text.extract( + page, ' class="photo', '\n')[0]) + return { + "genre": text.extract(page, " genre - ", ". ")[0], + "genre_id": text.parse_int(self.genre_id), + } + + def photos(self): + return self._pagination({ + "page": "genre", + "community_id": self.genre_id, + "photo_rating": "0" if self.new else "50", + "lastId": self.photo_ids[-1], + }, self.photo_ids) + + +class _35photoImageExtractor(_35photoExtractor): + """Extractor for individual images from 35photo.pro""" + subcategory = "image" + pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/photo_(\d+)" + test = ("https://35photo.pro/photo_753340/", { + "count": 1, + "keyword": { + "url" : r"re:https://m\d+.35photo.pro/photos_main/.*.jpg", + "id" : 753340, + "title" : "Winter walk", + "description": str, + "tags" : list, + "views" : int, + "favorites" : int, + "score" : int, + "type" : 0, + "date" : "15 авг, 2014", + "user" : "liya", + "user_id" : 20415, + "user_name" : "Liya Mirzaeva", + "other" : str, + }, + }) + + def __init__(self, match): + _35photoExtractor.__init__(self, match) + self.photo_id = match.group(1) + + def photos(self): + return (self.photo_id,) diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py new file mode 100644 index 0000000..d0e59ad --- /dev/null +++ b/gallery_dl/extractor/3dbooru.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://behoimi.org/""" + +from . import booru + + +class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): + """Base class for 3dbooru extractors""" + category = "3dbooru" + api_url = "http://behoimi.org/post/index.json" + post_url = "http://behoimi.org/post/show/{}" + page_limit = 1000 + + def __init__(self, match): + super().__init__(match) + self.session.headers.update({ + "Referer": "http://behoimi.org/post/show/", + "Accept-Encoding": "identity", + }) + + +class ThreedeebooruTagExtractor(booru.TagMixin, + ThreedeebooruExtractor): + """Extractor for images from behoimi.org based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org/post" + r"(?:/(?:index)?)?\?tags=(?P[^&#]+)") + test = ("http://behoimi.org/post?tags=himekawa_azuru+dress", { + "url": "ecb30c6aaaf8a6ff8f55255737a9840832a483c1", + "content": "11cbda40c287e026c1ce4ca430810f761f2d0b2a", + }) + + +class ThreedeebooruPoolExtractor(booru.PoolMixin, + ThreedeebooruExtractor): + """Extractor for image-pools from behoimi.org""" + pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(?P\d+)" + test = ("http://behoimi.org/pool/show/27", { + "url": "da75d2d1475449d5ef0c266cb612683b110a30f2", + "content": "fd5b37c5c6c2de4b4d6f1facffdefa1e28176554", + }) + + +class ThreedeebooruPostExtractor(booru.PostMixin, + ThreedeebooruExtractor): + """Extractor for single images from behoimi.org""" + pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(?P\d+)" + test = ("http://behoimi.org/post/show/140852", { + "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6", + "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4", + "options": (("tags", True),), + "keyword": { + "tags_character": "furude_rika", + "tags_copyright": "higurashi_no_naku_koro_ni", + "tags_model": "himekawa_azuru", + "tags_general": str, + }, + }) + + +class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin, + ThreedeebooruExtractor): + """Extractor for popular images from behoimi.org""" + pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org" + r"/post/popular_(?Pby_(?:day|week|month)|recent)" + r"(?:\?(?P[^#]*))?") + test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", { + "url": "c70268dce441a9ccc3383c244ec15edb059f494f", + "count": 20, + }) + + def __init__(self, match): + super().__init__(match) + self.api_url = "http://behoimi.org/post/popular_{scale}.json".format( + scale=self.scale) diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py new file mode 100644 index 0000000..e387b33 --- /dev/null +++ b/gallery_dl/extractor/4chan.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images and videos from https://www.4chan.org/""" + +from . import chan +from .. import text + + +class FourchanThreadExtractor(chan.ChanThreadExtractor): + """Extractor for images from threads from 4chan.org""" + category = "4chan" + pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org" + r"/([^/]+)/thread/(\d+)") + test = ( + ("https://boards.4chan.org/tg/thread/15396072/", { + "url": "39082ad166161966d7ba8e37f2173a824eb540f0", + "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a", + "content": "20b7b51afa51c9c31a0020a0737b889532c8d7ec", + }), + ("https://boards.4channel.org/tg/thread/15396072/", { + "url": "39082ad166161966d7ba8e37f2173a824eb540f0", + "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a", + }), + ) + api_url = "https://a.4cdn.org/{board}/thread/{thread}.json" + file_url = "https://i.4cdn.org/{board}/{tim}{ext}" + + def update(self, post, data=None): + chan.ChanThreadExtractor.update(self, post, data) + post["filename"] = text.unescape(post["filename"]) diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py new file mode 100644 index 0000000..00b8ab5 --- /dev/null +++ b/gallery_dl/extractor/500px.py @@ -0,0 +1,238 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://500px.com/""" + +from .common import Extractor, Message +from .. import text + + +class _500pxExtractor(Extractor): + """Base class for 500px extractors""" + category = "500px" + directory_fmt = ("{category}", "{user[username]}") + filename_fmt = "{id}_{name}.{extension}" + archive_fmt = "{id}" + root = "https://500px.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + "/" + + def items(self): + first = True + data = self.metadata() + yield Message.Version, 1 + + for photo in self.photos(): + url = photo["images"][-1]["url"] + fmt = photo["image_format"] + photo["extension"] = "jpg" if fmt == "jpeg" else fmt + if data: + photo.update(data) + if first: + first = False + yield Message.Directory, photo + yield Message.Url, url, photo + + def metadata(self): + """Returns general metadata""" + + def photos(self): + """Returns an iterable containing all relevant photo IDs""" + + def _extend(self, photos): + """Extend photos with additional metadata and higher resolution URLs""" + url = "https://api.500px.com/v1/photos" + params = { + "expanded_user_info" : "true", + "include_tags" : "true", + "include_geo" : "true", + "include_equipment_info": "true", + "vendor_photos" : "true", + "include_licensing" : "true", + "include_releases" : "true", + "liked_by" : "1", + "following_sample" : "100", + "image_size" : "32768", + "ids" : ",".join(str(p["id"]) for p in photos), + } + + data = self._api_call(url, params)["photos"] + for photo in photos: + pid = str(photo["id"]) + photo.update(data[pid]) + return photos + + def _api_call(self, url, params, csrf_token=None): + headers = {"Origin": self.root, "X-CSRF-Token": csrf_token} + return self.request(url, headers=headers, params=params).json() + + def _pagination(self, url, params, csrf): + params["page"] = 1 + while True: + data = self._api_call(url, params, csrf) + yield from self._extend(data["photos"]) + + if params["page"] >= data["total_pages"]: + return + params["page"] += 1 + + +class _500pxUserExtractor(_500pxExtractor): + """Extractor for photos from a user's photostream on 500px.com""" + subcategory = "user" + pattern = (r"(?:https?://)?500px\.com" + r"/(?!photo/)([^/?&#]+)/?(?:$|\?|#)") + test = ("https://500px.com/light_expression_photography", { + "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D5000/v2", + "range": "1-99", + "count": 99, + }) + + def __init__(self, match): + _500pxExtractor.__init__(self, match) + self.user = match.group(1) + + def photos(self): + # get csrf token and user id from webpage + url = "{}/{}".format(self.root, self.user) + page = self.request(url).text + csrf_token, pos = text.extract(page, 'csrf-token" content="', '"') + user_id , pos = text.extract(page, '/user/', '"', pos) + + # get user photos + url = "https://api.500px.com/v1/photos" + params = { + "feature" : "user", + "stream" : "photos", + "rpp" : "50", + "user_id" : user_id, + } + return self._pagination(url, params, csrf_token) + + +class _500pxGalleryExtractor(_500pxExtractor): + """Extractor for photo galleries on 500px.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{user[username]}", "{gallery[name]}") + pattern = (r"(?:https?://)?500px\.com" + r"/(?!photo/)([^/?&#]+)/galleries/([^/?&#]+)") + test = ("https://500px.com/fashvamp/galleries/lera", { + "url": "8a520272ece83278166b4f8556f9c9da43c43c45", + "count": 3, + "keyword": { + "gallery": dict, + "user": dict, + }, + }) + + def __init__(self, match): + _500pxExtractor.__init__(self, match) + self.user_name, self.gallery_name = match.groups() + self.user_id = self.gallery_id = self.csrf_token = None + + def metadata(self): + # get csrf token and user id from webpage + url = "{}/{}/galleries/{}".format( + self.root, self.user_name, self.gallery_name) + page = self.request(url).text + self.csrf_token, pos = text.extract(page, 'csrf-token" content="', '"') + self.user_id , pos = text.extract(page, 'App.CuratorId =', '\n', pos) + self.user_id = self.user_id.strip() + + # get gallery metadata; transform gallery name into id + url = "https://api.500px.com/v1/users/{}/galleries/{}".format( + self.user_id, self.gallery_name) + params = { + # "include_user": "true", + "include_cover": "1", + "cover_size": "2048", + } + data = self._api_call(url, params, self.csrf_token) + self.gallery_id = data["gallery"]["id"] + return data + + def photos(self): + url = "https://api.500px.com/v1/users/{}/galleries/{}/items".format( + self.user_id, self.gallery_id) + params = { + "sort" : "position", + "sort_direction" : "asc", + "rpp" : "50", + } + return self._pagination(url, params, self.csrf_token) + + +class _500pxImageExtractor(_500pxExtractor): + """Extractor for individual images from 500px.com""" + subcategory = "image" + pattern = r"(?:https?://)?500px\.com/photo/(\d+)" + test = ("https://500px.com/photo/222049255/queen-of-coasts", { + "url": "d1eda7afeaa589f71f05b9bb5c0694e3ffb357cd", + "count": 1, + "keyword": { + "camera": "Canon EOS 600D", + "camera_info": dict, + "collections_count": int, + "comments": list, + "comments_count": int, + "converted": False, + "converted_bits": int, + "created_at": "2017-08-01T04:40:05-04:00", + "crop_version": 0, + "description": str, + "editored_by": dict, + "editors_choice": False, + "extension": "jpg", + "favorites_count": int, + "feature": "popular", + "feature_date": "2017-08-01T09:58:28+00:00", + "focal_length": "208", + "height": 3111, + "id": 222049255, + "image_format": "jpeg", + "image_url": str, + "images": list, + "iso": "100", + "lens": "EF-S55-250mm f/4-5.6 IS II", + "lens_info": dict, + "license_type": 0, + "licensed_at": None, + "liked": False, + "location": None, + "location_details": dict, + "name": "Queen Of Coasts", + "nsfw": False, + "privacy": False, + "profile": True, + "rating": float, + "sales_count": int, + "status": 1, + "store_download": False, + "store_height": 3111, + "store_width": 4637, + "tags": list, + "taken_at": "2017-05-04T13:36:51-04:00", + "times_viewed": int, + "url": "/photo/222049255/queen-of-coasts-by-olesya-nabieva", + "user": dict, + "user_id": 12847235, + "votes_count": int, + "watermark": True, + "width": 4637, + }, + }) + + def __init__(self, match): + _500pxExtractor.__init__(self, match) + self.photo_id = match.group(1) + + def photos(self): + photos = ({"id": self.photo_id},) + return self._extend(photos) diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py new file mode 100644 index 0000000..e526da3 --- /dev/null +++ b/gallery_dl/extractor/8chan.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images and videos from https://8ch.net/""" + +from . import chan + + +class InfinitychanThreadExtractor(chan.ChanThreadExtractor): + """Extractor for images from threads from 8ch.net""" + category = "8chan" + filename_fmt = "{time}-{filename}{ext}" + pattern = r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+)" + test = ("https://8ch.net/builders/res/3.html", { + "url": "5d85c0509f907f217aea379f862b41bf3d01f645", + "keyword": "0c497190c0c0f826925fde09815351d01869c783", + }) + api_url = "https://8ch.net/{board}/res/{thread}.json" + file_url = "https://media.8ch.net/{board}/src/{tim}{ext}" + file_url_v2 = "https://media.8ch.net/file_store/{tim}{ext}" + + def build_url(self, post): + fmt = self.file_url if len(post["tim"]) < 64 else self.file_url_v2 + return fmt.format_map(post) diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py new file mode 100644 index 0000000..6fbf6b5 --- /dev/null +++ b/gallery_dl/extractor/8muses.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.8muses.com/""" + +from .common import Extractor, Message +from .. import text +import json + + +class _8musesAlbumExtractor(Extractor): + """Extractor for image albums on www.8muses.com""" + category = "8muses" + subcategory = "album" + directory_fmt = ("{category}", "{album[path]}") + filename_fmt = "{page:>03}.{extension}" + archive_fmt = "{hash}" + root = "https://www.8muses.com" + pattern = (r"(?:https?://)?(?:www\.)?8muses\.com" + r"(/comics/album/[^?&#]+)(\?[^#]+)?") + test = ( + ("https://www.8muses.com/comics/album/Fakku-Comics/santa/Im-Sorry", { + "url": "82449d6a26a29204695cba5d52c3ec60170bc159", + "keyword": { + "url" : str, + "hash" : str, + "page" : int, + "count": 16, + "album": { + "id" : 10457, + "title" : "Im Sorry", + "path" : "Fakku Comics/santa/Im Sorry", + "private": False, + "url" : str, + "parent" : 10454, + "views" : int, + "likes" : int, + "date" : "type:datetime", + }, + }, + }), + ("https://www.8muses.com/comics/album/Fakku-Comics/santa", { + "count": ">= 3", + "pattern": pattern, + "keyword": { + "url" : str, + "name" : str, + "private": False, + }, + }), + ("https://www.8muses.com/comics/album/Fakku-Comics/6?sort=az", { + "count": ">= 70", + "keyword": {"name": r"re:^[S-Zs-z]"}, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + self.params = match.group(2) or "" + + def items(self): + url = self.root + self.path + self.params + + while True: + data = self._unobfuscate(text.extract( + self.request(url).text, + 'id="ractive-public" type="text/plain">', '')[0]) + + images = data.get("pictures") + if images: + count = len(images) + album = self._make_album(data["album"]) + yield Message.Directory, {"album": album, "count": count} + for num, image in enumerate(images, 1): + url = self.root + "/image/fl/" + image["publicUri"] + img = { + "url" : url, + "page" : num, + "hash" : image["publicUri"], + "count" : count, + "album" : album, + "extension": "jpg", + } + yield Message.Url, url, img + + albums = data.get("albums") + if albums: + for album in albums: + url = self.root + "/comics/album/" + album["permalink"] + album = { + "url" : url, + "name" : album["name"], + "private": album["isPrivate"], + } + yield Message.Queue, url, album + + if data["page"] >= data["pages"]: + return + path, _, num = self.path.rstrip("/").rpartition("/") + path = path if num.isdecimal() else self.path + url = "{}{}/{}{}".format( + self.root, path, data["page"] + 1, self.params) + + def _make_album(self, album): + return { + "id" : album["id"], + "path" : album["path"], + "title" : album["name"], + "private": album["isPrivate"], + "url" : self.root + album["permalink"], + "parent" : text.parse_int(album["parentId"]), + "views" : text.parse_int(album["numberViews"]), + "likes" : text.parse_int(album["numberLikes"]), + "date" : text.parse_datetime( + album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"), + } + + @staticmethod + def _unobfuscate(data): + return json.loads("".join([ + chr(33 + (ord(c) + 14) % 94) if c != " " else c + for c in text.unescape(data.strip("\t\n\r !")) + ])) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py new file mode 100644 index 0000000..81d480e --- /dev/null +++ b/gallery_dl/extractor/__init__.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import re +import importlib + +modules = [ + "2chan", + "35photo", + "3dbooru", + "4chan", + "500px", + "8chan", + "8muses", + "artstation", + "behance", + "bobx", + "danbooru", + "deviantart", + "dynastyscans", + "e621", + "exhentai", + "fallenangels", + "flickr", + "gelbooru", + "gfycat", + "hbrowse", + "hentai2read", + "hentaicafe", + "hentaifoundry", + "hentaifox", + "hentaihere", + "hentainexus", + "hitomi", + "hypnohub", + "idolcomplex", + "imagebam", + "imagefap", + "imgbox", + "imgth", + "imgur", + "instagram", + "keenspot", + "khinsider", + "kissmanga", + "komikcast", + "konachan", + "livedoor", + "luscious", + "mangadex", + "mangafox", + "mangahere", + "mangapanda", + "mangapark", + "mangareader", + "mangastream", + "mangoxo", + "myportfolio", + "newgrounds", + "ngomik", + "nhentai", + "nijie", + "nsfwalbum", + "paheal", + "patreon", + "photobucket", + "piczel", + "pinterest", + "pixiv", + "pixnet", + "plurk", + "pornhub", + "pururin", + "reactor", + "readcomiconline", + "reddit", + "rule34", + "safebooru", + "sankaku", + "sankakucomplex", + "seiga", + "senmanga", + "sexcom", + "simplyhentai", + "slickpic", + "slideshare", + "smugmug", + "tsumino", + "tumblr", + "twitter", + "vanillarock", + "wallhaven", + "warosu", + "weibo", + "wikiart", + "xhamster", + "xvideos", + "yandere", + "yaplog", + "yuki", + "foolfuuka", + "foolslide", + "mastodon", + "shopify", + "imagehosts", + "directlink", + "recursive", + "oauth", + "test", +] + + +def find(url): + """Find a suitable extractor for the given URL""" + for cls in _list_classes(): + match = cls.pattern.match(url) + if match and cls not in _blacklist: + return cls(match) + return None + + +def add(cls): + """Add 'cls' to the list of available extractors""" + cls.pattern = re.compile(cls.pattern) + _cache.append(cls) + return cls + + +def add_module(module): + """Add all extractors in 'module' to the list of available extractors""" + classes = _get_classes(module) + for cls in classes: + cls.pattern = re.compile(cls.pattern) + _cache.extend(classes) + return classes + + +def extractors(): + """Yield all available extractor classes""" + return sorted( + _list_classes(), + key=lambda x: x.__name__ + ) + + +class blacklist(): + """Context Manager to blacklist extractor modules""" + def __init__(self, categories, extractors=None): + self.extractors = extractors or [] + for cls in _list_classes(): + if cls.category in categories: + self.extractors.append(cls) + + def __enter__(self): + _blacklist.update(self.extractors) + + def __exit__(self, etype, value, traceback): + _blacklist.clear() + + +# -------------------------------------------------------------------- +# internals + +_cache = [] +_blacklist = set() +_module_iter = iter(modules) + + +def _list_classes(): + """Yield all available extractor classes""" + yield from _cache + + for module_name in _module_iter: + module = importlib.import_module("."+module_name, __package__) + yield from add_module(module) + + +def _get_classes(module): + """Return a list of all extractor classes in a module""" + return [ + cls for cls in module.__dict__.values() if ( + hasattr(cls, "pattern") and cls.__module__ == module.__name__ + ) + ] diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py new file mode 100644 index 0000000..24197ad --- /dev/null +++ b/gallery_dl/extractor/artstation.py @@ -0,0 +1,369 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.artstation.com/""" + +from .common import Extractor, Message +from .. import text, util, exception +import random +import string + + +class ArtstationExtractor(Extractor): + """Base class for artstation extractors""" + category = "artstation" + filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}" + directory_fmt = ("{category}", "{userinfo[username]}") + archive_fmt = "{asset[id]}" + root = "https://www.artstation.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) or match.group(2) + self.external = self.config("external", False) + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + + for project in self.projects(): + for asset in self.get_project_assets(project["hash_id"]): + asset.update(data) + adict = asset["asset"] + + if adict["has_embedded_player"] and self.external: + player = adict["player_embedded"] + url = text.extract(player, 'src="', '"')[0] + if not url.startswith(self.root): + yield Message.Url, "ytdl:" + url, asset + continue + + if adict["has_image"]: + url = adict["image_url"] + text.nameext_from_url(url, asset) + yield Message.Url, self._no_cache(url), asset + + def metadata(self): + """Return general metadata""" + return {"userinfo": self.get_user_info(self.user)} + + def projects(self): + """Return an iterable containing all relevant project IDs""" + + def get_project_assets(self, project_id): + """Return all assets associated with 'project_id'""" + url = "{}/projects/{}.json".format(self.root, project_id) + data = self.request(url).json() + + data["title"] = text.unescape(data["title"]) + data["description"] = text.unescape(text.remove_html( + data["description"])) + + assets = data["assets"] + del data["assets"] + + if len(assets) == 1: + data["asset"] = assets[0] + yield data + else: + for asset in assets: + data["asset"] = asset + yield data.copy() + + def get_user_info(self, username): + """Return metadata for a specific user""" + url = "{}/users/{}/quick.json".format(self.root, username.lower()) + response = self.request(url, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError("user") + return response.json() + + def _pagination(self, url, params=None): + if not params: + params = {} + params["page"] = 1 + total = 0 + + while True: + data = self.request(url, params=params).json() + yield from data["data"] + + total += len(data["data"]) + if total >= data["total_count"]: + return + + params["page"] += 1 + + @staticmethod + def _no_cache(url, alphabet=(string.digits + string.ascii_letters)): + """Cause a cache miss to prevent Cloudflare 'optimizations' + + Cloudflare's 'Polish' optimization strips image metadata and may even + recompress an image as lossy JPEG. This can be prevented by causing + a cache miss when requesting an image by adding a random dummy query + parameter. + + Ref: + https://github.com/r888888888/danbooru/issues/3528 + https://danbooru.donmai.us/forum_topics/14952 + """ + param = "gallerydl_no_cache=" + util.bencode( + random.getrandbits(64), alphabet) + sep = "&" if "?" in url else "?" + return url + sep + param + + +class ArtstationUserExtractor(ArtstationExtractor): + """Extractor for all projects of an artstation user""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?&#]+)(?:/albums/all)?" + r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$") + test = ( + ("https://www.artstation.com/gaerikim/", { + "pattern": r"https://\w+\.artstation\.com/p/assets" + r"/images/images/\d+/\d+/\d+/large/[^/]+", + "count": ">= 6", + }), + ("https://www.artstation.com/gaerikim/albums/all/"), + ("https://gaerikim.artstation.com/"), + ("https://gaerikim.artstation.com/projects/"), + ) + + def projects(self): + url = "{}/users/{}/projects.json".format(self.root, self.user) + return self._pagination(url) + + +class ArtstationAlbumExtractor(ArtstationExtractor): + """Extractor for all projects in an artstation album""" + subcategory = "album" + directory_fmt = ("{category}", "{userinfo[username]}", "Albums", + "{album[id]} - {album[title]}") + archive_fmt = "a_{album[id]}_{asset[id]}" + pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?&#]+)" + r"|((?!www)\w+)\.artstation\.com)/albums/(\d+)") + test = ( + ("https://www.artstation.com/huimeiye/albums/770899", { + "count": 2, + }), + ("https://www.artstation.com/huimeiye/albums/770898", { + "exception": exception.NotFoundError, + }), + ("https://huimeiye.artstation.com/albums/770899"), + ) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.album_id = text.parse_int(match.group(3)) + + def metadata(self): + userinfo = self.get_user_info(self.user) + album = None + + for album in userinfo["albums_with_community_projects"]: + if album["id"] == self.album_id: + break + else: + raise exception.NotFoundError("album") + + return { + "userinfo": userinfo, + "album": album + } + + def projects(self): + url = "{}/users/{}/projects.json".format(self.root, self.user) + params = {"album_id": self.album_id} + return self._pagination(url, params) + + +class ArtstationLikesExtractor(ArtstationExtractor): + """Extractor for liked projects of an artstation user""" + subcategory = "likes" + directory_fmt = ("{category}", "{userinfo[username]}", "Likes") + archive_fmt = "f_{userinfo[id]}_{asset[id]}" + pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?&#]+)/likes/?") + test = ( + ("https://www.artstation.com/mikf/likes", { + "pattern": r"https://\w+\.artstation\.com/p/assets" + r"/images/images/\d+/\d+/\d+/large/[^/]+", + "count": 6, + }), + # no likes + ("https://www.artstation.com/sungchoi/likes", { + "count": 0, + }), + ) + + def projects(self): + url = "{}/users/{}/likes.json".format(self.root, self.user) + return self._pagination(url) + + +class ArtstationChallengeExtractor(ArtstationExtractor): + """Extractor for submissions of artstation challenges""" + subcategory = "challenge" + filename_fmt = "{submission_id}_{asset_id}_{filename}.{extension}" + directory_fmt = ("{category}", "Challenges", + "{challenge[id]} - {challenge[title]}") + archive_fmt = "c_{challenge[id]}_{asset_id}" + pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" + r"/contests/[^/?&#]+/challenges/(\d+)" + r"/?(?:\?sorting=([a-z]+))?") + test = ( + ("https://www.artstation.com/contests/thu-2017/challenges/20"), + (("https://www.artstation.com/contests/beyond-human" + "/challenges/23?sorting=winners"), { + "range": "1-30", + "count": 30, + }), + ) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.challenge_id = match.group(1) + self.sorting = match.group(2) or "popular" + + def items(self): + challenge_url = "{}/contests/_/challenges/{}.json".format( + self.root, self.challenge_id) + submission_url = "{}/contests/_/challenges/{}/submissions.json".format( + self.root, self.challenge_id) + update_url = "{}/contests/submission_updates.json".format( + self.root) + + challenge = self.request(challenge_url).json() + yield Message.Version, 1 + yield Message.Directory, {"challenge": challenge} + + params = {"sorting": self.sorting} + for submission in self._pagination(submission_url, params): + + params = {"submission_id": submission["id"]} + for update in self._pagination(update_url, params=params): + + del update["replies"] + update["challenge"] = challenge + for url in text.extract_iter( + update["body_presentation_html"], ' href="', '"'): + update["asset_id"] = self._id_from_url(url) + text.nameext_from_url(url, update) + yield Message.Url, self._no_cache(url), update + + @staticmethod + def _id_from_url(url): + """Get an image's submission ID from its URL""" + parts = url.split("/") + return text.parse_int("".join(parts[7:10])) + + +class ArtstationSearchExtractor(ArtstationExtractor): + """Extractor for artstation search results""" + subcategory = "search" + directory_fmt = ("{category}", "Searches", "{search[searchterm]}") + archive_fmt = "s_{search[searchterm]}_{asset[id]}" + pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com" + r"/search/?\?([^#]+)") + test = ("https://www.artstation.com/search?sorting=recent&q=ancient",) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + query = text.parse_query(match.group(1)) + self.searchterm = query.get("q", "") + self.order = query.get("sorting", "recent").lower() + + def metadata(self): + return {"search": { + "searchterm": self.searchterm, + "order": self.order, + }} + + def projects(self): + order = "likes_count" if self.order == "likes" else "published_at" + url = "{}/search/projects.json".format(self.root) + params = { + "direction": "desc", + "order": order, + "q": self.searchterm, + # "show_pro_first": "true", + } + return self._pagination(url, params) + + +class ArtstationArtworkExtractor(ArtstationExtractor): + """Extractor for projects on artstation's artwork page""" + subcategory = "artwork" + directory_fmt = ("{category}", "Artworks", "{artwork[sorting]!c}") + archive_fmt = "A_{asset[id]}" + pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com" + r"/artwork/?\?([^#]+)") + test = ("https://www.artstation.com/artwork?sorting=latest",) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.query = text.parse_query(match.group(1)) + + def metadata(self): + return {"artwork": self.query} + + def projects(self): + url = "{}/projects.json".format(self.root) + params = self.query.copy() + params["page"] = 1 + return self._pagination(url, params) + + +class ArtstationImageExtractor(ArtstationExtractor): + """Extractor for images from a single artstation project""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:" + r"(?:\w+\.)?artstation\.com/(?:artwork|projects|search)" + r"|artstn\.co/p)/(\w+)") + test = ( + ("https://www.artstation.com/artwork/LQVJr", { + "pattern": r"https?://\w+\.artstation\.com/p/assets" + r"/images/images/008/760/279/large/.+", + "content": "1f645ce7634e44675ebde8f6b634d36db0617d3c", + # SHA1 hash without _no_cache() + # "content": "2e8aaf6400aeff2345274f45e90b6ed3f2a0d946", + }), + # multiple images per project + ("https://www.artstation.com/artwork/Db3dy", { + "count": 4, + }), + # embedded youtube video + ("https://www.artstation.com/artwork/g4WPK", { + "range": "2", + "options": (("external", True),), + "pattern": "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0", + }), + # alternate URL patterns + ("https://sungchoi.artstation.com/projects/LQVJr"), + ("https://artstn.co/p/LQVJr"), + ) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.project_id = match.group(1) + self.assets = None + + def metadata(self): + self.assets = list(ArtstationExtractor.get_project_assets( + self, self.project_id)) + self.user = self.assets[0]["user"]["username"] + return ArtstationExtractor.metadata(self) + + def projects(self): + return ({"hash_id": self.project_id},) + + def get_project_assets(self, project_id): + return self.assets diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py new file mode 100644 index 0000000..111d560 --- /dev/null +++ b/gallery_dl/extractor/behance.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.behance.net/""" + +from .common import Extractor, Message +from .. import text +import json + + +class BehanceExtractor(Extractor): + """Base class for behance extractors""" + category = "behance" + root = "https://www.behance.net" + + def items(self): + yield Message.Version, 1 + for gallery in self.galleries(): + gallery["_extractor"] = BehanceGalleryExtractor + yield Message.Queue, gallery["url"], self._update(gallery) + + def galleries(self): + """Return all relevant gallery URLs""" + + @staticmethod + def _update(data): + # compress data to simple lists + data["fields"] = [field["name"] for field in data["fields"]] + data["owners"] = [owner["display_name"] for owner in data["owners"]] + if "tags" in data: + data["tags"] = [tag["title"] for tag in data["tags"]] + + # backwards compatibility + data["gallery_id"] = data["id"] + data["title"] = data["name"] + data["user"] = ", ".join(data["owners"]) + + return data + + +class BehanceGalleryExtractor(BehanceExtractor): + """Extractor for image galleries from www.behance.net""" + subcategory = "gallery" + directory_fmt = ("{category}", "{owners:J, }", "{id} {name}") + filename_fmt = "{category}_{id}_{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + pattern = r"(?:https?://)?(?:www\.)?behance\.net/gallery/(\d+)" + test = ( + ("https://www.behance.net/gallery/17386197/A-Short-Story", { + "count": 2, + "url": "ab79bd3bef8d3ae48e6ac74fd995c1dfaec1b7d2", + "keyword": { + "id": 17386197, + "name": 're:"Hi". A short story about the important things ', + "owners": ["Place Studio", "Julio César Velazquez"], + "fields": ["Animation", "Character Design", "Directing"], + "tags": list, + "module": dict, + }, + }), + ("https://www.behance.net/gallery/21324767/Nevada-City", { + "count": 6, + "url": "0258fe194fe7d828d6f2c7f6086a9a0a4140db1d", + "keyword": {"owners": ["Alex Strohl"]}, + }), + ) + + def __init__(self, match): + BehanceExtractor.__init__(self, match) + self.gallery_id = match.group(1) + + def items(self): + data = self.get_gallery_data() + imgs = self.get_images(data) + data["count"] = len(imgs) + + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], (url, module) in enumerate(imgs, 1): + data["module"] = module + data["extension"] = text.ext_from_url(url) + yield Message.Url, url, data + + def get_gallery_data(self): + """Collect gallery info dict""" + url = "{}/gallery/{}/a".format(self.root, self.gallery_id) + cookies = { + "_evidon_consent_cookie": + '{"consent_date":"2019-01-31T09:41:15.132Z"}', + "bcp": "815b5eee-8bdf-4898-ac79-33c2bcc0ed19", + "gk_suid": "66981391", + "gki": '{"feature_project_view":false,' + '"feature_discover_login_prompt":false,' + '"feature_project_login_prompt":false}', + "ilo0": "true", + } + page = self.request(url, cookies=cookies).text + + data = json.loads(text.extract( + page, 'id="beconfig-store_state">', '')[0]) + return self._update(data["project"]["project"]) + + @staticmethod + def get_images(data): + """Extract image results from an API response""" + results = [] + + for module in data["modules"]: + + if module["type"] == "image": + url = module["sizes"]["original"] + results.append((url, module)) + + elif module["type"] == "embed": + embed = module.get("original_embed") or module.get("embed") + url = "ytdl:" + text.extract(embed, 'src="', '"')[0] + results.append((url, module)) + + return results + + +class BehanceUserExtractor(BehanceExtractor): + """Extractor for a user's galleries from www.behance.net""" + subcategory = "user" + categorytransfer = True + pattern = r"(?:https?://)?(?:www\.)?behance\.net/([^/?&#]+)/?$" + test = ("https://www.behance.net/alexstrohl", { + "count": ">= 8", + "pattern": BehanceGalleryExtractor.pattern, + }) + + def __init__(self, match): + BehanceExtractor.__init__(self, match) + self.user = match.group(1) + + def galleries(self): + url = "{}/{}/projects".format(self.root, self.user) + headers = {"X-Requested-With": "XMLHttpRequest"} + params = {"offset": 0} + + while True: + data = self.request(url, headers=headers, params=params).json() + work = data["profile"]["activeSection"]["work"] + yield from work["projects"] + if not work["hasMore"]: + return + params["offset"] += len(work["projects"]) + + +class BehanceCollectionExtractor(BehanceExtractor): + """Extractor for a collection's galleries from www.behance.net""" + subcategory = "collection" + categorytransfer = True + pattern = r"(?:https?://)?(?:www\.)?behance\.net/collection/(\d+)" + test = ("https://www.behance.net/collection/170615607/Sky", { + "count": ">= 13", + "pattern": BehanceGalleryExtractor.pattern, + }) + + def __init__(self, match): + BehanceExtractor.__init__(self, match) + self.collection_id = match.group(1) + + def galleries(self): + url = "{}/collection/{}/a".format(self.root, self.collection_id) + headers = {"X-Requested-With": "XMLHttpRequest"} + params = {} + + while True: + data = self.request(url, headers=headers, params=params).json() + yield from data["output"] + if not data.get("offset"): + return + params["offset"] = data["offset"] diff --git a/gallery_dl/extractor/bobx.py b/gallery_dl/extractor/bobx.py new file mode 100644 index 0000000..67427a7 --- /dev/null +++ b/gallery_dl/extractor/bobx.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://www.bobx.com/dark/""" + +from .common import Extractor, Message +from .. import text + + +class BobxExtractor(Extractor): + """Base class for bobx extractors""" + category = "bobx" + root = "http://www.bobx.com" + per_page = 80 + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + + +class BobxGalleryExtractor(BobxExtractor): + """Extractor for individual image galleries on bobx.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{model}", "{title}") + filename_fmt = "{model}_{image_id}_{num:>03}.{extension}" + archive_fmt = "{image_id}" + pattern = (r"(?:https?://)?(?:www\.)?bobx\.com" + r"/([^/]+/[^/]+/photoset/[\w-]+)-\d+-\d+-\d+\.html") + test = ( + (("http://www.bobx.com/idol/mikoto-hibi" + "/photoset/wpb-2018-_11-0-2-8.html"), { + "url": "93972d6a661f6627e963d62c9d15531e6b36a389", + "keyword": "6c620862db494ed05e69356ba30e604b167b0670", + "content": "3f176b7fe752524cec21a763aa55567e41181e07", + }), + (("http://www.bobx.com/idol/nashiko-momotsuki" + "/photoset/wpb-net-_221---2018-08---magic-of-summer-0-10-10.html"), { + "url": "f5d6c0cd0881ae6f504c21a90d86e3464dc54e8e", + "keyword": "f4819c75f494044348889ecd27771508464c0f5f", + }), + ) + + def items(self): + num = 0 + while True: + url = "{}/{}-{}-10-8.html".format(self.root, self.path, num) + page = self.request(url, encoding="utf-8").text + + if num == 0: + data = self.metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + data["num"] = 0 + + for url in self.images(page): + url = text.urljoin(self.root, url.replace("-preview-", "-")) + data = text.nameext_from_url(url, data) + data["image_id"] = text.parse_int( + data["filename"].rpartition("-")[2]) + data["num"] += 1 + yield Message.Url, url, data + + num += self.per_page + if num >= data["count"]: + return + + @staticmethod + def metadata(page): + """Collect metadata for extractor-job""" + info = text.extract(page, "", "")[0] + model, _, info = info.partition(" in ") + info, _, count = info.rpartition(" of ") + title = info.rpartition(" - @")[0] + return { + "title": text.unquote(title), + "model": text.unquote(model), + "count": text.parse_int(count), + } + + @staticmethod + def images(page): + """Extract all image-urls""" + page = text.extract(page, "
self.page_limit: + pages = self.page_limit - self.page_start + self.page_start += pages + return pages * self.per_page + + def items(self): + data = self.get_metadata() + + yield Message.Version, 1 + yield Message.Directory, data + + self.reset_page() + while True: + images = self.parse_response( + self.request(self.api_url, params=self.params)) + + for image in images: + try: + url = image["file_url"] + except KeyError: + continue + if url.startswith("/"): + url = text.urljoin(self.api_url, url) + image.update(data) + if self.extags: + self.extended_tags(image) + yield Message.Url, url, text.nameext_from_url(url, image) + + if len(images) < self.per_page: + return + self.update_page(image) + + def reset_page(self): + """Initialize params to point to the first page""" + self.params["page"] = self.page_start + + def update_page(self, data): + """Update params to point to the next page""" + + def parse_response(self, response): + """Parse JSON API response""" + images = response.json() + if self.sort: + images.sort(key=operator.itemgetter("score", "id"), + reverse=True) + return images + + def get_metadata(self): + """Collect metadata for extractor-job""" + return {} + + def extended_tags(self, image, page=None): + """Retrieve extended tag information""" + if not page: + url = self.post_url.format(image["id"]) + page = self.request(url).text + tags = collections.defaultdict(list) + tags_html = text.extract(page, ''), "%Y-%m-%d %H:%M"), + "parent" : extr( + '>Parent:'), + ('artist', ''), + ('total' , ''), + ), values=data) + + if not data["manga"] and "Warning" in page: + msg = page.rpartition(">")[2].strip() + self.log.error("Site is not accessible: '%s'", msg) + raise exception.StopExtraction() + + tags = text.extract(page, 'class="listTable"', '
', 'Visible:', '<'), + "language" : extr( + '>Language:', ' '), + "gallery_size" : text.parse_bytes(extr( + '>File Size:', '<').rstrip("Bb")), + "count" : text.parse_int(extr( + '>Length:', ' ')), + } + + data["lang"] = util.language_to_code(data["language"]) + data["tags"] = [ + text.unquote(tag) + for tag in text.extract_iter(page, 'hentai.org/tag/', '"') + ] + + return data + + def image_from_page(self, page): + """Get image url and data from webpage""" + pos = page.index('
", "") + maximum, pos = text.extract(page, "", "", pos) + self._remaining = text.parse_int(maximum) - text.parse_int(current) + + @staticmethod + def _parse_image_info(url): + parts = url.split("/")[4].split("-") + return { + "width": text.parse_int(parts[2]), + "height": text.parse_int(parts[3]), + "size": text.parse_int(parts[1]), + "cost": 1, + } + + @staticmethod + def _parse_original_info(info): + parts = info.lstrip().split(" ") + size = text.parse_bytes(parts[3] + parts[4][0]) + return { + "width": text.parse_int(parts[0]), + "height": text.parse_int(parts[2]), + "size": size, + "cost": 1 + math.ceil(size * 5 / 1024 / 1024) + } + + +class ExhentaiSearchExtractor(ExhentaiExtractor): + """Extractor for exhentai search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/?\?(.*)$" + test = ( + ("https://exhentai.org/?f_search=touhou"), + (("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0" + "&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0" + "&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), { + "pattern": ExhentaiGalleryExtractor.pattern, + "range": "1-30", + "count": 30, + }), + ) + + def __init__(self, match): + ExhentaiExtractor.__init__(self, match) + self.params = text.parse_query(match.group(2)) + self.params["page"] = text.parse_int(self.params.get("page")) + self.search_url = self.root + + def items(self): + self.login() + yield Message.Version, 1 + + while True: + last = None + page = self.request(self.search_url, params=self.params).text + + for gallery in ExhentaiGalleryExtractor.pattern.finditer(page): + url = gallery.group(0) + if url == last: + continue + last = url + yield Message.Queue, url, {} + + if 'class="ptdd">><' in page or ">No hits found

" in page: + return + self.params["page"] += 1 + self.wait() + + +class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): + """Extractor for favorited exhentai galleries""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?" + test = ( + ("https://exhentai.org/favorites.php"), + ("https://exhentai.org/favorites.php?favcat=1&f_search=touhou" + "&f_apply=Search+Favorites"), + ) + + def __init__(self, match): + ExhentaiSearchExtractor.__init__(self, match) + self.search_url = self.root + "/favorites.php" diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py new file mode 100644 index 0000000..a2d8c04 --- /dev/null +++ b/gallery_dl/extractor/fallenangels.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters from https://www.fascans.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, util +import json + + +class FallenangelsChapterExtractor(ChapterExtractor): + """Extractor for manga-chapters from fascans.com""" + category = "fallenangels" + pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com" + r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?") + test = ( + ("https://manga.fascans.com/manga/chronos-ruler/20/1", { + "url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3", + "keyword": "2dfcc50020e32cd207be88e2a8fac0933e36bdfb", + }), + ("http://truyen.fascans.com/manga/hungry-marie/8", { + "url": "1f923d9cb337d5e7bbf4323719881794a951c6ae", + "keyword": "2bdb7334c0e3eceb9946ffd3132df679b4a94f6a", + }), + ("http://manga.fascans.com/manga/rakudai-kishi-no-eiyuutan/19.5", { + "keyword": "9fcca4c1a90d11f00764f62477ebe10bd408021c", + }), + ) + + def __init__(self, match): + self.version, self.manga, self.chapter, self.minor = match.groups() + url = "https://{}.fascans.com/manga/{}/{}/1".format( + self.version, self.manga, self.chapter) + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + lang = "vi" if self.version == "truyen" else "en" + return { + "manga" : extr('name="description" content="', ' Chapter '), + "title" : extr(': ', ' - Page 1'), + "chapter" : self.chapter, + "chapter_minor": self.minor or "", + "lang" : lang, + "language": util.code_to_language(lang), + } + + @staticmethod + def images(page): + return [ + (img["page_image"], None) + for img in json.loads( + text.extract(page, "var pages = ", ";")[0] + ) + ] + + +class FallenangelsMangaExtractor(MangaExtractor): + """Extractor for manga from fascans.com""" + chapterclass = FallenangelsChapterExtractor + category = "fallenangels" + pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$" + test = ( + ("http://manga.fascans.com/manga/trinity-seven", { + "url": "293057f264de6c438b979bd1c3de4719568db452", + "keyword": "50e0374dba60734230e4284b5ffdadef5104ae62", + }), + ("https://truyen.fascans.com/manga/rakudai-kishi-no-eiyuutan", { + "url": "51a731a6b82d5eb7a335fbae6b02d06aeb2ab07b", + "keyword": "2d2a2a5d9ea5925eb9a47bb13d848967f3af086c", + }), + ) + + def __init__(self, match): + url = "https://" + match.group(1) + self.lang = "vi" if match.group(2) == "truyen" else "en" + MangaExtractor.__init__(self, match, url) + + def chapters(self, page): + extr = text.extract_from(page) + results = [] + language = util.code_to_language(self.lang) + while extr('
  • ', '<') + title = extr('', '') + + manga, _, chapter = cha.rpartition(" ") + chapter, dot, minor = chapter.partition(".") + results.append((url, { + "manga" : manga, + "title" : text.unescape(title), + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(chapter), + "chapter_minor": dot + minor, + "lang" : self.lang, + "language": language, + })) + return results diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py new file mode 100644 index 0000000..d941d76 --- /dev/null +++ b/gallery_dl/extractor/flickr.py @@ -0,0 +1,503 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.flickr.com/""" + +from .common import Extractor, Message +from .. import text, oauth, util, exception + + +class FlickrExtractor(Extractor): + """Base class for flickr extractors""" + category = "flickr" + filename_fmt = "{category}_{id}.{extension}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = FlickrAPI(self) + self.item_id = match.group(1) + self.user = None + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for photo in self.photos(): + photo.update(data) + url = photo["url"] + yield Message.Url, url, text.nameext_from_url(url, photo) + + def metadata(self): + """Return general metadata""" + self.user = self.api.urls_lookupUser(self.item_id) + return {"user": self.user} + + def photos(self): + """Return an iterable with all relevant photo objects""" + + +class FlickrImageExtractor(FlickrExtractor): + """Extractor for individual images from flickr.com""" + subcategory = "image" + archive_fmt = "{id}" + pattern = (r"(?:https?://)?(?:" + r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/" + r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" + r"|flic\.kr/p/([A-Za-z1-9]+))") + test = ( + ("https://www.flickr.com/photos/departingyyz/16089302239", { + "pattern": pattern, + "content": "0821a28ee46386e85b02b67cf2720063440a228c", + "keyword": { + "comments": int, + "description": str, + "extension": "jpg", + "filename": "16089302239_de18cd8017_b", + "id": 16089302239, + "height": 683, + "label": "Large", + "media": "photo", + "url": str, + "views": int, + "width": 1024, + }, + }), + ("https://www.flickr.com/photos/145617051@N08/46733161535", { + "count": 1, + "keyword": {"media": "video"}, + }), + ("http://c2.staticflickr.com/2/1475/24531000464_9a7503ae68_b.jpg", { + "pattern": pattern}), + ("https://farm2.static.flickr.com/1035/1188352415_cb139831d0.jpg", { + "pattern": pattern}), + ("https://flic.kr/p/FPVo9U", { + "pattern": pattern}), + ("https://www.flickr.com/photos/zzz/16089302238", { + "exception": exception.NotFoundError}), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + if not self.item_id: + alphabet = ("123456789abcdefghijkmnopqrstu" + "vwxyzABCDEFGHJKLMNPQRSTUVWXYZ") + self.item_id = util.bdecode(match.group(2), alphabet) + + def items(self): + photo = self.api.photos_getInfo(self.item_id) + + if photo["media"] == "video" and self.api.videos: + self.api._extract_video(photo) + else: + self.api._extract_photo(photo) + + photo["title"] = photo["title"]["_content"] + photo["comments"] = text.parse_int(photo["comments"]["_content"]) + photo["description"] = photo["description"]["_content"] + photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]] + photo["date"] = text.parse_timestamp(photo["dateuploaded"]) + photo["views"] = text.parse_int(photo["views"]) + photo["id"] = text.parse_int(photo["id"]) + + if "location" in photo: + location = photo["location"] + for key, value in location.items(): + if isinstance(value, dict): + location[key] = value["_content"] + + url = photo["url"] + yield Message.Version, 1 + yield Message.Directory, photo + yield Message.Url, url, text.nameext_from_url(url, photo) + + +class FlickrAlbumExtractor(FlickrExtractor): + """Extractor for photo albums from flickr.com""" + subcategory = "album" + directory_fmt = ("{category}", "{subcategory}s", + "{album[id]} - {album[title]}") + archive_fmt = "a_{album[id]}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" + r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?") + test = ( + (("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), { + "pattern": FlickrImageExtractor.pattern, + "count": 6, + }), + ("https://www.flickr.com/photos/shona_s/albums", { + "pattern": pattern, + "count": 2, + }), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.album_id = match.group(2) + + def items(self): + if self.album_id: + return FlickrExtractor.items(self) + return self._album_items() + + def _album_items(self): + yield Message.Version, 1 + data = FlickrExtractor.metadata(self) + data["_extractor"] = FlickrAlbumExtractor + + for album in self.api.photosets_getList(self.user["nsid"]): + self.api._clean_info(album).update(data) + url = "https://www.flickr.com/photos/{}/albums/{}".format( + self.user["path_alias"], album["id"]) + yield Message.Queue, url, album + + def metadata(self): + data = FlickrExtractor.metadata(self) + data["album"] = self.api.photosets_getInfo( + self.album_id, self.user["nsid"]) + return data + + def photos(self): + return self.api.photosets_getPhotos(self.album_id) + + +class FlickrGalleryExtractor(FlickrExtractor): + """Extractor for photo galleries from flickr.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "galleries", + "{user[username]} {gallery[id]}") + archive_fmt = "g_{gallery[id]}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" + r"photos/([^/]+)/galleries/(\d+)") + test = (("https://www.flickr.com/photos/flickr/" + "galleries/72157681572514792/"), { + "pattern": FlickrImageExtractor.pattern, + "count": ">= 10", + }) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + def metadata(self): + data = FlickrExtractor.metadata(self) + data["gallery"] = self.api.galleries_getInfo(self.gallery_id) + return data + + def photos(self): + return self.api.galleries_getPhotos(self.gallery_id) + + +class FlickrGroupExtractor(FlickrExtractor): + """Extractor for group pools from flickr.com""" + subcategory = "group" + directory_fmt = ("{category}", "{subcategory}s", "{group[groupname]}") + archive_fmt = "G_{group[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)" + test = ("https://www.flickr.com/groups/bird_headshots/", { + "pattern": FlickrImageExtractor.pattern, + "count": "> 150", + }) + + def metadata(self): + self.group = self.api.urls_lookupGroup(self.item_id) + return {"group": self.group} + + def photos(self): + return self.api.groups_pools_getPhotos(self.group["nsid"]) + + +class FlickrUserExtractor(FlickrExtractor): + """Extractor for the photostream of a flickr user""" + subcategory = "user" + directory_fmt = ("{category}", "{user[username]}") + archive_fmt = "u_{user[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$" + test = ("https://www.flickr.com/photos/shona_s/", { + "pattern": FlickrImageExtractor.pattern, + "count": 28, + }) + + def photos(self): + return self.api.people_getPhotos(self.user["nsid"]) + + +class FlickrFavoriteExtractor(FlickrExtractor): + """Extractor for favorite photos of a flickr user""" + subcategory = "favorite" + directory_fmt = ("{category}", "{subcategory}s", "{user[username]}") + archive_fmt = "f_{user[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites" + test = ("https://www.flickr.com/photos/shona_s/favorites", { + "pattern": FlickrImageExtractor.pattern, + "count": 4, + }) + + def photos(self): + return self.api.favorites_getList(self.user["nsid"]) + + +class FlickrSearchExtractor(FlickrExtractor): + """Extractor for flickr photos based on search results""" + subcategory = "search" + directory_fmt = ("{category}", "{subcategory}", "{search[text]}") + archive_fmt = "s_{search}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)" + test = ( + ("https://flickr.com/search/?text=mountain"), + ("https://flickr.com/search/?text=tree%20cloud%20house" + "&color_codes=4&styles=minimalism"), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.search = text.parse_query(match.group(1)) + if "text" not in self.search: + self.search["text"] = "" + + def metadata(self): + return {"search": self.search} + + def photos(self): + return self.api.photos_search(self.search) + + +class FlickrAPI(oauth.OAuth1API): + """Minimal interface for the flickr API""" + API_URL = "https://api.flickr.com/services/rest/" + API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" + API_SECRET = "3adb0f568dc68393" + FORMATS = [ + ("o", "Original" , None), + ("k", "Large 2048" , 2048), + ("h", "Large 1600" , 1600), + ("l", "Large" , 1024), + ("c", "Medium 800" , 800), + ("z", "Medium 640" , 640), + ("m", "Medium" , 500), + ("n", "Small 320" , 320), + ("s", "Small" , 240), + ("q", "Large Square", 150), + ("t", "Thumbnail" , 100), + ("s", "Square" , 75), + ] + VIDEO_FORMATS = { + "orig" : 9, + "1080p" : 8, + "720p" : 7, + "360p" : 6, + "288p" : 5, + "700" : 4, + "300" : 3, + "100" : 2, + "appletv" : 1, + "iphone_wifi": 0, + } + + def __init__(self, extractor): + oauth.OAuth1API.__init__(self, extractor) + + self.videos = extractor.config("videos", True) + self.maxsize = extractor.config("size-max") + if isinstance(self.maxsize, str): + for fmt, fmtname, fmtwidth in self.FORMATS: + if self.maxsize == fmt or self.maxsize == fmtname: + self.maxsize = fmtwidth + break + else: + self.maxsize = None + extractor.log.warning( + "Could not match '%s' to any format", self.maxsize) + if self.maxsize: + self.formats = [fmt for fmt in self.FORMATS + if not fmt[2] or fmt[2] <= self.maxsize] + else: + self.formats = self.FORMATS + self.formats = self.formats[:4] + + def favorites_getList(self, user_id): + """Returns a list of the user's favorite photos.""" + params = {"user_id": user_id} + return self._pagination("favorites.getList", params) + + def galleries_getInfo(self, gallery_id): + """Gets information about a gallery.""" + params = {"gallery_id": gallery_id} + gallery = self._call("galleries.getInfo", params)["gallery"] + return self._clean_info(gallery) + + def galleries_getPhotos(self, gallery_id): + """Return the list of photos for a gallery.""" + params = {"gallery_id": gallery_id} + return self._pagination("galleries.getPhotos", params) + + def groups_pools_getPhotos(self, group_id): + """Returns a list of pool photos for a given group.""" + params = {"group_id": group_id} + return self._pagination("groups.pools.getPhotos", params) + + def people_getPhotos(self, user_id): + """Return photos from the given user's photostream.""" + params = {"user_id": user_id} + return self._pagination("people.getPhotos", params) + + def photos_getInfo(self, photo_id): + """Get information about a photo.""" + params = {"photo_id": photo_id} + return self._call("photos.getInfo", params)["photo"] + + def photos_getSizes(self, photo_id): + """Returns the available sizes for a photo.""" + params = {"photo_id": photo_id} + sizes = self._call("photos.getSizes", params)["sizes"]["size"] + if self.maxsize: + for index, size in enumerate(sizes): + if index > 0 and (int(size["width"]) > self.maxsize or + int(size["height"]) > self.maxsize): + del sizes[index:] + break + return sizes + + def photos_search(self, params): + """Return a list of photos matching some criteria.""" + return self._pagination("photos.search", params.copy()) + + def photosets_getInfo(self, photoset_id, user_id): + """Gets information about a photoset.""" + params = {"photoset_id": photoset_id, "user_id": user_id} + photoset = self._call("photosets.getInfo", params)["photoset"] + return self._clean_info(photoset) + + def photosets_getList(self, user_id): + """Returns the photosets belonging to the specified user.""" + params = {"user_id": user_id} + return self._pagination_sets("photosets.getList", params) + + def photosets_getPhotos(self, photoset_id): + """Get the list of photos in a set.""" + params = {"photoset_id": photoset_id} + return self._pagination("photosets.getPhotos", params, "photoset") + + def urls_lookupGroup(self, groupname): + """Returns a group NSID, given the url to a group's page.""" + params = {"url": "https://www.flickr.com/groups/" + groupname} + group = self._call("urls.lookupGroup", params)["group"] + return {"nsid": group["id"], + "path_alias": groupname, + "groupname": group["groupname"]["_content"]} + + def urls_lookupUser(self, username): + """Returns a user NSID, given the url to a user's photos or profile.""" + params = {"url": "https://www.flickr.com/photos/" + username} + user = self._call("urls.lookupUser", params)["user"] + return {"nsid": user["id"], + "path_alias": username, + "username": user["username"]["_content"]} + + def video_getStreamInfo(self, video_id, secret=None): + """Returns all available video streams""" + params = {"photo_id": video_id} + if not secret: + secret = self._call("photos.getInfo", params)["photo"]["secret"] + params["secret"] = secret + stream = self._call("video.getStreamInfo", params)["streams"]["stream"] + return max(stream, key=lambda s: self.VIDEO_FORMATS.get(s["type"], 0)) + + def _call(self, method, params): + params["method"] = "flickr." + method + params["format"] = "json" + params["nojsoncallback"] = "1" + if self.api_key: + params["api_key"] = self.api_key + data = self.request(self.API_URL, params=params).json() + if "code" in data: + if data["code"] == 1: + raise exception.NotFoundError(self.extractor.subcategory) + elif data["code"] == 98: + raise exception.AuthenticationError(data.get("message")) + elif data["code"] == 99: + raise exception.AuthorizationError() + self.log.error("API call failed: %s", data.get("message")) + raise exception.StopExtraction() + return data + + def _pagination(self, method, params, key="photos"): + params["extras"] = "description,date_upload,tags,views,media," + params["extras"] += ",".join("url_" + fmt[0] for fmt in self.formats) + params["page"] = 1 + + while True: + data = self._call(method, params)[key] + yield from map(self._extract_format, data["photo"]) + if params["page"] >= data["pages"]: + return + params["page"] += 1 + + def _pagination_sets(self, method, params): + params["page"] = 1 + + while True: + data = self._call(method, params)["photosets"] + yield from data["photoset"] + if params["page"] >= data["pages"]: + return + params["page"] += 1 + + def _extract_format(self, photo): + photo["description"] = photo["description"]["_content"].strip() + photo["views"] = text.parse_int(photo["views"]) + photo["date"] = text.parse_timestamp(photo["dateupload"]) + photo["tags"] = photo["tags"].split() + photo["id"] = text.parse_int(photo["id"]) + + if photo["media"] == "video" and self.videos: + return self._extract_video(photo) + + for fmt, fmtname, fmtwidth in self.formats: + key = "url_" + fmt + if key in photo: + photo["width"] = text.parse_int(photo["width_" + fmt]) + photo["height"] = text.parse_int(photo["height_" + fmt]) + if self.maxsize and (photo["width"] > self.maxsize or + photo["height"] > self.maxsize): + continue + photo["url"] = photo[key] + photo["label"] = fmtname + + # remove excess data + keys = [ + key for key in photo + if key.startswith(("url_", "width_", "height_")) + ] + for key in keys: + del photo[key] + break + else: + self._extract_photo(photo) + + return photo + + def _extract_photo(self, photo): + size = self.photos_getSizes(photo["id"])[-1] + photo["url"] = size["source"] + photo["label"] = size["label"] + photo["width"] = text.parse_int(size["width"]) + photo["height"] = text.parse_int(size["height"]) + return photo + + def _extract_video(self, photo): + stream = self.video_getStreamInfo(photo["id"], photo.get("secret")) + photo["url"] = stream["_content"] + photo["label"] = stream["type"] + photo["width"] = photo["height"] = 0 + return photo + + @staticmethod + def _clean_info(info): + info["title"] = info["title"]["_content"] + info["description"] = info["description"]["_content"] + return info diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py new file mode 100644 index 0000000..5f4c5b8 --- /dev/null +++ b/gallery_dl/extractor/foolfuuka.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for 4chan archives based on FoolFuuka""" + +from .common import Extractor, Message, SharedConfigMixin, generate_extractors +from .. import text +import itertools +import operator + + +class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): + """Base extractor for FoolFuuka based boards/archives""" + basecategory = "foolfuuka" + subcategory = "thread" + directory_fmt = ("{category}", "{board[shortname]}", + "{thread_num}{title:? - //}") + filename_fmt = "{media[media]}" + archive_fmt = "{board[shortname]}_{num}_{timestamp}" + pattern_fmt = r"/([^/]+)/thread/(\d+)" + external = "default" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + self.session.headers["Referer"] = self.root + if self.external == "direct": + self.remote = self._remote_direct + + def items(self): + op = True + yield Message.Version, 1 + for post in self.posts(): + if op: + yield Message.Directory, post + op = False + if not post["media"]: + continue + + media = post["media"] + url = media["media_link"] + + if not url and "remote_media_link" in media: + url = self.remote(media) + if url.startswith("/"): + url = self.root + url + + post["extension"] = url.rpartition(".")[2] + yield Message.Url, url, post + + def posts(self): + """Return an iterable with all posts in this thread""" + url = self.root + "/_/api/chan/thread/" + params = {"board": self.board, "num": self.thread} + data = self.request(url, params=params).json()[self.thread] + + # sort post-objects by key + posts = sorted(data.get("posts", {}).items()) + posts = map(operator.itemgetter(1), posts) + + return itertools.chain((data["op"],), posts) + + def remote(self, media): + """Resolve a remote media link""" + needle = '= 5 else "" + data["title"] = data["chapter_string"].partition(":")[2].strip() + return data + + +class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): + """Base class for chapter extractors for FoOlSlide based sites""" + directory_fmt = ( + "{category}", "{manga}", "{chapter_string}") + archive_fmt = "{id}" + pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" + decode = "default" + + def items(self): + page = self.request(self.chapter_url).text + data = self.metadata(page) + imgs = self.images(page) + + data["count"] = len(imgs) + data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"]) + + yield Message.Version, 1 + yield Message.Directory, data + for data["page"], image in enumerate(imgs, 1): + try: + url = image["url"] + del image["url"] + del image["chapter_id"] + del image["thumb_url"] + except KeyError: + pass + for key in ("height", "id", "size", "width"): + image[key] = text.parse_int(image[key]) + data.update(image) + text.nameext_from_url(data["filename"], data) + yield Message.Url, url, data + + def metadata(self, page): + extr = text.extract_from(page) + extr('

    ', '') + return self.parse_chapter_url(self.chapter_url, { + "manga" : text.unescape(extr('title="', '"')).strip(), + "chapter_string": text.unescape(extr('title="', '"')), + }) + + def images(self, page): + if self.decode == "base64": + base64_data = text.extract(page, 'atob("', '"')[0].encode() + data = base64.b64decode(base64_data).decode() + elif self.decode == "double": + pos = page.find("[{") + data = text.extract(page, " = ", ";", pos)[0] + else: + data = text.extract(page, "var pages = ", ";")[0] + return json.loads(data) + + +class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): + """Base class for manga extractors for FoOlSlide based sites""" + pattern_fmt = r"(/series/[^/?&#]+)" + + def chapters(self, page): + extr = text.extract_from(page) + manga = text.unescape(extr('

    ', '

    ')).strip() + author = extr('Author: ', 'Artist: ', '
    = 1", + "keyword": { + "chapter": int, + "chapter_minor": str, + "chapter_string": str, + "group": "PowerManga", + "lang": "en", + "language": "English", + "manga": "One Piece Digital Colour Comics", + "title": str, + "volume": int, + }, + }), + }, + "sensescans": { + "root": "http://sensescans.com/reader", + "pattern": r"(?:(?:www\.)?sensescans\.com/reader" + r"|reader\.sensescans\.com)", + "test-chapter": ( + (("http://sensescans.com/reader/read/" + "magi__labyrinth_of_magic/en/37/369/"), { + "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812", + "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988", + }), + (("http://reader.sensescans.com/read/" + "magi__labyrinth_of_magic/en/37/369/"), { + "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812", + "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988", + }), + ), + "test-manga": + ("http://sensescans.com/reader/series/hakkenden/", { + "url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2", + "keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23", + }), + }, + "worldthree": { + "root": "http://www.slide.world-three.org", + "pattern": r"(?:www\.)?slide\.world-three\.org", + "test-chapter": ( + (("http://www.slide.world-three.org" + "/read/black_bullet/en/2/7/page/1"), { + "url": "be2f04f6e2d311b35188094cfd3e768583271584", + "keyword": "967d536a65de4d52478d5b666a1760b181eddb6e", + }), + (("http://www.slide.world-three.org" + "/read/idolmster_cg_shuffle/en/0/4/2/"), { + "url": "6028ea5ca282744f925dfad92eeb98509f9cc78c", + "keyword": "f3cfe2ad3388991f1d045c85d0fa94795a7694dc", + }), + ), + "test-manga": + ("http://www.slide.world-three.org/series/black_bullet/", { + "url": "5743b93512d26e6b540d90a7a5d69208b6d4a738", + "keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120", + }), + }, + "_ckey": "chapterclass", +} + +generate_extractors(EXTRACTORS, globals(), ( + FoolslideChapterExtractor, + FoolslideMangaExtractor, +)) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py new file mode 100644 index 0000000..15bd0a8 --- /dev/null +++ b/gallery_dl/extractor/gelbooru.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://gelbooru.com/""" + +from . import booru +from .common import Message +from .. import text, util + + +class GelbooruExtractor(booru.XmlParserMixin, + booru.GelbooruPageMixin, + booru.BooruExtractor): + """Base class for gelbooru extractors""" + category = "gelbooru" + api_url = "https://gelbooru.com/index.php" + post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}" + pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}" + + def __init__(self, match): + super().__init__(match) + + self.use_api = self.config("api", True) + if self.use_api: + self.params.update({"page": "dapi", "s": "post", "q": "index"}) + else: + self.items = self.items_noapi + + def items_noapi(self): + data = self.get_metadata() + + yield Message.Version, 1 + yield Message.Directory, data + + for post in self.get_posts(): + post = self.get_post_data(post) + url = post["file_url"] + post.update(data) + yield Message.Url, url, text.nameext_from_url(url, post) + + def get_posts(self): + """Return an iterable containing all relevant post objects""" + + def get_post_data(self, post_id): + """Extract metadata of a single post""" + page = self.request(self.post_url.format(post_id)).text + data = text.extract_all(page, ( + (None , 'Id: ', '<'), + ("created_at", '
  • Posted: ', '<'), + ("width" , '
  • Size: ', 'x'), + ("height" , '', '<'), + ("source" , '
  • Source: Rating: ', '<'), + (None , '
  • Score: ', ''), + ("score" , '>', '<'), + ("file_url" , '
  • [^&#]+)") + test = ( + ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", { + "count": 5, + }), + ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", { + "options": (("api", False),), + "count": 5, + }), + ) + + def __init__(self, match): + super().__init__(match) + if not self.use_api: + self.per_page = 42 + + def get_posts(self): + url = "https://gelbooru.com/index.php?page=post&s=list" + params = {"tags": self.tags, "pid": self.page_start * self.per_page} + + while True: + page = self.request(url, params=params).text + ids = list(text.extract_iter(page, '\d+)") + test = ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { + "count": 6, + }) + + def get_posts(self): + return util.advance(self.posts, self.page_start) + + +class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor): + """Extractor for single images from gelbooru.com""" + pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" + r"\?page=post&s=view&id=(?P\d+)") + test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", { + "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + "count": 1, + }) + + def get_posts(self): + return (self.post,) diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py new file mode 100644 index 0000000..1dcb3c8 --- /dev/null +++ b/gallery_dl/extractor/gfycat.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://gfycat.com/""" + +from .common import Extractor, Message + + +class GfycatExtractor(Extractor): + """Base class for gfycat extractors""" + category = "gfycat" + filename_fmt = "{category}_{gfyName}.{extension}" + archive_fmt = "{gfyName}" + root = "https://gfycat.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.formats = (self.config("format", "mp4"), "mp4", "webm", "gif") + + def _select_format(self, gfyitem): + for fmt in self.formats: + key = fmt + "Url" + if key in gfyitem: + url = gfyitem[key] + gfyitem["extension"] = url.rpartition(".")[2] + return url + return "" + + def _get_info(self, gfycat_id): + url = "https://api.gfycat.com/v1/gfycats/" + gfycat_id + return self.request(url).json()["gfyItem"] + + +class GfycatImageExtractor(GfycatExtractor): + """Extractor for individual images from gfycat.com""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:\w+\.)?gfycat\.com" + r"/(?:gifs/detail/|\w+/)?([A-Za-z]+)") + test = ( + ("https://gfycat.com/GrayGenerousCowrie", { + "url": "e0b5e1d7223108249b15c3c7898dd358dbfae045", + "content": "5786028e04b155baa20b87c5f4f77453cd5edc37", + "keyword": { + "gfyId": "graygenerouscowrie", + "gfyName": "GrayGenerousCowrie", + "gfyNumber": "755075459", + "title": "Bottom's up", + "userName": "jackson3oh3", + "createDate": 1495884169, + "md5": "a4796e05b0db9ba9ce5140145cd318aa", + "width": 400, + "height": 224, + "frameRate": 23, + "numFrames": 158, + "views": int, + }, + }), + (("https://thumbs.gfycat.com/SillyLameIsabellinewheatear" + "-size_restricted.gif"), { + "url": "13b32e6cc169d086577d7dd3fd36ee6cdbc02726", + }), + ("https://gfycat.com/detail/UnequaledHastyAnkole?tagname=aww", { + "url": "e24c9f69897fd223343782425a429c5cab6a768e", + }), + ("https://gfycat.com/gifs/detail/UnequaledHastyAnkole"), + ("https://gfycat.com/ifr/UnequaledHastyAnkole"), + ("https://gfycat.com/ru/UnequaledHastyAnkole"), + ) + + def __init__(self, match): + GfycatExtractor.__init__(self, match) + self.gfycat_id = match.group(1) + + def items(self): + gfyitem = self._get_info(self.gfycat_id) + yield Message.Version, 1 + yield Message.Directory, gfyitem + yield Message.Url, self._select_format(gfyitem), gfyitem diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py new file mode 100644 index 0000000..01793dc --- /dev/null +++ b/gallery_dl/extractor/hbrowse.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.hbrowse.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, exception +import json + + +class HbrowseBase(): + """Base class for hbrowse extractors""" + category = "hbrowse" + root = "https://www.hbrowse.com" + + def parse_page(self, page, data): + """Parse metadata on 'page' and add it to 'data'""" + data, pos = text.extract_all(page, ( + ('manga' , '
  • ', '', '', ' '), + ('origin', '', '
    ', pos)[0] + + data["manga"] = text.unescape(data["manga"]) + data["total"] = text.parse_int(data["total"]) + data["artist"] = text.remove_html(data["artist"]) + data["origin"] = text.remove_html(data["origin"]) + data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"')) + return data + + +class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor): + """Extractor for manga-chapters from hbrowse.com""" + directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}") + filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" + "{page:>03}.{extension}") + archive_fmt = "{manga_id}_{chapter}_{page}" + pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))" + test = ("https://www.hbrowse.com/10363/c00000", { + "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6", + "keyword": "274996f6c809e5250b6ff3abbc5147e29f89d9a5", + "content": "44578ebbe176c2c27434966aef22945787e2781e", + }) + + def __init__(self, match): + self.path, self.gid, self.chapter = match.groups() + self.path += "/" + ChapterExtractor.__init__(self, match) + + def metadata(self, page): + return self.parse_page(page, { + "manga_id": text.parse_int(self.gid), + "chapter": text.parse_int(self.chapter) + }) + + def images(self, page): + base = self.root + "/data" + self.path + json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]" + return [(base + name, None) for name in json.loads(json_data)] + + +class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): + """Extractor for manga from hbrowse.com""" + chapterclass = HbrowseChapterExtractor + reverse = False + pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$" + test = ("https://www.hbrowse.com/10363", { + "url": "b89682bfb86c11d2af0dc47463804ec3ac4aadd6", + "keyword": "4b15fda1858a69de1fbf5afddfe47dd893397312", + }) + + def chapters(self, page): + results = [] + data = self.parse_page(page, { + "manga_id": text.parse_int( + self.manga_url.rstrip("/").rpartition("/")[2]) + }) + + pos = 0 + needle = '\nView ', '<', pos) + data["chapter"] = text.parse_int(url.rpartition("/")[2][1:]) + data["title"] = title + results.append((text.urljoin(self.root, url), data.copy())) diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py new file mode 100644 index 0000000..354acbf --- /dev/null +++ b/gallery_dl/extractor/hentai2read.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract hentai-manga from https://hentai2read.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +import json +import re + + +class Hentai2readBase(): + """Base class for hentai2read extractors""" + category = "hentai2read" + root = "https://hentai2read.com" + + +class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): + """Extractor for a single manga chapter from hentai2read.com""" + archive_fmt = "{chapter_id}_{page}" + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+/(\d+))" + test = ("https://hentai2read.com/amazon_elixir/1/", { + "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", + "keyword": "ff84b8f751f0e4ee37717efc4332ff1db71951d9", + }) + + def __init__(self, match): + self.chapter = match.group(2) + ChapterExtractor.__init__(self, match) + + def metadata(self, page): + title, pos = text.extract(page, "", "") + manga_id, pos = text.extract(page, 'data-mid="', '"', pos) + chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) + match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - " + r"(\d+): (.+) . Page 1 ", title) + return { + "manga": match.group(1), + "manga_id": text.parse_int(manga_id), + "chapter": text.parse_int(self.chapter), + "chapter_id": text.parse_int(chapter_id), + "type": match.group(2), + "author": match.group(3), + "title": match.group(5), + "lang": "en", + "language": "English", + } + + @staticmethod + def images(page): + images = text.extract(page, "'images' : ", ",\n")[0] + return [ + ("https://hentaicdn.com/hentai" + part, None) + for part in json.loads(images) + ] + + +class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor): + """Extractor for hmanga from hentai2read.com""" + chapterclass = Hentai2readChapterExtractor + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+)/?$" + test = ( + ("https://hentai2read.com/amazon_elixir/", { + "url": "273073752d418ec887d7f7211e42b832e8c403ba", + "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac", + }), + ("https://hentai2read.com/oshikage_riot/", { + "url": "6595f920a3088a15c2819c502862d45f8eb6bea6", + "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36", + }), + ) + + def chapters(self, page): + results = [] + manga, pos = text.extract( + page, '', '') + mtype, pos = text.extract( + page, '[', ']', pos) + manga_id = text.parse_int(text.extract( + page, 'data-mid="', '"', pos)[0]) + + while True: + chapter_id, pos = text.extract(page, ' data-cid="', '"', pos) + if not chapter_id: + return results + _ , pos = text.extract(page, ' href="', '"', pos) + url, pos = text.extract(page, ' href="', '"', pos) + chapter, pos = text.extract(page, '>', '<', pos) + + chapter, _, title = text.unescape(chapter).strip().partition(" - ") + results.append((url, { + "manga_id": manga_id, "manga": manga, "type": mtype, + "chapter_id": text.parse_int(chapter_id), + "chapter": text.parse_int(chapter), + "title": title, "lang": "en", "language": "English", + })) diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py new file mode 100644 index 0000000..e95467b --- /dev/null +++ b/gallery_dl/extractor/hentaicafe.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentai.cafe/""" + +from . import foolslide +from .. import text +from ..cache import memcache +import re + + +class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): + """Extractor for manga-chapters from hentai.cafe""" + category = "hentaicafe" + directory_fmt = ("{category}", "{manga}") + pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe" + r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)") + test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", { + "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2", + "keyword": "6913608267d883c82b887303b9ced13821188329", + }) + root = "https://hentai.cafe" + + def metadata(self, page): + info = text.unescape(text.extract(page, '', '')[0]) + manga, _, chapter_string = info.partition(" :: ") + + data = self._data(self.chapter_url.split("/")[5]) + data["manga"] = manga + data["chapter_string"] = chapter_string.rstrip(" :") + return self.parse_chapter_url(self.chapter_url, data) + + @memcache(keyarg=1) + def _data(self, manga): + return {"artist": [], "tags": []} + + +class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): + """Extractor for manga from hentai.cafe""" + category = "hentaicafe" + pattern = (r"(?:https?://)?" + r"(?:www\.)?hentai\.cafe" + r"((?:/manga/series)?/[^/?&#]+)/?$") + test = ( + # single chapter + ("https://hentai.cafe/hazuki-yuuto-summer-blues/", { + "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b", + "keyword": "eb9f98544098c961bd8cf5dbe69e6da51c4fb2f6", + }), + # multi-chapter + ("https://hentai.cafe/saitom-saitom-box/", { + "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", + "keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb", + }), + # foolslide URL + ("https://hentai.cafe/manga/series/saitom-box/", { + "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", + "keyword": "f0ece32d958f889d8229ed4052716d398a0a875c", + }), + ) + root = "https://hentai.cafe" + reverse = False + chapterclass = HentaicafeChapterExtractor + + def chapters(self, page): + if "/manga/series/" in self.manga_url: + chapters = foolslide.FoolslideMangaExtractor.chapters(self, page) + chapters.reverse() + return chapters + + tags , pos = text.extract(page, "

    Tags: ", "
    ") + artist, pos = text.extract(page, "\nArtists: ", "
    ", pos) + manga , pos = text.extract(page, "/manga/read/", "/", pos) + data = { + "tags" : text.split_html(tags)[::2], + "artist": text.split_html(artist), + } + HentaicafeChapterExtractor._data(manga).update(data) + + return [ + (url, data) + for url in re.findall( + r'
    ', '') + _ , pos = text.extract(page, 'id="picBox"', '', pos) + width , pos = text.extract(page, 'width="', '"', pos) + height, pos = text.extract(page, 'height="', '"', pos) + url , pos = text.extract(page, 'src="', '"', pos) + + title, _, artist = title.rpartition(" - ")[0].rpartition(" by ") + + data = text.nameext_from_url(url, { + "title": text.unescape(title), + "artist": text.unescape(artist), + "index": text.parse_int(index), + "width": text.parse_int(width), + "height": text.parse_int(height), + }) + if not data["extension"]: + data["extension"] = "jpg" + return text.urljoin(self.root, url), data + + def set_filters(self): + """Set site-internal filters to show all images""" + token = text.unquote(text.extract( + self.session.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0]) + data = { + "YII_CSRF_TOKEN": token, + "rating_nudity": 3, + "rating_violence": 3, + "rating_profanity": 3, + "rating_racism": 3, + "rating_sex": 3, + "rating_spoilers": 3, + "rating_yaoi": 1, + "rating_yuri": 1, + "rating_teen": 1, + "rating_guro": 1, + "rating_furry": 1, + "rating_beast": 1, + "rating_male": 1, + "rating_female": 1, + "rating_futa": 1, + "rating_other": 1, + "rating_scat": 1, + "rating_incest": 1, + "rating_rape": 1, + "filter_media": "A", + "filter_order": "date_new", + "filter_type": 0, + } + url = self.root + "/site/filters" + self.request(url, method="POST", data=data) + + +class HentaifoundryUserExtractor(HentaifoundryExtractor): + """Extractor for all images of a hentai-foundry-user""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/(?:pictures/user/([^/]+)(?:/page/(\d+))?/?$" + r"|user/([^/]+)/profile)") + test = ( + ("https://www.hentai-foundry.com/pictures/user/Tenpura", { + "url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28", + "keyword": "63ad576f87f82fa166ca4676761762f7f8496cf5", + }), + ("https://www.hentai-foundry.com/pictures/user/Tenpura/page/3"), + ("https://www.hentai-foundry.com/user/Tenpura/profile"), + ) + + def __init__(self, match): + HentaifoundryExtractor.__init__( + self, match, match.group(1) or match.group(3), match.group(2)) + self.page_url = "{}/pictures/user/{}".format(self.root, self.user) + + def get_job_metadata(self): + page = self.request(self.page_url + "?enterAgree=1").text + count = text.extract(page, ">Pictures (", ")")[0] + return {"user": self.user, "count": text.parse_int(count)} + + +class HentaifoundryScrapsExtractor(HentaifoundryExtractor): + """Extractor for scrap images of a hentai-foundry-user""" + subcategory = "scraps" + directory_fmt = ("{category}", "{user}", "Scraps") + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/pictures/user/([^/]+)/scraps(?:/page/(\d+))?") + test = ( + ("https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", { + "url": "00a11e30b73ff2b00a1fba0014f08d49da0a68ec", + "keyword": "410c6c900cfd23a8dd1e53dfcc97a79ea68c3359", + }), + ("https://www.hentai-foundry.com" + "/pictures/user/Evulchibi/scraps/page/3"), + ) + + def __init__(self, match): + HentaifoundryExtractor.__init__( + self, match, match.group(1), match.group(2)) + self.page_url = "{}/pictures/user/{}/scraps".format( + self.root, self.user) + + def get_job_metadata(self): + page = self.request(self.page_url + "?enterAgree=1").text + count = text.extract(page, ">Scraps (", ")")[0] + return {"user": self.user, "count": text.parse_int(count)} + + +class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): + """Extractor for favorite images of a hentai-foundry-user""" + subcategory = "favorite" + directory_fmt = ("{category}", "{user}", "Favorites") + archive_fmt = "f_{user}_{index}" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/user/([^/]+)/faves/pictures(?:/page/(\d+))?") + test = ( + ("https://www.hentai-foundry.com/user/Tenpura/faves/pictures", { + "url": "56f9ae2e89fe855e9fe1da9b81e5ec6212b0320b", + "keyword": "2b9478725e66d46ea043fa87476bbd28546958e7", + }), + ("https://www.hentai-foundry.com" + "/user/Tenpura/faves/pictures/page/3"), + ) + + def __init__(self, match): + HentaifoundryExtractor.__init__( + self, match, match.group(1), match.group(2)) + self.page_url = "{}/user/{}/faves/pictures".format( + self.root, self.user) + + +class HentaifoundryRecentExtractor(HentaifoundryExtractor): + """Extractor for 'Recent Pictures' on hentaifoundry.com""" + subcategory = "recent" + directory_fmt = ("{category}", "Recent Pictures", "{date}") + archive_fmt = "r_{index}" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/pictures/recent/(\d+-\d+-\d+)(?:/page/(\d+))?") + test = ("http://www.hentai-foundry.com/pictures/recent/2018-09-20",) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, "", match.group(2)) + self.date = match.group(1) + self.page_url = "{}/pictures/recent/{}".format(self.root, self.date) + + def get_job_metadata(self): + self.request(self.root + "/?enterAgree=1") + return {"date": self.date} + + +class HentaifoundryPopularExtractor(HentaifoundryExtractor): + """Extractor for popular images on hentaifoundry.com""" + subcategory = "popular" + directory_fmt = ("{category}", "Popular Pictures") + archive_fmt = "p_{index}" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/pictures/popular(?:/page/(\d+))?") + test = ("http://www.hentai-foundry.com/pictures/popular",) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, "", match.group(1)) + self.page_url = self.root + "/pictures/popular" + + +class HentaifoundryImageExtractor(HentaifoundryExtractor): + """Extractor for a single image from hentaifoundry.com""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com" + r"/(?:pictures/user|[^/])/([^/]+)/(\d+)") + test = ( + (("https://www.hentai-foundry.com" + "/pictures/user/Tenpura/407501/shimakaze"), { + "url": "fbf2fd74906738094e2575d2728e8dc3de18a8a3", + "keyword": "cbb9381e6c2acce58db4adf4efc0ad7d138bddc4", + "content": "91bf01497c39254b6dfb234a18e8f01629c77fd1", + }), + ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/", { + "exception": exception.HttpError, + }), + ("https://pictures.hentai-foundry.com" + "/t/Tenpura/407501/Tenpura-407501-shimakaze.png"), + ) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, match.group(1)) + self.index = match.group(2) + + def items(self): + post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format( + self.root, self.user, self.index) + url, data = self.get_image_metadata(post_url) + data["user"] = self.user + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, url, data + + def skip(self, _): + return 0 diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py new file mode 100644 index 0000000..cf4871f --- /dev/null +++ b/gallery_dl/extractor/hentaifox.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentaifox.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text + + +class HentaifoxBase(): + """Base class for hentaifox extractors""" + category = "hentaifox" + root = "https://hentaifox.com" + + +class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): + """Extractor for image galleries on hentaifox.com""" + pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" + test = ("https://hentaifox.com/gallery/56622/", { + "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", + "count": 24, + "keyword": "38f8517605feb6854d48833297da6b05c6541b69", + }) + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + def metadata(self, page, split=text.split_html): + extr = text.extract_from(page) + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(extr("

    ", "

    ")), + "parody" : split(extr(">Parodies:" , ""))[::2], + "characters": split(extr(">Characters:", ""))[::2], + "tags" : split(extr(">Tags:" , ""))[::2], + "artist" : split(extr(">Artists:" , ""))[::2], + "group" : split(extr(">Groups:" , ""))[::2], + "type" : text.remove_html(extr(">Category:", "")), + "language" : "English", + "lang" : "en", + } + + def images(self, page): + return [ + (text.urljoin(self.root, url.replace("t.", ".")), None) + for url in text.extract_iter(page, 'data-src="', '"') + ] + + +class HentaifoxSearchExtractor(HentaifoxBase, Extractor): + """Extractor for search results and listings on hentaifox.com""" + subcategory = "search" + pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com" + r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)") + test = ( + ("https://hentaifox.com/parody/touhou-project/"), + ("https://hentaifox.com/character/reimu-hakurei/"), + ("https://hentaifox.com/artist/distance/"), + ("https://hentaifox.com/search/touhou/"), + ("https://hentaifox.com/tag/full-colour/", { + "pattern": HentaifoxGalleryExtractor.pattern, + "count": ">= 40", + "keyword": { + "url": str, + "gallery_id": int, + "thumbnail": r"re:https://i\d*.hentaifox.com/\d+/\d+/thumb\.", + "title": str, + "tags": list, + }, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + + def items(self): + yield Message.Version, 1 + for gallery in self.galleries(): + yield Message.Queue, gallery["url"], gallery + + def galleries(self): + url = "{}/{}/".format(self.root, self.path) + + while True: + page = self.request(url).text + info, gpos = text.extract( + page, 'class="galleries_overview">', 'class="clear">') + + for ginfo in text.extract_iter(info, '
    ", "")[0] + chapter_id = text.extract(page, 'report/C', '"')[0] + pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " + match = re.match(pattern, title) + return { + "manga": match.group(1), + "manga_id": text.parse_int(self.manga_id), + "chapter": text.parse_int(self.chapter), + "chapter_id": text.parse_int(chapter_id), + "type": match.group(2), + "title": match.group(3), + "author": match.group(4), + "lang": "en", + "language": "English", + } + + @staticmethod + def images(page): + images = text.extract(page, "var rff_imageList = ", ";")[0] + return [ + ("https://hentaicdn.com/hentai" + part, None) + for part in json.loads(images) + ] + + +class HentaihereMangaExtractor(HentaihereBase, MangaExtractor): + """Extractor for hmanga from hentaihere.com""" + chapterclass = HentaihereChapterExtractor + pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com(/m/S\d+)/?$" + test = ( + ("https://hentaihere.com/m/S13812", { + "url": "d1ba6e28bb2162e844f8559c2b2725ba0a093559", + "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac", + }), + ("https://hentaihere.com/m/S7608", { + "url": "6c5239758dc93f6b1b4175922836c10391b174f7", + "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36", + }), + ) + + def chapters(self, page): + results = [] + manga_id = text.parse_int( + self.manga_url.rstrip("/").rpartition("/")[2][1:]) + manga, pos = text.extract( + page, '', '') + mtype, pos = text.extract( + page, '[', ']', pos) + + while True: + marker, pos = text.extract( + page, '
  • ', '', pos) + if marker is None: + return results + url, pos = text.extract(page, '\n', '<', pos) + chapter_id, pos = text.extract(page, '/C', '"', pos) + chapter, _, title = text.unescape(chapter).strip().partition(" - ") + results.append((url, { + "manga_id": manga_id, "manga": manga, "type": mtype, + "chapter_id": text.parse_int(chapter_id), + "chapter": text.parse_int(chapter), + "title": title, "lang": "en", "language": "English", + })) diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py new file mode 100644 index 0000000..d875817 --- /dev/null +++ b/gallery_dl/extractor/hentainexus.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentainexus.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import json + + +class HentainexusGalleryExtractor(GalleryExtractor): + """Extractor for image galleries on hentainexus.com""" + category = "hentainexus" + root = "https://hentainexus.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" + r"/(?:view|read)/(\d+)") + test = ( + ("https://hentainexus.com/view/5688", { + "url": "746d0043e20030f1171aae5ea113176607302517", + "keyword": "b05986369fbaf29cfa08b118960d92c49e59524b", + }), + ("https://hentainexus.com/read/5688"), + ) + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/view/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + rmve = text.remove_html + extr = text.extract_from(page) + data = { + "gallery_id" : text.parse_int(self.gallery_id), + "tags" : extr('"og:description" content="', '"').split(", "), + "thumbnail" : extr('"og:image" content="', '"'), + "title" : extr('

    ', '

    '), + "artist" : rmve(extr('viewcolumn">Artist' , '')), + "book" : rmve(extr('viewcolumn">Book' , '')), + "language" : rmve(extr('viewcolumn">Language' , '')), + "magazine" : rmve(extr('viewcolumn">Magazine' , '')), + "parody" : rmve(extr('viewcolumn">Parody' , '')), + "publisher" : rmve(extr('viewcolumn">Publisher' , '')), + "description": rmve(extr('viewcolumn">Description', '')), + } + data["lang"] = util.language_to_code(data["language"]) + return data + + def images(self, page): + url = "{}/read/{}".format(self.root, self.gallery_id) + extr = text.extract_from(self.request(url).text) + urls = extr("initReader(", "]") + "]" + return [(url, None) for url in json.loads(urls)] + + +class HentainexusSearchExtractor(Extractor): + """Extractor for search results on hentainexus.com""" + category = "hentainexus" + subcategory = "search" + root = "https://hentainexus.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" + r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$") + test = ( + ("https://hentainexus.com/?q=tag:%22heart+pupils%22%20tag:group", { + "pattern": HentainexusGalleryExtractor.pattern, + "count": ">= 50", + }), + ("https://hentainexus.com/page/3?q=tag:%22heart+pupils%22"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.params = text.parse_query(match.group(1)) + + def items(self): + yield Message.Version, 1 + params = self.params + path = "/" + + while path: + page = self.request(self.root + path, params=params).text + extr = text.extract_from(page) + data = {"_extractor": HentainexusGalleryExtractor} + + while True: + gallery_id = extr('
    ', '<').strip()), + "artist" : self._prep(extr('

    ', '

    ')), + "group" : self._prep(extr('Group', '')), + "type" : self._prep_1(extr('Type', '')), + "language" : self._prep_1(extr('Language', '')), + "parody" : self._prep(extr('Series', '')), + "characters": self._prep(extr('Characters', '')), + "tags" : self._prep(extr('Tags', '')), + "date" : self._date(extr('', '')), + } + if data["language"] == "N/a": + data["language"] = None + data["lang"] = util.language_to_code(data["language"]) + return data + + def images(self, page): + # see https://ltn.hitomi.la/common.js + offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0 + subdomain = chr(97 + offset) + "a" + base = "https://" + subdomain + ".hitomi.la/galleries/" + + # set Referer header before image downloads (#239) + self.session.headers["Referer"] = self.chapter_url + + # handle Game CG galleries with scenes (#321) + scenes = text.extract(page, "var scene_indexes = [", "]")[0] + if scenes and scenes.strip(): + url = "{}/reader/{}.html".format(self.root, self.gallery_id) + page = self.request(url).text + begin, end = ">//g.hitomi.la/galleries/", "
  • " + else: + begin, end = "'//tn.hitomi.la/smalltn/", ".jpg'," + + return [ + (base + urlpart, None) + for urlpart in text.extract_iter(page, begin, end) + ] + + @staticmethod + def _prep(value): + return [ + text.unescape(string.capwords(v)) + for v in text.extract_iter(value or "", '.html">', '<') + ] + + @staticmethod + def _prep_1(value): + return text.remove_html(value).capitalize() + + @staticmethod + def _date(value): + return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z") diff --git a/gallery_dl/extractor/hypnohub.py b/gallery_dl/extractor/hypnohub.py new file mode 100644 index 0000000..bf2db96 --- /dev/null +++ b/gallery_dl/extractor/hypnohub.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hypnohub.net/""" + +from . import booru + + +class HypnohubExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): + """Base class for hypnohub extractors""" + category = "hypnohub" + api_url = "https://hypnohub.net/post.json" + post_url = "https://hypnohub.net/post/show/{}" + + +class HypnohubTagExtractor(booru.TagMixin, HypnohubExtractor): + """Extractor for images from hypnohub.net based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net" + r"/post\?(?:[^&#]*&)*tags=(?P[^&#]+)") + test = ("https://hypnohub.net/post?tags=gonoike_biwa", { + "url": "6bebc4318489ee37e0c3b814352acd6783ba95d6", + }) + + +class HypnohubPoolExtractor(booru.PoolMixin, HypnohubExtractor): + """Extractor for image-pools from hypnohub.net""" + pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/pool/show/(?P\d+)" + test = ("https://hypnohub.net/pool/show/61", { + "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf", + }) + + +class HypnohubPostExtractor(booru.PostMixin, HypnohubExtractor): + """Extractor for single images from hypnohub.net""" + pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/post/show/(?P\d+)" + test = ("https://hypnohub.net/post/show/73964", { + "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee", + "options": (("tags", True),), + "keyword": { + "tags_artist": "gonoike_biwa icontrol_(manipper)", + "tags_character": "komaru_naegi", + "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode", + "tags_general": str, + }, + }) + + +class HypnohubPopularExtractor(booru.MoebooruPopularMixin, HypnohubExtractor): + """Extractor for popular images from hypnohub.net""" + pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net" + r"/post/popular_(?Pby_(?:day|week|month)|recent)" + r"(?:\?(?P[^#]*))?") + test = ( + ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", { + "count": 20, + }), + ("https://hypnohub.net/post/popular_recent"), + ) + + def __init__(self, match): + super().__init__(match) + self.api_url = "https://hypnohub.net/post/popular_{scale}.json".format( + scale=self.scale) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py new file mode 100644 index 0000000..dcb4a54 --- /dev/null +++ b/gallery_dl/extractor/idolcomplex.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://idol.sankakucomplex.com/""" + +from . import sankaku + + +class IdolcomplexExtractor(sankaku.SankakuExtractor): + """Base class for idolcomplex extractors""" + category = "idolcomplex" + cookiedomain = "idol.sankakucomplex.com" + subdomain = "idol" + + +class IdolcomplexTagExtractor(IdolcomplexExtractor, + sankaku.SankakuTagExtractor): + """Extractor for images from idol.sankakucomplex.com by search-tags""" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)" + test = ( + ("https://idol.sankakucomplex.com/?tags=lyumos+wreath", { + "count": ">= 6", + "pattern": r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" + r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", + }), + ("https://idol.sankakucomplex.com" + "/?tags=lyumos+wreath&page=3&next=694215"), + ) + + +class IdolcomplexPoolExtractor(IdolcomplexExtractor, + sankaku.SankakuPoolExtractor): + """Extractor for image-pools from idol.sankakucomplex.com""" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)" + test = ("https://idol.sankakucomplex.com/pool/show/145", { + "count": 3, + }) + + +class IdolcomplexPostExtractor(IdolcomplexExtractor, + sankaku.SankakuPostExtractor): + """Extractor for single images from idol.sankakucomplex.com""" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)" + test = ("https://idol.sankakucomplex.com/post/show/694215", { + "content": "694ec2491240787d75bf5d0c75d0082b53a85afd", + "options": (("tags", True),), + "keyword": { + "tags_character": "shani_(the_witcher)", + "tags_copyright": "the_witcher", + "tags_idol": str, + "tags_medium": str, + "tags_general": str, + }, + }) diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py new file mode 100644 index 0000000..6980185 --- /dev/null +++ b/gallery_dl/extractor/imagebam.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://www.imagebam.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +class ImagebamExtractor(Extractor): + """Base class for imagebam extractors""" + category = "imagebam" + root = "http://www.imagebam.com" + + def get_image_data(self, page_url, data): + """Fill 'data' and return image URL""" + page = self.request(page_url).text + image_url = text.extract(page, 'property="og:image" content="', '"')[0] + data["extension"] = image_url.rpartition(".")[2] + data["image_key"] = page_url.rpartition("/")[2] + data["image_id"] = data["image_key"][6:] + return image_url + + def request_page(self, url): + """Retrive the main part of a gallery page""" + page = self.request(text.urljoin(self.root, url)).text + return text.extract(page, "
    ", "
    ")[0] + + +class ImagebamGalleryExtractor(ImagebamExtractor): + """Extractor for image galleries from imagebam.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{title} - {gallery_key}") + filename_fmt = "{num:>03}-{image_key}.{extension}" + archive_fmt = "{gallery_key}_{image_key}" + pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)" + test = ( + ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { + "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", + "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", + "content": "596e6bfa157f2c7169805d50075c2986549973a8", + }), + ("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", { + # more than 100 images; see issue #219 + "count": 107, + "url": "f92ce5b17676b6ea69288f0aef26f4cdbea7fd8d", + }), + ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + ImagebamExtractor.__init__(self, match) + self.gallery_key = match.group(1) + + def items(self): + url = "{}/gallery/{}".format(self.root, self.gallery_key) + page = self.request_page(url) + if not page or ">Error<" in page: + raise exception.NotFoundError("gallery") + + data = self.get_metadata(page) + imgs = self.get_image_pages(page) + data["count"] = len(imgs) + data["gallery_key"] = self.gallery_key + + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], page_url in enumerate(imgs, 1): + image_url = self.get_image_data(page_url, data) + yield Message.Url, image_url, data + + @staticmethod + def get_metadata(page): + """Return gallery metadata""" + return text.extract_all(page, ( + ("title" , "'> ", " ", ""), + ("description", ":#FCFCFC;'>", ""), + ))[0] + + def get_image_pages(self, page): + """Return a list of all image pages""" + pages = [] + while True: + pages.extend(text.extract_iter(page, "\n
    0: + url = text.extract(page, "', '')[0]) + parts = info["contentUrl"].rsplit("/", 3) + return text.nameext_from_url(parts[3], { + "url": info["contentUrl"], + "title": text.unescape(info["name"]), + "uploader": info["author"], + "date": info["datePublished"], + "width": text.parse_int(info["width"]), + "height": text.parse_int(info["height"]), + "gallery_id": text.parse_int(parts[1]), + "image_id": text.parse_int(parts[2]), + }) + + +class ImagefapUserExtractor(ImagefapExtractor): + """Extractor for all galleries from a user at imagefap.com""" + subcategory = "user" + categorytransfer = True + pattern = (r"(?:https?://)?(?:www\.)?imagefap\.com/" + r"(?:profile(?:\.php\?user=|/)([^/?&#]+)" + r"|usergallery\.php\?userid=(\d+))") + test = ( + ("https://www.imagefap.com/profile/LucyRae/galleries", { + "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd", + }), + ("https://www.imagefap.com/usergallery.php?userid=1862791", { + "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd", + }), + ("https://www.imagefap.com/profile.php?user=LucyRae"), + ) + + def __init__(self, match): + ImagefapExtractor.__init__(self, match) + self.user, self.user_id = match.groups() + + def items(self): + yield Message.Version, 1 + for gid, name in self.get_gallery_data(): + url = "{}/gallery/{}".format(self.root, gid) + data = { + "gallery_id": text.parse_int(gid), + "title": text.unescape(name), + "_extractor": ImagefapGalleryExtractor, + } + yield Message.Queue, url, data + + def get_gallery_data(self): + """Yield all gallery_ids of a specific user""" + folders = self.get_gallery_folders() + url = "{}/ajax_usergallery_folder.php".format(self.root) + params = {"userid": self.user_id} + for folder_id in folders: + params["id"] = folder_id + page = self.request(url, params=params).text + + pos = 0 + while True: + gid, pos = text.extract(page, '", "<", pos) + yield gid, name + + def get_gallery_folders(self): + """Create a list of all folder_ids of a specific user""" + if self.user: + url = "{}/profile/{}/galleries".format(self.root, self.user) + else: + url = "{}/usergallery.php?userid={}".format( + self.root, self.user_id) + page = self.request(url).text + self.user_id, pos = text.extract(page, '?userid=', '"') + folders, pos = text.extract(page, ' id="tgl_all" value="', '"', pos) + return folders.split("|")[:-1] diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py new file mode 100644 index 0000000..954c1f0 --- /dev/null +++ b/gallery_dl/extractor/imagehosts.py @@ -0,0 +1,251 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Collection of extractors for various imagehosts""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text, exception +from ..cache import memcache +from os.path import splitext + + +class ImagehostImageExtractor(SharedConfigMixin, Extractor): + """Base class for single-image extractors for various imagehosts""" + basecategory = "imagehost" + subcategory = "image" + archive_fmt = "{token}" + https = False + method = "post" + params = "simple" + cookies = None + encoding = None + + def __init__(self, match): + Extractor.__init__(self, match) + self.page_url = "http{}://{}".format( + "s" if self.https else "", match.group(1)) + self.token = match.group(2) + if self.params == "simple": + self.params = { + "imgContinue": "Continue+to+image+...+", + } + elif self.params == "complex": + self.params = { + "op": "view", + "id": self.token, + "pre": "1", + "adb": "1", + "next": "Continue+to+image+...+", + } + else: + self.params = {} + self.method = "get" + + def items(self): + page = self.request( + self.page_url, + method=self.method, + data=self.params, + cookies=self.cookies, + encoding=self.encoding, + ).text + + url, filename = self.get_info(page) + data = text.nameext_from_url(filename, {"token": self.token}) + if self.https and url.startswith("http:"): + url = "https:" + url[5:] + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, url, data + + def get_info(self, page): + """Find image-url and string to get filename from""" + + +class ImxtoImageExtractor(ImagehostImageExtractor): + """Extractor for single images from imx.to""" + category = "imxto" + pattern = (r"(?:https?://)?(?:www\.)?((?:imx\.to|img\.yt)" + r"/(?:i/|img-)(\w+)(\.html)?)") + test = ( + ("https://imx.to/i/1qdeva", { # new-style URL + "url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130", + "keyword": "1153a986c939d7aed599905588f5c940048bc517", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }), + ("https://imx.to/img-57a2050547b97.html", { # old-style URL + "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204", + "keyword": "fd2240aee77a21b8252d5b829a1f7e542f927f09", + "content": "54592f2635674c25677c6872db3709d343cdf92f", + }), + ("https://img.yt/img-57a2050547b97.html", { # img.yt domain + "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204", + }), + ("https://imx.to/img-57a2050547b98.html", { + "exception": exception.NotFoundError, + }), + ) + https = True + encoding = "utf-8" + + def __init__(self, match): + ImagehostImageExtractor.__init__(self, match) + if "/img-" in self.page_url: + self.page_url = self.page_url.replace("img.yt", "imx.to") + self.url_ext = True + else: + self.url_ext = False + + def get_info(self, page): + url, pos = text.extract( + page, '
    ', '', '<', pos) + return url, text.unescape(filename) + + +class TurboimagehostImageExtractor(ImagehostImageExtractor): + """Extractor for single images from www.turboimagehost.com""" + category = "turboimagehost" + pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com" + r"/p/(\d+)/[^/?&#]+\.html)") + test = ("https://www.turboimagehost.com/p/39078423/test--.png.html", { + "url": "b94de43612318771ced924cb5085976f13b3b90e", + "keyword": "704757ca8825f51cec516ec44c1e627c1f2058ca", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }) + https = True + params = None + + def get_info(self, page): + url = text.extract(page, 'src="', '"', page.index("   ', ' of '), + (None , 'class="image-container"', ''), + ("filename" , ' title="', '"'), + ))[0] + + @staticmethod + def get_image_url(page): + """Extract download-url""" + pos = page.index(">Image") + return text.extract(page, '', page)
+
+        title = text.extract(page, ", "")[0] + title, _, count = title.rpartition(" - ") + return { + "gallery_key": self.gallery_key, + "title": text.unescape(title), + "count": count[:-7], + } + + def get_image_keys(self): + return self.image_keys + + +class ImgboxImageExtractor(ImgboxExtractor): + """Extractor for single images from imgbox.com""" + subcategory = "image" + archive_fmt = "{image_key}" + pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})" + test = ( + ("https://imgbox.com/qHhw7lpG", { + "url": "d931f675a9b848fa7cb9077d6c2b14eb07bdb80f", + "keyword": "dfc72310026b45f3feb4f9cada20c79b2575e1af", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }), + ("https://imgbox.com/qHhw7lpH", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + ImgboxExtractor.__init__(self, match) + self.image_key = match.group(1) + + def get_image_keys(self): + return (self.image_key,) + + @staticmethod + def get_image_metadata(page): + data = ImgboxExtractor.get_image_metadata(page) + if not data["filename"]: + raise exception.NotFoundError("image") + return data diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py new file mode 100644 index 0000000..a97f2e0 --- /dev/null +++ b/gallery_dl/extractor/imgth.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://imgth.com/""" + +from .common import Extractor, Message +from .. import text + + +class ImgthGalleryExtractor(Extractor): + """Extractor for image galleries from imgth.com""" + category = "imgth" + subcategory = "gallery" + directory_fmt = ("{category}", "{gallery_id} {title}") + filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{num}" + pattern = r"(?:https?://)?imgth\.com/gallery/(\d+)" + test = ("http://imgth.com/gallery/37/wallpaper-anime", { + "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748", + "keyword": "6f8c00d6849ea89d1a028764675ec1fe9dbd87e2", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.gid = match.group(1) + self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/" + + def items(self): + page = self.request(self.url_base + "0").text + data = self.metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], url in enumerate(self.images(page), 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def images(self, page): + """Yield all image urls for this gallery""" + pnum = 0 + while True: + thumbs = text.extract(page, '
      ', '
    ')[0] + for url in text.extract_iter(thumbs, '' not in page: + return + pnum += 1 + page = self.request(self.url_base + str(pnum)).text + + def metadata(self, page): + """Collect metadata for extractor-job""" + return text.extract_all(page, ( + ("title", '

    ', '

    '), + ("count", 'total of images in this gallery: ', ' '), + ("date" , 'created on ', ' by <'), + (None , 'href="/users/', ''), + ("user" , '>', '<'), + ), values={"gallery_id": self.gid})[0] diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py new file mode 100644 index 0000000..0468c0b --- /dev/null +++ b/gallery_dl/extractor/imgur.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://imgur.com/""" + +from .common import Extractor, Message +from .. import text, exception +import json + + +class ImgurExtractor(Extractor): + """Base class for imgur extractors""" + category = "imgur" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item_id = match.group(1) + self.mp4 = self.config("mp4", True) + + def _get_data(self, urlpart): + response = self.request("https://imgur.com/" + urlpart, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError(self.subcategory) + data = text.extract(response.text, "image : ", ",\n")[0] + return self._clean(json.loads(data)) + + def _prepare(self, image): + image["ext"] = image["ext"].partition("?")[0] + if image["ext"] == ".gif" and ( + (self.mp4 and image["prefer_video"]) or self.mp4 == "always"): + image["ext"] = ".mp4" + url = "https://i.imgur.com/" + image["hash"] + image["ext"] + image["extension"] = image["ext"][1:] + return url + + @staticmethod + def _clean(data): + try: + del data["adConfig"] + del data["isAd"] + except KeyError: + pass + return data + + +class ImgurImageExtractor(ImgurExtractor): + """Extractor for individual images from imgur.com""" + subcategory = "image" + filename_fmt = "{category}_{hash}{title:?_//}.{extension}" + archive_fmt = "{hash}" + pattern = (r"(?:https?://)?(?:www\.|[im]\.|)?imgur\.com" + r"/(?!gallery)(\w{7}|\w{5})[sbtmlh]?\.?") + test = ( + ("https://imgur.com/21yMxCS", { + "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + "keyword": { + "animated": False, + "datetime": "2016-11-10 14:24:35", + "description": str, + "ext": ".png", + "extension": "png", + "hash": "21yMxCS", + "height": "32", + "is_moderated": False, + "is_safe": False, + "is_viral": 0, + "looping": False, + "mimetype": "image/png", + "name": None, + "prefer_video": False, + "size": 182, + "source": "", + "title": "Test", + "video_host": None, + "video_source": None, + "width": "64", + }, + }), + ("http://imgur.com/0gybAXR", { # gifv/mp4 video + "url": "a2220eb265a55b0c95e0d3d721ec7665460e3fd7", + "content": "a3c080e43f58f55243ab830569ba02309d59abfc", + }), + ("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1' + "url": "73f361b50753ab25da64160aa50bc5d139480d45", + }), + ("https://imgur.com/zzzzzzz", { # not found + "exception": exception.NotFoundError, + }), + ("https://www.imgur.com/21yMxCS"), # www + ("https://m.imgur.com/21yMxCS"), # mobile + ("https://imgur.com/zxaY6"), # 5 character key + ("https://i.imgur.com/21yMxCS.png"), # direct link + ("https://i.imgur.com/21yMxCSh.png"), # direct link thumbnail + ("https://i.imgur.com/zxaY6.gif"), # direct link (short) + ("https://i.imgur.com/zxaY6s.gif"), # direct link (short; thumb) + ) + + def items(self): + image = self._get_data(self.item_id) + url = self._prepare(image) + + yield Message.Version, 1 + yield Message.Directory, image + yield Message.Url, url, image + + +class ImgurAlbumExtractor(ImgurExtractor): + """Extractor for image albums from imgur.com""" + subcategory = "album" + directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}") + filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}" + archive_fmt = "{album[hash]}_{hash}" + pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com" + r"/(?:a|gallery|t/unmuted)/(\w{7}|\w{5})") + test = ( + ("https://imgur.com/a/TcBmP", { + "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", + "keyword": { + "album": { + "album_cover": "693j2Kr", + "album_description": None, + "cover": "693j2Kr", + "datetime": "2015-10-09 10:37:50", + "description": None, + "hash": "TcBmP", + "id": "TcBmP", + "is_album": True, + "num_images": "19", + "title": "138", + "title_clean": "TcBmP", + "views": str, + }, + "animated": bool, + "datetime": str, + "extension": str, + "hash": str, + "height": int, + "num": int, + "prefer_video": bool, + "size": int, + "title": str, + "width": int, + }, + }), + ("https://imgur.com/gallery/eD9CT", { # large album + "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937", + }), + ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash + "url": "695ef0c950023362a0163ee5041796300db76674", + }), + ("https://imgur.com/t/unmuted/YMqBcua", { # unmuted URL + "url": "86b4747f8147cec7602f0214e267309af73a8655", + }), + ("https://imgur.com/a/TcBmQ", { + "exception": exception.NotFoundError, + }), + ("https://www.imgur.com/a/TcBmP"), # www + ("https://m.imgur.com/a/TcBmP"), # mobile + ) + + def items(self): + album = self._get_data("a/" + self.item_id + "/all") + images = album["album_images"]["images"] + del album["album_images"] + + if int(album["num_images"]) > len(images): + url = ("https://imgur.com/ajaxalbums/getimages/" + + self.item_id + "/hit.json") + images = self.request(url).json()["data"]["images"] + + yield Message.Version, 1 + yield Message.Directory, {"album": album, "count": len(images)} + for num, image in enumerate(images, 1): + url = self._prepare(image) + image["num"] = num + image["album"] = album + yield Message.Url, url, image diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py new file mode 100644 index 0000000..871236b --- /dev/null +++ b/gallery_dl/extractor/instagram.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Leonardo Taccari, Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.instagram.com/""" + +import hashlib +import json +from .common import Extractor, Message +from .. import text + + +class InstagramExtractor(Extractor): + """Base class for instagram extractors""" + category = "instagram" + directory_fmt = ("{category}", "{username}") + filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}" + archive_fmt = "{media_id}" + root = "https://www.instagram.com" + + def get_metadata(self): + return {} + + def items(self): + yield Message.Version, 1 + + metadata = self.get_metadata() + for data in self.instagrams(): + data.update(metadata) + yield Message.Directory, data + + if data['typename'] == 'GraphImage': + yield Message.Url, data['display_url'], \ + text.nameext_from_url(data['display_url'], data) + elif data['typename'] == 'GraphVideo': + yield Message.Url, \ + 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data + + def _extract_shared_data(self, page): + return json.loads(text.extract(page, + 'window._sharedData = ', ';')[0]) + + def _extract_postpage(self, url): + page = self.request(url).text + shared_data = self._extract_shared_data(page) + media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media'] + + common = { + 'date': text.parse_timestamp(media['taken_at_timestamp']), + 'likes': text.parse_int(media['edge_media_preview_like']['count']), + 'owner_id': media['owner']['id'], + 'username': media['owner']['username'], + 'fullname': media['owner']['full_name'], + 'description': text.parse_unicode_escapes('\n'.join( + edge['node']['text'] + for edge in media['edge_media_to_caption']['edges'] + )), + } + + medias = [] + if media['__typename'] == 'GraphSidecar': + yi = 0 + for n in media['edge_sidecar_to_children']['edges']: + children = n['node'] + media_data = { + 'media_id': children['id'], + 'shortcode': children['shortcode'], + 'typename': children['__typename'], + 'display_url': children['display_url'], + 'height': text.parse_int(children['dimensions']['height']), + 'width': text.parse_int(children['dimensions']['width']), + 'sidecar_media_id': media['id'], + 'sidecar_shortcode': media['shortcode'], + } + if children['__typename'] == 'GraphVideo': + media_data["_ytdl_index"] = yi + yi += 1 + media_data.update(common) + medias.append(media_data) + + else: + media_data = { + 'media_id': media['id'], + 'shortcode': media['shortcode'], + 'typename': media['__typename'], + 'display_url': media['display_url'], + 'height': text.parse_int(media['dimensions']['height']), + 'width': text.parse_int(media['dimensions']['width']), + } + media_data.update(common) + medias.append(media_data) + + return medias + + def _extract_page(self, url, page_type): + shared_data_fields = { + 'ProfilePage': { + 'node': 'user', + 'node_id': 'id', + 'edge_to_medias': 'edge_owner_to_timeline_media', + 'variables_id': 'id', + 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41', + }, + 'TagPage': { + 'node': 'hashtag', + 'node_id': 'name', + 'edge_to_medias': 'edge_hashtag_to_media', + 'variables_id': 'tag_name', + 'query_hash': 'f92f56d47dc7a55b606908374b43a314', + }, + } + + page = self.request(url).text + shared_data = self._extract_shared_data(page) + psdf = shared_data_fields[page_type] + + while True: + # Deal with different structure of pages: the first page + # has interesting data in `entry_data', next pages in `data'. + if 'entry_data' in shared_data: + base_shared_data = shared_data['entry_data'][page_type][0]['graphql'] + + # variables_id is available only in the first page + variables_id = base_shared_data[psdf['node']][psdf['node_id']] + else: + base_shared_data = shared_data['data'] + + medias = base_shared_data[psdf['node']][psdf['edge_to_medias']] + has_next_page = medias['page_info']['has_next_page'] + shortcodes = [n['node']['shortcode'] for n in medias['edges']] + + for s in shortcodes: + url = '{}/p/{}/'.format(self.root, s) + yield from self._extract_postpage(url) + + if not has_next_page: + break + + end_cursor = medias['page_info']['end_cursor'] + variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format( + psdf['variables_id'], + variables_id, + end_cursor, + ) + headers = { + "X-Requested-With": "XMLHttpRequest", + "X-Instagram-GIS": hashlib.md5(variables.encode()).hexdigest(), + } + url = '{}/graphql/query/?query_hash={}&variables={}'.format( + self.root, + psdf['query_hash'], + variables, + ) + shared_data = self.request(url, headers=headers).json() + + def _extract_profilepage(self, url): + yield from self._extract_page(url, 'ProfilePage') + + def _extract_tagpage(self, url): + yield from self._extract_page(url, 'TagPage') + + +class InstagramImageExtractor(InstagramExtractor): + """Extractor for PostPage""" + subcategory = "image" + pattern = r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/?&#]+)" + test = ( + # GraphImage + ("https://www.instagram.com/p/BqvsDleB3lV/", { + "pattern": r"https://[^/]+\.(cdninstagram\.com|fbcdn\.net)" + r"/vp/[0-9a-f]+/[0-9A-F]+/t51.2885-15/e35" + r"/44877605_725955034447492_3123079845831750529_n.jpg", + "keyword": { + "date": "type:datetime", + "description": str, + "height": int, + "likes": int, + "media_id": "1922949326347663701", + "shortcode": "BqvsDleB3lV", + "typename": "GraphImage", + "username": "instagram", + "width": int, + } + }), + + # GraphSidecar + ("https://www.instagram.com/p/BoHk1haB5tM/", { + "count": 5, + "keyword": { + "sidecar_media_id": "1875629777499953996", + "sidecar_shortcode": "BoHk1haB5tM", + "likes": int, + "username": "instagram", + } + }), + + # GraphVideo + ("https://www.instagram.com/p/Bqxp0VSBgJg/", { + "url": "8f38c1cf460c9804842f7306c487410f33f82e7e", + "keyword": { + "date": "type:datetime", + "description": str, + "height": int, + "likes": int, + "media_id": "1923502432034620000", + "shortcode": "Bqxp0VSBgJg", + "typename": "GraphVideo", + "username": "instagram", + "width": int, + } + }), + + # GraphSidecar with 2 embedded GraphVideo objects + ("https://www.instagram.com/p/BtOvDOfhvRr/", { + "count": 2, + "url": "e290d4180a58ae50c910d51d3b04d5f5c4622cd7", + "keyword": { + "sidecar_media_id": "1967717017113261163", + "sidecar_shortcode": "BtOvDOfhvRr", + "_ytdl_index": int, + } + }) + ) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.shortcode = match.group(1) + + def instagrams(self): + url = '{}/p/{}/'.format(self.root, self.shortcode) + return self._extract_postpage(url) + + +class InstagramUserExtractor(InstagramExtractor): + """Extractor for ProfilePage""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/(?!p/|explore/|directory/|accounts/)([^/?&#]+)") + test = ("https://www.instagram.com/instagram/", { + "range": "1-12", + "count": ">= 12", + }) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.username = match.group(1) + + def instagrams(self): + url = '{}/{}/'.format(self.root, self.username) + return self._extract_profilepage(url) + + +class InstagramTagExtractor(InstagramExtractor): + """Extractor for TagPage""" + subcategory = "tag" + directory_fmt = ("{category}", "{subcategory}", "{tag}") + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/explore/tags/([^/?&#]+)") + test = ("https://www.instagram.com/explore/tags/instagram/", { + "range": "1-12", + "count": ">= 12", + }) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.tag = match.group(1) + + def get_metadata(self): + return {"tag": self.tag} + + def instagrams(self): + url = '{}/explore/tags/{}/'.format(self.root, self.tag) + return self._extract_tagpage(url) diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py new file mode 100644 index 0000000..5902333 --- /dev/null +++ b/gallery_dl/extractor/keenspot.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for http://www.keenspot.com/""" + +from .common import Extractor, Message +from .. import text + + +class KeenspotComicExtractor(Extractor): + """Extractor for webcomics from keenspot.com""" + category = "keenspot" + subcategory = "comic" + directory_fmt = ("{category}", "{comic}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{comic}_{filename}" + pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?" + test = ( + ("http://marksmen.keenspot.com/", { # link + "range": "1-3", + "url": "83bcf029103bf8bc865a1988afa4aaeb23709ba6", + }), + ("http://barkercomic.keenspot.com/", { # id + "range": "1-3", + "url": "c4080926db18d00bac641fdd708393b7d61379e6", + }), + ("http://crowscare.keenspot.com/", { # id v2 + "range": "1-3", + "url": "a00e66a133dd39005777317da90cef921466fcaa" + }), + ("http://supernovas.keenspot.com/", { # ks + "range": "1-3", + "url": "de21b12887ef31ff82edccbc09d112e3885c3aab" + }), + ("http://twokinds.keenspot.com/comic/1066/", { # "random" access + "range": "1-3", + "url": "97e2a6ed8ba1709314f2449f84b6b1ce5db21c04", + }) + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.comic = match.group(1).lower() + self.path = match.group(2) + self.root = "http://" + self.comic + ".keenspot.com" + + self._needle = "" + self._image = 'class="ksc"' + self._next = self._next_needle + + def items(self): + data = {"comic": self.comic} + yield Message.Version, 1 + yield Message.Directory, data + + url = self._first(self.request(self.root + "/").text) + if self.path: + url = self.root + self.path + + prev = None + ilen = len(self._image) + while url and url != prev: + prev = url + page = self.request(text.urljoin(self.root, url)).text + + pos = 0 + while True: + pos = page.find(self._image, pos) + if pos < 0: + break + img, pos = text.extract(page, 'src="', '"', pos + ilen) + if img.endswith(".js"): + continue + if img[0] == "/": + img = self.root + img + elif "youtube.com/" in img: + img = "ytdl:" + img + yield Message.Url, img, text.nameext_from_url(img, data) + + url = self._next(page) + + def _first(self, page): + if self.comic == "brawlinthefamily": + self._next = self._next_brawl + self._image = '
    ' + return "http://brawlinthefamily.keenspot.com/comic/theshowdown/" + + url = text.extract(page, '= 0: + self._next = self._next_id + return text.rextract(page, 'href="', '"', pos)[0] + + pos = page.find('>FIRST PAGE<') + if pos >= 0: + if self.comic == "lastblood": + self._next = self._next_lastblood + self._image = '
    ' + else: + self._next = self._next_id + return text.rextract(page, 'href="', '"', pos)[0] + + pos = page.find('
    = 0: + self._needle = 'First Comic<') # twokinds + if pos >= 0: + self._image = '' + self._needle = 'class="navarchive"' + return text.rextract(page, 'href="', '"', pos)[0] + + pos = page.find('id="flip_FirstDay"') # flipside + if pos >= 0: + self._image = 'class="flip_Pages ksc"' + self._needle = 'id="flip_ArcButton"' + return text.rextract(page, 'href="', '"', pos)[0] + + self.log.error("Unrecognized page layout") + return None + + def _next_needle(self, page): + pos = page.index(self._needle) + len(self._needle) + return text.extract(page, 'href="', '"', pos)[0] + + @staticmethod + def _next_link(page): + return text.extract(page, '= 0 else None + + @staticmethod + def _next_lastblood(page): + pos = page.index("link rel='next'") + return text.extract(page, "href='", "'", pos)[0] + + @staticmethod + def _next_brawl(page): + pos = page.index("comic-nav-next") + url = text.rextract(page, 'href="', '"', pos)[0] + return None if "?random" in url else url diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py new file mode 100644 index 0000000..c9e6959 --- /dev/null +++ b/gallery_dl/extractor/khinsider.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract soundtracks from https://downloads.khinsider.com/""" + +from .common import Extractor, Message, AsynchronousMixin +from .. import text, exception + + +class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): + """Extractor for soundtracks from khinsider.com""" + category = "khinsider" + subcategory = "soundtrack" + directory_fmt = ("{category}", "{album}") + archive_fmt = "{album}_{filename}.{extension}" + pattern = (r"(?:https?://)?downloads\.khinsider\.com" + r"/game-soundtracks/album/([^/?&#]+)") + test = (("https://downloads.khinsider.com" + "/game-soundtracks/album/horizon-riders-wii"), { + "pattern": r"https?://\d+\.\d+\.\d+\.\d+/ost/horizon-riders-wii/[^/]+" + r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3", + "count": 1, + "keyword": "b4f460c78dd23e1f1121f4ac784dd67ded7c2679", + }) + root = "https://downloads.khinsider.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.album = match.group(1) + + def items(self): + url = (self.root + "/game-soundtracks/album/" + self.album) + page = self.request(url, encoding="utf-8").text + data = self.get_job_metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for url, track in self.get_album_tracks(page): + track.update(data) + yield Message.Url, url, track + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + if "Download all songs at once:" not in page: + raise exception.NotFoundError("soundtrack") + data = text.extract_all(page, ( + ("album", "Album name: ", ""), + ("count", "Number of Files: ", ""), + ("size" , "Total Filesize: ", ""), + ("date" , "Date added: ", ""), + ("type" , "Album type: ", ""), + ))[0] + data["album"] = text.unescape(data["album"]) + return data + + def get_album_tracks(self, page): + """Collect url and metadata for all tracks of a soundtrack""" + page = text.extract(page, '', '
    ')[0] + for num, url in enumerate(text.extract_iter( + page, '
    ", "")[0].strip() + manga, cinfo = title.split("\n")[1:3] + data = { + "manga": manga.strip(), + "chapter_string": cinfo.strip(), + "chapter_id": text.parse_int(self.chapter_id), + "lang": "en", + "language": "English", + } + return self.parse_chapter_string(data) + + def images(self, page): + self.session.headers["Referer"] = None + try: + key = self.build_aes_key(page) + iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0, + 0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3) + return [ + (aes.aes_cbc_decrypt_text( + data, key, iv).partition("&")[0], None) + for data in text.extract_iter( + page, 'lstImages.push(wrapKA("', '"' + ) + ] + except UnicodeDecodeError: + self.log.error("Failed to decrypt image URLs") + except (ValueError, IndexError): + self.log.error("Failed to get AES key") + return [] + + def build_aes_key(self, page): + chko = self._chko_from_external_script() + + for script in self._scripts(page): + for stmt in [s.strip() for s in script.split(";")]: + + if stmt.startswith("var _"): + name, _, value = stmt[4:].partition(" = ") + name += "[0]" + value = ast.literal_eval(value)[0] + + elif stmt.startswith("chko = "): + stmt = stmt[7:] + if stmt == name: + chko = value + elif stmt == "chko + " + name: + chko = chko + value + elif stmt == name + " + chko": + chko = value + chko + else: + self.log.warning("unrecognized expression: '%s'", stmt) + + elif stmt.startswith("key = "): + pass + + else: + self.log.warning("unrecognized statement: '%s'", stmt) + + return list(hashlib.sha256(chko.encode("ascii")).digest()) + + @staticmethod + def _scripts(page): + end = 0 + while True: + pos = page.find("key = ", end) + if pos == -1: + return + beg = page.rindex('', pos) + yield page[beg:end] + + @cache(maxage=3600) + def _chko_from_external_script(self): + script = self.request(self.root + "/Scripts/lo.js").text + + pos = script.index("var chko") + var = text.extract(script, "=", "[", pos)[0].lstrip() + idx = text.extract(script, "[", "]", pos)[0] + + pos = script.index(var) + lst = text.extract(script, "=", ";", pos)[0] + return ast.literal_eval(lst.strip())[int(idx)] + + +class KissmangaMangaExtractor(KissmangaBase, MangaExtractor): + """Extractor for manga from kissmanga.com""" + chapterclass = KissmangaChapterExtractor + pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" + r"(/Manga/[^/?&#]+/?)$") + test = ( + ("https://kissmanga.com/Manga/Dropout", { + "url": "9e3a6f715b229aa3fafa42a1d5da5d65614cb532", + "keyword": "32b09711c28b481845acc32e3bb6054cfc90224d", + }), + ("https://kissmanga.com/manga/feng-shen-ji"), # lowercase + ) + + def chapters(self, page): + results = [] + manga, pos = text.extract(page, ' class="barTitle">', '\ninformation') + page , pos = text.extract(page, ' class="listing">', '', pos) + manga = manga.strip() + needle = '" title="Read ' + manga + ' ' + manga = text.unescape(manga) + + for item in text.extract_iter(page, ''): + url, _, chapter = item.partition(needle) + data = { + "manga": manga, "chapter_string": chapter, + "chapter_id": text.parse_int(url.rpartition("=")[2]), + "lang": "en", "language": "English", + } + self.parse_chapter_string(data) + results.append((self.root + url, data)) + return results diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py new file mode 100644 index 0000000..8541e4f --- /dev/null +++ b/gallery_dl/extractor/komikcast.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://komikcast.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +import re + + +class KomikcastBase(): + """Base class for komikcast extractors""" + category = "komikcast" + root = "https://komikcast.com" + + @staticmethod + def parse_chapter_string(chapter_string, data=None): + """Parse 'chapter_string' value and add its info to 'data'""" + if not data: + data = {} + + match = re.match( + r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?", + text.unescape(chapter_string), + ) + manga, chapter, data["chapter_minor"], title = match.groups() + + if manga: + data["manga"] = manga.partition(" Chapter ")[0] + if title and title.lower() != "bahasa indonesia": + data["title"] = title.strip() + else: + data["title"] = "" + data["chapter"] = text.parse_int(chapter) + data["lang"] = "id" + data["language"] = "Indonesian" + + return data + + +class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): + """Extractor for manga-chapters from komikcast.com""" + pattern = r"(?:https?://)?(?:www\.)?komikcast\.com(/chapter/[^/?&#]+/)" + test = ( + (("https://komikcast.com/chapter/" + "apotheosis-chapter-02-2-bahasa-indonesia/"), { + "url": "f6b43fbc027697749b3ea1c14931c83f878d7936", + "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4", + }), + (("https://komikcast.com/chapter/" + "tonari-no-kashiwagi-san-chapter-18b/"), { + "url": "aff90dd21dbb945a726778b10bdef522af7c42fe", + "keyword": "19b5783864c4299913de436513b124b028b557c1", + }), + (("https://komikcast.com/chapter/090-eko-to-issho-chapter-1/"), { + "url": "cda104a32ea2b06b3d6b096726622f519ed1fa33", + }), + ) + + def metadata(self, page): + info = text.extract(page, '', "")[0] + return self.parse_chapter_string(info) + + @staticmethod + def images(page): + readerarea = text.extract( + page, '
    ', '')), + } + + def images(self, page): + iframe = self.root + "/iframe_image.php?id=" + backend = self.root + "/backend.php" + for image_id in text.extract_iter(page, 'data-img-id="', '"'): + spirit = text.extract(self.request( + iframe + image_id).text, 'giraffe.annihilate("', '"')[0] + params = {"spirit": self._annihilate(spirit), "photo": image_id} + data = self.request(backend, params=params).json() + yield data[0], { + "id" : text.parse_int(image_id), + "width" : text.parse_int(data[1]), + "height": text.parse_int(data[2]), + } + + @staticmethod + def _annihilate(value, base=6): + return "".join( + chr(ord(char) ^ base) + for char in value + ) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py new file mode 100644 index 0000000..e26eae1 --- /dev/null +++ b/gallery_dl/extractor/oauth.py @@ -0,0 +1,375 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Utility classes to setup OAuth and link a users account to gallery-dl""" + +from .common import Extractor, Message +from . import deviantart, flickr, reddit, smugmug, tumblr +from .. import text, oauth, config, exception +from ..cache import cache +import os +import urllib.parse + + +class OAuthBase(Extractor): + """Base class for OAuth Helpers""" + category = "oauth" + redirect_uri = "http://localhost:6414/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.client = None + + def oauth_config(self, key, default=None): + return config.interpolate( + ("extractor", self.subcategory, key), default) + + def recv(self): + """Open local HTTP server and recv callback parameters""" + import socket + print("Waiting for response. (Cancel with Ctrl+c)") + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind(("localhost", 6414)) + server.listen(1) + + # workaround for ctrl+c not working during server.accept on Windows + if os.name == "nt": + server.settimeout(1.0) + while True: + try: + self.client = server.accept()[0] + break + except socket.timeout: + pass + server.close() + + data = self.client.recv(1024).decode() + path = data.split(" ", 2)[1] + return text.parse_query(path.partition("?")[2]) + + def send(self, msg): + """Send 'msg' to the socket opened in 'recv()'""" + print(msg) + self.client.send(b"HTTP/1.1 200 OK\r\n\r\n" + msg.encode()) + self.client.close() + + def open(self, url, params): + """Open 'url' in browser amd return response parameters""" + import webbrowser + url += "?" + urllib.parse.urlencode(params) + if not self.config("browser", True) or not webbrowser.open(url): + print("Please open this URL in your browser:") + print(url, end="\n\n", flush=True) + return self.recv() + + def _oauth1_authorization_flow( + self, request_token_url, authorize_url, access_token_url): + """Perform the OAuth 1.0a authorization flow""" + # get a request token + params = {"oauth_callback": self.redirect_uri} + data = self.session.get(request_token_url, params=params).text + + data = text.parse_query(data) + self.session.auth.token_secret = data["oauth_token_secret"] + + # get the user's authorization + params = {"oauth_token": data["oauth_token"], "perms": "read"} + data = self.open(authorize_url, params) + + # exchange the request token for an access token + data = self.session.get(access_token_url, params=data).text + + data = text.parse_query(data) + self.send(OAUTH1_MSG_TEMPLATE.format( + category=self.subcategory, + token=data["oauth_token"], + token_secret=data["oauth_token_secret"], + )) + + def _oauth2_authorization_code_grant( + self, client_id, client_secret, auth_url, token_url, + scope="read", key="refresh_token", auth=True, + message_template=None): + """Perform an OAuth2 authorization code grant""" + + state = "gallery-dl_{}_{}".format( + self.subcategory, + oauth.nonce(8), + ) + + auth_params = { + "client_id": client_id, + "response_type": "code", + "state": state, + "redirect_uri": self.redirect_uri, + "duration": "permanent", + "scope": scope, + } + + # receive an authorization code + params = self.open(auth_url, auth_params) + + # check authorization response + if state != params.get("state"): + self.send("'state' mismatch: expected {}, got {}.".format( + state, params.get("state") + )) + return + if "error" in params: + self.send(params["error"]) + return + + # exchange the authorization code for a token + data = { + "grant_type": "authorization_code", + "code": params["code"], + "redirect_uri": self.redirect_uri, + } + + if auth: + auth = (client_id, client_secret) + else: + auth = None + data["client_id"] = client_id + data["client_secret"] = client_secret + + data = self.session.post(token_url, data=data, auth=auth).json() + + # check token response + if "error" in data: + self.send(data["error"]) + return + + # display token + part = key.partition("_")[0] + template = message_template or OAUTH2_MSG_TEMPLATE + self.send(template.format( + category=self.subcategory, + key=part, + Key=part.capitalize(), + token=data[key], + instance=getattr(self, "instance", ""), + client_id=client_id, + client_secret=client_secret, + )) + + +class OAuthDeviantart(OAuthBase): + subcategory = "deviantart" + pattern = "oauth:deviantart$" + redirect_uri = "https://mikf.github.io/gallery-dl/oauth-redirect.html" + + def items(self): + yield Message.Version, 1 + + self._oauth2_authorization_code_grant( + self.oauth_config( + "client-id", deviantart.DeviantartAPI.CLIENT_ID), + self.oauth_config( + "client-secret", deviantart.DeviantartAPI.CLIENT_SECRET), + "https://www.deviantart.com/oauth2/authorize", + "https://www.deviantart.com/oauth2/token", + scope="browse", + ) + + +class OAuthFlickr(OAuthBase): + subcategory = "flickr" + pattern = "oauth:flickr$" + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.session = oauth.OAuth1Session( + self.oauth_config("api-key", flickr.FlickrAPI.API_KEY), + self.oauth_config("api-secret", flickr.FlickrAPI.API_SECRET), + ) + + def items(self): + yield Message.Version, 1 + + self._oauth1_authorization_flow( + "https://www.flickr.com/services/oauth/request_token", + "https://www.flickr.com/services/oauth/authorize", + "https://www.flickr.com/services/oauth/access_token", + ) + + +class OAuthReddit(OAuthBase): + subcategory = "reddit" + pattern = "oauth:reddit$" + + def items(self): + yield Message.Version, 1 + + self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT + self._oauth2_authorization_code_grant( + self.oauth_config("client-id", reddit.RedditAPI.CLIENT_ID), + "", + "https://www.reddit.com/api/v1/authorize", + "https://www.reddit.com/api/v1/access_token", + scope="read", + ) + + +class OAuthSmugmug(OAuthBase): + subcategory = "smugmug" + pattern = "oauth:smugmug$" + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.session = oauth.OAuth1Session( + self.oauth_config("api-key", smugmug.SmugmugAPI.API_KEY), + self.oauth_config("api-secret", smugmug.SmugmugAPI.API_SECRET), + ) + + def items(self): + yield Message.Version, 1 + + self._oauth1_authorization_flow( + "https://api.smugmug.com/services/oauth/1.0a/getRequestToken", + "https://api.smugmug.com/services/oauth/1.0a/authorize", + "https://api.smugmug.com/services/oauth/1.0a/getAccessToken", + ) + + +class OAuthTumblr(OAuthBase): + subcategory = "tumblr" + pattern = "oauth:tumblr$" + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.session = oauth.OAuth1Session( + self.oauth_config("api-key", tumblr.TumblrAPI.API_KEY), + self.oauth_config("api-secret", tumblr.TumblrAPI.API_SECRET), + ) + + def items(self): + yield Message.Version, 1 + + self._oauth1_authorization_flow( + "https://www.tumblr.com/oauth/request_token", + "https://www.tumblr.com/oauth/authorize", + "https://www.tumblr.com/oauth/access_token", + ) + + +class OAuthMastodon(OAuthBase): + subcategory = "mastodon" + pattern = "oauth:mastodon:(?:https?://)?([^/?&#]+)" + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.instance = match.group(1) + + def items(self): + yield Message.Version, 1 + + application = self.oauth_config(self.instance) + if not application: + application = self._register(self.instance) + + self._oauth2_authorization_code_grant( + application["client-id"], + application["client-secret"], + "https://{}/oauth/authorize".format(self.instance), + "https://{}/oauth/token".format(self.instance), + key="access_token", + message_template=MASTODON_MSG_TEMPLATE, + ) + + @cache(maxage=10*365*24*3600, keyarg=1) + def _register(self, instance): + self.log.info("Registering application for '%s'", instance) + + url = "https://{}/api/v1/apps".format(instance) + data = { + "client_name": "gdl:" + oauth.nonce(8), + "redirect_uris": self.redirect_uri, + "scopes": "read", + } + data = self.session.post(url, data=data).json() + + if "client_id" not in data or "client_secret" not in data: + self.log.error("Failed to register new application: '%s'", data) + raise exception.StopExtraction() + + data["client-id"] = data.pop("client_id") + data["client-secret"] = data.pop("client_secret") + + self.log.info("client-id:\n%s", data["client-id"]) + self.log.info("client-secret:\n%s", data["client-secret"]) + + return data + + +OAUTH1_MSG_TEMPLATE = """ +Your Access Token and Access Token Secret are + +{token} +{token_secret} + +Put these values into your configuration file as +'extractor.{category}.access-token' and +'extractor.{category}.access-token-secret'. + +Example: +{{ + "extractor": {{ + "{category}": {{ + "access-token": "{token}", + "access-token-secret": "{token_secret}" + }} + }} +}} +""" + + +OAUTH2_MSG_TEMPLATE = """ +Your {Key} Token is + +{token} + +Put this value into your configuration file as +'extractor.{category}.{key}-token'. + +Example: +{{ + "extractor": {{ + "{category}": {{ + "{key}-token": "{token}" + }} + }} +}} +""" + + +MASTODON_MSG_TEMPLATE = """ +Your {Key} Token is + +{token} + +Put this value into your configuration file as +'extractor.mastodon.{instance}.{key}-token'. + +You can also add your 'client-id' and 'client-secret' values +if you want to register another account in the future. + +Example: +{{ + "extractor": {{ + "mastodon": {{ + "{instance}": {{ + "{key}-token": "{token}", + "client-id": "{client_id}", + "client-secret": "{client_secret}" + }} + }} + }} +}} +""" diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py new file mode 100644 index 0000000..a4731d0 --- /dev/null +++ b/gallery_dl/extractor/paheal.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://rule34.paheal.net/""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text + + +class PahealExtractor(SharedConfigMixin, Extractor): + """Base class for paheal extractors""" + basecategory = "booru" + category = "paheal" + filename_fmt = "{category}_{id}_{md5}.{extension}" + archive_fmt = "{id}" + root = "https://rule34.paheal.net" + + def items(self): + yield Message.Version, 1 + yield Message.Directory, self.get_metadata() + + for data in self.get_posts(): + url = data["file_url"] + for key in ("id", "width", "height"): + data[key] = text.parse_int(data[key]) + data["tags"] = text.unquote(data["tags"]) + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_metadata(self): + """Return general metadata""" + return {} + + def get_posts(self): + """Return an iterable containing data of all relevant posts""" + + +class PahealTagExtractor(PahealExtractor): + """Extractor for images from rule34.paheal.net by search-tags""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" + r"/post/list/([^/?&#]+)") + test = ("https://rule34.paheal.net/post/list/k-on/1", { + "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", + "count": ">= 15" + }) + per_page = 70 + + def __init__(self, match): + PahealExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1)) + + def get_metadata(self): + return {"search_tags": self.tags} + + def get_posts(self): + pnum = 1 + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + + for post in text.extract_iter( + page, 'Next<" not in page: + return + pnum += 1 + + @staticmethod + def _extract_data(post): + pid , pos = text.extract(post, '', '"') + data, pos = text.extract(post, 'title="', '"', pos) + md5 , pos = text.extract(post, '/_thumbs/', '/', pos) + url , pos = text.extract(post, '= 25", + "keyword": { + "attachments": list, + "comment_count": int, + "content": str, + "creator": dict, + "date": "type:datetime", + "id": int, + "like_count": int, + "post_type": str, + "published_at": str, + "title": str, + }, + }) + + def __init__(self, match): + PatreonExtractor.__init__(self, match) + self.creator = match.group(1).lower() + + def posts(self): + url = "{}/{}".format(self.root, self.creator) + page = self.request(url).text + campaign_id = text.extract(page, "/campaign/", "/")[0] + + url = self._build_url("posts", ( + "&sort=-published_at" + "&filter[is_draft]=false" + "&filter[contains_exclusive_posts]=true" + "&filter[campaign_id]=" + campaign_id + )) + return self._pagination(url) + + +class PatreonUserExtractor(PatreonExtractor): + """Extractor for media from creators supported by you""" + subcategory = "user" + pattern = r"(?:https?://)?(?:www\.)?patreon\.com/home$" + test = ("https://www.patreon.com/home",) + + def posts(self): + url = self._build_url("stream", ( + "&page[cursor]=null" + "&filter[is_following]=true" + )) + return self._pagination(url) diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py new file mode 100644 index 0000000..83f75a3 --- /dev/null +++ b/gallery_dl/extractor/photobucket.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://photobucket.com/""" + +from .common import Extractor, Message +from .. import text, exception +import base64 +import json + + +class PhotobucketAlbumExtractor(Extractor): + """Extractor for albums on photobucket.com""" + category = "photobucket" + subcategory = "album" + directory_fmt = ("{category}", "{username}", "{location}") + filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}" + archive_fmt = "{id}" + pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)" + r"/user/[^/?&#]+/library/[^?&#]*") + test = ( + ("https://s258.photobucket.com/user/focolandia/library/", { + "pattern": r"https?://[oi]+\d+.photobucket.com/albums/hh280/", + "count": ">= 39" + }), + # subalbums of main "directory" + ("https://s271.photobucket.com/user/lakerfanryan/library/", { + "options": (("image-filter", "False"),), + "pattern": pattern, + "count": 1, + }), + # subalbums of subalbum without images + ("https://s271.photobucket.com/user/lakerfanryan/library/Basketball", { + "pattern": pattern, + "count": ">= 9", + }), + # private (missing JSON data) + ("https://s1277.photobucket.com/user/sinisterkat44/library/", { + "count": 0, + }), + ("https://s1110.photobucket.com/user/chndrmhn100/library/" + "Chandu%20is%20the%20King?sort=3&page=1"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.album_path = "" + self.root = "https://" + match.group(1) + self.session.headers["Referer"] = self.url + + def items(self): + yield Message.Version, 1 + for image in self.images(): + image["titleOrFilename"] = text.unescape(image["titleOrFilename"]) + image["title"] = text.unescape(image["title"]) + image["extension"] = image["ext"] + yield Message.Directory, image + yield Message.Url, image["fullsizeUrl"], image + + if self.config("subalbums", True): + for album in self.subalbums(): + album["_extractor"] = PhotobucketAlbumExtractor + yield Message.Queue, album["url"], album + + def images(self): + """Yield all images of the current album""" + url = self.url + params = {"sort": "3", "page": 1} + + while True: + page = self.request(url, params=params).text + json_data = text.extract(page, "collectionData:", ",\n")[0] + if not json_data: + msg = text.extract(page, 'libraryPrivacyBlock">', "
    ")[0] + msg = ' ("{}")'.format(text.remove_html(msg)) if msg else "" + self.log.error("Unable to get JSON data%s", msg) + return + data = json.loads(json_data) + + yield from data["items"]["objects"] + + if data["total"] <= data["offset"] + data["pageSize"]: + self.album_path = data["currentAlbumPath"] + return + params["page"] += 1 + + def subalbums(self): + """Return all subalbum objects""" + url = self.root + "/component/Albums-SubalbumList" + params = { + "albumPath": self.album_path, + "fetchSubAlbumsOnly": "true", + "deferCollapsed": "true", + "json": "1", + } + + data = self.request(url, params=params).json() + return data["body"].get("subAlbums", ()) + + +class PhotobucketImageExtractor(Extractor): + """Extractor for individual images from photobucket.com""" + category = "photobucket" + subcategory = "image" + directory_fmt = ("{category}", "{username}") + filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}" + archive_fmt = "{username}_{id}" + pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com" + r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)" + r"|/user/([^/?&#]+)/media/[^?&#]+\.html)") + test = ( + (("https://s271.photobucket.com/user/lakerfanryan" + "/media/Untitled-3-1.jpg.html"), { + "url": "3b647deeaffc184cc48c89945f67574559c9051f", + "keyword": "a2de4e60d584912537b8025c01bdd1d20bdea735", + }), + (("https://s271.photobucket.com/user/lakerfanryan" + "/media/IsotopeswBros.jpg.html?sort=3&o=2"), { + "url": "12c1890c09c9cdb8a88fba7eec13f324796a8d7b", + "keyword": "61200a223df6c06f45ac3d30c88b3f5b048ce9a8", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) or match.group(3) + self.media_id = match.group(2) + self.session.headers["Referer"] = self.url + + def items(self): + url = "https://photobucket.com/galleryd/search.php" + params = {"userName": self.user, "searchTerm": "", "ref": ""} + + if self.media_id: + params["mediaId"] = self.media_id + else: + params["url"] = self.url + + # retry API call up to 5 times, since it can randomly fail + tries = 0 + while tries < 5: + data = self.request(url, method="POST", params=params).json() + image = data["mediaDocuments"] + if "message" not in image: + break # success + tries += 1 + self.log.debug("'%s'", image["message"]) + else: + self.log.error("%s", image["message"]) + raise exception.StopExtraction() + + # adjust metadata entries to be at least somewhat similar + # to what the 'album' extractor provides + if "media" in image: + image = image["media"][image["mediaIndex"]] + image["albumView"] = data["mediaDocuments"]["albumView"] + image["username"] = image["ownerId"] + else: + image["fileUrl"] = image.pop("imageUrl") + + image.setdefault("title", "") + image.setdefault("description", "") + name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".") + image["ext"] = image["extension"] = ext + image["titleOrFilename"] = image["title"] or name + image["tags"] = image.pop("clarifaiTagList", []) + + mtype, _, mid = base64.b64decode(image["id"]).partition(b":") + image["pictureId"] = mid.decode() if mtype == b"mediaId" else "" + + yield Message.Version, 1 + yield Message.Directory, image + yield Message.Url, image["fileUrl"], image diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py new file mode 100644 index 0000000..6a5c41c --- /dev/null +++ b/gallery_dl/extractor/piczel.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://piczel.tv/""" + +from .common import Extractor, Message +from .. import text + + +class PiczelExtractor(Extractor): + """Base class for piczel extractors""" + category = "piczel" + directory_fmt = ("{category}", "{user[username]}") + filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + root = "https://piczel.tv" + api_root = "https://apollo.piczel.tv" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item_id = match.group(1) + + def items(self): + first = True + yield Message.Version, 1 + for image in self.unpack(self.get_images()): + if first: + yield Message.Directory, image + first = False + path = image["image"]["image"]["url"] + url = "{}/static/{}".format(self.api_root, path) + yield Message.Url, url, text.nameext_from_url(url, image) + + @staticmethod + def unpack(images): + """Unpack 'images' into individual image objects""" + for image in images: + if image["multi"]: + multi = image["images"] + del image["images"] + for image["num"], img in enumerate(multi): + image["image"] = img + yield image + else: + image["num"] = 0 + yield image + + def get_images(self): + """Return an iterable with all relevant image objects""" + + +class PiczelUserExtractor(PiczelExtractor): + """Extractor for all images from a user's gallery""" + subcategory = "user" + pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$" + test = ("https://piczel.tv/gallery/Lulena", { + "count": ">= 13", + }) + + def get_images(self): + url = "{}/api/users/{}/gallery".format(self.api_root, self.item_id) + return self.request(url).json() + + +class PiczelFolderExtractor(PiczelExtractor): + """Extractor for images inside a user's folder""" + subcategory = "folder" + directory_fmt = ("{category}", "{user[username]}", "{folder[name]}") + archive_fmt = "f{folder[id]}_{id}_{num}" + pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv" + r"/gallery/(?!image)[^/?&#]+/(\d+)") + test = ("https://piczel.tv/gallery/Lulena/1114", { + "count": ">= 4", + }) + + def get_images(self): + url = "{}/api/gallery/folder/{}".format(self.api_root, self.item_id) + images = self.request(url).json() + images.reverse() + return images + + +class PiczelImageExtractor(PiczelExtractor): + """Extractor for individual images""" + subcategory = "image" + pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)" + test = ("https://piczel.tv/gallery/image/7807", { + "url": "9b9e416b6ab7e58676fab84453d5028f306ece34", + "content": "df9a053a24234474a19bce2b7e27e0dec23bff87", + "keyword": { + "created_at": "2018-07-22T05:13:58.000Z", + "description": None, + "extension": "png", + "favorites_count": int, + "folder": dict, + "folder_id": 1113, + "id": 7807, + "is_flash": False, + "is_video": False, + "multi": False, + "nsfw": False, + "num": 0, + "password_protected": False, + "tags": "fanart, commission, altair, recreators, ", + "title": "Altair", + "user": dict, + "views": int, + }, + }) + + def get_images(self): + url = "{}/api/gallery/image/{}".format(self.api_root, self.item_id) + return (self.request(url).json(),) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py new file mode 100644 index 0000000..fa8cd48 --- /dev/null +++ b/gallery_dl/extractor/pinterest.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.pinterest.com/""" + +from .common import Extractor, Message +from .. import text, exception +import json + + +BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.\w+" + + +class PinterestExtractor(Extractor): + """Base class for pinterest extractors""" + category = "pinterest" + filename_fmt = "{category}_{id}.{extension}" + archive_fmt = "{id}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = PinterestAPI(self) + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + + for pin in self.pins(): + if "images" in pin: + url, pin_data = self.data_from_pin(pin) + pin_data.update(data) + yield Message.Url, url, pin_data + + def metadata(self): + """Return general metadata""" + + def pins(self): + """Return all relevant pin-objects""" + + @staticmethod + def data_from_pin(pin): + """Get image url and metadata from a pin-object""" + img = pin["images"]["orig"] + url = img["url"] + pin["width"] = img["width"] + pin["height"] = img["height"] + return url, text.nameext_from_url(url, pin) + + +class PinterestPinExtractor(PinterestExtractor): + """Extractor for images from a single pin from pinterest.com""" + subcategory = "pin" + pattern = BASE_PATTERN + r"/pin/([^/?#&]+)(?!.*#related$)" + test = ( + ("https://www.pinterest.com/pin/858146903966145189/", { + "url": "afb3c26719e3a530bb0e871c480882a801a4e8a5", + # image version depends on CDN server used + # "content": "d3e24bc9f7af585e8c23b9136956bd45a4d9b947", + # "content": "4c435a66f6bb82bb681db2ecc888f76cf6c5f9ca", + }), + ("https://www.pinterest.com/pin/858146903966145188/", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.pin_id = match.group(1) + self.pin = None + + def metadata(self): + self.pin = self.api.pin(self.pin_id) + return self.data_from_pin(self.pin)[1] + + def pins(self): + return (self.pin,) + + +class PinterestBoardExtractor(PinterestExtractor): + """Extractor for images from a board from pinterest.com""" + subcategory = "board" + directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") + archive_fmt = "{board[id]}_{id}" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)(?!.*#related$)" + test = ( + ("https://www.pinterest.com/g1952849/test-/", { + "pattern": r"https://i\.pinimg\.com/originals/", + "count": 2, + }), + ("https://www.pinterest.com/g1952848/test/", { + "exception": exception.GalleryDLException, + }), + ) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.user = text.unquote(match.group(1)) + self.board = text.unquote(match.group(2)) + self.board_id = 0 + + def metadata(self): + board = self.api.board(self.user, self.board) + self.board_id = board["id"] + return {"board": board} + + def pins(self): + return self.api.board_pins(self.board_id) + + +class PinterestRelatedPinExtractor(PinterestPinExtractor): + """Extractor for related pins of another pin from pinterest.com""" + subcategory = "related-pin" + directory_fmt = ("{category}", "related {original_pin[id]}") + pattern = BASE_PATTERN + r"/pin/([^/?#&]+).*#related$" + test = ("https://www.pinterest.com/pin/858146903966145189/#related", { + "range": "31-50", + "count": 20, + }) + + def metadata(self): + pin = self.api.pin(self.pin_id) + return {"original_pin": self.data_from_pin(pin)[1]} + + def pins(self): + return self.api.pin_related(self.pin_id) + + +class PinterestRelatedBoardExtractor(PinterestBoardExtractor): + """Extractor for related pins of a board from pinterest.com""" + subcategory = "related-board" + directory_fmt = ("{category}", "{board[owner][username]}", + "{board[name]}", "related") + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+).*#related$" + test = ("https://www.pinterest.com/g1952849/test-/#related", { + "range": "31-50", + "count": 20, + }) + + def pins(self): + return self.api.board_related(self.board_id) + + +class PinterestPinitExtractor(PinterestExtractor): + """Extractor for images from a pin.it URL""" + subcategory = "pinit" + pattern = r"(?:https?://)?pin\.it/([^/?#&]+)" + + test = ( + ("https://pin.it/Hvt8hgT", { + "url": "8daad8558382c68f0868bdbd17d05205184632fa", + }), + ("https://pin.it/Hvt8hgS", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.shortened_id = match.group(1) + + def items(self): + url = "https://api.pinterest.com/url_shortener/{}/redirect".format( + self.shortened_id) + response = self.request(url, method="HEAD", allow_redirects=False) + location = response.headers.get("Location") + if not location or location in ("https://api.pinterest.com/None", + "https://pin.it/None", + "https://www.pinterest.com"): + raise exception.NotFoundError("pin") + yield Message.Queue, location, {} + + +class PinterestAPI(): + """Minimal interface for the Pinterest Web API + + For a better and more complete implementation in PHP, see + - https://github.com/seregazhuk/php-pinterest-bot + """ + + BASE_URL = "https://www.pinterest.com" + HEADERS = { + "Accept" : "application/json, text/javascript, " + "*/*, q=0.01", + "Accept-Language" : "en-US,en;q=0.5", + "X-Pinterest-AppState": "active", + "X-APP-VERSION" : "cb1c7f9", + "X-Requested-With" : "XMLHttpRequest", + "Origin" : BASE_URL + "/", + } + + def __init__(self, extractor): + self.extractor = extractor + + def pin(self, pin_id): + """Query information about a pin""" + options = {"id": pin_id, "field_set_key": "detailed"} + return self._call("Pin", options)["resource_response"]["data"] + + def pin_related(self, pin_id): + """Yield related pins of another pin""" + options = {"pin": pin_id, "add_vase": True, "pins_only": True} + return self._pagination("RelatedPinFeed", options) + + def board(self, user, board): + """Query information about a board""" + options = {"slug": board, "username": user, + "field_set_key": "detailed"} + return self._call("Board", options)["resource_response"]["data"] + + def board_pins(self, board_id): + """Yield all pins of a specific board""" + options = {"board_id": board_id} + return self._pagination("BoardFeed", options) + + def board_related(self, board_id): + """Yield related pins of a specific board""" + options = {"board_id": board_id, "add_vase": True} + return self._pagination("BoardRelatedPixieFeed", options) + + def _call(self, resource, options): + url = "{}/resource/{}Resource/get/".format(self.BASE_URL, resource) + params = {"data": json.dumps({"options": options}), "source_url": ""} + + response = self.extractor.request( + url, params=params, headers=self.HEADERS, expect=range(400, 500)) + + try: + data = response.json() + except ValueError: + data = {} + + if 200 <= response.status_code < 400 and not response.history: + return data + + if response.status_code == 404 or response.history: + resource = self.extractor.subcategory.rpartition("-")[2] + raise exception.NotFoundError(resource) + self.extractor.log.error("API request failed") + self.extractor.log.debug("%s", response.text) + raise exception.StopExtraction() + + def _pagination(self, resource, options): + while True: + data = self._call(resource, options) + yield from data["resource_response"]["data"] + + try: + bookmarks = data["resource"]["options"]["bookmarks"] + if (not bookmarks or bookmarks[0] == "-end-" or + bookmarks[0].startswith("Y2JOb25lO")): + return + options["bookmarks"] = bookmarks + except KeyError: + return diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py new file mode 100644 index 0000000..af29c4b --- /dev/null +++ b/gallery_dl/extractor/pixiv.py @@ -0,0 +1,517 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images and ugoira from https://www.pixiv.net/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache +from datetime import datetime, timedelta + + +class PixivExtractor(Extractor): + """Base class for pixiv extractors""" + category = "pixiv" + directory_fmt = ("{category}", "{user[id]} {user[account]}") + filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}" + archive_fmt = "{id}{num}.{extension}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = PixivAppAPI(self) + self.user_id = -1 + self.load_ugoira = self.config("ugoira", True) + + def items(self): + metadata = self.get_metadata() + yield Message.Version, 1 + + for work in self.works(): + if not work["user"]["id"]: + continue + + meta_single_page = work["meta_single_page"] + meta_pages = work["meta_pages"] + del work["meta_single_page"] + del work["image_urls"] + del work["meta_pages"] + work["num"] = "" + work["tags"] = [tag["name"] for tag in work["tags"]] + work["date"] = text.parse_datetime(work["create_date"]) + work.update(metadata) + + yield Message.Directory, work + + if work["type"] == "ugoira": + if not self.load_ugoira: + continue + ugoira = self.api.ugoira_metadata(work["id"]) + + url = ugoira["zip_urls"]["medium"].replace( + "_ugoira600x600", "_ugoira1920x1080") + work["frames"] = ugoira["frames"] + work["extension"] = "zip" + yield Message.Url, url, work + + elif work["page_count"] == 1: + url = meta_single_page["original_image_url"] + work["extension"] = url.rpartition(".")[2] + yield Message.Url, url, work + + else: + for num, img in enumerate(meta_pages): + url = img["image_urls"]["original"] + work["num"] = "_p{:02}".format(num) + work["extension"] = url.rpartition(".")[2] + yield Message.Url, url, work + + def works(self): + """Return an iterable containing all relevant 'work'-objects""" + + def get_metadata(self, user=None): + """Collect metadata for extractor-job""" + if not user: + user = self.api.user_detail(self.user_id) + return {"user": user} + + +class PixivUserExtractor(PixivExtractor): + """Extractor for works of a pixiv-user""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/" + r"(?:member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?" + r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))") + test = ( + ("http://www.pixiv.net/member_illust.php?id=173530", { + "url": "852c31ad83b6840bacbce824d85f2a997889efb7", + }), + # illusts with specific tag + (("https://www.pixiv.net/member_illust.php?id=173530" + "&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), { + "url": "25b1cd81153a8ff82eec440dd9f20a4a22079658", + }), + ("http://www.pixiv.net/member_illust.php?id=173531", { + "exception": exception.NotFoundError, + }), + ("https://www.pixiv.net/u/173530"), + ("https://www.pixiv.net/user/173530"), + ("https://www.pixiv.net/mypage.php#id=173530"), + ("https://www.pixiv.net/#id=173530"), + ("https://touch.pixiv.net/member_illust.php?id=173530"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id = match.group(1) or match.group(3) + self.query = text.parse_query(match.group(2)) + + def works(self): + works = self.api.user_illusts(self.user_id) + + if "tag" in self.query: + tag = text.unquote(self.query["tag"]).lower() + works = ( + work for work in works + if tag in [t["name"].lower() for t in work["tags"]] + ) + + return works + + +class PixivMeExtractor(PixivExtractor): + """Extractor for pixiv.me URLs""" + subcategory = "me" + pattern = r"(?:https?://)?pixiv\.me/([^/?&#]+)" + test = ( + ("https://pixiv.me/del_shannon", { + "url": "0b1a18c3e3553c44ee6e0ccc36a7fd906c498e8f", + }), + ("https://pixiv.me/del_shanno", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.account = match.group(1) + + def items(self): + url = "https://pixiv.me/" + self.account + response = self.request( + url, method="HEAD", allow_redirects=False, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError("user") + yield Message.Version, 1 + yield Message.Queue, response.headers["Location"], {} + + +class PixivWorkExtractor(PixivExtractor): + """Extractor for a single pixiv work/illustration""" + subcategory = "work" + pattern = (r"(?:https?://)?(?:(?:www\.|touch\.)?pixiv\.net" + r"/member(?:_illust)?\.php\?(?:[^&]+&)*illust_id=(\d+)" + r"|(?:i(?:\d+\.pixiv|\.pximg)\.net" + r"/(?:(?:.*/)?img-[^/]+/img/\d{4}(?:/\d\d){5}|img\d+/img/[^/]+)" + r"|img\d*\.pixiv\.net/img/[^/]+|(?:www\.)?pixiv\.net/i)/(\d+))") + test = ( + (("http://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=966412"), { + "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba", + "content": "69a8edfb717400d1c2e146ab2b30d2c235440c5a", + }), + (("http://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=966411"), { + "exception": exception.NotFoundError, + }), + # ugoira + (("https://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=66806629"), { + "url": "7267695a985c4db8759bebcf8d21dbdd2d2317ef", + "keywords": {"frames": list}, + }), + ("http://i1.pixiv.net/c/600x600/img-master" + "/img/2008/06/13/00/29/13/966412_p0_master1200.jpg"), + ("https://i.pximg.net/img-original" + "/img/2017/04/25/07/33/29/62568267_p0.png"), + ("https://www.pixiv.net/i/966412"), + ("http://img.pixiv.net/img/soundcross/42626136.jpg"), + ("http://i2.pixiv.net/img76/img/snailrin/42672235.jpg"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.illust_id = match.group(1) or match.group(2) + self.load_ugoira = True + self.work = None + + def works(self): + return (self.work,) + + def get_metadata(self, user=None): + self.work = self.api.illust_detail(self.illust_id) + return PixivExtractor.get_metadata(self, self.work["user"]) + + +class PixivFavoriteExtractor(PixivExtractor): + """Extractor for all favorites/bookmarks of a pixiv-user""" + subcategory = "favorite" + directory_fmt = ("{category}", "bookmarks", + "{user_bookmark[id]} {user_bookmark[account]}") + archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/bookmark\.php(?:\?([^#]*))?") + test = ( + ("https://www.pixiv.net/bookmark.php?id=173530", { + "url": "e717eb511500f2fa3497aaee796a468ecf685cc4", + }), + # bookmarks with specific tag + (("https://www.pixiv.net/bookmark.php?id=3137110" + "&tag=%E3%81%AF%E3%82%93%E3%82%82%E3%82%93&p=1"), { + "count": 2, + }), + # own bookmarks + ("https://www.pixiv.net/bookmark.php", { + "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba", + }), + # touch URLs + ("https://touch.pixiv.net/bookmark.php?id=173530"), + ("https://touch.pixiv.net/bookmark.php"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.query = text.parse_query(match.group(1)) + if "id" not in self.query: + self.subcategory = "bookmark" + + def works(self): + tag = None + restrict = "public" + + if "tag" in self.query: + tag = text.unquote(self.query["tag"]) + if "rest" in self.query and self.query["rest"] == "hide": + restrict = "private" + + return self.api.user_bookmarks_illust(self.user_id, tag, restrict) + + def get_metadata(self, user=None): + if "id" in self.query: + user = self.api.user_detail(self.query["id"]) + else: + self.api.login() + user = self.api.user + + self.user_id = user["id"] + return {"user_bookmark": user} + + +class PixivRankingExtractor(PixivExtractor): + """Extractor for pixiv ranking pages""" + subcategory = "ranking" + archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}" + directory_fmt = ("{category}", "rankings", + "{ranking[mode]}", "{ranking[date]}") + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/ranking\.php(?:\?([^#]*))?") + test = ( + ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"), + ("https://www.pixiv.net/ranking.php"), + ("https://touch.pixiv.net/ranking.php"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.query = match.group(1) + self.mode = self.date = None + + def works(self): + return self.api.illust_ranking(self.mode, self.date) + + def get_metadata(self, user=None): + query = text.parse_query(self.query) + + mode = query.get("mode", "daily").lower() + mode_map = { + "daily": "day", + "daily_r18": "day_r18", + "weekly": "week", + "weekly_r18": "week_r18", + "monthly": "month", + "male": "day_male", + "male_r18": "day_male_r18", + "female": "day_female", + "female_r18": "day_female_r18", + "original": "week_original", + "rookie": "week_rookie", + "r18g": "week_r18g", + } + if mode not in mode_map: + self.log.warning("invalid mode '%s'", mode) + mode = "daily" + self.mode = mode_map[mode] + + date = query.get("date") + if date: + if len(date) == 8 and date.isdecimal(): + date = "{}-{}-{}".format(date[0:4], date[4:6], date[6:8]) + else: + self.log.warning("invalid date '%s'", date) + date = None + if not date: + date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d") + self.date = date + + return {"ranking": { + "mode": mode, + "date": self.date, + }} + + +class PixivSearchExtractor(PixivExtractor): + """Extractor for pixiv search results""" + subcategory = "search" + archive_fmt = "s_{search[word]}_{id}{num}.{extension}" + directory_fmt = ("{category}", "search", "{search[word]}") + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/search\.php\?([^#]+)") + test = ( + ("https://www.pixiv.net/search.php?s_mode=s_tag&word=Original"), + ("https://touch.pixiv.net/search.php?word=Original"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.query = match.group(1) + self.word = self.sort = self.target = None + + def works(self): + return self.api.search_illust(self.word, self.sort, self.target) + + def get_metadata(self, user=None): + query = text.parse_query(self.query) + + if "word" in query: + self.word = text.unescape(query["word"]) + else: + self.log.error("missing search term") + raise exception.StopExtraction() + + sort = query.get("order", "date_d") + sort_map = { + "date": "date_asc", + "date_d": "date_desc", + } + if sort not in sort_map: + self.log.warning("invalid sort order '%s'", sort) + sort = "date_d" + self.sort = sort_map[sort] + + target = query.get("s_mode", "s_tag") + target_map = { + "s_tag": "partial_match_for_tags", + "s_tag_full": "exact_match_for_tags", + "s_tc": "title_and_caption", + } + if target not in target_map: + self.log.warning("invalid search target '%s'", target) + target = "s_tag" + self.target = target_map[target] + + return {"search": { + "word": self.word, + "sort": self.sort, + "target": self.target, + }} + + +class PixivFollowExtractor(PixivExtractor): + """Extractor for new illustrations from your followed artists""" + subcategory = "follow" + archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}" + directory_fmt = ("{category}", "following") + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/bookmark_new_illust\.php") + test = ( + ("https://www.pixiv.net/bookmark_new_illust.php"), + ("https://touch.pixiv.net/bookmark_new_illust.php"), + ) + + def works(self): + return self.api.illust_follow() + + def get_metadata(self, user=None): + self.api.login() + return {"user_follow": self.api.user} + + +class PixivAppAPI(): + """Minimal interface for the Pixiv App API for mobile devices + + For a more complete implementation or documentation, see + - https://github.com/upbit/pixivpy + - https://gist.github.com/ZipFile/3ba99b47162c23f8aea5d5942bb557b1 + """ + CLIENT_ID = "MOBrBDS8blbauoSck0ZfDbtuzpyT" + CLIENT_SECRET = "lsACyCD94FhDUtGTXi3QzcFE2uU1hqtDaKeqrdwj" + + def __init__(self, extractor): + self.extractor = extractor + self.log = extractor.log + self.username, self.password = extractor._get_auth_info() + self.user = None + + self.client_id = extractor.config( + "client-id", self.CLIENT_ID) + self.client_secret = extractor.config( + "client-secret", self.CLIENT_SECRET) + + extractor.session.headers.update({ + "App-OS": "ios", + "App-OS-Version": "10.3.1", + "App-Version": "6.7.1", + "User-Agent": "PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)", + "Referer": "https://app-api.pixiv.net/", + }) + + def login(self): + """Login and gain an access token""" + self.user, auth = self._login_impl(self.username, self.password) + self.extractor.session.headers["Authorization"] = auth + + @cache(maxage=3600, keyarg=1) + def _login_impl(self, username, password): + url = "https://oauth.secure.pixiv.net/auth/token" + data = { + "client_id": self.client_id, + "client_secret": self.client_secret, + "get_secure_url": 1, + } + refresh_token = _refresh_token_cache(username) + + if refresh_token: + self.log.info("Refreshing access token") + data["grant_type"] = "refresh_token" + data["refresh_token"] = refresh_token + else: + self.log.info("Logging in as %s", username) + data["grant_type"] = "password" + data["username"] = username + data["password"] = password + + response = self.extractor.request( + url, method="POST", data=data, expect=(400,)) + if response.status_code >= 400: + raise exception.AuthenticationError() + + data = response.json()["response"] + if not refresh_token: + _refresh_token_cache.update(username, data["refresh_token"]) + return data["user"], "Bearer " + data["access_token"] + + def illust_detail(self, illust_id): + params = {"illust_id": illust_id} + return self._call("v1/illust/detail", params)["illust"] + + def illust_follow(self, restrict="all"): + params = {"restrict": restrict} + return self._pagination("v2/illust/follow", params) + + def illust_ranking(self, mode="day", date=None): + params = {"mode": mode, "date": date} + return self._pagination("v1/illust/ranking", params) + + def search_illust(self, word, sort=None, target=None, duration=None): + params = {"word": word, "search_target": target, + "sort": sort, "duration": duration} + return self._pagination("v1/search/illust", params) + + def user_bookmarks_illust(self, user_id, tag=None, restrict="public"): + params = {"user_id": user_id, "tag": tag, "restrict": restrict} + return self._pagination("v1/user/bookmarks/illust", params) + + def user_detail(self, user_id): + params = {"user_id": user_id} + return self._call("v1/user/detail", params)["user"] + + def user_illusts(self, user_id): + params = {"user_id": user_id} + return self._pagination("v1/user/illusts", params) + + def ugoira_metadata(self, illust_id): + params = {"illust_id": illust_id} + return self._call("v1/ugoira/metadata", params)["ugoira_metadata"] + + def _call(self, endpoint, params=None): + url = "https://app-api.pixiv.net/" + endpoint + + self.login() + response = self.extractor.request( + url, params=params, expect=range(400, 500)) + + if 200 <= response.status_code < 400: + return response.json() + if response.status_code == 404: + raise exception.NotFoundError() + self.log.error("API request failed: %s", response.text) + raise exception.StopExtraction() + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + yield from data["illusts"] + + if not data["next_url"]: + return + query = data["next_url"].rpartition("?")[2] + params = text.parse_query(query) + + +@cache(maxage=10*365*24*3600, keyarg=0) +def _refresh_token_cache(username): + return None diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py new file mode 100644 index 0000000..9cada6b --- /dev/null +++ b/gallery_dl/extractor/pixnet.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.pixnet.net/""" + +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net" + + +class PixnetExtractor(Extractor): + """Base class for pixnet extractors""" + category = "pixnet" + filename_fmt = "{num:>03}_{id}.{extension}" + archive_fmt = "{id}" + url_fmt = "" + + def __init__(self, match): + Extractor.__init__(self, match) + self.blog, self.item_id = match.groups() + self.root = "https://{}.pixnet.net".format(self.blog) + + def items(self): + url = self.url_fmt.format(self.root, self.item_id) + page = self.request(url, encoding="utf-8").text + user = text.extract(page, '')[0] + if "href" not in pnext: + return + url = self.root + text.extract(pnext, 'href="', '"')[0] + page = self.request(url, encoding="utf-8").text + + +class PixnetImageExtractor(PixnetExtractor): + """Extractor for a single photo from pixnet.net""" + subcategory = "image" + filename_fmt = "{id}.{extension}" + directory_fmt = ("{category}", "{blog}") + pattern = BASE_PATTERN + r"/album/photo/(\d+)" + test = ("https://albertayu773.pixnet.net/album/photo/159443828", { + "url": "156564c422138914c9fa5b42191677b45c414af4", + "keyword": "19971bcd056dfef5593f4328a723a9602be0f087", + "content": "0e097bdf49e76dd9b9d57a016b08b16fa6a33280", + }) + + def items(self): + url = "https://api.pixnet.cc/oembed" + params = { + "url": "https://{}.pixnet.net/album/photo/{}".format( + self.blog, self.item_id), + "format": "json", + } + + data = self.request(url, params=params).json() + data["id"] = text.parse_int( + data["url"].rpartition("/")[2].partition("-")[0]) + data["filename"], _, data["extension"] = data["title"].rpartition(".") + data["blog"] = self.blog + data["user"] = data.pop("author_name") + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, data["url"], data + + +class PixnetSetExtractor(PixnetExtractor): + """Extractor for images from a pixnet set""" + subcategory = "set" + url_fmt = "{}/album/set/{}" + directory_fmt = ("{category}", "{blog}", + "{folder_id} {folder_title}", "{set_id} {set_title}") + pattern = BASE_PATTERN + r"/album/set/(\d+)" + test = ( + ("https://albertayu773.pixnet.net/album/set/15078995", { + "url": "6535712801af47af51110542f4938a7cef44557f", + "keyword": "bf25d59e5b0959cb1f53e7fd2e2a25f2f67e5925", + }), + ("https://anrine910070.pixnet.net/album/set/5917493", { + "url": "b3eb6431aea0bcf5003432a4a0f3a3232084fc13", + "keyword": "bf7004faa1cea18cf9bd856f0955a69be51b1ec6", + }), + ) + + def items(self): + url = self.url_fmt.format(self.root, self.item_id) + page = self.request(url, encoding="utf-8").text + data = self.metadata(page) + + yield Message.Version, 1 + yield Message.Directory, data + for num, info in enumerate(self._pagination(page), 1): + url, pos = text.extract(info, ' href="', '"') + src, pos = text.extract(info, ' src="', '"', pos) + alt, pos = text.extract(info, ' alt="', '"', pos) + + photo = { + "id": text.parse_int(url.rpartition("/")[2].partition("#")[0]), + "url": src.replace("_s.", "."), + "num": num, + "filename": alt, + "extension": src.rpartition(".")[2], + } + photo.update(data) + yield Message.Url, photo["url"], photo + + def metadata(self, page): + user , pos = text.extract(page, '', '<', pos) + sid , pos = text.extract(page, '/set/', '"', pos) + sname, pos = text.extract(page, '>', '<', pos) + return { + "blog": self.blog, + "user": user.rpartition(" (")[0], + "folder_id" : text.parse_int(fid, ""), + "folder_title": text.unescape(fname).strip(), + "set_id" : text.parse_int(sid), + "set_title" : text.unescape(sname), + } + + +class PixnetFolderExtractor(PixnetExtractor): + """Extractor for all sets in a pixnet folder""" + subcategory = "folder" + url_fmt = "{}/album/folder/{}" + pattern = BASE_PATTERN + r"/album/folder/(\d+)" + test = ("https://albertayu773.pixnet.net/album/folder/1405768", { + "pattern": PixnetSetExtractor.pattern, + "count": ">= 15", + }) + + +class PixnetUserExtractor(PixnetExtractor): + """Extractor for all sets and folders of a pixnet user""" + subcategory = "user" + url_fmt = "{}{}/album/list" + pattern = BASE_PATTERN + r"()(?:/blog|/album(?:/list)?)?/?(?:$|[?&#])" + test = ( + ("https://albertayu773.pixnet.net/"), + ("https://albertayu773.pixnet.net/blog"), + ("https://albertayu773.pixnet.net/album"), + ("https://albertayu773.pixnet.net/album/list", { + "pattern": PixnetFolderExtractor.pattern, + "count": ">= 30", + }), + ("https://anrine910070.pixnet.net/album/list", { + "pattern": PixnetSetExtractor.pattern, + "count": ">= 14", + }), + ) diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py new file mode 100644 index 0000000..325c6a0 --- /dev/null +++ b/gallery_dl/extractor/plurk.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.plurk.com/""" + +from .common import Extractor, Message +from .. import text, extractor, exception +import datetime +import json +import re + + +class PlurkExtractor(Extractor): + """Base class for plurk extractors""" + category = "plurk" + root = "https://www.plurk.com" + + def items(self): + urls = self._urls_ex if self.config("comments", False) else self._urls + + yield Message.Version, 1 + with extractor.blacklist(("plurk",)): + for plurk in self.plurks(): + for url in urls(plurk): + yield Message.Queue, url, plurk + + def plurks(self): + """Return an iterable with all relevant 'plurk' objects""" + + @staticmethod + def _urls(obj): + """Extract URLs from a 'plurk' object""" + return text.extract_iter(obj["content"], ' href="', '"') + + def _urls_ex(self, plurk): + """Extract URLs from a 'plurk' and its comments""" + yield from self._urls(plurk) + for comment in self._comments(plurk): + yield from self._urls(comment) + + def _comments(self, plurk): + """Return an iterable with a 'plurk's comments""" + url = "https://www.plurk.com/Responses/get" + data = {"plurk_id": plurk["id"], "count": "200"} + + while True: + info = self.request(url, "POST", data=data).json() + yield from info["responses"] + if not info["has_newer"]: + return + data["from_response_id"] = info["responses"][-1]["id"] + + @staticmethod + def _load(data): + if not data: + raise exception.NotFoundError("user") + return json.loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data)) + + +class PlurkTimelineExtractor(PlurkExtractor): + """Extractor for URLs from all posts in a Plurk timeline""" + subcategory = "timeline" + pattern = r"(?:https?://)?(?:www\.)?plurk\.com/(?!p/)(\w+)/?(?:$|[?&#])" + test = ("https://www.plurk.com/plurkapi", { + "pattern": r"https?://.+", + "count": ">= 23" + }) + + def __init__(self, match): + PlurkExtractor.__init__(self, match) + self.user = match.group(1) + + def plurks(self): + url = "{}/{}".format(self.root, self.user) + page = self.request(url).text + user_id, pos = text.extract(page, '"user_id":', ',') + plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0]) + + url = "https://www.plurk.com/TimeLine/getPlurks" + data = {"user_id": user_id.strip()} + headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"} + + while plurks: + yield from plurks + + offset = datetime.datetime.strptime( + plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z") + data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z") + response = self.request(url, "POST", headers=headers, data=data) + plurks = response.json()["plurks"] + + +class PlurkPostExtractor(PlurkExtractor): + """Extractor for URLs from a Plurk post""" + subcategory = "post" + pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)" + test = ( + ("https://www.plurk.com/p/i701j1", { + "url": "2115f208564591b8748525c2807a84596aaaaa5f", + "count": 3, + }), + ("https://www.plurk.com/p/i701j1", { + "options": (("comments", True),), + "count": ">= 210", + }), + ) + + def __init__(self, match): + PlurkExtractor.__init__(self, match) + self.plurk_id = match.group(1) + + def plurks(self): + url = "{}/p/{}".format(self.root, self.plurk_id) + page = self.request(url).text + user, pos = text.extract(page, " GLOBAL = ", "\n") + data, pos = text.extract(page, "plurk = ", ";\n", pos) + + data = self._load(data) + data["user"] = self._load(user)["page_user"] + return (data,) diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py new file mode 100644 index 0000000..40816b3 --- /dev/null +++ b/gallery_dl/extractor/pornhub.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.pornhub.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?pornhub\.com" + + +class PornhubExtractor(Extractor): + """Base class for pornhub extractors""" + category = "pornhub" + root = "https://www.pornhub.com" + + +class PornhubGalleryExtractor(PornhubExtractor): + """Extractor for image galleries on pornhub.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{user}", "{gallery[id]} {gallery[title]}") + filename_fmt = "{num:>03}_{id}.{extension}" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/album/(\d+)" + test = ( + ("https://www.pornhub.com/album/1708982", { + "pattern": r"https://\w+.phncdn.com/pics/albums/\d+/\d+/\d+/\d+/", + "count": 93, + "keyword": { + "id": int, + "num": int, + "score": int, + "views": int, + "caption": str, + "user": "Unknown", + "gallery": { + "id" : 1708982, + "score": int, + "views": int, + "tags" : list, + "title": "Random Hentai", + }, + }, + }), + ("https://www.pornhub.com/album/37180171", { + "exception": exception.AuthorizationError, + }), + ) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.gallery_id = match.group(1) + self._first = None + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for num, image in enumerate(self.images(), 1): + url = image["url"] + image.update(data) + image["num"] = num + yield Message.Url, url, text.nameext_from_url(url, image) + + def metadata(self): + url = "{}/album/{}".format( + self.root, self.gallery_id) + extr = text.extract_from(self.request(url).text) + + title = extr("", "") + score = extr('
    ', '<') + tags = extr('
    = 8", + }), + ("https://www.pornhub.com/users/flyings0l0/"), + ("https://www.pornhub.com/users/flyings0l0/photos/public"), + ("https://www.pornhub.com/users/flyings0l0/photos/private"), + ("https://www.pornhub.com/users/flyings0l0/photos/favorites"), + ("https://www.pornhub.com/model/bossgirl/photos"), + ) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.type, self.user, self.cat = match.groups() + + def items(self): + url = "{}/{}/{}/photos/{}/ajax".format( + self.root, self.type, self.user, self.cat or "public") + params = {"page": 1} + headers = { + "Referer": url[:-5], + "X-Requested-With": "XMLHttpRequest", + } + + data = {"_extractor": PornhubGalleryExtractor} + yield Message.Version, 1 + while True: + page = self.request( + url, method="POST", headers=headers, params=params).text + if not page: + return + for gid in text.extract_iter(page, 'id="albumphoto', '"'): + yield Message.Queue, self.root + "/album/" + gid, data + params["page"] += 1 diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py new file mode 100644 index 0000000..fa4eb81 --- /dev/null +++ b/gallery_dl/extractor/pururin.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://pururin.io/""" + +from .common import GalleryExtractor +from .. import text, util +import json + + +class PururinGalleryExtractor(GalleryExtractor): + """Extractor for image galleries on pururin.io""" + category = "pururin" + pattern = r"(?:https?://)?(?:www\.)?pururin\.io/(?:gallery|read)/(\d+)" + test = ( + ("https://pururin.io/gallery/38661/iowant-2", { + "pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg", + "keyword": { + "title" : "Iowant 2!!", + "title_en" : "Iowant 2!!", + "title_jp" : "", + "gallery_id": 38661, + "count" : 19, + "artist" : ["Shoda Norihiro"], + "group" : ["Obsidian Order"], + "parody" : ["Kantai Collection"], + "characters": ["Iowa", "Teitoku"], + "tags" : list, + "type" : "Doujinshi", + "collection": "", + "convention": "C92", + "rating" : float, + "uploader" : "demo", + "scanlator" : "", + "lang" : "en", + "language" : "English", + } + }), + ("https://pururin.io/gallery/7661/unisis-team-vanilla", { + "count": 17, + }), + ) + root = "https://pururin.io" + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/gallery/{}/x".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + self._ext = "" + self._cnt = 0 + + def metadata(self, page): + extr = text.extract_from(page) + + def _lst(key, e=extr): + return [ + text.unescape(item) + for item in text.extract_iter(e(key, ""), 'title="', '"') + ] + + def _str(key, e=extr): + return text.unescape(text.extract( + e(key, ""), 'title="', '"')[0] or "") + + url = "{}/read/{}/01/x".format(self.root, self.gallery_id) + page = self.request(url).text + info = json.loads(text.unescape(text.extract( + page, ':gallery="', '"')[0])) + self._ext = info["image_extension"] + self._cnt = info["total_pages"] + + data = { + "gallery_id": text.parse_int(self.gallery_id), + "title" : info["title"] or info.get("j_title") or "", + "title_en" : info["title"], + "title_jp" : info.get("j_title") or "", + "artist" : _lst("Artist"), + "group" : _lst("Circle"), + "parody" : _lst("Parody"), + "tags" : _lst("Contents"), + "type" : _str("Category"), + "characters": _lst("Character"), + "collection": _str("Collection"), + "language" : _str("Language"), + "scanlator" : _str("Scanlator"), + "convention": _str("Convention"), + "uploader" : text.remove_html(extr("Uploader", "")), + "rating" : text.parse_float(extr(" :rating='" , "'")), + } + data["lang"] = util.language_to_code(data["language"]) + return data + + def images(self, _): + ufmt = "https://cdn.pururin.io/assets/images/data/{}/{{}}.{}".format( + self.gallery_id, self._ext) + return [(ufmt.format(num), None) for num in range(1, self._cnt + 1)] diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py new file mode 100644 index 0000000..59d502a --- /dev/null +++ b/gallery_dl/extractor/reactor.py @@ -0,0 +1,338 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Generic extractors for *reactor sites""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text +import urllib.parse +import random +import time +import json + + +BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)" + + +class ReactorExtractor(SharedConfigMixin, Extractor): + """Base class for *reactor.cc extractors""" + basecategory = "reactor" + filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" + archive_fmt = "{post_id}_{num}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.root = "http://" + match.group(1) + self.session.headers["Referer"] = self.root + + self.wait_min = self.config("wait-min", 3) + self.wait_max = self.config("wait-max", 6) + if self.wait_max < self.wait_min: + self.wait_max = self.wait_min + + if not self.category: + # set category based on domain name + netloc = urllib.parse.urlsplit(self.root).netloc + self.category = netloc.rpartition(".")[0] + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(): + for image in self._parse_post(post): + url = image["url"] + image.update(data) + yield Message.Url, url, text.nameext_from_url(url, image) + + def metadata(self): + """Collect metadata for extractor-job""" + return {} + + def posts(self): + """Return all relevant post-objects""" + return self._pagination(self.url) + + def _pagination(self, url): + while True: + time.sleep(random.uniform(self.wait_min, self.wait_max)) + + response = self.request(url) + if response.history: + # sometimes there is a redirect from + # the last page of a listing (.../tag//1) + # to the first page (.../tag/) + # which could cause an endless loop + cnt_old = response.history[0].url.count("/") + cnt_new = response.url.count("/") + if cnt_old == 5 and cnt_new == 4: + return + page = response.text + + yield from text.extract_iter( + page, '
    ', '
    ') + + try: + pos = page.index("class='next'") + pos = page.rindex("class='current'", 0, pos) + url = self.root + text.extract(page, "href='", "'", pos)[0] + except (ValueError, TypeError): + return + + def _parse_post(self, post): + post, _, script = post.partition('")[0].rstrip("\n\r;")) + + +class XhamsterUserExtractor(XhamsterExtractor): + """Extractor for all galleries of an xhamster user""" + subcategory = "user" + pattern = BASE_PATTERN + r"/users/([^/?&#]+)(?:/photos)?/?(?:$|[?#])" + test = ( + ("https://xhamster.com/users/nickname68/photos", { + "pattern": XhamsterGalleryExtractor.pattern, + "count": 50, + "range": "1-50", + }), + ("https://xhamster.com/users/nickname68"), + ) + + def __init__(self, match): + XhamsterExtractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + yield Message.Version, 1 + url = "{}/users/{}/photos".format(self.root, self.user) + data = {"_extractor": XhamsterGalleryExtractor} + + while url: + extr = text.extract_from(self.request(url).text) + while True: + url = extr('thumb-image-container" href="', '"') + if not url: + break + yield Message.Queue, url, data + url = extr('data-page="next" href="', '"') diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py new file mode 100644 index 0000000..7eec18b --- /dev/null +++ b/gallery_dl/extractor/xvideos.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.xvideos.com/""" + +from .common import Extractor, Message +from .. import text, exception +import json + + +class XvideosExtractor(Extractor): + """Base class for xvideos extractors""" + category = "xvideos" + root = "https://www.xvideos.com" + + def get_page(self, url, codes=(403, 404)): + response = self.request(url, expect=codes) + if response.status_code in codes: + raise exception.NotFoundError(self.subcategory) + return response.text + + +class XvideosGalleryExtractor(XvideosExtractor): + """Extractor for user profile galleries from xvideos.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{user[name]}", "{title}") + filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{num}" + pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" + r"/profiles/([^/?&#]+)/photos/(\d+)") + test = ( + (("https://www.xvideos.com/profiles" + "/pervertedcouple/photos/751031/random_stuff"), { + "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7", + "keyword": "8d637b372c6231cc4ada92dd5918db5fdbd06520", + }), + ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + XvideosExtractor.__init__(self, match) + self.user, self.gid = match.groups() + + def items(self): + url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid) + page = self.get_page(url) + data = self.get_metadata(page) + imgs = self.get_images(page) + data["count"] = len(imgs) + yield Message.Version, 1 + yield Message.Directory, data + for url in imgs: + data["num"] = text.parse_int(url.rsplit("_", 2)[1]) + data["extension"] = url.rpartition(".")[2] + yield Message.Url, url, data + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + data = text.extract_all(page, ( + ("userid" , '"id_user":', ','), + ("display", '"display":"', '"'), + ("title" , '"title":"', '"'), + ("descr" , '', ''), + ("tags" , 'Tagged:', '<'), + ))[0] + + return { + "user": { + "id": text.parse_int(data["userid"]), + "name": self.user, + "display": data["display"], + "description": data["descr"].strip(), + }, + "tags": text.unescape(data["tags"] or "").strip().split(", "), + "title": text.unescape(data["title"]), + "gallery_id": text.parse_int(self.gid), + } + + @staticmethod + def get_images(page): + """Return a list of all image urls for this gallery""" + return list(text.extract_iter( + page, '")[0])["data"] + + if not isinstance(data["galleries"], dict): + return + if "0" in data["galleries"]: + del data["galleries"]["0"] + + galleries = [ + { + "gallery_id": text.parse_int(gid), + "title": text.unescape(gdata["title"]), + "count": gdata["nb_pics"], + "_extractor": XvideosGalleryExtractor, + } + for gid, gdata in data["galleries"].items() + ] + galleries.sort(key=lambda x: x["gallery_id"]) + + yield Message.Version, 1 + for gallery in galleries: + url = "https://www.xvideos.com/profiles/{}/photos/{}".format( + self.user, gallery["gallery_id"]) + yield Message.Queue, url, gallery diff --git a/gallery_dl/extractor/yandere.py b/gallery_dl/extractor/yandere.py new file mode 100644 index 0000000..623e7a8 --- /dev/null +++ b/gallery_dl/extractor/yandere.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://yande.re/""" + +from . import booru + + +class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): + """Base class for yandere extractors""" + category = "yandere" + api_url = "https://yande.re/post.json" + post_url = "https://yande.re/post/show/{}" + + +class YandereTagExtractor(booru.TagMixin, YandereExtractor): + """Extractor for images from yande.re based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?yande\.re" + r"/post\?(?:[^&#]*&)*tags=(?P[^&#]+)") + test = ("https://yande.re/post?tags=ouzoku+armor", { + "content": "59201811c728096b2d95ce6896fd0009235fe683", + }) + + +class YanderePoolExtractor(booru.PoolMixin, YandereExtractor): + """Extractor for image-pools from yande.re""" + pattern = r"(?:https?://)?(?:www\.)?yande\.re/pool/show/(?P\d+)" + test = ("https://yande.re/pool/show/318", { + "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68", + }) + + +class YanderePostExtractor(booru.PostMixin, YandereExtractor): + """Extractor for single images from yande.re""" + pattern = r"(?:https?://)?(?:www\.)?yande\.re/post/show/(?P\d+)" + test = ("https://yande.re/post/show/51824", { + "content": "59201811c728096b2d95ce6896fd0009235fe683", + "options": (("tags", True),), + "keyword": { + "tags_artist": "sasaki_tamaru", + "tags_circle": "softhouse_chara", + "tags_copyright": "ouzoku", + "tags_general": str, + }, + }) + + +class YanderePopularExtractor(booru.MoebooruPopularMixin, YandereExtractor): + """Extractor for popular images from yande.re""" + pattern = (r"(?:https?://)?(?:www\.)?yande\.re" + r"/post/popular_(?Pby_(?:day|week|month)|recent)" + r"(?:\?(?P[^#]*))?") + test = ( + ("https://yande.re/post/popular_by_month?month=6&year=2014", { + "count": 40, + }), + ("https://yande.re/post/popular_recent"), + ) + + def __init__(self, match): + super().__init__(match) + self.api_url = "https://yande.re/post/popular_{scale}.json".format( + scale=self.scale) diff --git a/gallery_dl/extractor/yaplog.py b/gallery_dl/extractor/yaplog.py new file mode 100644 index 0000000..b3c5501 --- /dev/null +++ b/gallery_dl/extractor/yaplog.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://yaplog.jp/""" + +from .common import Extractor, Message, AsynchronousMixin +from .. import text, util + + +class YaplogExtractor(AsynchronousMixin, Extractor): + """Base class for yaplog extractors""" + category = "yaplog" + root = "https://yaplog.jp" + filename_fmt = "{post[id]}_{post[title]}_{id}.{extension}" + directory_fmt = ("{category}", "{post[user]}") + archive_fmt = "{post[user]}_{id}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + yield Message.Version, 1 + for post, urls in self.posts(): + yield Message.Directory, {"post": post} + for num, url in enumerate(urls, 1): + page = self.request(url).text if num > 1 else url + iurl = text.extract(page, '', '<') + date , pos = text.extract(page, 'class="date">' , '<', pos) + pid , pos = text.extract(page, '/archive/' , '"', pos) + prev , pos = text.extract(page, 'class="last"> search /archive/ page for the rest + url = "{}/{}/archive/{}".format(self.root, self.user, pid) + page = self.request(url).text + + base = "{}/{}/image/{}/".format(self.root, self.user, pid) + for part in util.advance(text.extract_iter( + page, base, '"', pos), 24): + urls.append(base + part) + + return prev, urls, { + "id" : text.parse_int(pid), + "title": text.unescape(title[:-3]), + "user" : self.user, + "date" : date, + } + + +class YaplogBlogExtractor(YaplogExtractor): + """Extractor for a user's blog on yaplog.jp""" + subcategory = "blog" + pattern = r"(?:https?://)?(?:www\.)?yaplog\.jp/(\w+)/?(?:$|[?&#])" + test = ("https://yaplog.jp/omitakashi3", { + "pattern": r"https://img.yaplog.jp/img/18/pc/o/m/i/omitakashi3/0/", + "count": ">= 2", + }) + + def posts(self): + url = "{}/{}/image/".format(self.root, self.user) + while url: + url, images, data = self._parse_post(url) + yield data, images + + +class YaplogPostExtractor(YaplogExtractor): + """Extractor for images from a blog post on yaplog.jp""" + subcategory = "post" + pattern = (r"(?:https?://)?(?:www\.)?yaplog\.jp" + r"/(\w+)/(?:archive|image)/(\d+)") + test = ("https://yaplog.jp/imamiami0726/image/1299", { + "url": "896cae20fa718735a57e723c48544e830ff31345", + "keyword": "f8d8781e61c4c38238a7622d6df6c905f864e5d3", + }) + + def __init__(self, match): + YaplogExtractor.__init__(self, match) + self.post_id = match.group(2) + + def posts(self): + url = "{}/{}/image/{}".format(self.root, self.user, self.post_id) + _, images, data = self._parse_post(url) + return ((data, images),) diff --git a/gallery_dl/extractor/yuki.py b/gallery_dl/extractor/yuki.py new file mode 100644 index 0000000..0844c40 --- /dev/null +++ b/gallery_dl/extractor/yuki.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://yuki.la/""" + +from .common import Extractor, Message +from .. import text + + +class YukiThreadExtractor(Extractor): + """Extractor for images from threads on yuki.la""" + category = "yuki" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread}{title:? - //}") + filename_fmt = "{time}-{filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?yuki\.la/([^/?&#]+)/(\d+)" + test = ( + ("https://yuki.la/gd/309639", { + "url": "289e86c5caf673a2515ec5f5f521ac0ae7e189e9", + "keyword": "01cbe29ae207a5cb7556bcbd5ed481ecdaf32727", + "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", + }), + ("https://yuki.la/a/159767162", { + "url": "cd94d0eb646d279c3b7efb9b7898888e5d44fa93", + "keyword": "7a4ff90e423c74bd3126fb65d13015decec2fa45", + }), + # old thread - missing board name in title and multi-line HTML + ("https://yuki.la/gif/6877752", { + "url": "3dbb2f8453490d002416c5fc2fe95b56c129faf9", + "keyword": "563ef4ae80134d845dddaed7ebe56f5fc41056be", + }), + # even older thread - no thread title + ("https://yuki.la/a/9357051", { + "url": "010560bf254bd485e48366c3531728bda4b22583", + "keyword": "7b736c41e307dcfcb84ef495f29299a6ddd06d67", + }), + ) + root = "https://yuki.la" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/{}".format(self.root, self.board, self.thread) + page = self.request(url).text + data = self.get_metadata(page) + + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(page): + if "image" in post: + for key in ("w", "h", "no", "time"): + post[key] = text.parse_int(post[key]) + post.update(data) + yield Message.Url, post["image"], post + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + title = text.extract(page, "", "")[0] + try: + title, boardname, _ = title.rsplit(" - ", 2) + except ValueError: + title = boardname = "" + else: + title = title.partition(" - ")[2] + if not title: + title, boardname = boardname, "" + return { + "board": self.board, + "board_name": boardname, + "thread": text.parse_int(self.thread), + "title": text.unescape(title), + } + + def posts(self, page): + """Build a list of all post-objects""" + return [ + self.parse(post) for post in text.extract_iter( + page, '
    ', ''), + ("time", 'data-utc="', '"'), + ("now" , '>', ' <'), + )) + data["com"] = text.unescape(text.remove_html( + post[post.index("
    ")[2])) + return data + + @staticmethod + def _extract_image(post, data): + text.extract_all(post, ( + (None , '>File:', ''), + ("fullname", '', '<'), + ("fsize" , '(', ', '), + ("w" , '', 'x'), + ("h" , '', ')'), + ), 0, data) + filename = data["fullname"] or data["filename"] + data["filename"] = text.unescape(filename.rpartition(".")[0]) + data["image"] = "https:" + data["image"] + del data["fullname"] diff --git a/gallery_dl/job.py b/gallery_dl/job.py new file mode 100644 index 0000000..667b9b3 --- /dev/null +++ b/gallery_dl/job.py @@ -0,0 +1,492 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import sys +import time +import logging +from . import extractor, downloader, postprocessor +from . import config, text, util, output, exception +from .extractor.message import Message + + +class Job(): + """Base class for Job-types""" + ulog = None + + def __init__(self, extr, parent=None): + if isinstance(extr, str): + extr = extractor.find(extr) + if not extr: + raise exception.NoExtractorError() + + self.extractor = extr + extr.log.extractor = extr + extr.log.job = self + extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url) + + # url predicates + self.pred_url = self._prepare_predicates( + "image", [util.UniquePredicate()], True) + + # queue predicates + self.pred_queue = self._prepare_predicates( + "chapter", [], False) + + # category transfer + if parent and parent.extractor.config( + "category-transfer", parent.extractor.categorytransfer): + self.extractor.category = parent.extractor.category + self.extractor.subcategory = parent.extractor.subcategory + + # user-supplied metadata + self.userkwds = self.extractor.config("keywords") + + def run(self): + """Execute or run the job""" + try: + log = self.extractor.log + for msg in self.extractor: + self.dispatch(msg) + except exception.AuthenticationError as exc: + msg = str(exc) or "Please provide a valid username/password pair." + log.error("Authentication failed: %s", msg) + except exception.AuthorizationError: + log.error("You do not have permission to access the resource " + "at '%s'", self.extractor.url) + except exception.NotFoundError as exc: + res = str(exc) or "resource (gallery/image/user)" + log.error("The %s at '%s' does not exist", res, self.extractor.url) + except exception.HttpError as exc: + err = exc.args[0] + if isinstance(err, Exception): + err = "{}: {}".format(err.__class__.__name__, err) + log.error("HTTP request failed: %s", err) + except exception.FormatError as exc: + err, obj = exc.args + log.error("Applying %s format string failed: %s: %s", + obj, err.__class__.__name__, err) + except exception.FilterError as exc: + err = exc.args[0] + log.error("Evaluating filter expression failed: %s: %s", + err.__class__.__name__, err) + except exception.StopExtraction: + pass + except OSError as exc: + log.error("Unable to download data: %s: %s", + exc.__class__.__name__, exc) + log.debug("", exc_info=True) + except Exception as exc: + log.error(("An unexpected error occurred: %s - %s. " + "Please run gallery-dl again with the --verbose flag, " + "copy its output and report this issue on " + "https://github.com/mikf/gallery-dl/issues ."), + exc.__class__.__name__, exc) + log.debug("", exc_info=True) + self.handle_finalize() + + def dispatch(self, msg): + """Call the appropriate message handler""" + if msg[0] == Message.Url: + _, url, kwds = msg + if self.pred_url(url, kwds): + self.update_kwdict(kwds) + self.handle_url(url, kwds) + + elif msg[0] == Message.Directory: + self.update_kwdict(msg[1]) + self.handle_directory(msg[1]) + + elif msg[0] == Message.Queue: + _, url, kwds = msg + if self.pred_queue(url, kwds): + self.handle_queue(url, kwds) + + elif msg[0] == Message.Urllist: + _, urls, kwds = msg + if self.pred_url(urls[0], kwds): + self.update_kwdict(kwds) + self.handle_urllist(urls, kwds) + + elif msg[0] == Message.Version: + if msg[1] != 1: + raise "unsupported message-version ({}, {})".format( + self.extractor.category, msg[1] + ) + # TODO: support for multiple message versions + + def handle_url(self, url, keywords): + """Handle Message.Url""" + + def handle_urllist(self, urls, keywords): + """Handle Message.Urllist""" + self.handle_url(urls[0], keywords) + + def handle_directory(self, keywords): + """Handle Message.Directory""" + + def handle_queue(self, url, keywords): + """Handle Message.Queue""" + + def handle_finalize(self): + """Handle job finalization""" + + def update_kwdict(self, kwdict): + """Update 'kwdict' with additional metadata""" + kwdict["category"] = self.extractor.category + kwdict["subcategory"] = self.extractor.subcategory + if self.userkwds: + kwdict.update(self.userkwds) + + def _prepare_predicates(self, target, predicates, skip=True): + pfilter = self.extractor.config(target + "-filter") + if pfilter: + try: + pred = util.FilterPredicate(pfilter, target) + except (SyntaxError, ValueError, TypeError) as exc: + self.extractor.log.warning(exc) + else: + predicates.append(pred) + + prange = self.extractor.config(target + "-range") + if prange: + try: + pred = util.RangePredicate(prange) + except ValueError as exc: + self.extractor.log.warning( + "invalid %s range: %s", target, exc) + else: + if skip and pred.lower > 1 and not pfilter: + pred.index += self.extractor.skip(pred.lower - 1) + predicates.append(pred) + + return util.build_predicate(predicates) + + def _write_unsupported(self, url): + if self.ulog: + self.ulog.info(url) + + @staticmethod + def _filter(kwdict): + """Return a copy of 'kwdict' without "private" entries""" + return {k: v for k, v in kwdict.items() if k[0] != "_"} + + +class DownloadJob(Job): + """Download images into appropriate directory/filename locations""" + + def __init__(self, url, parent=None): + Job.__init__(self, url, parent) + self.log = logging.getLogger("download") + self.pathfmt = None + self.archive = None + self.sleep = None + self.downloaders = {} + self.postprocessors = None + self.out = output.select() + + def handle_url(self, url, keywords, fallback=None): + """Download the resource specified in 'url'""" + # prepare download + self.pathfmt.set_keywords(keywords) + + if self.postprocessors: + for pp in self.postprocessors: + pp.prepare(self.pathfmt) + + if self.pathfmt.exists(self.archive): + self.handle_skip() + return + + if self.sleep: + time.sleep(self.sleep) + + # download from URL + if not self.download(url): + + # use fallback URLs if available + for num, url in enumerate(fallback or (), 1): + self.log.info("Trying fallback URL #%d", num) + if self.download(url): + break + else: + # download failed + self.log.error( + "Failed to download %s", self.pathfmt.filename or url) + return + + if not self.pathfmt.temppath: + self.handle_skip() + return + + # run post processors + if self.postprocessors: + for pp in self.postprocessors: + pp.run(self.pathfmt) + + # download succeeded + self.pathfmt.finalize() + self.out.success(self.pathfmt.path, 0) + if self.archive: + self.archive.add(keywords) + self._skipcnt = 0 + + def handle_urllist(self, urls, keywords): + """Download the resource specified in 'url'""" + fallback = iter(urls) + url = next(fallback) + self.handle_url(url, keywords, fallback) + + def handle_directory(self, keywords): + """Set and create the target directory for downloads""" + if not self.pathfmt: + self.initialize(keywords) + else: + self.pathfmt.set_directory(keywords) + + def handle_queue(self, url, keywords): + if "_extractor" in keywords: + extr = keywords["_extractor"].from_url(url) + else: + extr = extractor.find(url) + if extr: + self.__class__(extr, self).run() + else: + self._write_unsupported(url) + + def handle_finalize(self): + if self.postprocessors: + for pp in self.postprocessors: + pp.finalize() + + def handle_skip(self): + self.out.skip(self.pathfmt.path) + if self._skipexc: + self._skipcnt += 1 + if self._skipcnt >= self._skipmax: + raise self._skipexc() + + def download(self, url): + """Download 'url'""" + scheme = url.partition(":")[0] + downloader = self.get_downloader(scheme) + if downloader: + return downloader.download(url, self.pathfmt) + self._write_unsupported(url) + return False + + def get_downloader(self, scheme): + """Return a downloader suitable for 'scheme'""" + if scheme == "https": + scheme = "http" + try: + return self.downloaders[scheme] + except KeyError: + pass + + klass = downloader.find(scheme) + if klass and config.get(("downloader", scheme, "enabled"), True): + instance = klass(self.extractor, self.out) + else: + instance = None + self.log.error("'%s:' URLs are not supported/enabled", scheme) + self.downloaders[scheme] = instance + return instance + + def initialize(self, keywords=None): + """Delayed initialization of PathFormat, etc.""" + self.pathfmt = util.PathFormat(self.extractor) + if keywords: + self.pathfmt.set_directory(keywords) + self.sleep = self.extractor.config("sleep") + + skip = self.extractor.config("skip", True) + if skip: + self._skipexc = None + if isinstance(skip, str): + skip, _, smax = skip.partition(":") + if skip == "abort": + self._skipexc = exception.StopExtraction + elif skip == "exit": + self._skipexc = sys.exit + self._skipcnt = 0 + self._skipmax = text.parse_int(smax) + else: + self.pathfmt.exists = lambda x=None: False + + archive = self.extractor.config("archive") + if archive: + path = util.expand_path(archive) + self.archive = util.DownloadArchive(path, self.extractor) + + postprocessors = self.extractor.config("postprocessors") + if postprocessors: + self.postprocessors = [] + for pp_dict in postprocessors: + whitelist = pp_dict.get("whitelist") + blacklist = pp_dict.get("blacklist") + if (whitelist and self.extractor.category not in whitelist or + blacklist and self.extractor.category in blacklist): + continue + name = pp_dict.get("name") + pp_cls = postprocessor.find(name) + if not pp_cls: + postprocessor.log.warning("module '%s' not found", name) + continue + try: + pp_obj = pp_cls(self.pathfmt, pp_dict) + except Exception as exc: + postprocessor.log.error( + "'%s' initialization failed: %s: %s", + name, exc.__class__.__name__, exc) + else: + self.postprocessors.append(pp_obj) + self.extractor.log.debug( + "Active postprocessor modules: %s", self.postprocessors) + + +class SimulationJob(DownloadJob): + """Simulate the extraction process without downloading anything""" + + def handle_url(self, url, keywords, fallback=None): + self.pathfmt.set_keywords(keywords) + self.out.skip(self.pathfmt.path) + if self.sleep: + time.sleep(self.sleep) + if self.archive: + self.archive.add(keywords) + + def handle_directory(self, keywords): + if not self.pathfmt: + self.initialize() + + +class KeywordJob(Job): + """Print available keywords""" + + def handle_url(self, url, keywords): + print("\nKeywords for filenames and --filter:") + print("------------------------------------") + self.print_keywords(keywords) + raise exception.StopExtraction() + + def handle_directory(self, keywords): + print("Keywords for directory names:") + print("-----------------------------") + self.print_keywords(keywords) + + def handle_queue(self, url, keywords): + if not keywords: + self.extractor.log.info( + "This extractor delegates work to other extractors " + "and does not provide any keywords on its own. Try " + "'gallery-dl -K \"%s\"' instead.", url) + else: + print("Keywords for --chapter-filter:") + print("------------------------------") + self.print_keywords(keywords) + if self.extractor.categorytransfer: + print() + KeywordJob(url, self).run() + raise exception.StopExtraction() + + @staticmethod + def print_keywords(keywords, prefix=""): + """Print key-value pairs with formatting""" + suffix = "]" if prefix else "" + for key, value in sorted(keywords.items()): + if key[0] == "_": + continue + key = prefix + key + suffix + + if isinstance(value, dict): + KeywordJob.print_keywords(value, key + "[") + + elif isinstance(value, list): + if value and isinstance(value[0], dict): + KeywordJob.print_keywords(value[0], key + "[][") + else: + print(key, "[]", sep="") + for val in value: + print(" -", val) + + else: + # string or number + print(key, "\n ", value, sep="") + + +class UrlJob(Job): + """Print download urls""" + maxdepth = 1 + + def __init__(self, url, parent=None, depth=1): + Job.__init__(self, url, parent) + self.depth = depth + if depth >= self.maxdepth: + self.handle_queue = self.handle_url + + @staticmethod + def handle_url(url, _): + print(url) + + @staticmethod + def handle_urllist(urls, _): + prefix = "" + for url in urls: + print(prefix, url, sep="") + prefix = "| " + + def handle_queue(self, url, _): + try: + UrlJob(url, self, self.depth + 1).run() + except exception.NoExtractorError: + self._write_unsupported(url) + + +class DataJob(Job): + """Collect extractor results and dump them""" + + def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True): + Job.__init__(self, url, parent) + self.file = file + self.data = [] + self.ascii = config.get(("output", "ascii"), ensure_ascii) + + def run(self): + # collect data + try: + for msg in self.extractor: + self.dispatch(msg) + except exception.StopExtraction: + pass + except Exception as exc: + self.data.append((exc.__class__.__name__, str(exc))) + except BaseException: + pass + + # convert numbers to string + if config.get(("output", "num-to-str"), False): + for msg in self.data: + util.transform_dict(msg[-1], util.number_to_string) + + # dump to 'file' + util.dump_json(self.data, self.file, self.ascii, 2) + + def handle_url(self, url, kwdict): + self.data.append((Message.Url, url, self._filter(kwdict))) + + def handle_urllist(self, urls, kwdict): + self.data.append((Message.Urllist, list(urls), self._filter(kwdict))) + + def handle_directory(self, kwdict): + self.data.append((Message.Directory, self._filter(kwdict))) + + def handle_queue(self, url, kwdict): + self.data.append((Message.Queue, url, self._filter(kwdict))) + + def handle_finalize(self): + self.file.close() diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py new file mode 100644 index 0000000..58126ac --- /dev/null +++ b/gallery_dl/oauth.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""OAuth helper functions and classes""" + +import hmac +import time +import base64 +import random +import string +import hashlib +import urllib.parse + +import requests +import requests.auth + +from . import text + + +def nonce(size, alphabet=string.ascii_letters): + """Generate a nonce value with 'size' characters""" + return "".join(random.choice(alphabet) for _ in range(size)) + + +def quote(value, quote=urllib.parse.quote): + """Quote 'value' according to the OAuth1.0 standard""" + return quote(value, "~") + + +def concat(*args): + """Concatenate 'args' as expected by OAuth1.0""" + return "&".join(quote(item) for item in args) + + +class OAuth1Session(requests.Session): + """Extension to requests.Session to support OAuth 1.0""" + + def __init__(self, consumer_key, consumer_secret, + token=None, token_secret=None): + + requests.Session.__init__(self) + self.auth = OAuth1Client( + consumer_key, consumer_secret, + token, token_secret, + ) + + def rebuild_auth(self, prepared_request, response): + if "Authorization" in prepared_request.headers: + del prepared_request.headers["Authorization"] + prepared_request.prepare_auth(self.auth) + + +class OAuth1Client(requests.auth.AuthBase): + """OAuth1.0a authentication""" + + def __init__(self, consumer_key, consumer_secret, + token=None, token_secret=None): + + self.consumer_key = consumer_key + self.consumer_secret = consumer_secret + self.token = token + self.token_secret = token_secret + + def __call__(self, request): + oauth_params = [ + ("oauth_consumer_key", self.consumer_key), + ("oauth_nonce", nonce(16)), + ("oauth_signature_method", "HMAC-SHA1"), + ("oauth_timestamp", str(int(time.time()))), + ("oauth_version", "1.0"), + ] + if self.token: + oauth_params.append(("oauth_token", self.token)) + + signature = self.generate_signature(request, oauth_params) + oauth_params.append(("oauth_signature", signature)) + + request.headers["Authorization"] = "OAuth " + ",".join( + key + '="' + value + '"' for key, value in oauth_params) + + return request + + def generate_signature(self, request, params): + """Generate 'oauth_signature' value""" + url, _, query = request.url.partition("?") + + params = params.copy() + for key, value in text.parse_query(query).items(): + params.append((quote(key), quote(value))) + params.sort() + query = "&".join("=".join(item) for item in params) + + message = concat(request.method, url, query).encode() + key = concat(self.consumer_secret, self.token_secret or "").encode() + signature = hmac.new(key, message, hashlib.sha1).digest() + + return quote(base64.b64encode(signature).decode()) + + +class OAuth1API(): + """Base class for OAuth1.0 based API interfaces""" + API_KEY = None + API_SECRET = None + + def __init__(self, extractor): + self.log = extractor.log + self.extractor = extractor + + api_key = extractor.config("api-key", self.API_KEY) + api_secret = extractor.config("api-secret", self.API_SECRET) + token = extractor.config("access-token") + token_secret = extractor.config("access-token-secret") + + if api_key and api_secret and token and token_secret: + self.log.debug("Using OAuth1.0 authentication") + self.session = OAuth1Session( + api_key, api_secret, token, token_secret) + self.api_key = None + else: + self.log.debug("Using api_key authentication") + self.session = extractor.session + self.api_key = api_key + + def request(self, url, method="GET", *, expect=range(400, 500), **kwargs): + kwargs["expect"] = expect + kwargs["session"] = self.session + return self.extractor.request(url, method, **kwargs) diff --git a/gallery_dl/option.py b/gallery_dl/option.py new file mode 100644 index 0000000..f23b79d --- /dev/null +++ b/gallery_dl/option.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Command line option parsing""" + +import argparse +import logging +import json +from . import job, version + + +class ConfigAction(argparse.Action): + """Set argparse results as config values""" + def __call__(self, parser, namespace, values, option_string=None): + namespace.options.append(((self.dest,), values)) + + +class ConfigConstAction(argparse.Action): + """Set argparse const values as config values""" + def __call__(self, parser, namespace, values, option_string=None): + namespace.options.append(((self.dest,), self.const)) + + +class ParseAction(argparse.Action): + """Parse = options and set them as config values""" + def __call__(self, parser, namespace, values, option_string=None): + key, _, value = values.partition("=") + try: + value = json.loads(value) + except ValueError: + pass + key = key.split(".") + namespace.options.append((key, value)) + + +class Formatter(argparse.HelpFormatter): + """Custom HelpFormatter class to customize help output""" + def __init__(self, *args, **kwargs): + super().__init__(max_help_position=50, *args, **kwargs) + + def _format_action_invocation(self, action): + opts = action.option_strings[:] + if opts: + if action.nargs != 0: + args_string = self._format_args(action, "ARG") + opts[-1] += " " + args_string + return ', '.join(opts) + else: + return self._metavar_formatter(action, action.dest)(1)[0] + + +def build_parser(): + """Build and configure an ArgumentParser object""" + parser = argparse.ArgumentParser( + usage="%(prog)s [OPTION]... URL...", + formatter_class=Formatter, + add_help=False, + ) + + general = parser.add_argument_group("General Options") + general.add_argument( + "-h", "--help", + action="help", + help="Print this help message and exit", + ) + general.add_argument( + "--version", + action="version", version=version.__version__, + help="Print program version and exit", + ) + general.add_argument( + "-d", "--dest", + dest="base-directory", metavar="DEST", action=ConfigAction, + help="Destination directory", + ) + general.add_argument( + "-i", "--input-file", + dest="inputfile", metavar="FILE", + help="Download URLs found in FILE ('-' for stdin)", + ) + general.add_argument( + "--cookies", + dest="cookies", metavar="FILE", action=ConfigAction, + help="File to load additional cookies from", + ) + general.add_argument( + "--proxy", + dest="proxy", metavar="URL", action=ConfigAction, + help="Use the specified proxy", + ) + general.add_argument( + "--clear-cache", + dest="clear_cache", action="store_true", + help="Delete all cached login sessions, cookies, etc.", + ) + + output = parser.add_argument_group("Output Options") + output.add_argument( + "-q", "--quiet", + dest="loglevel", default=logging.INFO, + action="store_const", const=logging.ERROR, + help="Activate quiet mode", + ) + output.add_argument( + "-v", "--verbose", + dest="loglevel", + action="store_const", const=logging.DEBUG, + help="Print various debugging information", + ) + output.add_argument( + "-g", "--get-urls", + dest="list_urls", action="count", + help="Print URLs instead of downloading", + ) + output.add_argument( + "-j", "--dump-json", + dest="jobtype", action="store_const", const=job.DataJob, + help="Print JSON information", + ) + output.add_argument( + "-s", "--simulate", + dest="jobtype", action="store_const", const=job.SimulationJob, + help="Simulate data extraction; do not download anything", + ) + output.add_argument( + "-K", "--list-keywords", + dest="jobtype", action="store_const", const=job.KeywordJob, + help=("Print a list of available keywords and example values " + "for the given URLs"), + ) + output.add_argument( + "--list-modules", + dest="list_modules", action="store_true", + help="Print a list of available extractor modules", + ) + output.add_argument( + "--list-extractors", + dest="list_extractors", action="store_true", + help=("Print a list of extractor classes " + "with description, (sub)category and example URL"), + ) + output.add_argument( + "--write-log", + dest="logfile", metavar="FILE", action=ConfigAction, + help="Write logging output to FILE", + ) + output.add_argument( + "--write-unsupported", + dest="unsupportedfile", metavar="FILE", action=ConfigAction, + help=("Write URLs, which get emitted by other extractors but cannot " + "be handled, to FILE"), + ) + + downloader = parser.add_argument_group("Downloader Options") + downloader.add_argument( + "-r", "--limit-rate", + dest="rate", metavar="RATE", action=ConfigAction, + help="Maximum download rate (e.g. 500k or 2.5M)", + ) + downloader.add_argument( + "-R", "--retries", + dest="retries", metavar="RETRIES", type=int, action=ConfigAction, + help="Number of retries (default: 5)", + ) + downloader.add_argument( + "--http-timeout", + dest="timeout", metavar="SECONDS", type=float, action=ConfigAction, + help="Timeout for HTTP connections (defaut: 30.0)", + ) + downloader.add_argument( + "--sleep", + dest="sleep", metavar="SECONDS", type=float, action=ConfigAction, + help="Number of seconds to sleep before each download", + ) + downloader.add_argument( + "--no-part", + dest="part", nargs=0, action=ConfigConstAction, const=False, + help="Do not use .part files", + ) + downloader.add_argument( + "--no-check-certificate", + dest="verify", nargs=0, action=ConfigConstAction, const=False, + help="Disable HTTPS certificate validation", + ) + downloader.add_argument( + "--abort-on-skip", + dest="skip", nargs=0, action=ConfigConstAction, const="abort", + help=("Abort extractor run if a file download would normally be " + "skipped, i.e. if a file with the same filename already exists"), + ) + + configuration = parser.add_argument_group("Configuration Options") + configuration.add_argument( + "-c", "--config", + dest="cfgfiles", metavar="FILE", action="append", + help="Additional configuration files", + ) + configuration.add_argument( + "--config-yaml", + dest="yamlfiles", metavar="FILE", action="append", + help=argparse.SUPPRESS, + ) + configuration.add_argument( + "-o", "--option", + dest="options", metavar="OPT", action=ParseAction, default=[], + help="Additional '=' option values", + ) + configuration.add_argument( + "--ignore-config", + dest="load_config", action="store_false", + help="Do not read the default configuration files", + ) + + authentication = parser.add_argument_group("Authentication Options") + authentication.add_argument( + "-u", "--username", + dest="username", metavar="USER", action=ConfigAction, + help="Username to login with", + ) + authentication.add_argument( + "-p", "--password", + dest="password", metavar="PASS", action=ConfigAction, + help="Password belonging to the given username", + ) + authentication.add_argument( + "--netrc", + dest="netrc", nargs=0, action=ConfigConstAction, const=True, + help="Enable .netrc authentication data", + ) + + selection = parser.add_argument_group("Selection Options") + selection.add_argument( + "--download-archive", + dest="archive", metavar="FILE", action=ConfigAction, + help=("Record all downloaded files in the archive file and " + "skip downloading any file already in it."), + ) + selection.add_argument( + "--range", + dest="image-range", metavar="RANGE", action=ConfigAction, + help=("Index-range(s) specifying which images to download. " + "For example '5-10' or '1,3-5,10-'"), + ) + selection.add_argument( + "--chapter-range", + dest="chapter-range", metavar="RANGE", action=ConfigAction, + help=("Like '--range', but applies to manga-chapters " + "and other delegated URLs"), + ) + selection.add_argument( + "--filter", + dest="image-filter", metavar="EXPR", action=ConfigAction, + help=("Python expression controlling which images to download. " + "Files for which the expression evaluates to False are ignored. " + "Available keys are the filename-specific ones listed by '-K'. " + "Example: --filter \"image_width >= 1000 and " + "rating in ('s', 'q')\""), + ) + selection.add_argument( + "--chapter-filter", + dest="chapter-filter", metavar="EXPR", action=ConfigAction, + help=("Like '--filter', but applies to manga-chapters " + "and other delegated URLs"), + ) + + postprocessor = parser.add_argument_group("Post-processing Options") + postprocessor.add_argument( + "--zip", + dest="postprocessors", + action="append_const", const={"name": "zip"}, + help="Store downloaded files in a ZIP archive", + ) + postprocessor.add_argument( + "--ugoira-conv", + dest="postprocessors", + action="append_const", const={"name": "ugoira", "ffmpeg-args": ( + "-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an")}, + help="Convert Pixiv Ugoira to WebM (requires FFmpeg)", + ) + postprocessor.add_argument( + "--write-metadata", + dest="postprocessors", + action="append_const", const={"name": "metadata"}, + help="Write metadata to separate JSON files", + ) + postprocessor.add_argument( + "--write-tags", + dest="postprocessors", + action="append_const", const={"name": "metadata", "mode": "tags"}, + help="Write image tags to separate text files", + ) + + parser.add_argument( + "urls", + metavar="URL", nargs="*", + help=argparse.SUPPRESS, + ) + + return parser diff --git a/gallery_dl/output.py b/gallery_dl/output.py new file mode 100644 index 0000000..327b69a --- /dev/null +++ b/gallery_dl/output.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import os +import sys +import shutil +import logging +from . import config, util + + +# -------------------------------------------------------------------- +# Logging + +LOG_FORMAT = "[{name}][{levelname}] {message}" +LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S" +LOG_LEVEL = logging.INFO + + +class Logger(logging.Logger): + """Custom logger that includes extractor and job info in log records""" + extractor = util.NONE + job = util.NONE + + def makeRecord(self, name, level, fn, lno, msg, args, exc_info, + func=None, extra=None, sinfo=None, + factory=logging._logRecordFactory): + rv = factory(name, level, fn, lno, msg, args, exc_info, func, sinfo) + rv.extractor = self.extractor + rv.job = self.job + return rv + + +def initialize_logging(loglevel): + """Setup basic logging functionality before configfiles have been loaded""" + # convert levelnames to lowercase + for level in (10, 20, 30, 40, 50): + name = logging.getLevelName(level) + logging.addLevelName(level, name.lower()) + + # register custom Logging class + logging.Logger.manager.setLoggerClass(Logger) + + # setup basic logging to stderr + formatter = logging.Formatter(LOG_FORMAT, LOG_FORMAT_DATE, "{") + handler = logging.StreamHandler() + handler.setFormatter(formatter) + handler.setLevel(loglevel) + root = logging.getLogger() + root.setLevel(logging.NOTSET) + root.addHandler(handler) + + return logging.getLogger("gallery-dl") + + +def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL): + """Setup a new logging handler""" + opts = config.interpolate(("output", key)) + if not opts: + return None + if not isinstance(opts, dict): + opts = {"path": opts} + + path = opts.get("path") + mode = opts.get("mode", "w") + encoding = opts.get("encoding", "utf-8") + try: + path = util.expand_path(path) + handler = logging.FileHandler(path, mode, encoding) + except (OSError, ValueError) as exc: + logging.getLogger("gallery-dl").warning( + "%s: %s", key, exc) + return None + except TypeError as exc: + logging.getLogger("gallery-dl").warning( + "%s: missing or invalid path (%s)", key, exc) + return None + + level = opts.get("level", lvl) + logfmt = opts.get("format", fmt) + datefmt = opts.get("format-date", LOG_FORMAT_DATE) + formatter = logging.Formatter(logfmt, datefmt, "{") + handler.setFormatter(formatter) + handler.setLevel(level) + + return handler + + +def configure_logging_handler(key, handler): + """Configure a logging handler""" + opts = config.interpolate(("output", key)) + if not opts: + return + if isinstance(opts, str): + opts = {"format": opts} + if handler.level == LOG_LEVEL and "level" in opts: + handler.setLevel(opts["level"]) + if "format" in opts or "format-date" in opts: + logfmt = opts.get("format", LOG_FORMAT) + datefmt = opts.get("format-date", LOG_FORMAT_DATE) + formatter = logging.Formatter(logfmt, datefmt, "{") + handler.setFormatter(formatter) + + +# -------------------------------------------------------------------- +# Utility functions + +def replace_std_streams(errors="replace"): + """Replace standard streams and set their error handlers to 'errors'""" + for name in ("stdout", "stdin", "stderr"): + stream = getattr(sys, name) + setattr(sys, name, stream.__class__( + stream.buffer, + errors=errors, + newline=stream.newlines, + line_buffering=stream.line_buffering, + )) + + +# -------------------------------------------------------------------- +# Downloader output + +def select(): + """Automatically select a suitable output class""" + pdict = { + "default": PipeOutput, + "pipe": PipeOutput, + "term": TerminalOutput, + "terminal": TerminalOutput, + "color": ColorOutput, + "null": NullOutput, + } + omode = config.get(("output", "mode"), "auto").lower() + if omode in pdict: + return pdict[omode]() + elif omode == "auto": + if hasattr(sys.stdout, "isatty") and sys.stdout.isatty(): + return ColorOutput() if ANSI else TerminalOutput() + else: + return PipeOutput() + else: + raise Exception("invalid output mode: " + omode) + + +class NullOutput(): + + def start(self, path): + """Print a message indicating the start of a download""" + + def skip(self, path): + """Print a message indicating that a download has been skipped""" + + def success(self, path, tries): + """Print a message indicating the completion of a download""" + + +class PipeOutput(NullOutput): + + def skip(self, path): + print(CHAR_SKIP, path, sep="", flush=True) + + def success(self, path, tries): + print(path, flush=True) + + +class TerminalOutput(NullOutput): + + def __init__(self): + self.short = config.get(("output", "shorten"), True) + if self.short: + self.width = shutil.get_terminal_size().columns - OFFSET + + def start(self, path): + print(self.shorten(" " + path), end="", flush=True) + + def skip(self, path): + print(self.shorten(CHAR_SKIP + path)) + + def success(self, path, tries): + print("\r", self.shorten(CHAR_SUCCESS + path), sep="") + + def shorten(self, txt): + """Reduce the length of 'txt' to the width of the terminal""" + if self.short and len(txt) > self.width: + hwidth = self.width // 2 - OFFSET + return "".join(( + txt[:hwidth-1], + CHAR_ELLIPSIES, + txt[-hwidth-(self.width % 2):] + )) + return txt + + +class ColorOutput(TerminalOutput): + + def start(self, path): + print(self.shorten(path), end="", flush=True) + + def skip(self, path): + print("\033[2m", self.shorten(path), "\033[0m", sep="") + + def success(self, path, tries): + print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="") + + +if os.name == "nt": + ANSI = os.environ.get("TERM") == "ANSI" + OFFSET = 1 + CHAR_SKIP = "# " + CHAR_SUCCESS = "* " + CHAR_ELLIPSIES = "..." +else: + ANSI = True + OFFSET = 0 + CHAR_SKIP = "# " + CHAR_SUCCESS = "✔ " + CHAR_ELLIPSIES = "…" diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py new file mode 100644 index 0000000..093f8e0 --- /dev/null +++ b/gallery_dl/postprocessor/__init__.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Post-processing modules""" + +import importlib +import logging + +modules = [ + "classify", + "exec", + "metadata", + "ugoira", + "zip", +] + +log = logging.getLogger("postprocessor") + + +def find(name): + """Return a postprocessor class with the given name""" + try: + return _cache[name] + except KeyError: + klass = None + try: + if name in modules: # prevent unwanted imports + module = importlib.import_module("." + name, __package__) + klass = module.__postprocessor__ + except (ImportError, AttributeError, TypeError): + pass + _cache[name] = klass + return klass + + +# -------------------------------------------------------------------- +# internals + +_cache = {} diff --git a/gallery_dl/postprocessor/classify.py b/gallery_dl/postprocessor/classify.py new file mode 100644 index 0000000..62460d3 --- /dev/null +++ b/gallery_dl/postprocessor/classify.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Categorize files by file extension""" + +from .common import PostProcessor +import os + + +class ClassifyPP(PostProcessor): + + DEFAULT_MAPPING = { + "Music" : ("mp3", "aac", "flac", "ogg", "wma", "m4a", "wav"), + "Video" : ("flv", "ogv", "avi", "mp4", "mpg", "mpeg", "3gp", "mkv", + "webm", "vob", "wmv"), + "Pictures" : ("jpg", "jpeg", "png", "gif", "bmp", "svg", "webp"), + "Archives" : ("zip", "rar", "7z", "tar", "gz", "bz2"), + } + + def __init__(self, pathfmt, options): + PostProcessor.__init__(self) + mapping = options.get("mapping", self.DEFAULT_MAPPING) + + self.mapping = { + ext: directory + for directory, exts in mapping.items() + for ext in exts + } + + def prepare(self, pathfmt): + ext = pathfmt.keywords.get("extension") + + if ext in self.mapping: + self._dir = pathfmt.realdirectory + os.sep + self.mapping[ext] + pathfmt.realpath = self._dir + os.sep + pathfmt.filename + else: + self._dir = None + + def run(self, pathfmt): + if self._dir: + os.makedirs(self._dir, exist_ok=True) + + +__postprocessor__ = ClassifyPP diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py new file mode 100644 index 0000000..c642f0f --- /dev/null +++ b/gallery_dl/postprocessor/common.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Common classes and constants used by postprocessor modules.""" + +from . import log + + +class PostProcessor(): + """Base class for postprocessors""" + log = log + + def prepare(self, pathfmt): + """ """ + + def run(self, pathfmt): + """Execute the postprocessor for a file""" + + def finalize(self): + """Cleanup""" diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py new file mode 100644 index 0000000..c86b480 --- /dev/null +++ b/gallery_dl/postprocessor/exec.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Execute processes""" + +from .common import PostProcessor +import subprocess + + +class ExecPP(PostProcessor): + + def __init__(self, pathfmt, options): + PostProcessor.__init__(self) + + try: + self.args = options["command"] + self.args[0] # test if 'args' is subscriptable + except (KeyError, IndexError, TypeError): + raise TypeError("option 'command' must be a non-empty list") + + if options.get("async", False): + self._exec = subprocess.Popen + + def run(self, pathfmt): + self._exec([ + arg.format_map(pathfmt.keywords) + for arg in self.args + ]) + + def _exec(self, args): + retcode = subprocess.Popen(args).wait() + if retcode: + self.log.warning( + "executing '%s' returned non-zero exit status %d", + " ".join(args), retcode) + + +__postprocessor__ = ExecPP diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py new file mode 100644 index 0000000..77be9c7 --- /dev/null +++ b/gallery_dl/postprocessor/metadata.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Write metadata to JSON files""" + +from .common import PostProcessor +from .. import util + + +class MetadataPP(PostProcessor): + + def __init__(self, pathfmt, options): + PostProcessor.__init__(self) + + mode = options.get("mode", "json") + ext = "txt" + + if mode == "custom": + self.write = self._write_custom + self.formatter = util.Formatter(options.get("format")) + elif mode == "tags": + self.write = self._write_tags + else: + self.write = self._write_json + self.indent = options.get("indent", 4) + self.ascii = options.get("ascii", False) + ext = "json" + + self.extension = options.get("extension", ext) + + def run(self, pathfmt): + path = "{}.{}".format(pathfmt.realpath, self.extension) + with open(path, "w", encoding="utf-8") as file: + self.write(file, pathfmt) + + def _write_custom(self, file, pathfmt): + output = self.formatter.format_map(pathfmt.keywords) + file.write(output) + + def _write_tags(self, file, pathfmt): + kwds = pathfmt.keywords + tags = kwds.get("tags") or kwds.get("tag_string") + + if not tags: + return + + if not isinstance(tags, list): + taglist = tags.split(", ") + if len(taglist) < len(tags) / 16: + taglist = tags.split(" ") + tags = taglist + + file.write("\n".join(tags)) + file.write("\n") + + def _write_json(self, file, pathfmt): + util.dump_json(pathfmt.keywords, file, self.ascii, self.indent) + + +__postprocessor__ = MetadataPP diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py new file mode 100644 index 0000000..bd8c5ad --- /dev/null +++ b/gallery_dl/postprocessor/ugoira.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Convert pixiv ugoira to webm""" + +from .common import PostProcessor +from .. import util +import collections +import subprocess +import tempfile +import zipfile +import os + + +class UgoiraPP(PostProcessor): + + def __init__(self, pathfmt, options): + PostProcessor.__init__(self) + self.extension = options.get("extension") or "webm" + self.args = options.get("ffmpeg-args") or () + self.twopass = options.get("ffmpeg-twopass", False) + self.output = options.get("ffmpeg-output", True) + self.delete = not options.get("keep-files", False) + + ffmpeg = options.get("ffmpeg-location") + self.ffmpeg = util.expand_path(ffmpeg) if ffmpeg else "ffmpeg" + + rate = options.get("framerate", "auto") + if rate != "auto": + self.calculate_framerate = lambda _: (None, rate) + + if options.get("libx264-prevent-odd", True): + # get last video-codec argument + vcodec = None + for index, arg in enumerate(self.args): + arg, _, stream = arg.partition(":") + if arg == "-vcodec" or arg in ("-c", "-codec") and ( + not stream or stream.partition(":")[0] in ("v", "V")): + vcodec = self.args[index + 1] + # use filter if libx264/5 is explicitly or implicitly used + self.prevent_odd = ( + vcodec in ("libx264", "libx265") or + not vcodec and self.extension.lower() in ("mp4", "mkv")) + else: + self.prevent_odd = False + + def prepare(self, pathfmt): + self._frames = None + + if pathfmt.keywords["extension"] != "zip": + return + + if "frames" in pathfmt.keywords: + self._frames = pathfmt.keywords["frames"] + elif "pixiv_ugoira_frame_data" in pathfmt.keywords: + self._frames = pathfmt.keywords["pixiv_ugoira_frame_data"]["data"] + else: + return + + if self.delete: + pathfmt.set_extension(self.extension) + + def run(self, pathfmt): + if not self._frames: + return + + rate_in, rate_out = self.calculate_framerate(self._frames) + + with tempfile.TemporaryDirectory() as tempdir: + # extract frames + with zipfile.ZipFile(pathfmt.temppath) as zfile: + zfile.extractall(tempdir) + + # write ffconcat file + ffconcat = tempdir + "/ffconcat.txt" + with open(ffconcat, "w") as file: + file.write("ffconcat version 1.0\n") + for frame in self._frames: + file.write("file '{}'\n".format(frame["file"])) + file.write("duration {}\n".format(frame["delay"] / 1000)) + if self.extension != "gif": + # repeat the last frame to prevent it from only being + # displayed for a very short amount of time + file.write("file '{}'\n".format(self._frames[-1]["file"])) + + # collect command-line arguments + args = [self.ffmpeg] + if rate_in: + args += ["-r", str(rate_in)] + args += ["-i", ffconcat] + if rate_out: + args += ["-r", str(rate_out)] + if self.prevent_odd: + args += ["-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)"] + if self.args: + args += self.args + self.log.debug("ffmpeg args: %s", args) + + # invoke ffmpeg + pathfmt.set_extension(self.extension) + if self.twopass: + if "-f" not in args: + args += ["-f", self.extension] + args += ["-passlogfile", tempdir + "/ffmpeg2pass", "-pass"] + self._exec(args + ["1", "-y", os.devnull]) + self._exec(args + ["2", pathfmt.realpath]) + else: + args.append(pathfmt.realpath) + self._exec(args) + + if self.delete: + pathfmt.delete = True + else: + pathfmt.set_extension("zip") + + def _exec(self, args): + out = None if self.output else subprocess.DEVNULL + return subprocess.Popen(args, stdout=out, stderr=out).wait() + + @staticmethod + def calculate_framerate(framelist): + counter = collections.Counter(frame["delay"] for frame in framelist) + fps = "1000/{}".format(min(counter)) + return (fps, None) if len(counter) == 1 else (None, fps) + + +__postprocessor__ = UgoiraPP diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py new file mode 100644 index 0000000..3a0c323 --- /dev/null +++ b/gallery_dl/postprocessor/zip.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Store files in ZIP archives""" + +from .common import PostProcessor +import zipfile +import os + + +class ZipPP(PostProcessor): + + COMPRESSION_ALGORITHMS = { + "store": zipfile.ZIP_STORED, + "zip": zipfile.ZIP_DEFLATED, + "bzip2": zipfile.ZIP_BZIP2, + "lzma": zipfile.ZIP_LZMA, + } + + def __init__(self, pathfmt, options): + PostProcessor.__init__(self) + self.delete = not options.get("keep-files", False) + self.ext = "." + options.get("extension", "zip") + algorithm = options.get("compression", "store") + if algorithm not in self.COMPRESSION_ALGORITHMS: + self.log.warning( + "unknown compression algorithm '%s'; falling back to 'store'", + algorithm) + algorithm = "store" + + self.path = pathfmt.realdirectory + self.zfile = zipfile.ZipFile( + self.path + self.ext, "a", + self.COMPRESSION_ALGORITHMS[algorithm], True) + + def run(self, pathfmt): + # 'NameToInfo' is not officially documented, but it's available + # for all supported Python versions and using it directly is a lot + # better than calling getinfo() + if pathfmt.filename not in self.zfile.NameToInfo: + self.zfile.write(pathfmt.temppath, pathfmt.filename) + pathfmt.delete = self.delete + + def finalize(self): + self.zfile.close() + + if self.delete: + try: + os.rmdir(self.path) + except OSError: + pass + + if not self.zfile.NameToInfo: + try: + os.unlink(self.zfile.filename) + except OSError: + pass + + +__postprocessor__ = ZipPP diff --git a/gallery_dl/text.py b/gallery_dl/text.py new file mode 100644 index 0000000..151fa30 --- /dev/null +++ b/gallery_dl/text.py @@ -0,0 +1,278 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Collection of functions that work on strings/text""" + +import re +import html +import os.path +import datetime +import urllib.parse + + +INVALID_XML_CHARS = ( + "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", + "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12", + "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a", + "\x1b", "\x1c", "\x1d", "\x1e", "\x1f", +) + + +def clean_xml(xmldata, repl=""): + """Replace/Remove invalid control characters in 'xmldata'""" + if not isinstance(xmldata, str): + try: + xmldata = "".join(xmldata) + except TypeError: + return "" + for char in INVALID_XML_CHARS: + if char in xmldata: + xmldata = xmldata.replace(char, repl) + return xmldata + + +def remove_html(txt): + """Remove html-tags from a string""" + try: + return " ".join(re.sub("<[^>]+>", " ", txt).split()) + except TypeError: + return "" + + +def split_html(txt, sep=None): + """Split input string by html-tags""" + try: + return [ + x.strip() for x in re.split("<[^>]+>", txt) + if x and not x.isspace() + ] + except TypeError: + return [] + + +def filename_from_url(url): + """Extract the last part of an URL to use as a filename""" + try: + return urllib.parse.urlsplit(url).path.rpartition("/")[2] + except (TypeError, AttributeError): + return "" + + +def ext_from_url(url): + """Extract the filename extension of an URL""" + filename = filename_from_url(url) + ext = os.path.splitext(filename)[1] + return ext[1:].lower() + + +def nameext_from_url(url, data=None): + """Extract the last part of an URL and fill 'data' accordingly""" + if data is None: + data = {} + name = unquote(filename_from_url(url)) + data["filename"], ext = os.path.splitext(name) + data["extension"] = ext[1:].lower() + return data + + +def clean_path_windows(path): + """Remove illegal characters from a path-segment (Windows)""" + try: + return re.sub(r'[<>:"\\/|?*]', "_", path) + except TypeError: + return "" + + +def clean_path_posix(path): + """Remove illegal characters from a path-segment (Posix)""" + try: + return path.replace("/", "_") + except AttributeError: + return "" + + +def extract(txt, begin, end, pos=0): + """Extract the text between 'begin' and 'end' from 'txt' + + Args: + txt: String to search in + begin: First string to be searched for + end: Second string to be searched for after 'begin' + pos: Starting position for searches in 'txt' + + Returns: + The string between the two search-strings 'begin' and 'end' beginning + with position 'pos' in 'txt' as well as the position after 'end'. + + If at least one of 'begin' or 'end' is not found, None and the original + value of 'pos' is returned + + Examples: + extract("abcde", "b", "d") -> "c" , 4 + extract("abcde", "b", "d", 3) -> None, 3 + """ + try: + first = txt.index(begin, pos) + len(begin) + last = txt.index(end, first) + return txt[first:last], last+len(end) + except (ValueError, TypeError, AttributeError): + return None, pos + + +def rextract(txt, begin, end, pos=-1): + try: + lbeg = len(begin) + first = txt.rindex(begin, 0, pos) + last = txt.index(end, first + lbeg) + return txt[first + lbeg:last], first + except (ValueError, TypeError, AttributeError): + return None, pos + + +def extract_all(txt, rules, pos=0, values=None): + """Calls extract for each rule and returns the result in a dict""" + if values is None: + values = {} + for key, begin, end in rules: + result, pos = extract(txt, begin, end, pos) + if key: + values[key] = result + return values, pos + + +def extract_iter(txt, begin, end, pos=0): + """Yield values that would be returned by repeated calls of extract()""" + index = txt.index + lbeg = len(begin) + lend = len(end) + try: + while True: + first = index(begin, pos) + lbeg + last = index(end, first) + pos = last + lend + yield txt[first:last] + except (ValueError, TypeError, AttributeError): + return + + +def extract_from(txt, pos=0, default=""): + """Returns a function object that extracts from 'txt'""" + def extr(begin, end, index=txt.index, txt=txt): + nonlocal pos + try: + first = index(begin, pos) + len(begin) + last = index(end, first) + pos = last + len(end) + return txt[first:last] + except (ValueError, TypeError, AttributeError): + return default + return extr + + +def parse_unicode_escapes(txt): + """Convert JSON Unicode escapes in 'txt' into actual characters""" + if "\\u" in txt: + return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt) + return txt + + +def _hex_to_char(match): + return chr(int(match.group(1), 16)) + + +def parse_bytes(value, default=0, suffixes="bkmgtp"): + """Convert a bytes-amount ("500k", "2.5M", ...) to int""" + try: + last = value[-1].lower() + except (TypeError, KeyError, IndexError): + return default + + if last in suffixes: + mul = 1024 ** suffixes.index(last) + value = value[:-1] + else: + mul = 1 + + try: + return round(float(value) * mul) + except ValueError: + return default + + +def parse_int(value, default=0): + """Convert 'value' to int""" + if not value: + return default + try: + return int(value) + except (ValueError, TypeError): + return default + + +def parse_float(value, default=0.0): + """Convert 'value' to float""" + if not value: + return default + try: + return float(value) + except (ValueError, TypeError): + return default + + +def parse_query(qs): + """Parse a query string into key-value pairs""" + result = {} + try: + for key, value in urllib.parse.parse_qsl(qs): + if key not in result: + result[key] = value + except AttributeError: + pass + return result + + +def parse_timestamp(ts, default=None): + """Create a datetime object from a unix timestamp""" + try: + return datetime.datetime.utcfromtimestamp(int(ts)) + except (TypeError, ValueError, OverflowError): + return default + + +def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z"): + """Create a datetime object by parsing 'date_string'""" + try: + if format.endswith("%z") and date_string[-3] == ":": + # workaround for Python < 3.7: +00:00 -> +0000 + ds = date_string[:-3] + date_string[-2:] + else: + ds = date_string + d = datetime.datetime.strptime(ds, format) + o = d.utcoffset() + if o is not None: + d = d.replace(tzinfo=None) - o # convert to naive UTC + return d + except (TypeError, IndexError, KeyError): + return None + except (ValueError, OverflowError): + return date_string + + +if os.name == "nt": + clean_path = clean_path_windows +else: + clean_path = clean_path_posix + + +urljoin = urllib.parse.urljoin + +quote = urllib.parse.quote +unquote = urllib.parse.unquote + +escape = html.escape +unescape = html.unescape diff --git a/gallery_dl/util.py b/gallery_dl/util.py new file mode 100644 index 0000000..5c0ae41 --- /dev/null +++ b/gallery_dl/util.py @@ -0,0 +1,673 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Utility functions and classes""" + +import re +import os +import sys +import json +import shutil +import string +import _string +import sqlite3 +import datetime +import operator +import itertools +import urllib.parse +from . import text, exception + + +def bencode(num, alphabet="0123456789"): + """Encode an integer into a base-N encoded string""" + data = "" + base = len(alphabet) + while num: + num, remainder = divmod(num, base) + data = alphabet[remainder] + data + return data + + +def bdecode(data, alphabet="0123456789"): + """Decode a base-N encoded string ( N = len(alphabet) )""" + num = 0 + base = len(alphabet) + for c in data: + num *= base + num += alphabet.index(c) + return num + + +def advance(iterable, num): + """"Advance the iterable by 'num' steps""" + iterator = iter(iterable) + next(itertools.islice(iterator, num, num), None) + return iterator + + +def raises(obj): + """Returns a function that raises 'obj' as exception""" + def wrap(): + raise obj + return wrap + + +def combine_dict(a, b): + """Recursively combine the contents of 'b' into 'a'""" + for key, value in b.items(): + if key in a and isinstance(value, dict) and isinstance(a[key], dict): + combine_dict(a[key], value) + else: + a[key] = value + return a + + +def transform_dict(a, func): + """Recursively apply 'func' to all values in 'a'""" + for key, value in a.items(): + if isinstance(value, dict): + transform_dict(value, func) + else: + a[key] = func(value) + + +def number_to_string(value, numbers=(int, float)): + """Convert numbers (int, float) to string; Return everything else as is.""" + return str(value) if value.__class__ in numbers else value + + +def to_string(value): + """str() with "better" defaults""" + if not value: + return "" + if value.__class__ is list: + try: + return ", ".join(value) + except Exception: + return ", ".join(map(str, value)) + return str(value) + + +def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4): + """Serialize 'obj' as JSON and write it to 'fp'""" + json.dump( + obj, fp, + ensure_ascii=ensure_ascii, + indent=indent, + default=str, + sort_keys=True, + ) + fp.write("\n") + + +def expand_path(path): + """Expand environment variables and tildes (~)""" + if not path: + return path + if not isinstance(path, str): + path = os.path.join(*path) + return os.path.expandvars(os.path.expanduser(path)) + + +def code_to_language(code, default=None): + """Map an ISO 639-1 language code to its actual name""" + return CODES.get((code or "").lower(), default) + + +def language_to_code(lang, default=None): + """Map a language name to its ISO 639-1 code""" + if lang is None: + return default + lang = lang.capitalize() + for code, language in CODES.items(): + if language == lang: + return code + return default + + +CODES = { + "ar": "Arabic", + "bg": "Bulgarian", + "ca": "Catalan", + "cs": "Czech", + "da": "Danish", + "de": "German", + "el": "Greek", + "en": "English", + "es": "Spanish", + "fi": "Finnish", + "fr": "French", + "he": "Hebrew", + "hu": "Hungarian", + "id": "Indonesian", + "it": "Italian", + "jp": "Japanese", + "ko": "Korean", + "ms": "Malay", + "nl": "Dutch", + "no": "Norwegian", + "pl": "Polish", + "pt": "Portuguese", + "ro": "Romanian", + "ru": "Russian", + "sv": "Swedish", + "th": "Thai", + "tr": "Turkish", + "vi": "Vietnamese", + "zh": "Chinese", +} + +SPECIAL_EXTRACTORS = {"oauth", "recursive", "test"} + + +class UniversalNone(): + """None-style object that supports more operations than None itself""" + __slots__ = () + + def __getattribute__(self, _): + return self + + def __getitem__(self, _): + return self + + @staticmethod + def __bool__(): + return False + + @staticmethod + def __str__(): + return "None" + + __repr__ = __str__ + + +NONE = UniversalNone() + + +def build_predicate(predicates): + if not predicates: + return lambda url, kwds: True + elif len(predicates) == 1: + return predicates[0] + else: + return ChainPredicate(predicates) + + +class RangePredicate(): + """Predicate; True if the current index is in the given range""" + def __init__(self, rangespec): + self.ranges = self.optimize_range(self.parse_range(rangespec)) + self.index = 0 + + if self.ranges: + self.lower, self.upper = self.ranges[0][0], self.ranges[-1][1] + else: + self.lower, self.upper = 0, 0 + + def __call__(self, url, kwds): + self.index += 1 + + if self.index > self.upper: + raise exception.StopExtraction() + + for lower, upper in self.ranges: + if lower <= self.index <= upper: + return True + return False + + @staticmethod + def parse_range(rangespec): + """Parse an integer range string and return the resulting ranges + + Examples: + parse_range("-2,4,6-8,10-") -> [(1,2), (4,4), (6,8), (10,INTMAX)] + parse_range(" - 3 , 4- 4, 2-6") -> [(1,3), (4,4), (2,6)] + """ + ranges = [] + + for group in rangespec.split(","): + if not group: + continue + first, sep, last = group.partition("-") + if not sep: + beg = end = int(first) + else: + beg = int(first) if first.strip() else 1 + end = int(last) if last.strip() else sys.maxsize + ranges.append((beg, end) if beg <= end else (end, beg)) + + return ranges + + @staticmethod + def optimize_range(ranges): + """Simplify/Combine a parsed list of ranges + + Examples: + optimize_range([(2,4), (4,6), (5,8)]) -> [(2,8)] + optimize_range([(1,1), (2,2), (3,6), (8,9))]) -> [(1,6), (8,9)] + """ + if len(ranges) <= 1: + return ranges + + ranges.sort() + riter = iter(ranges) + result = [] + + beg, end = next(riter) + for lower, upper in riter: + if lower > end+1: + result.append((beg, end)) + beg, end = lower, upper + elif upper > end: + end = upper + result.append((beg, end)) + return result + + +class UniquePredicate(): + """Predicate; True if given URL has not been encountered before""" + def __init__(self): + self.urls = set() + + def __call__(self, url, kwds): + if url.startswith("text:"): + return True + if url not in self.urls: + self.urls.add(url) + return True + return False + + +class FilterPredicate(): + """Predicate; True if evaluating the given expression returns True""" + globalsdict = { + "parse_int": text.parse_int, + "urlsplit": urllib.parse.urlsplit, + "datetime": datetime.datetime, + "abort": raises(exception.StopExtraction()), + "re": re, + } + + def __init__(self, filterexpr, target="image"): + name = "<{} filter>".format(target) + self.codeobj = compile(filterexpr, name, "eval") + + def __call__(self, url, kwds): + try: + return eval(self.codeobj, self.globalsdict, kwds) + except exception.GalleryDLException: + raise + except Exception as exc: + raise exception.FilterError(exc) + + +class ChainPredicate(): + """Predicate; True if all of its predicates return True""" + def __init__(self, predicates): + self.predicates = predicates + + def __call__(self, url, kwds): + for pred in self.predicates: + if not pred(url, kwds): + return False + return True + + +class ExtendedUrl(): + """URL with attached config key-value pairs""" + def __init__(self, url, gconf, lconf): + self.value, self.gconfig, self.lconfig = url, gconf, lconf + + def __str__(self): + return self.value + + +class Formatter(): + """Custom, extended version of string.Formatter + + This string formatter implementation is a mostly performance-optimized + variant of the original string.Formatter class. Unnecessary features have + been removed (positional arguments, unused argument check) and new + formatting options have been added. + + Extra Conversions: + - "l": calls str.lower on the target value + - "u": calls str.upper + - "c": calls str.capitalize + - "C": calls string.capwords + - "U": calls urllib.parse.unquote + - "S": calls util.to_string() + - Example: {f!l} -> "example"; {f!u} -> "EXAMPLE" + + Extra Format Specifiers: + - "?//": + Adds and to the actual value if it evaluates to True. + Otherwise the whole replacement field becomes an empty string. + Example: {f:?-+/+-/} -> "-+Example+-" (if "f" contains "Example") + -> "" (if "f" is None, 0, "") + + - "L//": + Replaces the output with if its length (in characters) + exceeds . Otherwise everything is left as is. + Example: {f:L5/too long/} -> "foo" (if "f" is "foo") + -> "too long" (if "f" is "foobar") + + - "J/": + Joins elements of a list (or string) using + Example: {f:J - /} -> "a - b - c" (if "f" is ["a", "b", "c"]) + + - "R//": + Replaces all occurrences of with + Example: {f:R /_/} -> "f_o_o_b_a_r" (if "f" is "f o o b a r") + """ + CONVERSIONS = { + "l": str.lower, + "u": str.upper, + "c": str.capitalize, + "C": string.capwords, + "U": urllib.parse.unquote, + "S": to_string, + "s": str, + "r": repr, + "a": ascii, + } + + def __init__(self, format_string, default=None): + self.default = default + self.result = [] + self.fields = [] + + for literal_text, field_name, format_spec, conversion in \ + _string.formatter_parser(format_string): + if literal_text: + self.result.append(literal_text) + if field_name: + self.fields.append(( + len(self.result), + self._field_access(field_name, format_spec, conversion) + )) + self.result.append("") + + def format_map(self, kwargs): + """Apply 'kwargs' to the initial format_string and return its result""" + for index, func in self.fields: + self.result[index] = func(kwargs) + return "".join(self.result) + + def _field_access(self, field_name, format_spec, conversion): + first, rest = _string.formatter_field_name_split(field_name) + + funcs = [] + for is_attr, key in rest: + if is_attr: + func = operator.attrgetter + elif ":" in key: + func = self._slicegetter + else: + func = operator.itemgetter + funcs.append(func(key)) + + if conversion: + funcs.append(self.CONVERSIONS[conversion]) + + if format_spec: + if format_spec[0] == "?": + func = self._format_optional + elif format_spec[0] == "L": + func = self._format_maxlen + elif format_spec[0] == "J": + func = self._format_join + elif format_spec[0] == "R": + func = self._format_replace + else: + func = self._format_default + fmt = func(format_spec) + else: + fmt = str + + if funcs: + return self._apply(first, funcs, fmt) + return self._apply_simple(first, fmt) + + def _apply_simple(self, key, fmt): + def wrap(obj): + if key in obj: + obj = obj[key] + else: + obj = self.default + return fmt(obj) + return wrap + + def _apply(self, key, funcs, fmt): + def wrap(obj): + try: + obj = obj[key] + for func in funcs: + obj = func(obj) + except Exception: + obj = self.default + return fmt(obj) + return wrap + + @staticmethod + def _slicegetter(key): + start, _, stop = key.partition(":") + stop, _, step = stop.partition(":") + start = int(start) if start else None + stop = int(stop) if stop else None + step = int(step) if step else None + return operator.itemgetter(slice(start, stop, step)) + + @staticmethod + def _format_optional(format_spec): + def wrap(obj): + if not obj: + return "" + return before + format(obj, format_spec) + after + before, after, format_spec = format_spec.split("/", 2) + before = before[1:] + return wrap + + @staticmethod + def _format_maxlen(format_spec): + def wrap(obj): + obj = format(obj, format_spec) + return obj if len(obj) <= maxlen else replacement + maxlen, replacement, format_spec = format_spec.split("/", 2) + maxlen = text.parse_int(maxlen[1:]) + return wrap + + @staticmethod + def _format_join(format_spec): + def wrap(obj): + obj = separator.join(obj) + return format(obj, format_spec) + separator, _, format_spec = format_spec.partition("/") + separator = separator[1:] + return wrap + + @staticmethod + def _format_replace(format_spec): + def wrap(obj): + obj = obj.replace(old, new) + return format(obj, format_spec) + old, new, format_spec = format_spec.split("/", 2) + old = old[1:] + return wrap + + @staticmethod + def _format_default(format_spec): + def wrap(obj): + return format(obj, format_spec) + return wrap + + +class PathFormat(): + + def __init__(self, extractor): + self.filename_fmt = extractor.config( + "filename", extractor.filename_fmt) + self.directory_fmt = extractor.config( + "directory", extractor.directory_fmt) + self.kwdefault = extractor.config("keywords-default") + + try: + self.formatter = Formatter(self.filename_fmt, self.kwdefault) + except Exception as exc: + raise exception.FormatError(exc, "filename") + + self.delete = False + self.has_extension = False + self.keywords = {} + self.filename = "" + self.directory = self.realdirectory = "" + self.path = self.realpath = self.temppath = "" + + self.basedirectory = expand_path( + extractor.config("base-directory", (".", "gallery-dl"))) + if os.altsep: + self.basedirectory = self.basedirectory.replace(os.altsep, os.sep) + + def open(self, mode="wb"): + """Open file and return a corresponding file object""" + return open(self.temppath, mode) + + def exists(self, archive=None): + """Return True if the file exists on disk or in 'archive'""" + if (archive and archive.check(self.keywords) or + self.has_extension and os.path.exists(self.realpath)): + if not self.has_extension: + # adjust display name + self.set_extension("") + if self.path[-1] == ".": + self.path = self.path[:-1] + return True + return False + + def set_directory(self, keywords): + """Build directory path and create it if necessary""" + try: + segments = [ + text.clean_path( + Formatter(segment, self.kwdefault) + .format_map(keywords).strip()) + for segment in self.directory_fmt + ] + except Exception as exc: + raise exception.FormatError(exc, "directory") + + self.directory = os.path.join( + self.basedirectory, + *segments + ) + + # remove trailing path separator; + # occurs if the last argument to os.path.join() is an empty string + if self.directory[-1] == os.sep: + self.directory = self.directory[:-1] + + self.realdirectory = self.adjust_path(self.directory) + os.makedirs(self.realdirectory, exist_ok=True) + + def set_keywords(self, keywords): + """Set filename keywords""" + self.keywords = keywords + self.temppath = "" + self.has_extension = bool(keywords.get("extension")) + if self.has_extension: + self.build_path() + + def set_extension(self, extension, real=True): + """Set the 'extension' keyword""" + self.has_extension = real + self.keywords["extension"] = extension + self.build_path() + + def build_path(self): + """Use filename-keywords and directory to build a full path""" + try: + self.filename = text.clean_path( + self.formatter.format_map(self.keywords)) + except Exception as exc: + raise exception.FormatError(exc, "filename") + + filename = os.sep + self.filename + self.path = self.directory + filename + self.realpath = self.realdirectory + filename + if not self.temppath: + self.temppath = self.realpath + + def part_enable(self, part_directory=None): + """Enable .part file usage""" + if self.has_extension: + self.temppath += ".part" + else: + self.set_extension("part", False) + if part_directory: + self.temppath = os.path.join( + part_directory, + os.path.basename(self.temppath), + ) + + def part_size(self): + """Return size of .part file""" + try: + return os.stat(self.temppath).st_size + except OSError: + pass + return 0 + + def finalize(self): + """Move tempfile to its target location""" + if self.delete: + self.delete = False + os.unlink(self.temppath) + return + + if self.temppath == self.realpath: + return + + try: + os.replace(self.temppath, self.realpath) + return + except OSError: + pass + + shutil.copyfile(self.temppath, self.realpath) + os.unlink(self.temppath) + + @staticmethod + def adjust_path(path): + """Enable longer-than-260-character paths on windows""" + return "\\\\?\\" + os.path.abspath(path) if os.name == "nt" else path + + +class DownloadArchive(): + + def __init__(self, path, extractor): + con = sqlite3.connect(path) + con.isolation_level = None + self.cursor = con.cursor() + self.cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry PRIMARY KEY) WITHOUT ROWID") + self.keygen = (extractor.category + extractor.config( + "archive-format", extractor.archive_fmt) + ).format_map + + def check(self, kwdict): + """Return True if item described by 'kwdict' exists in archive""" + key = self.keygen(kwdict) + self.cursor.execute( + "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + return self.cursor.fetchone() + + def add(self, kwdict): + """Add item described by 'kwdict' to archive""" + key = self.keygen(kwdict) + self.cursor.execute( + "INSERT OR IGNORE INTO archive VALUES (?)", (key,)) diff --git a/gallery_dl/version.py b/gallery_dl/version.py new file mode 100644 index 0000000..4167bc4 --- /dev/null +++ b/gallery_dl/version.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +__version__ = "1.8.7" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f9f5cd8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests>=2.11.0 diff --git a/scripts/bash_completion.py b/scripts/bash_completion.py new file mode 100755 index 0000000..69e6a79 --- /dev/null +++ b/scripts/bash_completion.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Generate bash completion script from gallery-dl's argument parser""" + +import util +from gallery_dl import option + + +TEMPLATE = """_gallery_dl() +{ + local cur prev + COMPREPLY=() + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + + if [[ "${prev}" =~ ^(%(fileopts)s)$ ]]; then + COMPREPLY=( $(compgen -f -- "${cur}") ) + elif [[ "${prev}" =~ ^(%(diropts)s)$ ]]; then + COMPREPLY=( $(compgen -d -- "${cur}") ) + else + COMPREPLY=( $(compgen -W "%(opts)s" -- "${cur}") ) + fi +} + +complete -F _gallery_dl gallery-dl +""" + +opts = [] +diropts = [] +fileopts = [] +for action in option.build_parser()._actions: + + if action.metavar in ("DEST",): + diropts.extend(action.option_strings) + + elif action.metavar in ("FILE", "CFG"): + fileopts.extend(action.option_strings) + + for opt in action.option_strings: + if opt.startswith("--"): + opts.append(opt) + +PATH = util.path("gallery-dl.bash_completion") +with open(PATH, "w", encoding="utf-8") as file: + file.write(TEMPLATE % { + "opts" : " ".join(opts), + "diropts" : "|".join(diropts), + "fileopts": "|".join(fileopts), + }) diff --git a/scripts/build_testresult_db.py b/scripts/build_testresult_db.py new file mode 100755 index 0000000..fda9f64 --- /dev/null +++ b/scripts/build_testresult_db.py @@ -0,0 +1,56 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""Collect results of extractor unit tests""" + +import sys +import os.path +import datetime + +import util +from gallery_dl import extractor, job, config +from test.test_results import setup_test_config + + +# filter test cases + +tests = [ + (idx, extr, url, result) + + for extr in extractor.extractors() + if hasattr(extr, "test") and extr.test + if len(sys.argv) <= 1 or extr.category in sys.argv + + for idx, (url, result) in enumerate(extr._get_tests()) + if result +] + + +# setup target directory + +path = util.path("archive", "testdb", str(datetime.date.today())) +os.makedirs(path, exist_ok=True) + + +for idx, extr, url, result in tests: + + # filename + name = "{}-{}-{}.json".format(extr.category, extr.subcategory, idx) + print(name) + + # config values + setup_test_config() + + if "options" in result: + for key, value in result["options"]: + config.set(key.split("."), value) + if "range" in result: + config.set(("image-range",), result["range"]) + config.set(("chapter-range",), result["range"]) + + # write test data + try: + with open(os.path.join(path, name), "w") as outfile: + job.DataJob(url, file=outfile, ensure_ascii=False).run() + except KeyboardInterrupt: + sys.exit() diff --git a/scripts/create_test_data.py b/scripts/create_test_data.py new file mode 100755 index 0000000..14ab0c0 --- /dev/null +++ b/scripts/create_test_data.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Create testdata for extractor tests""" + +import argparse + +import util # noqa +from gallery_dl import extractor +from test.test_results import ResultJob, setup_test_config + + +TESTDATA_FMT = """ + test = ("{}", {{ + "url": "{}", + "keyword": "{}", + "content": "{}", + }}) +""" + +TESTDATA_EXCEPTION_FMT = """ + test = ("{}", {{ + "exception": exception.{}, + }}) +""" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--content", action="store_true") + parser.add_argument("--recreate", action="store_true") + parser.add_argument("urls", nargs="*") + args = parser.parse_args() + + if args.recreate: + urls = [ + test[0] + for extr in extractor.extractors() if extr.category in args.urls + for test in extr.test + ] + else: + urls = args.urls + + setup_test_config() + + for url in urls: + tjob = ResultJob(url, content=args.content) + try: + tjob.run() + except Exception as exc: + fmt = TESTDATA_EXCEPTION_FMT + data = (exc.__class__.__name__,) + else: + fmt = TESTDATA_FMT + data = (tjob.hash_url.hexdigest(), + tjob.hash_keyword.hexdigest(), + tjob.hash_content.hexdigest()) + print(tjob.extractor.__class__.__name__) + print(fmt.format(url, *data)) + + +if __name__ == '__main__': + main() diff --git a/scripts/hook-gallery_dl.py b/scripts/hook-gallery_dl.py new file mode 100644 index 0000000..d549019 --- /dev/null +++ b/scripts/hook-gallery_dl.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- + +from gallery_dl import extractor, downloader, postprocessor + +hiddenimports = [ + package.__name__ + "." + module + for package in (extractor, downloader, postprocessor) + for module in package.modules +] diff --git a/scripts/man.py b/scripts/man.py new file mode 100755 index 0000000..91608a3 --- /dev/null +++ b/scripts/man.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Generate man pages""" + +import re +import datetime + +import util +import gallery_dl.option +import gallery_dl.version + + +def build_gallery_dl_1(path=None): + + OPTS_FMT = """.TP\n.B "{}" {}\n{}""" + + TEMPLATE = r""" +.TH "GALLERY-DL" "1" "%(date)s" "%(version)s" "gallery-dl Manual" +.\" disable hyphenation +.nh + +.SH NAME +gallery-dl \- download image-galleries and -collections + +.SH SYNOPSIS +.B gallery-dl +[OPTION]... URL... + +.SH DESCRIPTION +.B gallery-dl +is a command-line program to download image-galleries and -collections +from several image hosting sites. It is a cross-platform tool +with many configuration options and powerful filenaming capabilities. + +.SH OPTIONS +%(options)s + +.SH EXAMPLES +.TP +gallery-dl \f[I]URL\f[] +Download images from \f[I]URL\f[]. +.TP +gallery-dl -g -u -p \f[I]URL\f[] +Print direct URLs from a site that requires authentication. +.TP +gallery-dl --filter 'type == "ugoira"' --range '2-4' \f[I]URL\f[] +Apply filter and range expressions. This will only download +the second, third, and fourth file where its type value is equal to "ugoira". +.TP +gallery-dl r:\f[I]URL\f[] +Scan \f[I]URL\f[] for other URLs and invoke \f[B]gallery-dl\f[] on them. +.TP +gallery-dl oauth:\f[I]SITE\-NAME\f[] +Gain OAuth authentication tokens for +.IR deviantart , +.IR flickr , +.IR reddit , +.IR smugmug ", and" +.IR tumblr . + +.SH FILES +.TP +.I /etc/gallery-dl.conf +The system wide configuration file. +.TP +.I ~/.config/gallery-dl/config.json +Per user configuration file. +.TP +.I ~/.gallery-dl.conf +Alternate per user configuration file. + +.SH BUGS +https://github.com/mikf/gallery-dl/issues + +.SH AUTHORS +Mike Fährmann +.br +and https://github.com/mikf/gallery-dl/graphs/contributors + +.SH "SEE ALSO" +.BR gallery-dl.conf (5) +""" + + options = [] + for action in gallery_dl.option.build_parser()._actions: + if action.help.startswith("=="): + continue + options.append(OPTS_FMT.format( + ", ".join(action.option_strings).replace("-", r"\-"), + r"\f[I]{}\f[]".format(action.metavar) if action.metavar else "", + action.help, + )) + + if not path: + path = util.path("gallery-dl.1") + with open(path, "w", encoding="utf-8") as file: + file.write(TEMPLATE.lstrip() % { + "options": "\n".join(options), + "version": gallery_dl.version.__version__, + "date" : datetime.datetime.now().strftime("%Y-%m-%d"), + }) + + +def build_gallery_dl_conf_5(path=None): + + TEMPLATE = r""" +.TH "GALLERY-DL.CONF" "5" "%(date)s" "%(version)s" "gallery-dl Manual" +.\" disable hyphenation +.nh +.\" disable justification (adjust text to left margin only) +.ad l + +.SH NAME +gallery-dl.conf \- gallery-dl configuration file + +.SH DESCRIPTION +gallery-dl will search for configuration files in the following places +every time it is started, unless +.B --ignore-config +is specified: +.PP +.RS 4 +.nf +.I /etc/gallery-dl.conf +.I $HOME/.config/gallery-dl/config.json +.I $HOME/.gallery-dl.conf +.fi +.RE +.PP +It is also possible to specify additional configuration files with the +.B -c/--config +command-line option or to add further option values with +.B -o/--option +as = pairs, + +Configuration files are JSON-based and therefore don't allow any ordinary +comments, but, since unused keys are simply ignored, it is possible to utilize +those as makeshift comments by settings their values to arbitrary strings. + +.SH EXAMPLE +{ +.RS 4 +"base-directory": "/tmp/", +.br +"extractor": { +.RS 4 +"pixiv": { +.RS 4 +"directory": ["Pixiv", "Works", "{user[id]}"], +.br +"filename": "{id}{num}.{extension}", +.br +"username": "foo", +.br +"password": "bar" +.RE +}, +.br +"flickr": { +.RS 4 +"_comment": "OAuth keys for account 'foobar'", +.br +"access-token": "0123456789-0123456789abcdef", +.br +"access-token-secret": "fedcba9876543210" +.RE +} +.RE +}, +.br +"downloader": { +.RS 4 +"retries": 3, +.br +"timeout": 2.5 +.RE +} +.RE +} + +%(options)s + +.SH BUGS +https://github.com/mikf/gallery-dl/issues + +.SH AUTHORS +Mike Fährmann +.br +and https://github.com/mikf/gallery-dl/graphs/contributors + +.SH "SEE ALSO" +.BR gallery-dl (1) +""" + + sections = parse_docs_configuration() + content = [] + + for sec_name, section in sections.items(): + content.append(".SH " + sec_name.upper()) + + for opt_name, option in section.items(): + content.append(".SS " + opt_name) + + for field, text in option.items(): + if field in ("Type", "Default"): + content.append('.IP "{}:" {}'.format(field, len(field)+2)) + content.append(strip_rst(text)) + else: + content.append('.IP "{}:" 4'.format(field)) + content.append(strip_rst(text, field != "Example")) + + if not path: + path = util.path("gallery-dl.conf.5") + with open(path, "w", encoding="utf-8") as file: + file.write(TEMPLATE.lstrip() % { + "options": "\n".join(content), + "version": gallery_dl.version.__version__, + "date" : datetime.datetime.now().strftime("%Y-%m-%d"), + }) + + +def parse_docs_configuration(): + + doc_path = util.path("docs", "configuration.rst") + with open(doc_path, encoding="utf-8") as file: + doc_lines = file.readlines() + + sections = {} + sec_name = None + options = None + opt_name = None + opt_desc = None + name = None + last = last2 = None + for line in doc_lines: + + # start of new section + if re.match(r"^=+$", line): + if sec_name and options: + sections[sec_name] = options + sec_name = last.strip() + options = {} + + elif re.match(r"^=+ =+$", line): + # start of option table + if re.match(r"^-+$", last): + opt_name = last2.strip() + opt_desc = {} + # end of option table + elif opt_desc: + options[opt_name] = opt_desc + opt_name = None + name = None + + # inside option table + elif opt_name: + if line[0].isalpha(): + name, _, line = line.partition(" ") + opt_desc[name] = "" + line = line.strip() + if line.startswith(("* ", "- ")): + line = "\n" + line + elif line.startswith("| "): + line = line[2:] + "\n.br" + opt_desc[name] += line + "\n" + + last2 = last + last = line + sections[sec_name] = options + + return sections + + +def strip_rst(text, extended=True, *, ITALIC=r"\\f[I]\1\\f[]", REGULAR=r"\1"): + + text = text.replace("\\", "\\\\") + + # ``foo`` + repl = ITALIC if extended else REGULAR + text = re.sub(r"``([^`]+)``", repl, text) + # |foo|_ + text = re.sub(r"\|([^|]+)\|_*", ITALIC, text) + # `foo`_ + text = re.sub(r"`([^`]+)`_+", ITALIC, text) + # `foo` + text = re.sub(r"`([^`]+)`", REGULAR, text) + # foo_ + text = re.sub(r"([A-Za-z0-9-]+)_+(?=\s)", ITALIC, text) + # ------- + text = re.sub(r"---+", "", text) + + return text + + +if __name__ == "__main__": + build_gallery_dl_1() + build_gallery_dl_conf_5() diff --git a/scripts/pyinstaller.py b/scripts/pyinstaller.py new file mode 100755 index 0000000..879ae50 --- /dev/null +++ b/scripts/pyinstaller.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""Build a standalone executable using PyInstaller""" + +import PyInstaller.__main__ +import util + +PyInstaller.__main__.run([ + "--onefile", + "--console", + "--name", "gallery-dl." + ("exe" if PyInstaller.is_win else "bin"), + "--additional-hooks-dir", util.path("scripts"), + "--distpath", util.path("dist"), + "--workpath", util.path("build"), + "--specpath", util.path("build"), + util.path("gallery_dl", "__main__.py"), +]) diff --git a/scripts/release.sh b/scripts/release.sh new file mode 100755 index 0000000..ef444e0 --- /dev/null +++ b/scripts/release.sh @@ -0,0 +1,167 @@ +#!/bin/bash +set -e + +prompt() { + echo "root: ${ROOTDIR} old: ${OLDVERSION} - new: ${NEWVERSION}" + read -n 1 -r -p "Proceed? [Y/n] " P + echo + if [ "$P" == y -o "$P" == Y -o -z "$P" ]; then + return 0 + else + exit 1 + fi +} + +cleanup() { + cd "${ROOTDIR}" + echo Removing old build directory + + if [ -d ./build ]; then + rm -rf ./build + fi +} + +update() { + cd "${ROOTDIR}" + echo Updating version to ${NEWVERSION} + + sed -i "s#\"${PYVERSION}\"#\"${NEWVERSION}\"#" "gallery_dl/version.py" + sed -i "s#v${OLDVERSION}#v${NEWVERSION}#" "${README}" +} + +update-dev() { + cd "${ROOTDIR}" + + IFS="." read MAJOR MINOR BUILD <<< "${NEWVERSION}" + BUILD=$((BUILD+1)) + # update version to -dev + sed -i "s#\"${NEWVERSION}\"#\"${MAJOR}.${MINOR}.${BUILD}-dev\"#" "gallery_dl/version.py" + # add 'unreleased' line to changelog + sed -i "2i\\\n## Unreleased" "${CHANGELOG}" + + git add "gallery_dl/version.py" "${CHANGELOG}" +} + +build-python() { + cd "${ROOTDIR}" + echo Building bdist_wheel and sdist + + python setup.py bdist_wheel sdist +} + +build-linux() { + cd "${ROOTDIR}" + echo Building Linux executable + + make executable +} + +build-windows() { + cd "${ROOTDIR}/dist" + echo Building Windows executable + + # remove old executable + rm -f "gallery-dl.exe" + + # build windows exe in vm + ln -fs "${ROOTDIR}" /tmp/ + vmstart "Windows 7" & + disown + while [ ! -e "gallery-dl.exe" ] ; do + sleep 5 + done + sleep 2 + + # check exe version + OUTPUT="$(wine gallery-dl.exe --version)" + if [[ ! "${OUTPUT%?}" == "${NEWVERSION}" ]]; then + echo "exe version mismatch: ${OUTPUT} != ${NEWVERSION}" + exit 3 + fi +} + +sign() { + cd "${ROOTDIR}/dist" + echo Signing files + + gpg --detach-sign --armor gallery_dl-${NEWVERSION}-py3-none-any.whl + gpg --detach-sign --armor gallery_dl-${NEWVERSION}.tar.gz + gpg --detach-sign --yes gallery-dl.exe + gpg --detach-sign --yes gallery-dl.bin +} + +changelog() { + cd "${ROOTDIR}" + echo Updating "${CHANGELOG}" + + # - replace "#NN" with link to actual issue + # - insert new version and date + sed -i \ + -e "s*\([( ]\)#\([0-9]\+\)*\1[#\2](https://github.com/mikf/gallery-dl/issues/\2)*g" \ + -e "s*^## [Uu]nreleased*## ${NEWVERSION} - $(date +%Y-%m-%d)*" \ + "${CHANGELOG}" +} + +supportedsites() { + cd "${ROOTDIR}" + echo Checking if "${SUPPORTEDSITES}" is up to date + + ./scripts/supportedsites.py + if ! git diff --quiet "${SUPPORTEDSITES}"; then + echo "updated ${SUPPORTEDSITES} contains changes" + exit 4 + fi +} + +git-upload() { + cd "${ROOTDIR}" + echo Pushing changes to github + + git add "gallery_dl/version.py" "${README}" "${CHANGELOG}" + git commit -S -m "release version ${NEWVERSION}" + git tag -s -m "version ${NEWVERSION}" "v${NEWVERSION}" + git push + git push origin "v${NEWVERSION}" +} + +pypi-upload() { + cd "${ROOTDIR}/dist" + echo Uploading to PyPI + + twine upload gallery_dl-${NEWVERSION}* +} + + +ROOTDIR="$(realpath "$(dirname "$0")/..")/" +README="README.rst" +CHANGELOG="CHANGELOG.md" +SUPPORTEDSITES="./docs/supportedsites.rst" + +LASTTAG="$(git describe --abbrev=0 --tags)" +OLDVERSION="${LASTTAG#v}" +PYVERSION="$(python -c "import gallery_dl as g; print(g.__version__)")" + +if [[ "$1" ]]; then + NEWVERSION="$1" +else + NEWVERSION="${PYVERSION%-dev}" +fi + +if [[ ! $NEWVERSION =~ [0-9]+\.[0-9]+\.[0-9]+(-[a-z]+(\.[0-9]+)?)?$ ]]; then + echo "invalid version: $NEWVERSION" + exit 2 +fi + + +prompt +supportedsites +cleanup +update +build-python +build-linux +build-windows +sign +changelog +git-upload +pypi-upload +update-dev diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh new file mode 100755 index 0000000..334671e --- /dev/null +++ b/scripts/run_tests.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +TESTS_CORE=(config cookies downloader extractor oauth text util) +TESTS_RESULTS=(results) + + +# select tests +case "${1:-${GALLERYDL_TESTS:-core}}" in + core) TESTS=( ${TESTS_CORE[@]} );; + results) TESTS=( ${TESTS_RESULTS[@]} );; + *) TESTS=( );; +esac + + +# transform each array element to test_###.py +TESTS=( ${TESTS[@]/#/test_} ) +TESTS=( ${TESTS[@]/%/.py} ) + + +# run 'nosetests' with selected tests +# (or all tests if ${TESTS} is empty) +nosetests --verbose -w "${DIR}/../test" ${TESTS[@]} diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py new file mode 100755 index 0000000..f326617 --- /dev/null +++ b/scripts/supportedsites.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +"""Generate a reStructuredText document with all supported sites""" + +import sys +import collections + +import util +from gallery_dl import extractor + + +CATEGORY_MAP = { + "2chan" : "Futaba Channel", + "35photo" : "35PHOTO", + "archivedmoe" : "Archived.Moe", + "archiveofsins" : "Archive of Sins", + "artstation" : "ArtStation", + "b4k" : "arch.b4k.co", + "bobx" : "BobX", + "deviantart" : "DeviantArt", + "dokireader" : "Doki Reader", + "dynastyscans" : "Dynasty Reader", + "e621" : "e621", + "exhentai" : "ExHentai", + "fallenangels" : "Fallen Angels Scans", + "fashionnova" : "Fashion Nova", + "hbrowse" : "HBrowse", + "hentai2read" : "Hentai2Read", + "hentaicafe" : "Hentai Cafe", + "hentaifoundry" : "Hentai Foundry", + "hentaifox" : "HentaiFox", + "hentaihere" : "HentaiHere", + "hitomi" : "Hitomi.la", + "idolcomplex" : "Idol Complex", + "imagebam" : "ImageBam", + "imagefap" : "ImageFap", + "imgbox" : "imgbox", + "imgth" : "imgth", + "imgur" : "imgur", + "jaiminisbox" : "Jaimini's Box", + "kireicake" : "Kirei Cake", + "kissmanga" : "KissManga", + "livedoor" : "livedoor Blog", + "mangadex" : "MangaDex", + "mangafox" : "Manga Fox", + "mangahere" : "Manga Here", + "mangapark" : "MangaPark", + "mangastream" : "Manga Stream", + "myportfolio" : "Adobe Portfolio", + "nhentai" : "nhentai", + "nijie" : "nijie", + "nsfwalbum" : "NSFWalbum.com", + "nyafuu" : "Nyafuu Archive", + "paheal" : "rule #34", + "powermanga" : "PowerManga", + "readcomiconline": "Read Comic Online", + "rbt" : "RebeccaBlackTech", + "rule34" : "Rule 34", + "sankaku" : "Sankaku Channel", + "sankakucomplex" : "Sankaku Complex", + "seaotterscans" : "Sea Otter Scans", + "seiga" : "Niconico Seiga", + "senmanga" : "Sen Manga", + "sensescans" : "Sense-Scans", + "sexcom" : "Sex.com", + "simplyhentai" : "Simply Hentai", + "slickpic" : "SlickPic", + "slideshare" : "SlideShare", + "smugmug" : "SmugMug", + "thebarchive" : "The /b/ Archive", + "vanillarock" : "もえぴりあ", + "wikiart" : "WikiArt.org", + "worldthree" : "World Three", + "xhamster" : "xHamster", + "xvideos" : "XVideos", + "yaplog" : "yaplog!", + "yuki" : "yuki.la 4chan archive", +} + +SUBCATEGORY_MAP = { + "artwork": "Artwork Listings", + "artists": "", + "doujin" : "Doujin", + "gallery": "Galleries", + "image" : "individual Images", + "issue" : "Comic-Issues", + "manga" : "Manga", + "me" : "pixiv.me Links", + "media" : "Media Timelines", + "path" : "Images from Users and Folders", + "pinit" : "pin.it Links", + "popular": "Popular Images", + "recent" : "Recent Images", + "search" : "Search Results", + "stash" : "Sta.sh", + "status" : "Images from Statuses", + "tag" : "Tag-Searches", + "user" : "Images from Users", + "work" : "Individual Images", + "related-pin" : "related Pins", + "related-board": "", +} + +AUTH_MAP = { + "danbooru" : "Optional", + "deviantart" : "Optional (OAuth)", + "exhentai" : "Optional", + "flickr" : "Optional (OAuth)", + "idolcomplex": "Optional", + "luscious" : "Optional", + "mangoxo" : "Optional", + "nijie" : "Required", + "pixiv" : "Required", + "reddit" : "Optional (OAuth)", + "sankaku" : "Optional", + "seiga" : "Required", + "smugmug" : "Optional (OAuth)", + "tsumino" : "Optional", + "tumblr" : "Optional (OAuth)", + "twitter" : "Optional", +} + +IGNORE_LIST = ( + "directlink", + "oauth", + "recursive", + "test", +) + + +def domain(cls): + """Return the web-domain related to an extractor class""" + url = sys.modules[cls.__module__].__doc__.split()[-1] + if url.startswith("http"): + return url + + if hasattr(cls, "root") and cls.root: + return cls.root + "/" + + if hasattr(cls, "https"): + scheme = "https" if cls.https else "http" + netloc = cls.__doc__.split()[-1] + return "{}://{}/".format(scheme, netloc) + + test = next(cls._get_tests(), None) + if test: + url = test[0] + return url[:url.find("/", 8)+1] + + return "" + + +def category_text(cls): + """Return a human-readable representation of a category""" + c = cls.category + return CATEGORY_MAP.get(c) or c.capitalize() + + +def subcategory_text(cls): + """Return a human-readable representation of a subcategory""" + sc = cls.subcategory + if sc in SUBCATEGORY_MAP: + return SUBCATEGORY_MAP[sc] + sc = sc.capitalize() + return sc if sc.endswith("s") else sc + "s" + + +def category_key(cls): + """Generate sorting keys by category""" + key = category_text(cls).lower() + if cls.__module__.endswith(".imagehosts"): + key = "zz" + key + return key + + +def subcategory_key(cls): + """Generate sorting keys by subcategory""" + if cls.subcategory in ("user", "issue"): + return "A" + return cls.subcategory + + +def build_extractor_list(): + """Generate a sorted list of lists of extractor classes""" + extractors = collections.defaultdict(list) + + # get lists of extractor classes grouped by category + for extr in extractor.extractors(): + if not extr.category or extr.category in IGNORE_LIST: + continue + extractors[extr.category].append(extr) + + # sort extractor lists with the same category + for extrlist in extractors.values(): + extrlist.sort(key=subcategory_key) + + # sort lists by category + return sorted( + extractors.values(), + key=lambda lst: category_key(lst[0]), + ) + + +# define table columns +COLUMNS = ( + ("Site", 20, + lambda x: category_text(x[0])), + ("URL" , 35, + lambda x: domain(x[0])), + ("Capabilities", 50, + lambda x: ", ".join(subcategory_text(extr) for extr in x + if subcategory_text(extr))), + ("Authentication", 16, + lambda x: AUTH_MAP.get(x[0].category, "")), +) + + +def write_output(fobj, columns, extractors): + + def pad(output, col, category=None): + size = col[1] + output = output if isinstance(output, str) else col[2](output) + + if len(output) > size: + sub = "|{}-{}|".format(category, col[0][0]) + subs.append((sub, output)) + output = sub + + return output + " " * (size - len(output)) + + w = fobj.write + subs = [] + + # caption + w("Supported Sites\n") + w("===============\n") + + # table head + sep = " ".join("=" * c[1] for c in columns) + "\n" + w(sep) + w(" ".join(pad(c[0], c) for c in columns).strip() + "\n") + w(sep) + + # table body + for lst in extractors: + w(" ".join( + pad(col[2](lst), col, lst[0].category) + for col in columns + ).strip()) + w("\n") + + # table bottom + w(sep) + w("\n") + + # substitutions + for sub, value in subs: + w(".. {} replace:: {}\n".format(sub, value)) + + +outfile = sys.argv[1] if len(sys.argv) > 1 else "supportedsites.rst" +with open(util.path("docs", outfile), "w") as file: + write_output(file, COLUMNS, build_extractor_list()) diff --git a/scripts/util.py b/scripts/util.py new file mode 100644 index 0000000..bfbd6cb --- /dev/null +++ b/scripts/util.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +import sys +import os.path + +ROOTDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.insert(0, os.path.realpath(ROOTDIR)) + + +def path(*segments, join=os.path.join): + return join(ROOTDIR, *segments) diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..c8d5cea --- /dev/null +++ b/setup.cfg @@ -0,0 +1,3 @@ +[flake8] +exclude = gallery_dl/__init__.py,gallery_dl/__main__.py,setup.py,build,scripts,archive +ignore = E203,E226,W504 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8299811 --- /dev/null +++ b/setup.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals, print_function + +import sys +import os.path +import warnings + +if sys.hexversion < 0x3040000: + sys.exit("Python 3.4+ required") + +try: + from setuptools import setup + has_setuptools = True +except ImportError: + from distutils.core import setup + has_setuptools = False + + +def read(fname): + path = os.path.join(os.path.dirname(__file__), fname) + with open(path, encoding="utf-8") as file: + return file.read() + +def check_file(fname): + if os.path.exists(fname): + return True + warnings.warn( + "Not including file '{}' since it is not present. " + "Run 'make' to build all automatically generated files.".format(fname) + ) + return False + + +# get version without importing the package +exec(read("gallery_dl/version.py")) + +DESCRIPTION = ("Command-line program to download image-galleries and " + "-collections from several image hosting sites") +LONG_DESCRIPTION = read("README.rst") + +if "py2exe" in sys.argv: + try: + import py2exe + except ImportError: + sys.exit("Error importing 'py2exe'") + params = { + "console": [{ + "script": "./gallery_dl/__main__.py", + "dest_base": "gallery-dl", + "version": __version__, + "description": DESCRIPTION, + "comments": LONG_DESCRIPTION, + "product_name": "gallery-dl", + "product_version": __version__, + }], + "options": {"py2exe": { + "bundle_files": 0, + "compressed": 1, + "optimize": 1, + "dist_dir": ".", + "packages": ["gallery_dl"], + "dll_excludes": ["w9xpopen.exe"], + }}, + "zipfile": None, + } +elif has_setuptools: + params = { + "entry_points": { + "console_scripts": [ + "gallery-dl = gallery_dl:main" + ] + } + } +else: + params = { + "scripts": ["bin/gallery-dl"] + } + +data_files = [ + (path, [f for f in files if check_file(f)]) + for (path, files) in [ + ('etc/bash_completion.d', ['gallery-dl.bash_completion']), + ('share/man/man1' , ['gallery-dl.1']), + ('share/man/man5' , ['gallery-dl.conf.5']), + ] +] + + +setup( + name="gallery_dl", + version=__version__, + description=DESCRIPTION, + long_description=LONG_DESCRIPTION, + url="https://github.com/mikf/gallery-dl", + download_url="https://github.com/mikf/gallery-dl/releases/latest", + author="Mike Fährmann", + author_email="mike_faehrmann@web.de", + maintainer="Mike Fährmann", + maintainer_email="mike_faehrmann@web.de", + license="GPLv2", + python_requires=">=3.4", + install_requires=[ + "requests>=2.11.0", + ], + packages=[ + "gallery_dl", + "gallery_dl.extractor", + "gallery_dl.downloader", + "gallery_dl.postprocessor", + ], + data_files=data_files, + keywords="image gallery downloader crawler scraper", + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: End Users/Desktop", + "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", + "Operating System :: Microsoft :: Windows", + "Operating System :: POSIX", + "Operating System :: MacOS", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Internet :: WWW/HTTP", + "Topic :: Multimedia :: Graphics", + "Topic :: Utilities", + ], + test_suite="test", + **params +) diff --git a/snap/local/launchers/gallery-dl-launch b/snap/local/launchers/gallery-dl-launch new file mode 100755 index 0000000..908f303 --- /dev/null +++ b/snap/local/launchers/gallery-dl-launch @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# This is the maintainence launcher for the snap, make necessary runtime environment changes to make the snap work here. You may also insert security confinement/deprecation/obsoletion notice of the snap here. + +set \ + -o errexit \ + -o errtrace \ + -o nounset \ + -o pipefail + +# Use user's real home directory for canonical configuration path access +declare REALHOME="$( + getent passwd "${USER}" \ + | cut --delimiter=: --fields=6 +)" +HOME="${REALHOME}" + +if ! test -f "${SNAP_USER_COMMON}"/marker_disable_interface_warning; then + # Warn if the `removable-media` interface isn't connected + if ! ls /media &>/dev/null; then + printf -- \ + "It seems that this snap isn't connected to the \`removable-media\` security confinement interface. If you want to save the files under \`/media\`, \`/run/media\`, or \`/mnt\` directories you need to connect this snap to the \`removable-media\` interface by running the following command in a terminal:\\n\\n sudo snap connect %s:removable-media\\n\\n" \ + "${SNAP_NAME}" \ + >&2 + printf -- \ + "To disable this warning create an empty file at the following path:\\n\\n %s/marker_disable_interface_warning\\n\\n" \ + "${SNAP_USER_COMMON}" \ + >&2 + fi +fi + +# Finally run the next part of the command chain +exec "${@}" diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml new file mode 100644 index 0000000..f8e26fa --- /dev/null +++ b/snap/snapcraft.yaml @@ -0,0 +1,110 @@ +%YAML 1.1 +--- +# Snapcraft Recipe for gallery-dl +# ------------------------------ +# This file is in the YAML data serialization format: +# http://yaml.org +# For the spec. of writing this file refer the following documentation: +# * The snapcraft format +# https://docs.snapcraft.io/the-snapcraft-format/8337 +# * Snap Documentation +# https://docs.snapcraft.io +# * Topics under the doc category in the Snapcraft Forum +# https://forum.snapcraft.io/c/doc +# For support refer to the snapcraft section in the Snapcraft Forum: +# https://forum.snapcraft.io/c/snapcraft +name: gallery-dl +license: GPL-2.0 +base: core +summary: Download image-galleries and -collections from several image hosting sites +description: | + `gallery-dl` is a command-line program to download image-galleries and -collections from several image hosting sites (see [Supported Sites][1]). It is a cross-platform tool with many configuration options and powerful filenaming capabilities. + + [1]: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst + +adopt-info: gallery-dl +confinement: strict +grade: stable + +plugs: + # For `xdg-open` command access for opening OAuth authentication webpages + desktop: + + # Storage access + home: + removable-media: # Non-A/C + + # Network access + network: + + # For network service for recieving OAuth callback tokens + network-bind: + + # Configuration access + config-gallery-dl: + interface: personal-files + read: + - $HOME/.config/gallery-dl + - $HOME/.gallery-dl.conf + etc-gallery-dl: + interface: system-files + read: + - /etc/gallery-dl.conf + +parts: + # Launcher programs to fix problems at runtime + launchers: + source: snap/local/launchers + plugin: dump + organize: + '*': bin/ + + # Check out the tagged release revision if it isn’t promoted to the stable channel + # https://forum.snapcraft.io/t/selective-checkout-check-out-the-tagged-release-revision-if-it-isnt-promoted-to-the-stable-channel/10617 + selective-checkout: + plugin: nil + build-packages: + - git + stage-snaps: + - selective-checkout + prime: + - -* + + gallery-dl: + after: + - selective-checkout + + source: . + override-pull: | + snapcraftctl pull + $SNAPCRAFT_STAGE/scriptlets/selective-checkout + + plugin: python + build-packages: + - make + python-packages: + - youtube_dl + override-build: | + # build manpages and bash completion + make man completion + + snapcraftctl build + + ffmpeg: + plugin: nil + stage-packages: + - ffmpeg + +apps: + gallery-dl: + adapter: full + command-chain: + - bin/gallery-dl-launch + command: bin/gallery-dl + completer: etc/bash_completion.d/gallery-dl.bash_completion + environment: + LANG: C.UTF-8 + LC_ALL: C.UTF-8 + + # Satisfy FFmpeg's libpulsecommon dependency + LD_LIBRARY_PATH: $LD_LIBRARY_PATH:$SNAP/usr/lib/$SNAPCRAFT_ARCH_TRIPLET/pulseaudio diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/test_config.py b/test/test_config.py new file mode 100644 index 0000000..8cdb3da --- /dev/null +++ b/test/test_config.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2015-2017 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import unittest +import gallery_dl.config as config +import os +import tempfile + + +class TestConfig(unittest.TestCase): + + def setUp(self): + fd, self._configfile = tempfile.mkstemp() + with os.fdopen(fd, "w") as file: + file.write('{"a": "1", "b": {"a": 2, "c": "text"}}') + config.load((self._configfile,)) + + def tearDown(self): + config.clear() + os.remove(self._configfile) + + def test_get(self): + self.assertEqual(config.get(["a"]), "1") + self.assertEqual(config.get(["b", "c"]), "text") + self.assertEqual(config.get(["d"]), None) + self.assertEqual(config.get(["e", "f", "g"], 123), 123) + + def test_interpolate(self): + self.assertEqual(config.interpolate(["a"]), "1") + self.assertEqual(config.interpolate(["b", "a"]), "1") + self.assertEqual(config.interpolate(["b", "c"], "2"), "text") + self.assertEqual(config.interpolate(["b", "d"], "2"), "2") + config.set(["d"], 123) + self.assertEqual(config.interpolate(["b", "d"], "2"), 123) + self.assertEqual(config.interpolate(["d", "d"], "2"), 123) + + def test_set(self): + config.set(["b", "c"], [1, 2, 3]) + config.set(["e", "f", "g"], value=234) + self.assertEqual(config.get(["b", "c"]), [1, 2, 3]) + self.assertEqual(config.get(["e", "f", "g"]), 234) + + def test_setdefault(self): + config.setdefault(["b", "c"], [1, 2, 3]) + config.setdefault(["e", "f", "g"], value=234) + self.assertEqual(config.get(["b", "c"]), "text") + self.assertEqual(config.get(["e", "f", "g"]), 234) + + def test_unset(self): + config.unset(["a"]) + config.unset(["b", "c"]) + config.unset(["c", "d"]) + self.assertEqual(config.get(["a"]), None) + self.assertEqual(config.get(["b", "a"]), 2) + self.assertEqual(config.get(["b", "c"]), None) + + def test_apply(self): + options = ( + (["b", "c"], [1, 2, 3]), + (["e", "f", "g"], 234), + ) + + self.assertEqual(config.get(["b", "c"]), "text") + self.assertEqual(config.get(["e", "f", "g"]), None) + + with config.apply(options): + self.assertEqual(config.get(["b", "c"]), [1, 2, 3]) + self.assertEqual(config.get(["e", "f", "g"]), 234) + + self.assertEqual(config.get(["b", "c"]), "text") + self.assertEqual(config.get(["e", "f", "g"]), None) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_cookies.py b/test/test_cookies.py new file mode 100644 index 0000000..a786df6 --- /dev/null +++ b/test/test_cookies.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2017 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import unittest +from unittest import mock + +import logging +import tempfile +import http.cookiejar +from os.path import join + +import gallery_dl.config as config +import gallery_dl.extractor as extractor + +CKEY = ("cookies",) + + +class TestCookiejar(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.path = tempfile.TemporaryDirectory() + + cls.cookiefile = join(cls.path.name, "cookies.txt") + with open(cls.cookiefile, "w") as file: + file.write("""# HTTP Cookie File +.example.org\tTRUE\t/\tFALSE\t253402210800\tNAME\tVALUE +""") + + cls.invalid_cookiefile = join(cls.path.name, "invalid.txt") + with open(cls.invalid_cookiefile, "w") as file: + file.write("""# asd +.example.org\tTRUE\t/\tFALSE\t253402210800\tNAME\tVALUE +""") + + @classmethod + def tearDownClass(cls): + cls.path.cleanup() + config.clear() + + def test_cookiefile(self): + config.set(CKEY, self.cookiefile) + + cookies = extractor.find("test:").session.cookies + self.assertEqual(len(cookies), 1) + + cookie = next(iter(cookies)) + self.assertEqual(cookie.domain, ".example.org") + self.assertEqual(cookie.path , "/") + self.assertEqual(cookie.name , "NAME") + self.assertEqual(cookie.value , "VALUE") + + def test_invalid_cookiefile(self): + self._test_warning(self.invalid_cookiefile, http.cookiejar.LoadError) + + def test_invalid_filename(self): + self._test_warning(join(self.path.name, "nothing"), FileNotFoundError) + + def _test_warning(self, filename, exc): + config.set(CKEY, filename) + log = logging.getLogger("test") + with mock.patch.object(log, "warning") as mock_warning: + cookies = extractor.find("test:").session.cookies + self.assertEqual(len(cookies), 0) + self.assertEqual(mock_warning.call_count, 1) + self.assertEqual(mock_warning.call_args[0][0], "cookies: %s") + self.assertIsInstance(mock_warning.call_args[0][1], exc) + + +class TestCookiedict(unittest.TestCase): + + def setUp(self): + self.cdict = {"NAME1": "VALUE1", "NAME2": "VALUE2"} + config.set(CKEY, self.cdict) + + def tearDown(self): + config.clear() + + def test_dict(self): + cookies = extractor.find("test:").session.cookies + self.assertEqual(len(cookies), len(self.cdict)) + self.assertEqual(sorted(cookies.keys()), sorted(self.cdict.keys())) + self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values())) + + def test_domain(self): + for category in ["exhentai", "nijie", "sankaku", "seiga"]: + extr = _get_extractor(category) + cookies = extr.session.cookies + for key in self.cdict: + self.assertTrue(key in cookies) + for c in cookies: + self.assertEqual(c.domain, extr.cookiedomain) + + +class TestCookieLogin(unittest.TestCase): + + def tearDown(self): + config.clear() + + def test_cookie_login(self): + extr_cookies = { + "exhentai": ("ipb_member_id", "ipb_pass_hash"), + "nijie" : ("nemail", "nlogin"), + "sankaku" : ("login", "pass_hash"), + "seiga" : ("user_session",), + } + for category, cookienames in extr_cookies.items(): + cookies = {name: "value" for name in cookienames} + config.set(CKEY, cookies) + extr = _get_extractor(category) + with mock.patch.object(extr, "_login_impl") as mock_login: + extr.login() + mock_login.assert_not_called() + + +def _get_extractor(category): + for extr in extractor.extractors(): + if extr.category == category and hasattr(extr, "_login_impl"): + url = next(extr._get_tests())[0] + return extr.from_url(url) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_downloader.py b/test/test_downloader.py new file mode 100644 index 0000000..3f301b0 --- /dev/null +++ b/test/test_downloader.py @@ -0,0 +1,235 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import re +import base64 +import os.path +import tempfile +import unittest +import threading +import http.server + +import gallery_dl.downloader as downloader +import gallery_dl.extractor as extractor +import gallery_dl.config as config +from gallery_dl.downloader.common import DownloaderBase +from gallery_dl.output import NullOutput +from gallery_dl.util import PathFormat + + +class TestDownloaderBase(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.extractor = extractor.find("test:") + cls.dir = tempfile.TemporaryDirectory() + cls.fnum = 0 + config.set(("base-directory",), cls.dir.name) + + @classmethod + def tearDownClass(cls): + cls.dir.cleanup() + config.clear() + + @classmethod + def _prepare_destination(cls, content=None, part=True, extension=None): + name = "file-{}".format(cls.fnum) + cls.fnum += 1 + + kwdict = { + "category": "test", + "subcategory": "test", + "filename": name, + "extension": extension, + } + pathfmt = PathFormat(cls.extractor) + pathfmt.set_directory(kwdict) + pathfmt.set_keywords(kwdict) + + if content: + mode = "w" + ("b" if isinstance(content, bytes) else "") + with pathfmt.open(mode) as file: + file.write(content) + + return pathfmt + + def _run_test(self, url, input, output, + extension, expected_extension=None): + pathfmt = self._prepare_destination(input, extension=extension) + success = self.downloader.download(url, pathfmt) + + # test successful download + self.assertTrue(success, "downloading '{}' failed".format(url)) + + # test content + mode = "r" + ("b" if isinstance(output, bytes) else "") + with pathfmt.open(mode) as file: + content = file.read() + self.assertEqual(content, output) + + # test filename extension + self.assertEqual( + pathfmt.keywords["extension"], + expected_extension, + ) + self.assertEqual( + os.path.splitext(pathfmt.realpath)[1][1:], + expected_extension, + ) + + +class TestHTTPDownloader(TestDownloaderBase): + + @classmethod + def setUpClass(cls): + TestDownloaderBase.setUpClass() + cls.downloader = downloader.find("http")(cls.extractor, NullOutput()) + + port = 8088 + cls.address = "http://127.0.0.1:{}".format(port) + cls._jpg = cls.address + "/image.jpg" + cls._png = cls.address + "/image.png" + cls._gif = cls.address + "/image.gif" + + server = http.server.HTTPServer(("", port), HttpRequestHandler) + threading.Thread(target=server.serve_forever, daemon=True).start() + + def test_http_download(self): + self._run_test(self._jpg, None, DATA_JPG, "jpg", "jpg") + self._run_test(self._png, None, DATA_PNG, "png", "png") + self._run_test(self._gif, None, DATA_GIF, "gif", "gif") + + def test_http_offset(self): + self._run_test(self._jpg, DATA_JPG[:123], DATA_JPG, "jpg", "jpg") + self._run_test(self._png, DATA_PNG[:12] , DATA_PNG, "png", "png") + self._run_test(self._gif, DATA_GIF[:1] , DATA_GIF, "gif", "gif") + + def test_http_extension(self): + self._run_test(self._jpg, None, DATA_JPG, None, "jpg") + self._run_test(self._png, None, DATA_PNG, None, "png") + self._run_test(self._gif, None, DATA_GIF, None, "gif") + + def test_http_adjust_extension(self): + self._run_test(self._jpg, None, DATA_JPG, "png", "jpg") + self._run_test(self._png, None, DATA_PNG, "gif", "png") + self._run_test(self._gif, None, DATA_GIF, "jpg", "gif") + + +class TestTextDownloader(TestDownloaderBase): + + @classmethod + def setUpClass(cls): + TestDownloaderBase.setUpClass() + cls.downloader = downloader.find("text")(cls.extractor, NullOutput()) + + def test_text_download(self): + self._run_test("text:foobar", None, "foobar", "txt", "txt") + + def test_text_offset(self): + self._run_test("text:foobar", "foo", "foobar", "txt", "txt") + + def test_text_extension(self): + self._run_test("text:foobar", None, "foobar", None, "txt") + + def test_text_empty(self): + self._run_test("text:", None, "", "txt", "txt") + + +class FakeDownloader(DownloaderBase): + scheme = "fake" + + def __init__(self, extractor, output): + DownloaderBase.__init__(self, extractor, output) + + def connect(self, url, offset): + pass + + def receive(self, file): + pass + + def reset(self): + pass + + def get_extension(self): + pass + + @staticmethod + def _check_extension(file, pathfmt): + pass + + +class HttpRequestHandler(http.server.BaseHTTPRequestHandler): + + def do_GET(self): + if self.path == "/image.jpg": + content_type = "image/jpeg" + output = DATA_JPG + elif self.path == "/image.png": + content_type = "image/png" + output = DATA_PNG + elif self.path == "/image.gif": + content_type = "image/gif" + output = DATA_GIF + else: + self.send_response(404) + self.wfile.write(self.path.encode()) + return + + headers = { + "Content-Type": content_type, + "Content-Length": len(output), + } + + if "Range" in self.headers: + status = 206 + + match = re.match(r"bytes=(\d+)-", self.headers["Range"]) + start = int(match.group(1)) + + headers["Content-Range"] = "bytes {}-{}/{}".format( + start, len(output)-1, len(output)) + output = output[start:] + else: + status = 200 + + self.send_response(status) + for key, value in headers.items(): + self.send_header(key, value) + self.end_headers() + self.wfile.write(output) + + +DATA_JPG = base64.standard_b64decode(""" +/9j/4AAQSkZJRgABAQEASABIAAD/2wBD +AAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB +AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB +AQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEB +AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB +AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB +AQEBAQEBAQEBAQEBAQH/wAARCAABAAED +AREAAhEBAxEB/8QAFAABAAAAAAAAAAAA +AAAAAAAACv/EABQQAQAAAAAAAAAAAAAA +AAAAAAD/xAAUAQEAAAAAAAAAAAAAAAAA +AAAA/8QAFBEBAAAAAAAAAAAAAAAAAAAA +AP/aAAwDAQACEQMRAD8AfwD/2Q==""") + + +DATA_PNG = base64.standard_b64decode(""" +iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB +CAAAAAA6fptVAAAACklEQVQIHWP4DwAB +AQEANl9ngAAAAABJRU5ErkJggg==""") + + +DATA_GIF = base64.standard_b64decode(""" +R0lGODdhAQABAIAAAP///////ywAAAAA +AQABAAACAkQBADs=""") + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_extractor.py b/test/test_extractor.py new file mode 100644 index 0000000..fa0709b --- /dev/null +++ b/test/test_extractor.py @@ -0,0 +1,186 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import sys +import unittest +import string + +from gallery_dl import extractor +from gallery_dl.extractor.common import Extractor, Message +from gallery_dl.extractor.directlink import DirectlinkExtractor as DLExtractor + + +class FakeExtractor(Extractor): + category = "fake" + subcategory = "test" + pattern = "fake:" + + def items(self): + yield Message.Version, 1 + yield Message.Url, "text:foobar", {} + + +class TestExtractor(unittest.TestCase): + VALID_URIS = ( + "https://example.org/file.jpg", + "tumblr:foobar", + "oauth:flickr", + "test:pixiv:", + "recursive:https://example.org/document.html", + ) + + def setUp(self): + extractor._cache.clear() + extractor._module_iter = iter(extractor.modules) + + def test_find(self): + for uri in self.VALID_URIS: + result = extractor.find(uri) + self.assertIsInstance(result, Extractor, uri) + + for not_found in ("", "/tmp/file.ext"): + self.assertIsNone(extractor.find(not_found)) + + for invalid in (None, [], {}, 123, b"test:"): + with self.assertRaises(TypeError): + extractor.find(invalid) + + def test_add(self): + uri = "fake:foobar" + self.assertIsNone(extractor.find(uri)) + + extractor.add(FakeExtractor) + self.assertIsInstance(extractor.find(uri), FakeExtractor) + + def test_add_module(self): + uri = "fake:foobar" + self.assertIsNone(extractor.find(uri)) + + classes = extractor.add_module(sys.modules[__name__]) + self.assertEqual(len(classes), 1) + self.assertEqual(classes[0].pattern, FakeExtractor.pattern) + self.assertEqual(classes[0], FakeExtractor) + self.assertIsInstance(extractor.find(uri), FakeExtractor) + + def test_blacklist(self): + link_uri = "https://example.org/file.jpg" + test_uri = "test:" + fake_uri = "fake:" + + self.assertIsInstance(extractor.find(link_uri), DLExtractor) + self.assertIsInstance(extractor.find(test_uri), Extractor) + self.assertIsNone(extractor.find(fake_uri)) + + with extractor.blacklist(["directlink"]): + self.assertIsNone(extractor.find(link_uri)) + self.assertIsInstance(extractor.find(test_uri), Extractor) + self.assertIsNone(extractor.find(fake_uri)) + + with extractor.blacklist([], [DLExtractor, FakeExtractor]): + self.assertIsNone(extractor.find(link_uri)) + self.assertIsInstance(extractor.find(test_uri), Extractor) + self.assertIsNone(extractor.find(fake_uri)) + + with extractor.blacklist(["test"], [DLExtractor]): + self.assertIsNone(extractor.find(link_uri)) + self.assertIsNone(extractor.find(test_uri)) + self.assertIsNone(extractor.find(fake_uri)) + + def test_from_url(self): + for uri in self.VALID_URIS: + cls = extractor.find(uri).__class__ + extr = cls.from_url(uri) + self.assertIs(type(extr), cls) + self.assertIsInstance(extr, Extractor) + + for not_found in ("", "/tmp/file.ext"): + self.assertIsNone(FakeExtractor.from_url(not_found)) + + for invalid in (None, [], {}, 123, b"test:"): + with self.assertRaises(TypeError): + FakeExtractor.from_url(invalid) + + def test_unique_pattern_matches(self): + test_urls = [] + + # collect testcase URLs + for extr in extractor.extractors(): + for testcase in extr._get_tests(): + test_urls.append((testcase[0], extr)) + + # iterate over all testcase URLs + for url, extr1 in test_urls: + matches = [] + + # ... and apply all regex patterns to each one + for extr2 in extractor._cache: + + # skip DirectlinkExtractor pattern if it isn't tested + if extr1 != DLExtractor and extr2 == DLExtractor: + continue + + match = extr2.pattern.match(url) + if match: + matches.append(match) + + # fail if more or less than 1 match happened + if len(matches) > 1: + msg = "'{}' gets matched by more than one pattern:".format(url) + for match in matches: + msg += "\n- " + msg += match.re.pattern + self.fail(msg) + + if len(matches) < 1: + msg = "'{}' isn't matched by any pattern".format(url) + self.fail(msg) + + def test_docstrings(self): + """ensure docstring uniqueness""" + for extr1 in extractor.extractors(): + for extr2 in extractor.extractors(): + if extr1 != extr2 and extr1.__doc__ and extr2.__doc__: + self.assertNotEqual( + extr1.__doc__, + extr2.__doc__, + "{} <-> {}".format(extr1, extr2), + ) + + def test_names(self): + """Ensure extractor classes are named CategorySubcategoryExtractor""" + def capitalize(c): + if "-" in c: + return string.capwords(c.replace("-", " ")).replace(" ", "") + if "." in c: + c = c.replace(".", "") + return c.capitalize() + + mapping = { + "2chan" : "futaba", + "3dbooru": "threedeebooru", + "4chan" : "fourchan", + "4plebs" : "fourplebs", + "8chan" : "infinitychan", + "oauth" : None, + } + + for extr in extractor.extractors(): + category = mapping.get(extr.category, extr.category) + if category: + expected = "{}{}Extractor".format( + capitalize(category), + capitalize(extr.subcategory), + ) + if expected[0].isdigit(): + expected = "_" + expected + self.assertEqual(expected, extr.__name__) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/test_oauth.py b/test/test_oauth.py new file mode 100644 index 0000000..2ce5b43 --- /dev/null +++ b/test/test_oauth.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import unittest + +from gallery_dl import oauth, text + +TESTSERVER = "http://term.ie/oauth/example" +CONSUMER_KEY = "key" +CONSUMER_SECRET = "secret" +REQUEST_TOKEN = "requestkey" +REQUEST_TOKEN_SECRET = "requestsecret" +ACCESS_TOKEN = "accesskey" +ACCESS_TOKEN_SECRET = "accesssecret" + + +class TestOAuthSession(unittest.TestCase): + + def test_concat(self): + concat = oauth.concat + + self.assertEqual(concat(), "") + self.assertEqual(concat("str"), "str") + self.assertEqual(concat("str1", "str2"), "str1&str2") + + self.assertEqual(concat("&", "?/"), "%26&%3F%2F") + self.assertEqual( + concat("GET", "http://example.org/", "foo=bar&baz=a"), + "GET&http%3A%2F%2Fexample.org%2F&foo%3Dbar%26baz%3Da" + ) + + def test_nonce(self, size=16): + nonce_values = set(oauth.nonce(size) for _ in range(size)) + + # uniqueness + self.assertEqual(len(nonce_values), size) + + # length + for nonce in nonce_values: + self.assertEqual(len(nonce), size) + + def test_quote(self): + quote = oauth.quote + + reserved = ",;:!\"§$%&/(){}[]=?`´+*'äöü" + unreserved = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789-._~") + + for char in unreserved: + self.assertEqual(quote(char), char) + + for char in reserved: + quoted = quote(char) + quoted_hex = quoted.replace("%", "") + self.assertTrue(quoted.startswith("%")) + self.assertTrue(len(quoted) >= 3) + self.assertEqual(quoted_hex.upper(), quoted_hex) + + def test_request_token(self): + response = self._oauth_request( + "/request_token.php", {}) + expected = "oauth_token=requestkey&oauth_token_secret=requestsecret" + self.assertEqual(response, expected, msg=response) + + data = text.parse_query(response) + self.assertTrue(data["oauth_token"], REQUEST_TOKEN) + self.assertTrue(data["oauth_token_secret"], REQUEST_TOKEN_SECRET) + + def test_access_token(self): + response = self._oauth_request( + "/access_token.php", {}, REQUEST_TOKEN, REQUEST_TOKEN_SECRET) + expected = "oauth_token=accesskey&oauth_token_secret=accesssecret" + self.assertEqual(response, expected, msg=response) + + data = text.parse_query(response) + self.assertTrue(data["oauth_token"], ACCESS_TOKEN) + self.assertTrue(data["oauth_token_secret"], ACCESS_TOKEN_SECRET) + + def test_authenticated_call(self): + params = {"method": "foo", "a": "äöüß/?&#", "äöüß/?&#": "a"} + response = self._oauth_request( + "/echo_api.php", params, ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + + self.assertEqual(text.parse_query(response), params) + + def _oauth_request(self, endpoint, params=None, + oauth_token=None, oauth_token_secret=None): + session = oauth.OAuth1Session( + CONSUMER_KEY, CONSUMER_SECRET, + oauth_token, oauth_token_secret, + ) + url = TESTSERVER + endpoint + return session.get(url, params=params).text + + +if __name__ == "__main__": + unittest.main(warnings="ignore") diff --git a/test/test_results.py b/test/test_results.py new file mode 100644 index 0000000..8f03f03 --- /dev/null +++ b/test/test_results.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import os +import sys +import re +import json +import hashlib +import unittest +from gallery_dl import extractor, util, job, config, exception + + +# these don't work on Travis CI +TRAVIS_SKIP = { + "exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie", "bobx", + "archivedmoe", "archiveofsins", "thebarchive", "fireden", "4plebs", + "sankaku", "idolcomplex", "mangahere", "readcomiconline", "mangadex", + "sankakucomplex", +} + +# temporary issues, etc. +BROKEN = { + "komikcast", + "mangapark", +} + + +class TestExtractorResults(unittest.TestCase): + + def setUp(self): + setup_test_config() + + def tearDown(self): + config.clear() + + @classmethod + def setUpClass(cls): + cls._skipped = [] + + @classmethod + def tearDownClass(cls): + if cls._skipped: + print("\n\nSkipped tests:") + for url, exc in cls._skipped: + print('- {} ("{}")'.format(url, exc)) + + def _run_test(self, extr, url, result): + if result: + if "options" in result: + for key, value in result["options"]: + config.set(key.split("."), value) + if "range" in result: + config.set(("image-range",), result["range"]) + config.set(("chapter-range",), result["range"]) + content = "content" in result + else: + content = False + + tjob = ResultJob(url, content=content) + self.assertEqual(extr, tjob.extractor.__class__) + + if not result: + return + if "exception" in result: + with self.assertRaises(result["exception"]): + tjob.run() + return + try: + tjob.run() + except exception.StopExtraction: + pass + except exception.HttpError as exc: + exc = str(exc) + if re.match(r"5\d\d: ", exc) or \ + re.search(r"\bRead timed out\b", exc): + self._skipped.append((url, exc)) + self.skipTest(exc) + raise + + # test archive-id uniqueness + self.assertEqual(len(set(tjob.list_archive)), len(tjob.list_archive)) + + # test '_extractor' entries + if tjob.queue: + for url, kwdict in zip(tjob.list_url, tjob.list_keyword): + if "_extractor" in kwdict: + extr = kwdict["_extractor"].from_url(url) + self.assertIsInstance(extr, kwdict["_extractor"]) + self.assertEqual(extr.url, url) + + # test extraction results + if "url" in result: + self.assertEqual(result["url"], tjob.hash_url.hexdigest()) + + if "content" in result: + self.assertEqual(result["content"], tjob.hash_content.hexdigest()) + + if "keyword" in result: + keyword = result["keyword"] + if isinstance(keyword, dict): + for kwdict in tjob.list_keyword: + self._test_kwdict(kwdict, keyword) + else: # assume SHA1 hash + self.assertEqual(keyword, tjob.hash_keyword.hexdigest()) + + if "count" in result: + count = result["count"] + if isinstance(count, str): + self.assertRegex(count, r"^ *(==|!=|<|<=|>|>=) *\d+ *$") + expr = "{} {}".format(len(tjob.list_url), count) + self.assertTrue(eval(expr), msg=expr) + else: # assume integer + self.assertEqual(len(tjob.list_url), count) + + if "pattern" in result: + self.assertGreater(len(tjob.list_url), 0) + for url in tjob.list_url: + self.assertRegex(url, result["pattern"]) + + def _test_kwdict(self, kwdict, tests): + for key, test in tests.items(): + if key.startswith("?"): + key = key[1:] + if key not in kwdict: + continue + self.assertIn(key, kwdict) + value = kwdict[key] + + if isinstance(test, dict): + self._test_kwdict(value, test) + elif isinstance(test, type): + self.assertIsInstance(value, test, msg=key) + elif isinstance(test, str): + if test.startswith("re:"): + self.assertRegex(value, test[3:], msg=key) + elif test.startswith("type:"): + self.assertEqual(type(value).__name__, test[5:], msg=key) + else: + self.assertEqual(value, test, msg=key) + else: + self.assertEqual(value, test, msg=key) + + +class ResultJob(job.DownloadJob): + """Generate test-results for extractor runs""" + + def __init__(self, url, parent=None, content=False): + job.DownloadJob.__init__(self, url, parent) + self.queue = False + self.content = content + self.list_url = [] + self.list_keyword = [] + self.list_archive = [] + self.hash_url = hashlib.sha1() + self.hash_keyword = hashlib.sha1() + self.hash_archive = hashlib.sha1() + self.hash_content = hashlib.sha1() + if content: + self.fileobj = TestPathfmt(self.hash_content) + self.get_downloader("http")._check_extension = lambda a, b: None + + self.format_directory = TestFormatter( + "".join(self.extractor.directory_fmt)) + self.format_filename = TestFormatter(self.extractor.filename_fmt) + + def run(self): + for msg in self.extractor: + self.dispatch(msg) + + def handle_url(self, url, keywords, fallback=None): + self.update_url(url) + self.update_keyword(keywords) + self.update_archive(keywords) + self.update_content(url) + self.format_filename.format_map(keywords) + + def handle_directory(self, keywords): + self.update_keyword(keywords, False) + self.format_directory.format_map(keywords) + + def handle_queue(self, url, keywords): + self.queue = True + self.update_url(url) + self.update_keyword(keywords) + + def update_url(self, url): + self.list_url.append(url) + self.hash_url.update(url.encode()) + + def update_keyword(self, kwdict, to_list=True): + if to_list: + self.list_keyword.append(kwdict) + kwdict = self._filter(kwdict) + self.hash_keyword.update( + json.dumps(kwdict, sort_keys=True, default=str).encode()) + + def update_archive(self, kwdict): + archive_id = self.extractor.archive_fmt.format_map(kwdict) + self.list_archive.append(archive_id) + self.hash_archive.update(archive_id.encode()) + + def update_content(self, url): + if self.content: + scheme = url.partition(":")[0] + self.get_downloader(scheme).download(url, self.fileobj) + + +class TestPathfmt(): + + def __init__(self, hashobj): + self.hashobj = hashobj + self.path = "" + self.size = 0 + self.has_extension = True + + def __enter__(self): + return self + + def __exit__(self, *args): + pass + + def open(self, mode): + self.size = 0 + return self + + def write(self, content): + """Update SHA1 hash""" + self.size += len(content) + self.hashobj.update(content) + + def tell(self): + return self.size + + def part_size(self): + return 0 + + +class TestFormatter(util.Formatter): + + @staticmethod + def _noop(_): + return "" + + def _apply_simple(self, key, fmt): + if key == "extension" or "._format_optional." in repr(fmt): + return self._noop + + def wrap(obj): + return fmt(obj[key]) + return wrap + + def _apply(self, key, funcs, fmt): + if key == "extension" or "._format_optional." in repr(fmt): + return self._noop + + def wrap(obj): + obj = obj[key] + for func in funcs: + obj = func(obj) + return fmt(obj) + return wrap + + +def setup_test_config(): + name = "gallerydl" + email = "gallerydl@openaliasbox.org" + + config.clear() + config.set(("cache", "file"), ":memory:") + config.set(("downloader", "part"), False) + config.set(("extractor", "timeout"), 60) + config.set(("extractor", "username"), name) + config.set(("extractor", "password"), name) + config.set(("extractor", "nijie", "username"), email) + config.set(("extractor", "seiga", "username"), email) + config.set(("extractor", "danbooru", "username"), None) + config.set(("extractor", "twitter" , "username"), None) + config.set(("extractor", "mangoxo" , "password"), "VZ8DL3983u") + + config.set(("extractor", "deviantart", "client-id"), "7777") + config.set(("extractor", "deviantart", "client-secret"), + "ff14994c744d9208e5caeec7aab4a026") + + config.set(("extractor", "tumblr", "api-key"), + "0cXoHfIqVzMQcc3HESZSNsVlulGxEXGDTTZCDrRrjaa0jmuTc6") + config.set(("extractor", "tumblr", "api-secret"), + "6wxAK2HwrXdedn7VIoZWxGqVhZ8JdYKDLjiQjL46MLqGuEtyVj") + config.set(("extractor", "tumblr", "access-token"), + "N613fPV6tOZQnyn0ERTuoEZn0mEqG8m2K8M3ClSJdEHZJuqFdG") + config.set(("extractor", "tumblr", "access-token-secret"), + "sgOA7ZTT4FBXdOGGVV331sSp0jHYp4yMDRslbhaQf7CaS71i4O") + + +def generate_tests(): + """Dynamically generate extractor unittests""" + def _generate_test(extr, tcase): + def test(self): + url, result = tcase + print("\n", url, sep="") + self._run_test(extr, url, result) + return test + + # enable selective testing for direct calls + if __name__ == '__main__' and len(sys.argv) > 1: + if sys.argv[1].lower() == "all": + fltr = lambda c, bc: True # noqa: E731 + elif sys.argv[1].lower() == "broken": + fltr = lambda c, bc: c in BROKEN # noqa: E731 + else: + argv = sys.argv[1:] + fltr = lambda c, bc: c in argv or bc in argv # noqa: E731 + del sys.argv[1:] + else: + skip = set(BROKEN) + if "CI" in os.environ and "TRAVIS" in os.environ: + skip |= set(TRAVIS_SKIP) + if skip: + print("skipping:", ", ".join(skip)) + fltr = lambda c, bc: c not in skip # noqa: E731 + + # filter available extractor classes + extractors = [ + extr for extr in extractor.extractors() + if fltr(extr.category, getattr(extr, "basecategory", None)) + ] + + # add 'test_...' methods + for extr in extractors: + name = "test_" + extr.__name__ + "_" + for num, tcase in enumerate(extr._get_tests(), 1): + test = _generate_test(extr, tcase) + test.__name__ = name + str(num) + setattr(TestExtractorResults, test.__name__, test) + + +generate_tests() +if __name__ == '__main__': + unittest.main(warnings='ignore') diff --git a/test/test_text.py b/test/test_text.py new file mode 100644 index 0000000..405acd3 --- /dev/null +++ b/test/test_text.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2015-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import unittest +import datetime + +from gallery_dl import text + + +INVALID = ((), [], {}, None, 1, 2.3) +INVALID_ALT = ((), [], {}, None, "") + + +class TestText(unittest.TestCase): + + def test_clean_xml(self, f=text.clean_xml): + # standard usage + self.assertEqual(f(""), "") + self.assertEqual(f("foo"), "foo") + self.assertEqual(f("\tfoo\nbar\r"), "\tfoo\nbar\r") + self.assertEqual(f("\ab\ba\fr\v"), "bar") + + # 'repl' argument + repl = "#" + self.assertEqual(f("", repl), "") + self.assertEqual(f("foo", repl), "foo") + self.assertEqual(f("\tfoo\nbar\r", repl), "\tfoo\nbar\r") + self.assertEqual( + f("\ab\ba\fr\v", repl), "#b#a#r#") + + # removal of all illegal control characters + value = "".join(chr(x) for x in range(32)) + self.assertEqual(f(value), "\t\n\r") + + # 'invalid' arguments + for value in INVALID: + self.assertEqual(f(value), "") + + def test_remove_html(self, f=text.remove_html): + result = "Hello World." + + # standard usage + self.assertEqual(f(""), "") + self.assertEqual(f("Hello World."), result) + self.assertEqual(f(" Hello World. "), result) + self.assertEqual(f("Hello
    World."), result) + self.assertEqual( + f("
    HelloWorld.
    "), result) + + # empty HTML + self.assertEqual(f("
    "), "") + self.assertEqual(f("
    "), "") + + # malformed HTML + self.assertEqual(f(""), "") + self.assertEqual(f(""), "") + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), "") + + def test_split_html(self, f=text.split_html): + result = ["Hello", "World."] + empty = [] + + # standard usage + self.assertEqual(f(""), empty) + self.assertEqual(f("Hello World."), ["Hello World."]) + self.assertEqual(f(" Hello World. "), ["Hello World."]) + self.assertEqual(f("Hello
    World."), result) + self.assertEqual(f(" Hello
    World. "), result) + self.assertEqual( + f("
    HelloWorld.
    "), result) + + # empty HTML + self.assertEqual(f("
    "), empty) + self.assertEqual(f("
    "), empty) + + # malformed HTML + self.assertEqual(f(""), empty) + self.assertEqual(f(""), empty) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), empty) + + def test_filename_from_url(self, f=text.filename_from_url): + result = "filename.ext" + + # standard usage + self.assertEqual(f(""), "") + self.assertEqual(f("filename.ext"), result) + self.assertEqual(f("/filename.ext"), result) + self.assertEqual(f("example.org/filename.ext"), result) + self.assertEqual(f("http://example.org/v2/filename.ext"), result) + self.assertEqual( + f("http://example.org/v2/filename.ext?param=value#frag"), result) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), "") + + def test_ext_from_url(self, f=text.ext_from_url): + result = "ext" + + # standard usage + self.assertEqual(f(""), "") + self.assertEqual(f("filename.ext"), result) + self.assertEqual(f("/filename.ext"), result) + self.assertEqual(f("example.org/filename.ext"), result) + self.assertEqual(f("http://example.org/v2/filename.ext"), result) + self.assertEqual( + f("http://example.org/v2/filename.ext?param=value#frag"), result) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), "") + + def test_nameext_from_url(self, f=text.nameext_from_url): + empty = {"filename": "", "extension": ""} + result = {"filename": "filename", "extension": "ext"} + + # standard usage + self.assertEqual(f(""), empty) + self.assertEqual(f("filename.ext"), result) + self.assertEqual(f("/filename.ext"), result) + self.assertEqual(f("example.org/filename.ext"), result) + self.assertEqual(f("http://example.org/v2/filename.ext"), result) + self.assertEqual( + f("http://example.org/v2/filename.ext?param=value#frag"), result) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), empty) + + def test_clean_path_windows(self, f=text.clean_path_windows): + self.assertEqual(f(""), "") + self.assertEqual(f("foo"), "foo") + self.assertEqual(f("foo/bar"), "foo_bar") + self.assertEqual(f("foo<>:\"\\/|?*bar"), "foo_________bar") + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), "") + + def test_clean_path_posix(self, f=text.clean_path_posix): + self.assertEqual(f(""), "") + self.assertEqual(f("foo"), "foo") + self.assertEqual(f("foo/bar"), "foo_bar") + self.assertEqual(f("foo<>:\"\\/|?*bar"), "foo<>:\"\\_|?*bar") + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), "") + + def test_extract(self, f=text.extract): + txt = "
    " + self.assertEqual(f(txt, "<", ">"), ("a" , 3)) + self.assertEqual(f(txt, "X", ">"), (None, 0)) + self.assertEqual(f(txt, "<", "X"), (None, 0)) + + # 'pos' argument + for i in range(1, 4): + self.assertEqual(f(txt, "<", ">", i), ("b", 6)) + for i in range(4, 10): + self.assertEqual(f(txt, "<", ">", i), (None, i)) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value, "<" , ">") , (None, 0)) + self.assertEqual(f(txt , value, ">") , (None, 0)) + self.assertEqual(f(txt , "<" , value), (None, 0)) + + def test_rextract(self, f=text.rextract): + txt = "" + self.assertEqual(f(txt, "<", ">"), ("b" , 3)) + self.assertEqual(f(txt, "X", ">"), (None, -1)) + self.assertEqual(f(txt, "<", "X"), (None, -1)) + + # 'pos' argument + for i in range(10, 3, -1): + self.assertEqual(f(txt, "<", ">", i), ("b", 3)) + for i in range(3, 0, -1): + self.assertEqual(f(txt, "<", ">", i), ("a", 0)) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value, "<" , ">") , (None, -1)) + self.assertEqual(f(txt , value, ">") , (None, -1)) + self.assertEqual(f(txt , "<" , value), (None, -1)) + + def test_extract_all(self, f=text.extract_all): + txt = "[c][b][a]: xyz! [d][e" + + self.assertEqual( + f(txt, ()), ({}, 0)) + self.assertEqual( + f(txt, (("C", "[", "]"), ("B", "[", "]"), ("A", "[", "]"))), + ({"A": "a", "B": "b", "C": "c"}, 9), + ) + + # 'None' as field name + self.assertEqual( + f(txt, ((None, "[", "]"), (None, "[", "]"), ("A", "[", "]"))), + ({"A": "a"}, 9), + ) + self.assertEqual( + f(txt, ((None, "[", "]"), (None, "[", "]"), (None, "[", "]"))), + ({}, 9), + ) + + # failed matches + self.assertEqual( + f(txt, (("C", "[", "]"), ("X", "X", "X"), ("B", "[", "]"))), + ({"B": "b", "C": "c", "X": None}, 6), + ) + + # 'pos' argument + self.assertEqual( + f(txt, (("B", "[", "]"), ("A", "[", "]")), pos=1), + ({"A": "a", "B": "b"}, 9), + ) + + # 'values' argument + self.assertEqual( + f(txt, (("C", "[", "]"),), values={"A": "a", "B": "b"}), + ({"A": "a", "B": "b", "C": "c"}, 3), + ) + + vdict = {} + rdict, pos = f(txt, (), values=vdict) + self.assertIs(vdict, rdict) + + def test_extract_iter(self, f=text.extract_iter): + txt = "[c][b][a]: xyz! [d][e" + + def g(*args): + return list(f(*args)) + + self.assertEqual( + g("", "[", "]"), []) + self.assertEqual( + g("[a]", "[", "]"), ["a"]) + self.assertEqual( + g(txt, "[", "]"), ["c", "b", "a", "d"]) + self.assertEqual( + g(txt, "X", "X"), []) + self.assertEqual( + g(txt, "[", "]", 6), ["a", "d"]) + + def test_extract_from(self, f=text.extract_from): + txt = "[c][b][a]: xyz! [d][e" + + e = f(txt) + self.assertEqual(e("[", "]"), "c") + self.assertEqual(e("[", "]"), "b") + self.assertEqual(e("[", "]"), "a") + self.assertEqual(e("[", "]"), "d") + self.assertEqual(e("[", "]"), "") + self.assertEqual(e("[", "]"), "") + + e = f(txt, pos=6, default="END") + self.assertEqual(e("[", "]"), "a") + self.assertEqual(e("[", "]"), "d") + self.assertEqual(e("[", "]"), "END") + self.assertEqual(e("[", "]"), "END") + + def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes): + self.assertEqual(f(""), "") + self.assertEqual(f("foobar"), "foobar") + self.assertEqual(f("foo’bar"), "foo’bar") + self.assertEqual(f("foo\\u2019bar"), "foo’bar") + self.assertEqual(f("foo\\u201bar"), "foo‛ar") + self.assertEqual(f("foo\\u201zar"), "foo\\u201zar") + self.assertEqual( + f("\\u2018foo\\u2019\\u2020bar\\u00ff"), + "‘foo’†barÿ", + ) + + def test_parse_bytes(self, f=text.parse_bytes): + self.assertEqual(f("0"), 0) + self.assertEqual(f("50"), 50) + self.assertEqual(f("50k"), 50 * 1024**1) + self.assertEqual(f("50m"), 50 * 1024**2) + self.assertEqual(f("50g"), 50 * 1024**3) + self.assertEqual(f("50t"), 50 * 1024**4) + self.assertEqual(f("50p"), 50 * 1024**5) + + # fractions + self.assertEqual(f("123.456"), 123) + self.assertEqual(f("123.567"), 124) + self.assertEqual(f("0.5M"), round(0.5 * 1024**2)) + + # invalid arguments + for value in INVALID_ALT: + self.assertEqual(f(value), 0) + self.assertEqual(f("NaN"), 0) + self.assertEqual(f("invalid"), 0) + self.assertEqual(f(" 123 kb "), 0) + + def test_parse_int(self, f=text.parse_int): + self.assertEqual(f(0), 0) + self.assertEqual(f("0"), 0) + self.assertEqual(f(123), 123) + self.assertEqual(f("123"), 123) + + # invalid arguments + for value in INVALID_ALT: + self.assertEqual(f(value), 0) + self.assertEqual(f("123.456"), 0) + self.assertEqual(f("zzz"), 0) + self.assertEqual(f([1, 2, 3]), 0) + self.assertEqual(f({1: 2, 3: 4}), 0) + + # 'default' argument + default = "default" + for value in INVALID_ALT: + self.assertEqual(f(value, default), default) + self.assertEqual(f("zzz", default), default) + + def test_parse_float(self, f=text.parse_float): + self.assertEqual(f(0), 0.0) + self.assertEqual(f("0"), 0.0) + self.assertEqual(f(123), 123.0) + self.assertEqual(f("123"), 123.0) + self.assertEqual(f(123.456), 123.456) + self.assertEqual(f("123.456"), 123.456) + + # invalid arguments + for value in INVALID_ALT: + self.assertEqual(f(value), 0.0) + self.assertEqual(f("zzz"), 0.0) + self.assertEqual(f([1, 2, 3]), 0.0) + self.assertEqual(f({1: 2, 3: 4}), 0.0) + + # 'default' argument + default = "default" + for value in INVALID_ALT: + self.assertEqual(f(value, default), default) + self.assertEqual(f("zzz", default), default) + + def test_parse_query(self, f=text.parse_query): + # standard usage + self.assertEqual(f(""), {}) + self.assertEqual(f("foo=1"), {"foo": "1"}) + self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"}) + + # missing value + self.assertEqual(f("bar"), {}) + self.assertEqual(f("foo=1&bar"), {"foo": "1"}) + self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"}) + + # keys with identical names + self.assertEqual(f("foo=1&foo=2"), {"foo": "1"}) + self.assertEqual( + f("foo=1&bar=2&foo=3&bar=4"), + {"foo": "1", "bar": "2"}, + ) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), {}) + + def test_parse_timestamp(self, f=text.parse_timestamp): + null = datetime.datetime.utcfromtimestamp(0) + value = datetime.datetime.utcfromtimestamp(1555816235) + + self.assertEqual(f(0) , null) + self.assertEqual(f("0") , null) + self.assertEqual(f(1555816235) , value) + self.assertEqual(f("1555816235"), value) + + for value in INVALID_ALT: + self.assertEqual(f(value), None) + self.assertEqual(f(value, "foo"), "foo") + + def test_parse_datetime(self, f=text.parse_datetime): + null = datetime.datetime.utcfromtimestamp(0) + + self.assertEqual(f("1970-01-01T00:00:00+00:00"), null) + self.assertEqual(f("1970-01-01T00:00:00+0000") , null) + self.assertEqual(f("1970.01.01", "%Y.%m.%d") , null) + + self.assertEqual( + f("2019-05-07T21:25:02+09:00"), + datetime.datetime(2019, 5, 7, 12, 25, 2), + ) + self.assertEqual( + f("2019-05-07T21:25:02+0900"), + datetime.datetime(2019, 5, 7, 12, 25, 2), + ) + self.assertEqual( + f("2019-05-07 21:25:02"), + "2019-05-07 21:25:02", + ) + + for value in INVALID: + self.assertEqual(f(value), None) + self.assertEqual(f("1970.01.01"), "1970.01.01") + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_util.py b/test/test_util.py new file mode 100644 index 0000000..815b2d8 --- /dev/null +++ b/test/test_util.py @@ -0,0 +1,395 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import unittest +import sys +import random +import string + +from gallery_dl import util, text, exception + + +class TestRange(unittest.TestCase): + + def test_parse_range(self, f=util.RangePredicate.parse_range): + self.assertEqual( + f(""), + []) + self.assertEqual( + f("1-2"), + [(1, 2)]) + self.assertEqual( + f("-"), + [(1, sys.maxsize)]) + self.assertEqual( + f("-2,4,6-8,10-"), + [(1, 2), (4, 4), (6, 8), (10, sys.maxsize)]) + self.assertEqual( + f(" - 3 , 4- 4, 2-6"), + [(1, 3), (4, 4), (2, 6)]) + + def test_optimize_range(self, f=util.RangePredicate.optimize_range): + self.assertEqual( + f([]), + []) + self.assertEqual( + f([(2, 4)]), + [(2, 4)]) + self.assertEqual( + f([(2, 4), (6, 8), (10, 12)]), + [(2, 4), (6, 8), (10, 12)]) + self.assertEqual( + f([(2, 4), (4, 6), (5, 8)]), + [(2, 8)]) + self.assertEqual( + f([(1, 1), (2, 2), (3, 6), (8, 9)]), + [(1, 6), (8, 9)]) + + +class TestPredicate(unittest.TestCase): + + def test_range_predicate(self): + dummy = None + + pred = util.RangePredicate(" - 3 , 4- 4, 2-6") + for i in range(6): + self.assertTrue(pred(dummy, dummy)) + with self.assertRaises(exception.StopExtraction): + bool(pred(dummy, dummy)) + + pred = util.RangePredicate("1, 3, 5") + self.assertTrue(pred(dummy, dummy)) + self.assertFalse(pred(dummy, dummy)) + self.assertTrue(pred(dummy, dummy)) + self.assertFalse(pred(dummy, dummy)) + self.assertTrue(pred(dummy, dummy)) + with self.assertRaises(exception.StopExtraction): + bool(pred(dummy, dummy)) + + pred = util.RangePredicate("") + with self.assertRaises(exception.StopExtraction): + bool(pred(dummy, dummy)) + + def test_unique_predicate(self): + dummy = None + pred = util.UniquePredicate() + + # no duplicates + self.assertTrue(pred("1", dummy)) + self.assertTrue(pred("2", dummy)) + self.assertFalse(pred("1", dummy)) + self.assertFalse(pred("2", dummy)) + self.assertTrue(pred("3", dummy)) + self.assertFalse(pred("3", dummy)) + + # duplicates for "text:" + self.assertTrue(pred("text:123", dummy)) + self.assertTrue(pred("text:123", dummy)) + self.assertTrue(pred("text:123", dummy)) + + def test_filter_predicate(self): + url = "" + + pred = util.FilterPredicate("a < 3") + self.assertTrue(pred(url, {"a": 2})) + self.assertFalse(pred(url, {"a": 3})) + + with self.assertRaises(SyntaxError): + util.FilterPredicate("(") + + with self.assertRaises(exception.FilterError): + util.FilterPredicate("a > 1")(url, {"a": None}) + + with self.assertRaises(exception.FilterError): + util.FilterPredicate("b > 1")(url, {"a": 2}) + + def test_build_predicate(self): + pred = util.build_predicate([]) + self.assertIsInstance(pred, type(lambda: True)) + + pred = util.build_predicate([util.UniquePredicate()]) + self.assertIsInstance(pred, util.UniquePredicate) + + pred = util.build_predicate([util.UniquePredicate(), + util.UniquePredicate()]) + self.assertIsInstance(pred, util.ChainPredicate) + + +class TestISO639_1(unittest.TestCase): + + def test_code_to_language(self): + d = "default" + self._run_test(util.code_to_language, { + ("en",): "English", + ("FR",): "French", + ("xx",): None, + ("" ,): None, + (None,): None, + ("en", d): "English", + ("FR", d): "French", + ("xx", d): d, + ("" , d): d, + (None, d): d, + }) + + def test_language_to_code(self): + d = "default" + self._run_test(util.language_to_code, { + ("English",): "en", + ("fRENch",): "fr", + ("xx",): None, + ("" ,): None, + (None,): None, + ("English", d): "en", + ("fRENch", d): "fr", + ("xx", d): d, + ("" , d): d, + (None, d): d, + }) + + def _run_test(self, func, tests): + for args, result in tests.items(): + self.assertEqual(func(*args), result) + + +class TestFormatter(unittest.TestCase): + + kwdict = { + "a": "hElLo wOrLd", + "b": "äöü", + "l": ["a", "b", "c"], + "n": None, + "u": "%27%3C%20/%20%3E%27", + "name": "Name", + "title1": "Title", + "title2": "", + "title3": None, + "title4": 0, + } + + def test_conversions(self): + self._run_test("{a!l}", "hello world") + self._run_test("{a!u}", "HELLO WORLD") + self._run_test("{a!c}", "Hello world") + self._run_test("{a!C}", "Hello World") + self._run_test("{a!U}", self.kwdict["a"]) + self._run_test("{u!U}", "'< / >'") + self._run_test("{a!s}", self.kwdict["a"]) + self._run_test("{a!r}", "'" + self.kwdict["a"] + "'") + self._run_test("{a!a}", "'" + self.kwdict["a"] + "'") + self._run_test("{b!a}", "'\\xe4\\xf6\\xfc'") + self._run_test("{a!S}", self.kwdict["a"]) + self._run_test("{l!S}", "a, b, c") + self._run_test("{n!S}", "") + with self.assertRaises(KeyError): + self._run_test("{a!q}", "hello world") + + def test_optional(self): + self._run_test("{name}{title1}", "NameTitle") + self._run_test("{name}{title1:?//}", "NameTitle") + self._run_test("{name}{title1:? **/''/}", "Name **Title''") + + self._run_test("{name}{title2}", "Name") + self._run_test("{name}{title2:?//}", "Name") + self._run_test("{name}{title2:? **/''/}", "Name") + + self._run_test("{name}{title3}", "NameNone") + self._run_test("{name}{title3:?//}", "Name") + self._run_test("{name}{title3:? **/''/}", "Name") + + self._run_test("{name}{title4}", "Name0") + self._run_test("{name}{title4:?//}", "Name") + self._run_test("{name}{title4:? **/''/}", "Name") + + def test_missing(self): + replacement = "None" + + self._run_test("{missing}", replacement) + self._run_test("{missing.attr}", replacement) + self._run_test("{missing[key]}", replacement) + self._run_test("{missing:?a//}", "") + + self._run_test("{name[missing]}", replacement) + self._run_test("{name[missing].attr}", replacement) + self._run_test("{name[missing][key]}", replacement) + self._run_test("{name[missing]:?a//}", "") + + def test_missing_custom_default(self): + replacement = default = "foobar" + self._run_test("{missing}" , replacement, default) + self._run_test("{missing.attr}", replacement, default) + self._run_test("{missing[key]}", replacement, default) + self._run_test("{missing:?a//}", "a" + default, default) + + def test_slicing(self): + v = self.kwdict["a"] + self._run_test("{a[1:10]}" , v[1:10]) + self._run_test("{a[-10:-1]}", v[-10:-1]) + self._run_test("{a[5:]}" , v[5:]) + self._run_test("{a[50:]}", v[50:]) + self._run_test("{a[:5]}" , v[:5]) + self._run_test("{a[:50]}", v[:50]) + self._run_test("{a[:]}" , v) + self._run_test("{a[1:10:2]}" , v[1:10:2]) + self._run_test("{a[-10:-1:2]}", v[-10:-1:2]) + self._run_test("{a[5::2]}" , v[5::2]) + self._run_test("{a[50::2]}", v[50::2]) + self._run_test("{a[:5:2]}" , v[:5:2]) + self._run_test("{a[:50:2]}", v[:50:2]) + self._run_test("{a[::]}" , v) + + def test_maxlen(self): + v = self.kwdict["a"] + self._run_test("{a:L5/foo/}" , "foo") + self._run_test("{a:L50/foo/}", v) + self._run_test("{a:L50/foo/>50}", " " * 39 + v) + self._run_test("{a:L50/foo/>51}", "foo") + self._run_test("{a:Lab/foo/}", "foo") + + def test_join(self): + self._run_test("{l:J}" , "abc") + self._run_test("{l:J,}" , "a,b,c") + self._run_test("{l:J,/}" , "a,b,c") + self._run_test("{l:J,/>20}" , " a,b,c") + self._run_test("{l:J - }" , "a - b - c") + self._run_test("{l:J - /}" , "a - b - c") + self._run_test("{l:J - />20}", " a - b - c") + + self._run_test("{a:J/}" , self.kwdict["a"]) + self._run_test("{a:J, /}" , ", ".join(self.kwdict["a"])) + + def test_replace(self): + self._run_test("{a:Rh/C/}" , "CElLo wOrLd") + self._run_test("{a!l:Rh/C/}", "Cello world") + self._run_test("{a!u:Rh/C/}", "HELLO WORLD") + + self._run_test("{a!l:Rl/_/}", "he__o wor_d") + self._run_test("{a!l:Rl//}" , "heo word") + self._run_test("{name:Rame/othing/}", "Nothing") + + def _run_test(self, format_string, result, default=None): + formatter = util.Formatter(format_string, default) + output = formatter.format_map(self.kwdict) + self.assertEqual(output, result, format_string) + + +class TestOther(unittest.TestCase): + + def test_bencode(self): + self.assertEqual(util.bencode(0), "") + self.assertEqual(util.bencode(123), "123") + self.assertEqual(util.bencode(123, "01"), "1111011") + self.assertEqual(util.bencode(123, "BA"), "AAAABAA") + + def test_bdecode(self): + self.assertEqual(util.bdecode(""), 0) + self.assertEqual(util.bdecode("123"), 123) + self.assertEqual(util.bdecode("1111011", "01"), 123) + self.assertEqual(util.bdecode("AAAABAA", "BA"), 123) + + def test_bencode_bdecode(self): + for _ in range(100): + value = random.randint(0, 1000000) + for alphabet in ("01", "0123456789", string.ascii_letters): + result = util.bdecode(util.bencode(value, alphabet), alphabet) + self.assertEqual(result, value) + + def test_advance(self): + items = range(5) + + self.assertCountEqual( + util.advance(items, 0), items) + self.assertCountEqual( + util.advance(items, 3), range(3, 5)) + self.assertCountEqual( + util.advance(items, 9), []) + self.assertCountEqual( + util.advance(util.advance(items, 1), 2), range(3, 5)) + + def test_raises(self): + func = util.raises(Exception()) + with self.assertRaises(Exception): + func() + + func = util.raises(ValueError(1)) + with self.assertRaises(ValueError): + func() + with self.assertRaises(ValueError): + func() + with self.assertRaises(ValueError): + func() + + def test_combine_dict(self): + self.assertEqual( + util.combine_dict({}, {}), + {}) + self.assertEqual( + util.combine_dict({1: 1, 2: 2}, {2: 4, 4: 8}), + {1: 1, 2: 4, 4: 8}) + self.assertEqual( + util.combine_dict( + {1: {11: 22, 12: 24}, 2: {13: 26, 14: 28}}, + {1: {11: 33, 13: 39}, 2: "str"}), + {1: {11: 33, 12: 24, 13: 39}, 2: "str"}) + self.assertEqual( + util.combine_dict( + {1: {2: {3: {4: {"1": "a", "2": "b"}}}}}, + {1: {2: {3: {4: {"1": "A", "3": "C"}}}}}), + {1: {2: {3: {4: {"1": "A", "2": "b", "3": "C"}}}}}) + + def test_transform_dict(self): + d = {} + util.transform_dict(d, str) + self.assertEqual(d, {}) + + d = {1: 123, 2: "123", 3: True, 4: None} + util.transform_dict(d, str) + self.assertEqual( + d, {1: "123", 2: "123", 3: "True", 4: "None"}) + + d = {1: 123, 2: "123", 3: "foo", 4: {11: 321, 12: "321", 13: "bar"}} + util.transform_dict(d, text.parse_int) + self.assertEqual( + d, {1: 123, 2: 123, 3: 0, 4: {11: 321, 12: 321, 13: 0}}) + + def test_number_to_string(self, f=util.number_to_string): + self.assertEqual(f(1) , "1") + self.assertEqual(f(1.0) , "1.0") + self.assertEqual(f("1.0") , "1.0") + self.assertEqual(f([1]) , [1]) + self.assertEqual(f({1: 2}), {1: 2}) + self.assertEqual(f(True) , True) + self.assertEqual(f(None) , None) + + def test_to_string(self, f=util.to_string): + self.assertEqual(f(1) , "1") + self.assertEqual(f(1.0) , "1.0") + self.assertEqual(f("1.0"), "1.0") + + self.assertEqual(f("") , "") + self.assertEqual(f(None) , "") + self.assertEqual(f(0) , "") + + self.assertEqual(f(["a"]), "a") + self.assertEqual(f([1]) , "1") + self.assertEqual(f(["a", "b", "c"]), "a, b, c") + self.assertEqual(f([1, 2, 3]), "1, 2, 3") + + def test_universal_none(self): + obj = util.NONE + + self.assertFalse(obj) + self.assertEqual(str(obj), str(None)) + self.assertEqual(repr(obj), repr(None)) + self.assertIs(obj.attr, obj) + self.assertIs(obj["key"], obj) + + +if __name__ == '__main__': + unittest.main() -- cgit v1.2.3