summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2019-07-02 04:33:45 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2019-07-02 04:33:45 -0400
commit195c45911e79c33cf0bb986721365fb06df5a153 (patch)
treeac0c9b6ef40bea7aa7ab0c5c3cb500eb510668fa
Import Upstream version 1.8.7upstream/1.8.7
-rw-r--r--.gitignore75
-rw-r--r--.travis.yml36
-rw-r--r--CHANGELOG.md630
-rw-r--r--LICENSE339
-rw-r--r--Makefile45
-rw-r--r--README.rst244
-rwxr-xr-xbin/gallery-dl7
-rw-r--r--docs/configuration.rst1615
-rw-r--r--docs/gallery-dl-example.conf172
-rw-r--r--docs/gallery-dl.conf172
-rw-r--r--docs/supportedsites.rst137
-rw-r--r--gallery_dl/__init__.py255
-rw-r--r--gallery_dl/__main__.py20
-rw-r--r--gallery_dl/aes.py337
-rw-r--r--gallery_dl/cache.py204
-rw-r--r--gallery_dl/cloudflare.py176
-rw-r--r--gallery_dl/config.py155
-rw-r--r--gallery_dl/downloader/__init__.py39
-rw-r--r--gallery_dl/downloader/common.py170
-rw-r--r--gallery_dl/downloader/http.py128
-rw-r--r--gallery_dl/downloader/text.py37
-rw-r--r--gallery_dl/downloader/ytdl.py81
-rw-r--r--gallery_dl/exception.py79
-rw-r--r--gallery_dl/extractor/2chan.py95
-rw-r--r--gallery_dl/extractor/35photo.py205
-rw-r--r--gallery_dl/extractor/3dbooru.py81
-rw-r--r--gallery_dl/extractor/4chan.py36
-rw-r--r--gallery_dl/extractor/500px.py238
-rw-r--r--gallery_dl/extractor/8chan.py29
-rw-r--r--gallery_dl/extractor/8muses.py129
-rw-r--r--gallery_dl/extractor/__init__.py189
-rw-r--r--gallery_dl/extractor/artstation.py369
-rw-r--r--gallery_dl/extractor/behance.py179
-rw-r--r--gallery_dl/extractor/bobx.py112
-rw-r--r--gallery_dl/extractor/booru.py265
-rw-r--r--gallery_dl/extractor/chan.py61
-rw-r--r--gallery_dl/extractor/common.py432
-rw-r--r--gallery_dl/extractor/danbooru.py86
-rw-r--r--gallery_dl/extractor/deviantart.py992
-rw-r--r--gallery_dl/extractor/directlink.py56
-rw-r--r--gallery_dl/extractor/dynastyscans.py145
-rw-r--r--gallery_dl/extractor/e621.py71
-rw-r--r--gallery_dl/extractor/exhentai.py382
-rw-r--r--gallery_dl/extractor/fallenangels.py105
-rw-r--r--gallery_dl/extractor/flickr.py503
-rw-r--r--gallery_dl/extractor/foolfuuka.py157
-rw-r--r--gallery_dl/extractor/foolslide.py240
-rw-r--r--gallery_dl/extractor/gelbooru.py130
-rw-r--r--gallery_dl/extractor/gfycat.py83
-rw-r--r--gallery_dl/extractor/hbrowse.py101
-rw-r--r--gallery_dl/extractor/hentai2read.py101
-rw-r--r--gallery_dl/extractor/hentaicafe.py88
-rw-r--r--gallery_dl/extractor/hentaifoundry.py264
-rw-r--r--gallery_dl/extractor/hentaifox.py117
-rw-r--r--gallery_dl/extractor/hentaihere.py101
-rw-r--r--gallery_dl/extractor/hentainexus.py96
-rw-r--r--gallery_dl/extractor/hitomi.py103
-rw-r--r--gallery_dl/extractor/hypnohub.py68
-rw-r--r--gallery_dl/extractor/idolcomplex.py59
-rw-r--r--gallery_dl/extractor/imagebam.py128
-rw-r--r--gallery_dl/extractor/imagefap.py195
-rw-r--r--gallery_dl/extractor/imagehosts.py251
-rw-r--r--gallery_dl/extractor/imgbox.py134
-rw-r--r--gallery_dl/extractor/imgth.py61
-rw-r--r--gallery_dl/extractor/imgur.py183
-rw-r--r--gallery_dl/extractor/instagram.py277
-rw-r--r--gallery_dl/extractor/keenspot.py157
-rw-r--r--gallery_dl/extractor/khinsider.py69
-rw-r--r--gallery_dl/extractor/kissmanga.py223
-rw-r--r--gallery_dl/extractor/komikcast.py117
-rw-r--r--gallery_dl/extractor/konachan.py85
-rw-r--r--gallery_dl/extractor/livedoor.py156
-rw-r--r--gallery_dl/extractor/luscious.py208
-rw-r--r--gallery_dl/extractor/mangadex.py180
-rw-r--r--gallery_dl/extractor/mangafox.py61
-rw-r--r--gallery_dl/extractor/mangahere.py138
-rw-r--r--gallery_dl/extractor/mangapanda.py36
-rw-r--r--gallery_dl/extractor/mangapark.py140
-rw-r--r--gallery_dl/extractor/mangareader.py119
-rw-r--r--gallery_dl/extractor/mangastream.py54
-rw-r--r--gallery_dl/extractor/mangoxo.py176
-rw-r--r--gallery_dl/extractor/mastodon.py203
-rw-r--r--gallery_dl/extractor/message.py54
-rw-r--r--gallery_dl/extractor/myportfolio.py95
-rw-r--r--gallery_dl/extractor/newgrounds.py155
-rw-r--r--gallery_dl/extractor/ngomik.py51
-rw-r--r--gallery_dl/extractor/nhentai.py135
-rw-r--r--gallery_dl/extractor/nijie.py205
-rw-r--r--gallery_dl/extractor/nsfwalbum.py62
-rw-r--r--gallery_dl/extractor/oauth.py375
-rw-r--r--gallery_dl/extractor/paheal.py120
-rw-r--r--gallery_dl/extractor/patreon.py183
-rw-r--r--gallery_dl/extractor/photobucket.py178
-rw-r--r--gallery_dl/extractor/piczel.py118
-rw-r--r--gallery_dl/extractor/pinterest.py260
-rw-r--r--gallery_dl/extractor/pixiv.py517
-rw-r--r--gallery_dl/extractor/pixnet.py179
-rw-r--r--gallery_dl/extractor/plurk.py125
-rw-r--r--gallery_dl/extractor/pornhub.py157
-rw-r--r--gallery_dl/extractor/pururin.py102
-rw-r--r--gallery_dl/extractor/reactor.py338
-rw-r--r--gallery_dl/extractor/readcomiconline.py97
-rw-r--r--gallery_dl/extractor/recursive.py55
-rw-r--r--gallery_dl/extractor/reddit.py313
-rw-r--r--gallery_dl/extractor/rule34.py63
-rw-r--r--gallery_dl/extractor/safebooru.py61
-rw-r--r--gallery_dl/extractor/sankaku.py299
-rw-r--r--gallery_dl/extractor/sankakucomplex.py120
-rw-r--r--gallery_dl/extractor/seiga.py198
-rw-r--r--gallery_dl/extractor/senmanga.py65
-rw-r--r--gallery_dl/extractor/sexcom.py194
-rw-r--r--gallery_dl/extractor/shopify.py136
-rw-r--r--gallery_dl/extractor/simplyhentai.py187
-rw-r--r--gallery_dl/extractor/slickpic.py140
-rw-r--r--gallery_dl/extractor/slideshare.py86
-rw-r--r--gallery_dl/extractor/smugmug.py316
-rw-r--r--gallery_dl/extractor/test.py86
-rw-r--r--gallery_dl/extractor/tsumino.py343
-rw-r--r--gallery_dl/extractor/tumblr.py425
-rw-r--r--gallery_dl/extractor/twitter.py202
-rw-r--r--gallery_dl/extractor/vanillarock.py95
-rw-r--r--gallery_dl/extractor/wallhaven.py148
-rw-r--r--gallery_dl/extractor/warosu.py108
-rw-r--r--gallery_dl/extractor/weibo.py137
-rw-r--r--gallery_dl/extractor/wikiart.py134
-rw-r--r--gallery_dl/extractor/xhamster.py171
-rw-r--r--gallery_dl/extractor/xvideos.py140
-rw-r--r--gallery_dl/extractor/yandere.py68
-rw-r--r--gallery_dl/extractor/yaplog.py109
-rw-r--r--gallery_dl/extractor/yuki.py125
-rw-r--r--gallery_dl/job.py492
-rw-r--r--gallery_dl/oauth.py132
-rw-r--r--gallery_dl/option.py304
-rw-r--r--gallery_dl/output.py221
-rw-r--r--gallery_dl/postprocessor/__init__.py44
-rw-r--r--gallery_dl/postprocessor/classify.py49
-rw-r--r--gallery_dl/postprocessor/common.py25
-rw-r--r--gallery_dl/postprocessor/exec.py43
-rw-r--r--gallery_dl/postprocessor/metadata.py65
-rw-r--r--gallery_dl/postprocessor/ugoira.py132
-rw-r--r--gallery_dl/postprocessor/zip.py65
-rw-r--r--gallery_dl/text.py278
-rw-r--r--gallery_dl/util.py673
-rw-r--r--gallery_dl/version.py9
-rw-r--r--requirements.txt1
-rwxr-xr-xscripts/bash_completion.py56
-rwxr-xr-xscripts/build_testresult_db.py56
-rwxr-xr-xscripts/create_test_data.py69
-rw-r--r--scripts/hook-gallery_dl.py9
-rwxr-xr-xscripts/man.py304
-rwxr-xr-xscripts/pyinstaller.py18
-rwxr-xr-xscripts/release.sh167
-rwxr-xr-xscripts/run_tests.sh24
-rwxr-xr-xscripts/supportedsites.py264
-rw-r--r--scripts/util.py11
-rw-r--r--setup.cfg3
-rw-r--r--setup.py134
-rwxr-xr-xsnap/local/launchers/gallery-dl-launch32
-rw-r--r--snap/snapcraft.yaml110
-rw-r--r--test/__init__.py0
-rw-r--r--test/test_config.py81
-rw-r--r--test/test_cookies.py130
-rw-r--r--test/test_downloader.py235
-rw-r--r--test/test_extractor.py186
-rw-r--r--test/test_oauth.py104
-rw-r--r--test/test_results.py344
-rw-r--r--test/test_text.py409
-rw-r--r--test/test_util.py395
168 files changed, 28676 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2e257a8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,75 @@
+archive/
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Manpages
+gallery-dl.1
+gallery-dl.conf.5
+
+# Bash completion
+gallery-dl.bash_completion
+
+# Snap packaging specific
+/snap/.snapcraft/
+/parts/
+/stage/
+/prime/
+
+/*.snap
+/*_source.tar.bz2
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..6158941
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,36 @@
+language: python
+python:
+ - "3.4"
+ - "3.5"
+ - "3.6"
+ - "pypy3"
+env:
+ - GALLERYDL_TESTS=core
+matrix:
+ include:
+ - python: "3.7"
+ dist: xenial
+ - python: "3.8-dev"
+ dist: xenial
+ - python: "3.6"
+ env: GALLERYDL_TESTS=results
+ - language: minimal
+ dist: xenial
+ addons:
+ snaps:
+ - name: snapcraft
+ classic: true
+ env: SNAP_TESTS=true
+
+git:
+ depth: 3
+ quiet: true
+branches:
+ only:
+ - master
+ - /^v\d+\.\d+\.\d+(-\S*)?$/
+ - /^test(-\w+)+$/
+
+script:
+ - 'if test "${SNAP_TESTS}" != true; then ./scripts/run_tests.sh; else true; fi'
+ - 'if test "${SNAP_TESTS}" = true; then sudo apt update && snapcraft --destructive-mode && sudo snap try && snap run gallery-dl --verbose https://twitter.com/ubuntu/status/1121001597092364288; else true; fi'
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..cd74a9f
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,630 @@
+# Changelog
+
+## 1.8.7 - 2019-06-28
+### Additions
+- Support for
+ - `vanillarock` - https://vanilla-rock.com/ ([#254](https://github.com/mikf/gallery-dl/issues/254))
+ - `nsfwalbum` - https://nsfwalbum.com/ ([#287](https://github.com/mikf/gallery-dl/issues/287))
+- `artist` and `tags` metadata for `hentaicafe` ([#238](https://github.com/mikf/gallery-dl/issues/238))
+- `description` metadata for `instagram` ([#310](https://github.com/mikf/gallery-dl/issues/310))
+- Format string option to replace a substring with another - `R<old>/<new>/` ([#318](https://github.com/mikf/gallery-dl/issues/318))
+### Changes
+- Delete empty archives created by the `zip` post-processor ([#316](https://github.com/mikf/gallery-dl/issues/316))
+### Fixes
+- Handle `hitomi` Game CG galleries correctly ([#321](https://github.com/mikf/gallery-dl/issues/321))
+- Miscellaneous fixes for `deviantart`, `hitomi`, `pururin`, `kissmanga`, `keenspot`, `mangoxo`, `imagefap`
+
+## 1.8.6 - 2019-06-14
+### Additions
+- Support for
+ - `slickpic` - https://www.slickpic.com/ ([#249](https://github.com/mikf/gallery-dl/issues/249))
+ - `xhamster` - https://xhamster.com/ ([#281](https://github.com/mikf/gallery-dl/issues/281))
+ - `pornhub` - https://www.pornhub.com/ ([#282](https://github.com/mikf/gallery-dl/issues/282))
+ - `8muses` - https://www.8muses.com/ ([#305](https://github.com/mikf/gallery-dl/issues/305))
+- `extra` option for `deviantart` to download Sta.sh content linked in description texts ([#302](https://github.com/mikf/gallery-dl/issues/302))
+### Changes
+- Detect `directlink` URLs with upper case filename extensions ([#296](https://github.com/mikf/gallery-dl/issues/296))
+### Fixes
+- Improved error handling for `tumblr` API calls ([#297](https://github.com/mikf/gallery-dl/issues/297))
+- Fixed extraction of `livedoor` blogs ([#301](https://github.com/mikf/gallery-dl/issues/301))
+- Fixed extraction of special `deviantart` Sta.sh items ([#307](https://github.com/mikf/gallery-dl/issues/307))
+- Fixed pagination for specific `keenspot` comics
+
+## 1.8.5 - 2019-06-01
+### Additions
+- Support for
+ - `keenspot` - http://keenspot.com/ ([#223](https://github.com/mikf/gallery-dl/issues/223))
+ - `sankakucomplex` - https://www.sankakucomplex.com ([#258](https://github.com/mikf/gallery-dl/issues/258))
+- `folders` option for `deviantart` to add a list of containing folders to each file ([#276](https://github.com/mikf/gallery-dl/issues/276))
+- `captcha` option for `kissmanga` and `readcomiconline` to control CAPTCHA handling ([#279](https://github.com/mikf/gallery-dl/issues/279))
+- `filename` metadata for files downloaded with youtube-dl ([#291](https://github.com/mikf/gallery-dl/issues/291))
+### Changes
+- Adjust `wallhaven` extractors to new page layout:
+ - use API and add `api-key` option
+ - removed traditional login support
+- Provide original filenames for `patreon` downloads ([#268](https://github.com/mikf/gallery-dl/issues/268))
+- Use e-hentai.org or exhentai.org depending on input URL ([#278](https://github.com/mikf/gallery-dl/issues/278))
+### Fixes
+- Fix pagination over `sankaku` popular listings ([#265](https://github.com/mikf/gallery-dl/issues/265))
+- Fix folder and collection extraction on `deviantart` ([#271](https://github.com/mikf/gallery-dl/issues/271))
+- Detect "AreYouHuman" redirects on `readcomiconline` ([#279](https://github.com/mikf/gallery-dl/issues/279))
+- Miscellaneous fixes for `hentainexus`, `livedoor`, `ngomik`
+
+## 1.8.4 - 2019-05-17
+### Additions
+- Support for
+ - `patreon` - https://www.patreon.com/ ([#226](https://github.com/mikf/gallery-dl/issues/226))
+ - `hentainexus` - https://hentainexus.com/ ([#256](https://github.com/mikf/gallery-dl/issues/256))
+- `date` metadata fields for `pixiv` ([#248](https://github.com/mikf/gallery-dl/issues/248)), `instagram` ([#250](https://github.com/mikf/gallery-dl/issues/250)), `exhentai`, and `newgrounds`
+### Changes
+- Improved `flickr` metadata and video extraction ([#246](https://github.com/mikf/gallery-dl/issues/246))
+### Fixes
+- Download original GIF animations from `deviantart` ([#242](https://github.com/mikf/gallery-dl/issues/242))
+- Ignore missing `edge_media_to_comment` fields on `instagram` ([#250](https://github.com/mikf/gallery-dl/issues/250))
+- Fix serialization of `datetime` objects for `--write-metadata` ([#251](https://github.com/mikf/gallery-dl/issues/251), [#252](https://github.com/mikf/gallery-dl/issues/252))
+- Allow multiple post-processor command-line options at once ([#253](https://github.com/mikf/gallery-dl/issues/253))
+- Prevent crash on `booru` sites when no tags are available ([#259](https://github.com/mikf/gallery-dl/issues/259))
+- Fix extraction on `instagram` after `rhx_gis` field removal ([#266](https://github.com/mikf/gallery-dl/issues/266))
+- Avoid Cloudflare CAPTCHAs for Python interpreters built against OpenSSL < 1.1.1
+- Miscellaneous fixes for `luscious`
+
+## 1.8.3 - 2019-05-04
+### Additions
+- Support for
+ - `plurk` - https://www.plurk.com/ ([#212](https://github.com/mikf/gallery-dl/issues/212))
+ - `sexcom` - https://www.sex.com/ ([#147](https://github.com/mikf/gallery-dl/issues/147))
+- `--clear-cache`
+- `date` metadata fields for `deviantart`, `twitter`, and `tumblr` ([#224](https://github.com/mikf/gallery-dl/issues/224), [#232](https://github.com/mikf/gallery-dl/issues/232))
+### Changes
+- Standalone executables are now built using PyInstaller:
+ - uses the latest CPython interpreter (Python 3.7.3)
+ - available on several platforms (Windows, Linux, macOS)
+ - includes the `certifi` CA bundle, `youtube-dl`, and `pyOpenSSL` on Windows
+### Fixes
+- Patch `urllib3`'s default list of SSL/TLS ciphers to prevent Cloudflare CAPTCHAs ([#227](https://github.com/mikf/gallery-dl/issues/227))
+ (Windows users need to install `pyOpenSSL` for this to take effect)
+- Provide fallback URLs for `twitter` images ([#237](https://github.com/mikf/gallery-dl/issues/237))
+- Send `Referer` headers when downloading from `hitomi` ([#239](https://github.com/mikf/gallery-dl/issues/239))
+- Updated login procedure on `mangoxo`
+
+## 1.8.2 - 2019-04-12
+### Additions
+- Support for
+ - `pixnet` - https://www.pixnet.net/ ([#177](https://github.com/mikf/gallery-dl/issues/177))
+ - `wikiart` - https://www.wikiart.org/ ([#179](https://github.com/mikf/gallery-dl/issues/179))
+ - `mangoxo` - https://www.mangoxo.com/ ([#184](https://github.com/mikf/gallery-dl/issues/184))
+ - `yaplog` - https://yaplog.jp/ ([#190](https://github.com/mikf/gallery-dl/issues/190))
+ - `livedoor` - http://blog.livedoor.jp/ ([#190](https://github.com/mikf/gallery-dl/issues/190))
+- Login support for `mangoxo` ([#184](https://github.com/mikf/gallery-dl/issues/184)) and `twitter` ([#214](https://github.com/mikf/gallery-dl/issues/214))
+### Changes
+- Increased required `Requests` version to 2.11.0
+### Fixes
+- Improved image quality on `reactor` sites ([#210](https://github.com/mikf/gallery-dl/issues/210))
+- Support `imagebam` galleries with more than 100 images ([#219](https://github.com/mikf/gallery-dl/issues/219))
+- Updated Cloudflare bypass code
+
+## 1.8.1 - 2019-03-29
+### Additions
+- Support for:
+ - `35photo` - https://35photo.pro/ ([#162](https://github.com/mikf/gallery-dl/issues/162))
+ - `500px` - https://500px.com/ ([#185](https://github.com/mikf/gallery-dl/issues/185))
+- `instagram` extractor for hashtags ([#202](https://github.com/mikf/gallery-dl/issues/202))
+- Option to get more metadata on `deviantart` ([#189](https://github.com/mikf/gallery-dl/issues/189))
+- Man pages and bash completion ([#150](https://github.com/mikf/gallery-dl/issues/150))
+- Snap improvements ([#197](https://github.com/mikf/gallery-dl/issues/197), [#199](https://github.com/mikf/gallery-dl/issues/199), [#207](https://github.com/mikf/gallery-dl/issues/207))
+### Changes
+- Better FFmpeg arguments for `--ugoira-conv`
+- Adjusted metadata for `luscious` albums
+### Fixes
+- Proper handling of `instagram` multi-image posts ([#178](https://github.com/mikf/gallery-dl/issues/178), [#201](https://github.com/mikf/gallery-dl/issues/201))
+- Fixed `tumblr` avatar URLs when not using OAuth1.0 ([#193](https://github.com/mikf/gallery-dl/issues/193))
+- Miscellaneous fixes for `exhentai`, `komikcast`
+
+## 1.8.0 - 2019-03-15
+### Additions
+- Support for:
+ - `weibo` - https://www.weibo.com/
+ - `pururin` - https://pururin.io/ ([#174](https://github.com/mikf/gallery-dl/issues/174))
+ - `fashionnova` - https://www.fashionnova.com/ ([#175](https://github.com/mikf/gallery-dl/issues/175))
+ - `shopify` sites in general ([#175](https://github.com/mikf/gallery-dl/issues/175))
+- Snap packaging ([#169](https://github.com/mikf/gallery-dl/issues/169), [#170](https://github.com/mikf/gallery-dl/issues/170), [#187](https://github.com/mikf/gallery-dl/issues/187), [#188](https://github.com/mikf/gallery-dl/issues/188))
+- Automatic Cloudflare DDoS protection bypass
+- Extractor and Job information for logging format strings
+- `dynastyscans` image and search extractors ([#163](https://github.com/mikf/gallery-dl/issues/163))
+- `deviantart` scraps extractor ([#168](https://github.com/mikf/gallery-dl/issues/168))
+- `artstation` extractor for artwork listings ([#172](https://github.com/mikf/gallery-dl/issues/172))
+- `smugmug` video support and improved image format selection ([#183](https://github.com/mikf/gallery-dl/issues/183))
+### Changes
+- More metadata for `nhentai` galleries
+- Combined `myportfolio` extractors into one
+- Renamed `name` metadata field to `filename` and removed the original `filename` field
+- Simplified and improved internal data structures
+- Optimized creation of child extractors
+### Fixes
+- Filter empty `tumblr` URLs ([#165](https://github.com/mikf/gallery-dl/issues/165))
+- Filter ads and improve connection speed on `hentaifoundry`
+- Show proper error messages if `luscious` galleries are unavailable
+- Miscellaneous fixes for `mangahere`, `ngomik`, `simplyhentai`, `imgspice`
+### Removals
+- `seaotterscans`
+
+## 1.7.0 - 2019-02-05
+- Added support for:
+ - `photobucket` - http://photobucket.com/ ([#117](https://github.com/mikf/gallery-dl/issues/117))
+ - `hentaifox` - https://hentaifox.com/ ([#160](https://github.com/mikf/gallery-dl/issues/160))
+ - `tsumino` - https://www.tsumino.com/ ([#161](https://github.com/mikf/gallery-dl/issues/161))
+- Added the ability to dynamically generate extractors based on a user's config file for
+ - [`mastodon`](https://github.com/tootsuite/mastodon) instances ([#144](https://github.com/mikf/gallery-dl/issues/144))
+ - [`foolslide`](https://github.com/FoolCode/FoOlSlide) based sites
+ - [`foolfuuka`](https://github.com/FoolCode/FoolFuuka) based archives
+- Added an extractor for `behance` collections ([#157](https://github.com/mikf/gallery-dl/issues/157))
+- Added login support for `luscious` ([#159](https://github.com/mikf/gallery-dl/issues/159)) and `tsumino` ([#161](https://github.com/mikf/gallery-dl/issues/161))
+- Added an option to stop downloading if the `exhentai` image limit is exceeded ([#141](https://github.com/mikf/gallery-dl/issues/141))
+- Fixed extraction issues for `behance` and `mangapark`
+
+## 1.6.3 - 2019-01-18
+- Added `metadata` post-processor to write image metadata to an external file ([#135](https://github.com/mikf/gallery-dl/issues/135))
+- Added option to reverse chapter order of manga extractors ([#149](https://github.com/mikf/gallery-dl/issues/149))
+- Added authentication support for `danbooru` ([#151](https://github.com/mikf/gallery-dl/issues/151))
+- Added tag metadata for `exhentai` and `hbrowse` galleries
+- Improved `*reactor` extractors ([#148](https://github.com/mikf/gallery-dl/issues/148))
+- Fixed extraction issues for `nhentai` ([#156](https://github.com/mikf/gallery-dl/issues/156)), `pinterest`, `mangapark`
+
+## 1.6.2 - 2019-01-01
+- Added support for:
+ - `instagram` - https://www.instagram.com/ ([#134](https://github.com/mikf/gallery-dl/issues/134))
+- Added support for multiple items on sta.sh pages ([#113](https://github.com/mikf/gallery-dl/issues/113))
+- Added option to download `tumblr` avatars ([#137](https://github.com/mikf/gallery-dl/issues/137))
+- Changed defaults for visited post types and inline media on `tumblr`
+- Improved inline extraction of `tumblr` posts ([#133](https://github.com/mikf/gallery-dl/issues/133), [#137](https://github.com/mikf/gallery-dl/issues/137))
+- Improved error handling and retry behavior of all API calls
+- Improved handling of missing fields in format strings ([#136](https://github.com/mikf/gallery-dl/issues/136))
+- Fixed hash extraction for unusual `tumblr` URLs ([#129](https://github.com/mikf/gallery-dl/issues/129))
+- Fixed image subdomains for `hitomi` galleries ([#142](https://github.com/mikf/gallery-dl/issues/142))
+- Fixed and improved miscellaneous issues for `kissmanga` ([#20](https://github.com/mikf/gallery-dl/issues/20)), `luscious`, `mangapark`, `readcomiconline`
+
+## 1.6.1 - 2018-11-28
+- Added support for:
+ - `joyreactor` - http://joyreactor.cc/ ([#114](https://github.com/mikf/gallery-dl/issues/114))
+ - `pornreactor` - http://pornreactor.cc/ ([#114](https://github.com/mikf/gallery-dl/issues/114))
+ - `newgrounds` - https://www.newgrounds.com/ ([#119](https://github.com/mikf/gallery-dl/issues/119))
+- Added extractor for search results on `luscious` ([#127](https://github.com/mikf/gallery-dl/issues/127))
+- Fixed filenames of ZIP archives ([#126](https://github.com/mikf/gallery-dl/issues/126))
+- Fixed extraction issues for `gfycat`, `hentaifoundry` ([#125](https://github.com/mikf/gallery-dl/issues/125)), `mangafox`
+
+## 1.6.0 - 2018-11-17
+- Added support for:
+ - `wallhaven` - https://alpha.wallhaven.cc/
+ - `yuki` - https://yuki.la/
+- Added youtube-dl integration and video downloads for `twitter` ([#99](https://github.com/mikf/gallery-dl/issues/99)), `behance`, `artstation`
+- Added per-extractor options for network connections (`retries`, `timeout`, `verify`)
+- Added a `--no-check-certificate` command-line option
+- Added ability to specify the number of skipped downloads before aborting/exiting ([#115](https://github.com/mikf/gallery-dl/issues/115))
+- Added extractors for scraps, favorites, popular and recent images on `hentaifoundry` ([#110](https://github.com/mikf/gallery-dl/issues/110))
+- Improved login procedure for `pixiv` to avoid unwanted emails on each new login
+- Improved album metadata and error handling for `flickr` ([#109](https://github.com/mikf/gallery-dl/issues/109))
+- Updated default User-Agent string to Firefox 62 ([#122](https://github.com/mikf/gallery-dl/issues/122))
+- Fixed `twitter` API response handling when logged in ([#123](https://github.com/mikf/gallery-dl/issues/123))
+- Fixed issue when converting Ugoira using H.264
+- Fixed miscellaneous issues for `2chan`, `deviantart`, `fallenangels`, `flickr`, `imagefap`, `pinterest`, `turboimagehost`, `warosu`, `yuki` ([#112](https://github.com/mikf/gallery-dl/issues/112))
+
+## 1.5.3 - 2018-09-14
+- Added support for:
+ - `hentaicafe` - https://hentai.cafe/ ([#101](https://github.com/mikf/gallery-dl/issues/101))
+ - `bobx` - http://www.bobx.com/dark/
+- Added black-/whitelist options for post-processor modules
+- Added support for `tumblr` inline videos ([#102](https://github.com/mikf/gallery-dl/issues/102))
+- Fixed extraction of `smugmug` albums without owner ([#100](https://github.com/mikf/gallery-dl/issues/100))
+- Fixed issues when using default config values with `reddit` extractors ([#104](https://github.com/mikf/gallery-dl/issues/104))
+- Fixed pagination for user favorites on `sankaku` ([#106](https://github.com/mikf/gallery-dl/issues/106))
+- Fixed a crash when processing `deviantart` journals ([#108](https://github.com/mikf/gallery-dl/issues/108))
+
+## 1.5.2 - 2018-08-31
+- Added support for `twitter` timelines ([#96](https://github.com/mikf/gallery-dl/issues/96))
+- Added option to suppress FFmpeg output during ugoira conversions
+- Improved filename formatter performance
+- Improved inline image quality on `tumblr` ([#98](https://github.com/mikf/gallery-dl/issues/98))
+- Fixed image URLs for newly released `mangadex` chapters
+- Fixed a smaller issue with `deviantart` journals
+- Replaced `subapics` with `ngomik`
+
+## 1.5.1 - 2018-08-17
+- Added support for:
+ - `piczel` - https://piczel.tv/
+- Added support for related pins on `pinterest`
+- Fixed accessing "offensive" galleries on `exhentai` ([#97](https://github.com/mikf/gallery-dl/issues/97))
+- Fixed extraction issues for `mangadex`, `komikcast` and `behance`
+- Removed original-image functionality from `tumblr`, since "raw" images are no longer accessible
+
+## 1.5.0 - 2018-08-03
+- Added support for:
+ - `behance` - https://www.behance.net/
+ - `myportfolio` - https://www.myportfolio.com/ ([#95](https://github.com/mikf/gallery-dl/issues/95))
+- Added custom format string options to handle long strings ([#92](https://github.com/mikf/gallery-dl/issues/92), [#94](https://github.com/mikf/gallery-dl/issues/94))
+ - Slicing: `"{field[10:40]}"`
+ - Replacement: `"{field:L40/too long/}"`
+- Improved frame rate handling for ugoira conversions
+- Improved private access token usage on `deviantart`
+- Fixed metadata extraction for some images on `nijie`
+- Fixed chapter extraction on `mangahere`
+- Removed `whatisthisimnotgoodwithcomputers`
+- Removed support for Python 3.3
+
+## 1.4.2 - 2018-07-06
+- Added image-pool extractors for `safebooru` and `rule34`
+- Added option for extended tag information on `booru` sites ([#92](https://github.com/mikf/gallery-dl/issues/92))
+- Added support for DeviantArt's new URL format
+- Added support for `mangapark` mirrors
+- Changed `imagefap` extractors to use HTTPS
+- Fixed crash when skipping downloads for files without known extension
+
+## 1.4.1 - 2018-06-22
+- Added an `ugoira` post-processor to convert `pixiv` animations to WebM
+- Added `--zip` and `--ugoira-conv` command-line options
+- Changed how ugoira frame information is handled
+ - instead of being written to a separate file, it is now made available as metadata field of the ZIP archive
+- Fixed manga and chapter titles for `mangadex`
+- Fixed file deletion by post-processors
+
+## 1.4.0 - 2018-06-08
+- Added support for:
+ - `simplyhentai` - https://www.simply-hentai.com/ ([#89](https://github.com/mikf/gallery-dl/issues/89))
+- Added extractors for
+ - `pixiv` search results and followed users
+ - `deviantart` search results and popular listings
+- Added post-processors to perform actions on downloaded files
+- Added options to configure logging behavior
+- Added OAuth support for `smugmug`
+- Changed `pixiv` extractors to use the AppAPI
+ - this breaks `favorite` archive IDs and changes some metadata fields
+- Changed the default filename format for `tumblr` and renamed `offset` to `num`
+- Fixed a possible UnicodeDecodeError during installation ([#86](https://github.com/mikf/gallery-dl/issues/86))
+- Fixed extraction of `mangadex` manga with more than 100 chapters ([#84](https://github.com/mikf/gallery-dl/issues/84))
+- Fixed miscellaneous issues for `imgur`, `reddit`, `komikcast`, `mangafox` and `imagebam`
+
+## 1.3.5 - 2018-05-04
+- Added support for:
+ - `smugmug` - https://www.smugmug.com/
+- Added title information for `mangadex` chapters
+- Improved the `pinterest` API implementation ([#83](https://github.com/mikf/gallery-dl/issues/83))
+- Improved error handling for `deviantart` and `tumblr`
+- Removed `gomanga` and `puremashiro`
+
+## 1.3.4 - 2018-04-20
+- Added support for custom OAuth2 credentials for `pinterest`
+- Improved rate limit handling for `tumblr` extractors
+- Improved `hentaifoundry` extractors
+- Improved `imgur` URL patterns
+- Fixed miscellaneous extraction issues for `luscious` and `komikcast`
+- Removed `loveisover` and `spectrumnexus`
+
+## 1.3.3 - 2018-04-06
+- Added extractors for
+ - `nhentai` search results
+ - `exhentai` search results and favorites
+ - `nijie` doujins and favorites
+- Improved metadata extraction for `exhentai` and `nijie`
+- Improved `tumblr` extractors by avoiding unnecessary API calls
+- Fixed Cloudflare DDoS protection bypass
+- Fixed errors when trying to print unencodable characters
+
+## 1.3.2 - 2018-03-23
+- Added extractors for `artstation` albums, challenges and search results
+- Improved URL and metadata extraction for `hitomi`and `nhentai`
+- Fixed page transitions for `danbooru` API results ([#82](https://github.com/mikf/gallery-dl/issues/82))
+
+## 1.3.1 - 2018-03-16
+- Added support for:
+ - `mangadex` - https://mangadex.org/
+ - `artstation` - https://www.artstation.com/
+- Added Cloudflare DDoS protection bypass to `komikcast` extractors
+- Changed archive ID formats for `deviantart` folders and collections
+- Improved error handling for `deviantart` API calls
+- Removed `imgchili` and various smaller image hosts
+
+## 1.3.0 - 2018-03-02
+- Added `--proxy` to explicitly specify a proxy server ([#76](https://github.com/mikf/gallery-dl/issues/76))
+- Added options to customize [archive ID formats](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorarchive-format) and [undefined replacement fields](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorkeywords-default)
+- Changed various archive ID formats to improve their behavior for favorites / bookmarks / etc.
+ - Affected modules are `deviantart`, `flickr`, `tumblr`, `pixiv` and all …boorus
+- Improved `sankaku` and `idolcomplex` support by
+ - respecting `page` and `next` URL parameters ([#79](https://github.com/mikf/gallery-dl/issues/79))
+ - bypassing the page-limit for unauthenticated users
+- Improved `directlink` metadata by properly unquoting it
+- Fixed `pixiv` ugoira extraction ([#78](https://github.com/mikf/gallery-dl/issues/78))
+- Fixed miscellaneous extraction issues for `mangastream` and `tumblr`
+- Removed `yeet`, `chronos`, `coreimg`, `hosturimage`, `imageontime`, `img4ever`, `imgmaid`, `imgupload`
+
+## 1.2.0 - 2018-02-16
+- Added support for:
+ - `paheal` - https://rule34.paheal.net/ ([#69](https://github.com/mikf/gallery-dl/issues/69))
+ - `komikcast` - https://komikcast.com/ ([#70](https://github.com/mikf/gallery-dl/issues/70))
+ - `subapics` - http://subapics.com/ ([#70](https://github.com/mikf/gallery-dl/issues/70))
+- Added `--download-archive` to record downloaded files in an archive file
+- Added `--write-log` to write logging output to a file
+- Added a filetype check on download completion to fix incorrectly assigned filename extensions ([#63](https://github.com/mikf/gallery-dl/issues/63))
+- Added the `tumblr:...` pseudo URI scheme to support custom domains for Tumblr blogs ([#71](https://github.com/mikf/gallery-dl/issues/71))
+- Added fallback URLs for `tumblr` images ([#64](https://github.com/mikf/gallery-dl/issues/64))
+- Added support for `reddit`-hosted images ([#68](https://github.com/mikf/gallery-dl/issues/68))
+- Improved the input file format by allowing comments and per-URL options
+- Fixed OAuth 1.0 signature generation for Python 3.3 and 3.4 ([#75](https://github.com/mikf/gallery-dl/issues/75))
+- Fixed smaller issues for `luscious`, `hentai2read`, `hentaihere` and `imgur`
+- Removed the `batoto` module
+
+## 1.1.2 - 2018-01-12
+- Added support for:
+ - `puremashiro` - http://reader.puremashiro.moe/ ([#66](https://github.com/mikf/gallery-dl/issues/66))
+ - `idolcomplex` - https://idol.sankakucomplex.com/
+- Added an option to filter reblogs on `tumblr` ([#61](https://github.com/mikf/gallery-dl/issues/61))
+- Added OAuth user authentication for `tumblr` ([#65](https://github.com/mikf/gallery-dl/issues/65))
+- Added support for `slideshare` mobile URLs ([#67](https://github.com/mikf/gallery-dl/issues/67))
+- Improved pagination for various …booru sites to work around page limits
+- Fixed chapter information parsing for certain manga on `kissmanga` ([#58](https://github.com/mikf/gallery-dl/issues/58)) and `batoto` ([#60](https://github.com/mikf/gallery-dl/issues/60))
+
+## 1.1.1 - 2017-12-22
+- Added support for:
+ - `slideshare` - https://www.slideshare.net/ ([#54](https://github.com/mikf/gallery-dl/issues/54))
+- Added pool- and post-extractors for `sankaku`
+- Added OAuth user authentication for `deviantart`
+- Updated `luscious` to support `members.luscious.net` URLs ([#55](https://github.com/mikf/gallery-dl/issues/55))
+- Updated `mangahere` to use their new domain name (mangahere.cc) and support mobile URLs
+- Updated `gelbooru` to not be restricted to the first 20,000 images ([#56](https://github.com/mikf/gallery-dl/issues/56))
+- Fixed extraction issues for `nhentai` and `khinsider`
+
+## 1.1.0 - 2017-12-08
+- Added the ``-r/--limit-rate`` command-line option to set a maximum download rate
+- Added the ``--sleep`` command-line option to specify the number of seconds to sleep before each download
+- Updated `gelbooru` to no longer use their now disabled API
+- Fixed SWF extraction for `sankaku` ([#52](https://github.com/mikf/gallery-dl/issues/52))
+- Fixed extraction issues for `hentai2read` and `khinsider`
+- Removed the deprecated `--images` and `--chapters` options
+- Removed the ``mangazuki`` module
+
+## 1.0.2 - 2017-11-24
+- Added an option to set a [custom user-agent string](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractoruser-agent)
+- Improved retry behavior for failed HTTP requests
+- Improved `seiga` by providing better metadata and getting more than the latest 200 images
+- Improved `tumblr` by adding support for [all post types](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractortumblrposts), scanning for [inline images](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractortumblrinline) and following [external links](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractortumblrexternal) ([#48](https://github.com/mikf/gallery-dl/issues/48))
+- Fixed extraction issues for `hbrowse`, `khinsider` and `senmanga`
+
+## 1.0.1 - 2017-11-10
+- Added support for:
+ - `xvideos` - https://www.xvideos.com/ ([#45](https://github.com/mikf/gallery-dl/issues/45))
+- Fixed exception handling during file downloads which could lead to a premature exit
+- Fixed an issue with `tumblr` where not all images would be downloaded when using tags ([#48](https://github.com/mikf/gallery-dl/issues/48))
+- Fixed extraction issues for `imgbox` ([#47](https://github.com/mikf/gallery-dl/issues/47)), `mangastream` ([#49](https://github.com/mikf/gallery-dl/issues/49)) and `mangahere`
+
+## 1.0.0 - 2017-10-27
+- Added support for:
+ - `warosu` - https://warosu.org/
+ - `b4k` - https://arch.b4k.co/
+- Added support for `pixiv` ranking lists
+- Added support for `booru` popular lists (`danbooru`, `e621`, `konachan`, `yandere`, `3dbooru`)
+- Added the `--cookies` command-line and [`cookies`](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#extractorcookies) config option to load additional cookies
+- Added the `--filter` and `--chapter-filter` command-line options to select individual images or manga-chapters by their metadata using simple Python expressions ([#43](https://github.com/mikf/gallery-dl/issues/43))
+- Added the [`verify`](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#downloaderhttpverify) config option to control certificate verification during file downloads
+- Added config options to overwrite internally used API credentials ([API Tokens & IDs](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#api-tokens-ids))
+- Added `-K` as a shortcut for `--list-keywords`
+- Changed the `--images` and `--chapters` command-line options to `--range` and `--chapter-range`
+- Changed keyword names for various modules to make them accessible by `--filter`. In general minus signs have been replaced with underscores (e.g. `gallery-id` -> `gallery_id`).
+- Changed default filename formats for manga extractors to optionally use volume and title information
+- Improved the downloader modules to use [`.part` files](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst#downloaderpart) and support resuming incomplete downloads ([#29](https://github.com/mikf/gallery-dl/issues/29))
+- Improved `deviantart` by distinguishing between users and groups ([#26](https://github.com/mikf/gallery-dl/issues/26)), always using HTTPS, and always downloading full-sized original images
+- Improved `sankaku` by adding authentication support and fixing various other issues ([#44](https://github.com/mikf/gallery-dl/issues/44))
+- Improved URL pattern for direct image links ([#30](https://github.com/mikf/gallery-dl/issues/30))
+- Fixed an issue with `luscious` not getting original image URLs ([#33](https://github.com/mikf/gallery-dl/issues/33))
+- Fixed various smaller issues for `batoto`, `hentai2read` ([#38](https://github.com/mikf/gallery-dl/issues/38)), `jaiminisbox`, `khinsider`, `kissmanga` ([#28](https://github.com/mikf/gallery-dl/issues/28), [#46](https://github.com/mikf/gallery-dl/issues/46)), `mangahere`, `pawoo`, `twitter`
+- Removed `kisscomic` and `yonkouprod` modules
+
+## 0.9.1 - 2017-07-24
+- Added support for:
+ - `2chan` - https://www.2chan.net/
+ - `4plebs` - https://archive.4plebs.org/
+ - `archivedmoe` - https://archived.moe/
+ - `archiveofsins` - https://archiveofsins.com/
+ - `desuarchive` - https://desuarchive.org/
+ - `fireden` - https://boards.fireden.net/
+ - `loveisover` - https://archive.loveisover.me/
+ - `nyafuu` - https://archive.nyafuu.org/
+ - `rbt` - https://rbt.asia/
+ - `thebarchive` - https://thebarchive.com/
+ - `mangazuki` - https://mangazuki.co/
+- Improved `reddit` to allow submission filtering by ID and human-readable dates
+- Improved `deviantart` to support group galleries and gallery folders ([#26](https://github.com/mikf/gallery-dl/issues/26))
+- Changed `deviantart` to use better default path formats
+- Fixed extraction of larger `imgur` albums
+- Fixed some smaller issues for `pixiv`, `batoto` and `fallenangels`
+
+## 0.9.0 - 2017-06-28
+- Added support for:
+ - `reddit` - https://www.reddit.com/ ([#15](https://github.com/mikf/gallery-dl/issues/15))
+ - `flickr` - https://www.flickr.com/ ([#16](https://github.com/mikf/gallery-dl/issues/16))
+ - `gfycat` - https://gfycat.com/
+- Added support for direct image links
+- Added user authentication via [OAuth](https://github.com/mikf/gallery-dl#52oauth) for `reddit` and `flickr`
+- Added support for user authentication data from [`.netrc`](https://stackoverflow.com/tags/.netrc/info) files ([#22](https://github.com/mikf/gallery-dl/issues/22))
+- Added a simple progress indicator for multiple URLs ([#19](https://github.com/mikf/gallery-dl/issues/19))
+- Added the `--write-unsupported` command-line option to write unsupported URLs to a file
+- Added documentation for all available config options ([configuration.rst](https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst))
+- Improved `pixiv` to support tags for user downloads ([#17](https://github.com/mikf/gallery-dl/issues/17))
+- Improved `pixiv` to support shortened and http://pixiv.me/... URLs ([#23](https://github.com/mikf/gallery-dl/issues/23))
+- Improved `imgur` to properly handle `.gifv` images and provide better metadata
+- Fixed an issue with `kissmanga` where metadata parsing for some series failed ([#20](https://github.com/mikf/gallery-dl/issues/20))
+- Fixed an issue with getting filename extensions from `Content-Type` response headers
+
+## 0.8.4 - 2017-05-21
+- Added the `--abort-on-skip` option to stop extraction if a download would be skipped
+- Improved the output format of the `--list-keywords` option
+- Updated `deviantart` to support all media types and journals
+- Updated `fallenangels` to support their [Vietnamese version](https://truyen.fascans.com/)
+- Fixed an issue with multiple tags on ...booru sites
+- Removed the `yomanga` module
+
+## 0.8.3 - 2017-05-01
+- Added support for https://pawoo.net/
+- Added manga extractors for all [FoOlSlide](https://foolcode.github.io/FoOlSlide/)-based modules
+- Added the `-q/--quiet` and `-v/--verbose` options to control output verbosity
+- Added the `-j/--dump-json` option to dump extractor results in JSON format
+- Added the `--ignore-config` option
+- Updated the `exhentai` extractor to fall back to using the e-hentai version if no username is given
+- Updated `deviantart` to support sta.sh URLs
+- Fixed an issue with `kissmanga` which prevented image URLs from being decrypted properly (again)
+- Fixed an issue with `pixhost` where for an image inside an album it would always download the first image of that album ([#13](https://github.com/mikf/gallery-dl/issues/13))
+- Removed the `mangashare` and `readcomics` modules
+
+## 0.8.2 - 2017-04-10
+- Fixed an issue in `kissmanga` which prevented image URLs from being decrypted properly
+
+## 0.8.1 - 2017-04-09
+- Added new extractors:
+ - `kireicake` - https://reader.kireicake.com/
+ - `seaotterscans` - https://reader.seaotterscans.com/
+- Added a favourites extractor for `deviantart`
+- Re-enabled the `kissmanga` module
+- Updated `nijie` to support multi-page image listings
+- Updated `mangastream` to support readms.net URLs
+- Updated `exhentai` to support e-hentai.org URLs
+- Updated `fallenangels` to support their new domain and site layout
+
+## 0.8.0 - 2017-03-28
+- Added logging support
+- Added the `-R/--retries` option to specify how often a download should be retried before giving up
+- Added the `--http-timeout` option to set a timeout for HTTP connections
+- Improved error handling/tolerance during HTTP file downloads ([#10](https://github.com/mikf/gallery-dl/issues/10))
+- Improved option parsing and the help message from `-h/--help`
+- Changed the way configuration values are used by prioritizing top-level values
+ - This allows for cmdline options like `-u/--username` to overwrite values set in configuration files
+- Fixed an issue with `imagefap.com` where incorrectly reported gallery sizes would cause the extractor to fail ([#9](https://github.com/mikf/gallery-dl/issues/9))
+- Fixed an issue with `seiga.nicovideo.jp` where invalid characters in an API response caused the XML parser to fail
+- Fixed an issue with `seiga.nicovideo.jp` where the filename extension for the first image would be used for all others
+- Removed support for old configuration paths on Windows
+- Removed several modules:
+ - `mangamint`: site is down
+ - `whentai`: now requires account with VIP status for original images
+ - `kissmanga`: encrypted image URLs (will be re-added later)
+
+## 0.7.0 - 2017-03-06
+- Added `--images` and `--chapters` options
+ - Specifies which images (or chapters) to download through a comma-separated list of indices or index-ranges
+ - Example: `--images -2,4,6-8,10-` will select images with index 1, 2, 4, 6, 7, 8 and 10 up to the last one
+- Changed the `-g`/`--get-urls` option
+ - The amount of how often the -g option is given now determines up until which level URLs are resolved.
+ - See 3bca86618505c21628cd9c7179ce933a78d00ca2
+- Changed several option keys:
+ - `directory_fmt` -> `directory`
+ - `filename_fmt` -> `filename`
+ - `download-original` -> `original`
+- Improved [FoOlSlide](https://foolcode.github.io/FoOlSlide/)-based extractors
+- Fixed URL extraction for hentai2read
+- Fixed an issue with deviantart, where the API access token wouldn't get refreshed
+
+## 0.6.4 - 2017-02-13
+- Added new extractors:
+ - fallenangels (famatg.com)
+- Fixed url- and data-extraction for:
+ - nhentai
+ - mangamint
+ - twitter
+ - imagetwist
+- Disabled InsecureConnectionWarning when no certificates are available
+
+## 0.6.3 - 2017-01-25
+- Added new extractors:
+ - gomanga
+ - yomanga
+ - mangafox
+- Fixed deviantart extractor failing - switched to using their API
+- Fixed an issue with SQLite on Python 3.6
+- Automated test builds via Travis CI
+- Standalone executables for Windows
+
+## 0.6.2 - 2017-01-05
+- Added new extractors:
+ - kisscomic
+ - readcomics
+ - yonkouprod
+ - jaiminisbox
+- Added manga extractor to batoto-module
+- Added user extractor to seiga-module
+- Added `-i`/`--input-file` argument to allow local files and stdin as input (like wget)
+- Added basic support for `file://` URLs
+ - this allows for the recursive extractor to be applied to local files:
+ - `$ gallery-dl r:file://[path to file]`
+- Added a utility extractor to run unit test URLs
+- Updated luscious to deal with API changes
+- Fixed twitter to provide the original image URL
+- Minor fixes to hentaifoundry
+- Removed imgclick extractor
+
+## 0.6.1 - 2016-11-30
+- Added new extractors:
+ - whentai
+ - readcomiconline
+ - sensescans, worldthree
+ - imgmaid, imagevenue, img4ever, imgspot, imgtrial, pixhost
+- Added base class for extractors of [FoOlSlide](https://foolcode.github.io/FoOlSlide/)-based sites
+- Changed default paths for configuration files on Windows
+ - old paths are still supported, but that will change in future versions
+- Fixed aborting downloads if a single one failed ([#5](https://github.com/mikf/gallery-dl/issues/5))
+- Fixed cloudflare-bypass cache containing outdated cookies
+- Fixed image URLs for hitomi and 8chan
+- Updated deviantart to always provide the highest quality image
+- Updated README.rst
+- Removed doujinmode extractor
+
+## 0.6.0 - 2016-10-08
+- Added new extractors:
+ - hentaihere
+ - dokireader
+ - twitter
+ - rapidimg, picmaniac
+- Added support to find filename extensions by Content-Type response header
+- Fixed filename/path issues on Windows ([#4](https://github.com/mikf/gallery-dl/issues/4)):
+ - Enable path names with more than 260 characters
+ - Remove trailing spaces in path segments
+- Updated Job class to automatically set category/subcategory keywords
+
+## 0.5.2 - 2016-09-23
+- Added new extractors:
+ - pinterest
+ - rule34
+ - dynastyscans
+ - imagebam, coreimg, imgcandy, imgtrex
+- Added login capabilities for batoto
+- Added `--version` cmdline argument to print the current program version and exit
+- Added `--list-extractors` cmdline argument to print names of all extractor classes together with descriptions and example URLs
+- Added proper error messages if an image/user does not exist
+- Added unittests for every extractor
+
+## 0.5.1 - 2016-08-22
+- Added new extractors:
+ - luscious
+ - doujinmode
+ - hentaibox
+ - seiga
+ - imagefap
+- Changed error output to use stderr instead of stdout
+- Fixed broken pipes causing an exception-dump by catching BrokenPipeErrors
+
+## 0.5.0 - 2016-07-25
+
+## 0.4.1 - 2015-12-03
+- New modules (imagetwist, turboimagehost)
+- Manga-extractors: Download entire manga and not just single chapters
+- Generic extractor (provisional)
+- Better and configurable console output
+- Windows support
+
+## 0.4.0 - 2015-11-26
+
+## 0.3.3 - 2015-11-10
+
+## 0.3.2 - 2015-11-04
+
+## 0.3.1 - 2015-10-30
+
+## 0.3.0 - 2015-10-05
+
+## 0.2.0 - 2015-06-28
+
+## 0.1.0 - 2015-05-27
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5a98fcd
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,45 @@
+
+PREFIX ?= /usr/local
+BINDIR ?= $(PREFIX)/bin
+MANDIR ?= $(PREFIX)/man
+SHAREDIR ?= $(PREFIX)/share
+PYTHON ?= /usr/bin/env python3
+
+# set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local
+SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi)
+
+all: man completion docs/supportedsites.rst
+
+clean:
+ $(RM) gallery-dl.1 gallery-dl.conf.5 gallery-dl.bash_completion
+ $(RM) -r build/
+
+install: man completion
+ $(PYTHON) setup.py install
+
+release: man completion docs/supportedsites.rst
+ scripts/release.sh
+
+test:
+ scripts/run_tests.sh
+
+executable:
+ scripts/pyinstaller.py
+
+completion: gallery-dl.bash_completion
+
+man: gallery-dl.1 gallery-dl.conf.5
+
+.PHONY: all clean install release test executable completion man
+
+docs/supportedsites.rst: gallery_dl/*/*.py scripts/supportedsites.py
+ $(PYTHON) scripts/supportedsites.py
+
+gallery-dl.1: gallery_dl/option.py scripts/man.py
+ $(PYTHON) scripts/man.py
+
+gallery-dl.conf.5: docs/configuration.rst scripts/man.py
+ $(PYTHON) scripts/man.py
+
+gallery-dl.bash_completion: gallery_dl/option.py scripts/bash_completion.py
+ $(PYTHON) scripts/bash_completion.py
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..873c034
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,244 @@
+==========
+gallery-dl
+==========
+
+*gallery-dl* is a command-line program to download image-galleries and
+-collections from several image hosting sites (see `Supported Sites`_).
+It is a cross-platform tool with many configuration options
+and powerful filenaming capabilities.
+
+
+|pypi| |build| |gitter|
+
+
+Dependencies
+============
+
+- Python_ 3.4+
+- Requests_
+
+Optional
+--------
+
+- FFmpeg_: Pixiv Ugoira to WebM conversion
+- youtube-dl_: Video downloads
+
+
+Installation
+============
+
+Pip
+---
+
+The stable releases of *gallery-dl* are distributed on PyPI_ and can be
+easily installed or upgraded using pip_:
+
+.. code:: bash
+
+ $ pip install --upgrade gallery-dl
+
+Installing the latest dev-version directly from GitHub can be done with
+pip_ as well:
+
+.. code:: bash
+
+ $ pip install --upgrade https://github.com/mikf/gallery-dl/archive/master.zip
+
+Be sure the Python interpreter used for pip_ is version 3.4 or higher.
+You might have to use :code:`pip3` or :code:`python3 -m pip`
+depending on your system's defaults.
+
+
+From Source
+-----------
+
+Get the code by either
+
+* Downloading a stable_ or dev_ archive and unpacking it
+* Or via :code:`git clone https://github.com/mikf/gallery-dl.git`
+
+Navigate into the respective directory and run the :code:`setup.py` file.
+
+.. code:: bash
+
+ $ wget https://github.com/mikf/gallery-dl/archive/master.zip
+ $ unzip master.zip
+ # or
+ $ git clone https://github.com/mikf/gallery-dl.git
+
+ $ cd gallery-dl
+ $ python setup.py install
+
+
+Standalone Executable
+---------------------
+
+Download a standalone executable file,
+put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
+and run it inside a command prompt (like ``cmd.exe``).
+
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.8.7/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.8.7/gallery-dl.bin>`__
+
+These executables include a Python 3.7 interpreter
+and all required Python packages.
+
+
+Snap
+----
+
+Linux users that are using a distro that is supported by Snapd_ can install *gallery-dl* from the Snap Store:
+
+.. code:: bash
+
+ $ snap install gallery-dl
+
+
+Usage
+=====
+
+To use *gallery-dl* simply call it with the URLs you wish to download images
+from:
+
+.. code:: bash
+
+ $ gallery-dl [OPTION]... URL...
+
+See also :code:`gallery-dl --help`.
+
+
+Examples
+--------
+
+Download images; in this case from danbooru via tag search for 'bonocho':
+
+.. code:: bash
+
+ $ gallery-dl http://danbooru.donmai.us/posts?tags=bonocho
+
+
+Get the direct URL of an image from a site that requires authentication:
+
+.. code:: bash
+
+ $ gallery-dl -g -u <username> -p <password> http://seiga.nicovideo.jp/seiga/im3211703
+
+
+| Search a remote resource for URLs and download images from them:
+| (URLs for which no extractor can be found will be silently ignored)
+
+.. code:: bash
+
+ $ gallery-dl r:https://pastebin.com/raw/FLwrCYsT
+
+
+Configuration
+=============
+
+Configuration files for *gallery-dl* use a JSON-based file format.
+
+| For a (more or less) complete example with options set to their default values,
+ see gallery-dl.conf_.
+| For a configuration file example with more involved settings and options,
+ see gallery-dl-example.conf_.
+| A list of all available configuration options and their
+ descriptions can be found in configuration.rst_.
+
+*gallery-dl* searches for configuration files in the following places:
+
++--------------------------------------------+------------------------------------------+
+| Linux | Windows |
++--------------------------------------------+------------------------------------------+
+|* ``/etc/gallery-dl.conf`` |* |
+|* ``${HOME}/.config/gallery-dl/config.json``|* ``%USERPROFILE%\gallery-dl\config.json``|
+|* ``${HOME}/.gallery-dl.conf`` |* ``%USERPROFILE%\gallery-dl.conf`` |
++--------------------------------------------+------------------------------------------+
+
+(``%USERPROFILE%`` usually refers to the user's home directory,
+i.e. ``C:\Users\<username>\``)
+
+Values in later configuration files will override previous ones.
+
+
+Authentication
+==============
+
+Username & Password
+-------------------
+
+Some extractors require you to provide valid login-credentials in the form of
+a username & password pair.
+This is necessary for ``pixiv``, ``nijie`` and ``seiga``
+and optional (but strongly recommended) for ``exhentai``, ``luscious``,
+``sankaku``, ``idolcomplex``, ``tsumino`` and ``wallhaven``.
+
+You can set the necessary information in your configuration file
+(cf. gallery-dl.conf_)
+
+.. code::
+
+ {
+ "extractor": {
+ ...
+ "pixiv": {
+ "username": "<username>",
+ "password": "<password>"
+ }
+ ...
+ }
+ }
+
+or you can provide them directly via the
+:code:`-u/--username` and :code:`-p/--password` or via the
+:code:`-o/--option` command-line options
+
+.. code:: bash
+
+ $ gallery-dl -u <username> -p <password> URL
+ $ gallery-dl -o username=<username> -o password=<password> URL
+
+OAuth
+-----
+
+*gallery-dl* supports user authentication via OAuth_ for
+``deviantart``, ``flickr``, ``reddit``, ``smugmug`` and ``tumblr``.
+This is entirely optional, but grants *gallery-dl* the ability
+to issue requests on your account's behalf and enables it to access resources
+which would otherwise be unavailable to a public user.
+
+To link your account to *gallery-dl*, start by invoking it with
+``oauth:<site-name>`` as an argument. For example:
+
+.. code:: bash
+
+ $ gallery-dl oauth:flickr
+
+You will be sent to the site's authorization page and asked to grant read
+access to *gallery-dl*. Authorize it and you will be shown one or more
+"tokens", which should be added to your configuration file.
+
+
+.. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf
+.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
+.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
+.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
+.. _stable: https://github.com/mikf/gallery-dl/archive/v1.8.7.zip
+.. _dev: https://github.com/mikf/gallery-dl/archive/master.zip
+
+.. _Python: https://www.python.org/downloads/
+.. _PyPI: https://pypi.org/
+.. _pip: https://pip.pypa.io/en/stable/
+.. _Requests: http://docs.python-requests.org/en/master/
+.. _FFmpeg: https://www.ffmpeg.org/
+.. _youtube-dl: https://ytdl-org.github.io/youtube-dl/
+.. _Snapd: https://docs.snapcraft.io/installing-snapd
+.. _OAuth: https://en.wikipedia.org/wiki/OAuth
+
+.. |pypi| image:: https://img.shields.io/pypi/v/gallery-dl.svg
+ :target: https://pypi.org/project/gallery-dl/
+
+.. |build| image:: https://travis-ci.org/mikf/gallery-dl.svg?branch=master
+ :target: https://travis-ci.org/mikf/gallery-dl
+
+.. |gitter| image:: https://badges.gitter.im/gallery-dl/main.svg
+ :target: https://gitter.im/gallery-dl/main
diff --git a/bin/gallery-dl b/bin/gallery-dl
new file mode 100755
index 0000000..12da2fd
--- /dev/null
+++ b/bin/gallery-dl
@@ -0,0 +1,7 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+import gallery_dl
+
+if __name__ == '__main__':
+ gallery_dl.main()
diff --git a/docs/configuration.rst b/docs/configuration.rst
new file mode 100644
index 0000000..c606c6c
--- /dev/null
+++ b/docs/configuration.rst
@@ -0,0 +1,1615 @@
+Configuration
+#############
+
+Contents
+========
+
+1) `Extractor Options`_
+2) `Extractor-specific Options`_
+3) `Downloader Options`_
+4) `Output Options`_
+5) `Postprocessor Options`_
+6) `Miscellaneous Options`_
+7) `API Tokens & IDs`_
+
+
+
+Extractor Options
+=================
+
+
+Each extractor is identified by its ``category`` and ``subcategory``.
+The ``category`` is the lowercase site name without any spaces or special
+characters, which is usually just the module name
+(``pixiv``, ``danbooru``, ...).
+The ``subcategory`` is a lowercase word describing the general functionality
+of that extractor (``user``, ``favorite``, ``manga``, ...).
+
+Each one of the following options can be specified on multiple levels of the
+configuration tree:
+
+================== =====
+Base level: ``extractor.<option-name>``
+Category level: ``extractor.<category>.<option-name>``
+Subcategory level: ``extractor.<category>.<subcategory>.<option-name>``
+================== =====
+
+A value in a "deeper" level hereby overrides a value of the same name on a
+lower level. Setting the ``extractor.pixiv.filename`` value, for example, lets
+you specify a general filename pattern for all the different pixiv extractors.
+Using the ``extractor.pixiv.user.filename`` value lets you override this
+general pattern specifically for ``PixivUserExtractor`` instances.
+
+The ``category`` and ``subcategory`` of all extractors are included in the
+output of ``gallery-dl --list-extractors``. For a specific URL these values
+can also be determined by using the ``-K``/``--list-keywords`` command-line
+option (see the example below).
+
+extractor.*.filename
+--------------------
+=========== =====
+Type ``string``
+Example ``"{manga}_c{chapter}_{page:>03}.{extension}"``
+Description A `format string`_ to build the resulting filename
+ for a downloaded file.
+
+ The available replacement keys depend on the extractor used. A list
+ of keys for a specific one can be acquired by calling *gallery-dl*
+ with the ``-K``/``--list-keywords`` command-line option.
+ For example:
+
+ .. code::
+
+ $ gallery-dl -K http://seiga.nicovideo.jp/seiga/im5977527
+ Keywords for directory names:
+ -----------------------------
+ category
+ seiga
+ subcategory
+ image
+
+ Keywords for filenames:
+ -----------------------
+ category
+ seiga
+ extension
+ None
+ image-id
+ 5977527
+ subcategory
+ image
+
+ Note: Even if the value of the ``extension`` key is missing or
+ ``None``, it will filled in later when the file download is
+ starting. This key is therefore always available to provide
+ a valid filename extension.
+=========== =====
+
+
+extractor.*.directory
+---------------------
+=========== =====
+Type ``list`` of ``strings``
+Example ``["{category}", "{manga}", "c{chapter} - {title}"]``
+Description A list of `format strings`_ for the resulting target directory.
+
+ Each individual string in such a list represents a single path
+ segment, which will be joined together and appended to the
+ base-directory_ to form the complete target directory path.
+=========== =====
+
+
+extractor.*.base-directory
+--------------------------
+=========== =====
+Type |Path|_
+Default ``"./gallery-dl/"``
+Description Directory path used as the base for all download destinations.
+=========== =====
+
+
+extractor.*.skip
+----------------
+=========== =====
+Type ``bool`` or ``string``
+Default ``true``
+Description Controls the behavior when downloading files whose filename
+ already exists.
+
+ * ``true``: Skip downloads
+ * ``false``: Overwrite already existing files
+
+ * ``"abort"``: Abort the current extractor run
+ * ``"abort:N"``: Skip downloads and abort extractor run
+ after ``N`` consecutive skips
+
+ * ``"exit"``: Exit the program altogether
+ * ``"exit:N"``: Skip downloads and exit the program
+ after ``N`` consecutive skips
+=========== =====
+
+
+extractor.*.sleep
+-----------------
+=========== =====
+Type ``float``
+Default ``0``
+Description Number of seconds to sleep before each download.
+=========== =====
+
+
+extractor.*.username & .password
+--------------------------------
+=========== =====
+Type ``string``
+Default ``null``
+Description The username and password to use when attempting to log in to
+ another site.
+
+ Specifying username and password is
+ required for the ``pixiv``, ``nijie`` and ``seiga`` modules and
+ optional (but strongly recommended) for ``danbooru``, ``exhentai``,
+ ``sankaku`` and ``idolcomplex``.
+
+ These values can also be set via the ``-u/--username`` and
+ ``-p/--password`` command-line options or by using a |.netrc|_ file.
+ (see Authentication_)
+
+ Note: The password for ``danbooru`` is the API key found in your
+ user profile, not the password for your account.
+=========== =====
+
+
+extractor.*.netrc
+-----------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Enable the use of |.netrc|_ authentication data.
+=========== =====
+
+
+extractor.*.cookies
+-------------------
+=========== =====
+Type |Path|_ or ``object``
+Default ``null``
+Description Source to read additional cookies from.
+
+ * If this is a |Path|_, it specifies a
+ Mozilla/Netscape format cookies.txt file.
+ * If this is an ``object``, its key-value pairs, which should both
+ be ``strings``, will be used as cookie-names and -values.
+=========== =====
+
+
+extractor.*.proxy
+-----------------
+=========== =====
+Type ``string`` or ``object``
+Default ``null``
+Description Proxy (or proxies) to be used for remote connections.
+
+ * If this is a ``string``, it is the proxy URL for all
+ outgoing requests.
+ * If this is an ``object``, it is a scheme-to-proxy mapping to
+ specify different proxy URLs for each scheme.
+ It is also possible to set a proxy for a specific host by using
+ ``scheme://host`` as key.
+ See `Requests' proxy documentation`_ for more details.
+
+ Example:
+
+ .. code::
+
+ {
+ "http": "http://10.10.1.10:3128",
+ "https": "http://10.10.1.10:1080",
+ "http://10.20.1.128": "http://10.10.1.10:5323"
+ }
+
+ Note: All proxy URLs should include a scheme,
+ otherwise ``http://`` is assumed.
+=========== =====
+
+
+extractor.*.user-agent
+----------------------
+=========== =====
+Type ``string``
+Default ``"Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0"``
+Description User-Agent header value to be used for HTTP requests.
+
+ Note: This option has no effect on `pixiv` and
+ `readcomiconline` extractors, as these need specific values to
+ function correctly.
+=========== =====
+
+
+extractor.*.keywords
+--------------------
+=========== =====
+Type ``object``
+Example ``{"type": "Pixel Art", "type_id": 123}``
+Description Additional key-value pairs to be added to each metadata dictionary.
+=========== =====
+
+
+extractor.*.keywords-default
+----------------------------
+=========== =====
+Type any
+Default ``"None"``
+Description Default value used for missing or undefined keyword names in
+ format strings.
+=========== =====
+
+
+extractor.*.category-transfer
+-----------------------------
+=========== =====
+Type ``bool``
+Default Extractor-specific
+Description Transfer an extractor's (sub)category values to all child
+ extractors spawned by it, to let them inherit their parent's
+ config options.
+=========== =====
+
+
+extractor.*.archive
+-------------------
+=========== =====
+Type |Path|_
+Default ``null``
+Description File to store IDs of downloaded files in. Downloads of files
+ already recorded in this archive file will be skipped_.
+
+ The resulting archive file is not a plain text file but an SQLite3
+ database, as either lookup operations are significantly faster or
+ memory requirements are significantly lower when the
+ amount of stored IDs gets reasonably large.
+=========== =====
+
+
+extractor.*.archive-format
+--------------------------
+=========== =====
+Type ``string``
+Example ``"{id}_{offset}"``
+Description An alternative `format string`_ to build archive IDs with.
+=========== =====
+
+
+extractor.*.postprocessors
+--------------------------
+=========== =====
+Type ``list`` of |Postprocessor Configuration|_ objects
+Example .. code::
+
+ [
+ {"name": "zip", "compression": "zip"},
+ {"name": "exec", "command": ["/home/foobar/script", "{category}", "{image_id}"]}
+ ]
+
+Description A list of post-processors to be applied to each downloaded file
+ in the same order as they are specified.
+=========== =====
+
+
+extractor.*.retries
+-------------------
+=========== =====
+Type ``integer``
+Default ``5``
+Description Number of times a failed HTTP request is retried before giving up.
+=========== =====
+
+
+extractor.*.timeout
+-------------------
+=========== =====
+Type ``float`` or ``null``
+Default ``30``
+Description Amount of time (in seconds) to wait for a successful connection
+ and response from a remote server.
+
+ This value gets internally used as the |timeout|_ parameter for the
+ |requests.request()|_ method.
+=========== =====
+
+
+extractor.*.verify
+------------------
+=========== =====
+Type ``bool`` or ``string``
+Default ``true``
+Description Controls whether to verify SSL/TLS certificates for HTTPS requests.
+
+ If this is a ``string``, it must be the path to a CA bundle to use
+ instead of the default certificates.
+
+ This value gets internally used as the |verify|_ parameter for the
+ |requests.request()|_ method.
+=========== =====
+
+
+extractor.*.image-range
+-----------------------
+=========== =====
+Type ``string``
+Example | ``"10-20"``,
+ | ``"-5, 10, 30-50, 100-"``
+Description Index-range(s) specifying which images to download.
+
+ Note: The index of the first image is ``1``.
+=========== =====
+
+
+extractor.*.chapter-range
+-------------------------
+=========== =====
+Type ``string``
+Description Like `image-range`__, but applies to delegated URLs
+ like manga-chapters, etc.
+=========== =====
+
+__ `extractor.*.image-range`_
+
+
+extractor.*.image-filter
+------------------------
+=========== =====
+Type ``string``
+Example | ``"width >= 1200 and width/height > 1.2"``,
+ | ``"re.search(r'foo(bar)+', description)"``
+Description | Python expression controlling which images to download.
+ | Files for which the expression evaluates to ``False``
+ are ignored.
+ | Available keys are the filename-specific ones listed
+ by ``-K`` or ``-j``.
+=========== =====
+
+
+extractor.*.chapter-filter
+--------------------------
+=========== =====
+Type ``string``
+Description Like `image-filter`__, but applies to delegated URLs
+ like manga-chapters, etc.
+=========== =====
+
+__ `extractor.*.image-filter`_
+
+
+
+Extractor-specific Options
+==========================
+
+
+extractor.artstation.external
+-----------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Try to follow external URLs of embedded players.
+=========== =====
+
+
+extractor.deviantart.extra
+--------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Download extra Sta.sh resources from description texts.
+
+ Note: Enabling this option also enables deviantart.metadata_.
+=========== =====
+
+
+extractor.deviantart.flat
+-------------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Select the directory structure created by the Gallery- and
+ Favorite-Extractors.
+
+ * ``true``: Use a flat directory structure.
+ * ``false``: Collect a list of all gallery-folders or
+ favorites-collections and transfer any further work to other
+ extractors (``folder`` or ``collection``), which will then
+ create individual subdirectories for each of them.
+=========== =====
+
+
+extractor.deviantart.folders
+----------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Provide a ``folders`` metadata field that contains the names of all
+ folders a deviation is present in.
+
+ Note: Gathering this information requires a lot of API calls.
+ Use with caution.
+=========== =====
+
+
+extractor.deviantart.journals
+-----------------------------
+=========== =====
+Type ``string``
+Default ``"html"``
+Description Selects the output format of journal entries.
+
+ * ``"html"``: HTML with (roughly) the same layout as on DeviantArt.
+ * ``"text"``: Plain text with image references and HTML tags removed.
+ * ``"none"``: Don't download journals.
+=========== =====
+
+
+extractor.deviantart.mature
+---------------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Enable mature content.
+
+ This option simply sets the |mature_content|_ parameter for API
+ calls to either ``"true"`` or ``"false"`` and does not do any other
+ form of content filtering.
+=========== =====
+
+
+extractor.deviantart.metadata
+-----------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Request extended metadata for deviation objects to additionally
+ provide ``description``, ``tags``, ``license`` and ``is_watching``
+ fields.
+=========== =====
+
+
+extractor.deviantart.original
+-----------------------------
+=========== =====
+Type ``bool`` or ``string``
+Default ``true``
+Description Download original files if available.
+
+ Setting this option to ``"images"`` only downloads original
+ files if they are images and falls back to preview versions for
+ everything else (archives, etc.).
+=========== =====
+
+
+extractor.deviantart.refresh-token
+----------------------------------
+=========== =====
+Type ``string``
+Default ``null``
+Description The ``refresh_token`` value you get from linking your
+ DeviantArt account to *gallery-dl*.
+
+ Using a ``refresh_token`` allows you to access private or otherwise
+ not publicly available deviations.
+=========== =====
+
+
+extractor.deviantart.wait-min
+-----------------------------
+=========== =====
+Type ``integer``
+Default ``0``
+Description Minimum wait time in seconds before API requests.
+
+ Note: This value will internally be rounded up
+ to the next power of 2.
+=========== =====
+
+
+extractor.exhentai.limits
+-------------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Check image download limits
+ and stop extraction when they are exceeded.
+=========== =====
+
+
+extractor.exhentai.original
+---------------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Download full-sized original images if available.
+=========== =====
+
+
+extractor.exhentai.wait-min & .wait-max
+---------------------------------------
+=========== =====
+Type ``float``
+Default ``3.0`` and ``6.0``
+Description Minimum and maximum wait time in seconds between each image
+
+ ExHentai detects and blocks automated downloaders.
+ *gallery-dl* waits a randomly selected number of
+ seconds between ``wait-min`` and ``wait-max`` after
+ each image to prevent getting blocked.
+=========== =====
+
+
+extractor.flickr.access-token & .access-token-secret
+----------------------------------------------------
+=========== =====
+Type ``string``
+Default ``null``
+Description The ``access_token`` and ``access_token_secret`` values you get
+ from linking your Flickr account to *gallery-dl*.
+=========== =====
+
+
+extractor.flickr.videos
+-----------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Extract and download videos.
+=========== =====
+
+
+extractor.flickr.size-max
+--------------------------
+=========== =====
+Type ``integer`` or ``string``
+Default ``null``
+Description Sets the maximum allowed size for downloaded images.
+
+ * If this is an ``integer``, it specifies the maximum image dimension
+ (width and height) in pixels.
+ * If this is a ``string``, it should be one of Flickr's format specifiers
+ (``"Original"``, ``"Large"``, ... or ``"o"``, ``"k"``, ``"h"``,
+ ``"l"``, ...) to use as an upper limit.
+=========== =====
+
+
+extractor.gelbooru.api
+----------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Enable use of Gelbooru's API.
+
+ Set this value to `false` if the API has been disabled to switch
+ to manual information extraction.
+=========== =====
+
+
+extractor.gfycat.format
+-----------------------
+=========== =====
+Type ``string``
+Default ``"mp4"``
+Description The name of the preferred animation format, which can be one of
+ ``"mp4"``, ``"webm"``, ``"gif"``, ``"webp"`` or ``"mjpg"``.
+
+ If the selected format is not available, ``"mp4"``, ``"webm"``
+ and ``"gif"`` (in that order) will be tried instead, until an
+ available format is found.
+=========== =====
+
+
+extractor.imgur.mp4
+-------------------
+=========== =====
+Type ``bool`` or ``string``
+Default ``true``
+Description Controls whether to choose the GIF or MP4 version of an animation.
+
+ * ``true``: Follow Imgur's advice and choose MP4 if the
+ ``prefer_video`` flag in an image's metadata is set.
+ * ``false``: Always choose GIF.
+ * ``"always"``: Always choose MP4.
+=========== =====
+
+
+extractor.kissmanga.captcha
+---------------------------
+=========== =====
+Type ``string``
+Default ``"stop"``
+Description Controls how to handle redirects to CAPTCHA pages.
+
+ * ``"stop``: Stop the current extractor run.
+ * ``"wait``: Ask the user to solve the CAPTCHA and wait.
+=========== =====
+
+
+extractor.oauth.browser
+-----------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Controls how a user is directed to an OAuth authorization site.
+
+ * ``true``: Use Python's |webbrowser.open()|_ method to automatically
+ open the URL in the user's browser.
+ * ``false``: Ask the user to copy & paste an URL from the terminal.
+=========== =====
+
+
+extractor.photobucket.subalbums
+-------------------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Download subalbums.
+=========== =====
+
+
+extractor.pixiv.ugoira
+----------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Download Pixiv's Ugoira animations or ignore them.
+
+ These animations come as a ``.zip`` file containing all the single
+ animation frames in JPEG format.
+=========== =====
+
+
+extractor.plurk.comments
+------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Also search Plurk comments for URLs.
+=========== =====
+
+
+extractor.reactor.wait-min & .wait-max
+--------------------------------------
+=========== =====
+Type ``float``
+Default ``3.0`` and ``6.0``
+Description Minimum and maximum wait time in seconds between HTTP requests
+ during the extraction process.
+=========== =====
+
+
+extractor.readcomiconline.captcha
+---------------------------------
+=========== =====
+Type ``string``
+Default ``"stop"``
+Description Controls how to handle redirects to CAPTCHA pages.
+
+ * ``"stop``: Stop the current extractor run.
+ * ``"wait``: Ask the user to solve the CAPTCHA and wait.
+=========== =====
+
+
+extractor.recursive.blacklist
+-----------------------------
+=========== =====
+Type ``list`` of ``strings``
+Default ``["directlink", "oauth", "recursive", "test"]``
+Description A list of extractor categories which should be ignored when using
+ the ``recursive`` extractor.
+=========== =====
+
+
+extractor.reddit.comments
+-------------------------
+=========== =====
+Type ``integer`` or ``string``
+Default ``500``
+Description The value of the ``limit`` parameter when loading
+ a submission and its comments.
+ This number (roughly) specifies the total amount of comments
+ being retrieved with the first API call.
+
+ Reddit's internal default and maximum values for this parameter
+ appear to be 200 and 500 respectively.
+
+ The value `0` ignores all comments and significantly reduces the
+ time required when scanning a subreddit.
+=========== =====
+
+
+extractor.reddit.morecomments
+-----------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Retrieve additional comments by resolving the ``more`` comment
+ stubs in the base comment tree.
+
+ This requires 1 additional API call for every 100 extra comments.
+=========== =====
+
+
+extractor.reddit.date-min & .date-max
+-------------------------------------
+=========== =====
+Type ``integer`` or ``string``
+Default ``0`` and ``253402210800`` (timestamp of |datetime.max|_)
+Description Ignore all submissions posted before/after this date.
+
+ * If this is an ``integer``, it represents the date as UTC timestamp.
+ * If this is a ``string``, it will get parsed according to date-format_.
+=========== =====
+
+
+extractor.reddit.date-format
+----------------------------
+=========== =====
+Type ``string``
+Default ``"%Y-%m-%dT%H:%M:%S"``
+Description An explicit format string used to parse the ``string`` values of
+ `date-min and date-max`_.
+
+ See |strptime|_ for a list of formatting directives.
+=========== =====
+
+
+extractor.reddit.id-min & .id-max
+---------------------------------
+=========== =====
+Type ``string``
+Example ``"6kmzv2"``
+Description Ignore all submissions posted before/after the submission with
+ this ID.
+=========== =====
+
+
+extractor.reddit.recursion
+--------------------------
+=========== =====
+Type ``integer``
+Default ``0``
+Description Reddit extractors can recursively visit other submissions
+ linked to in the initial set of submissions.
+ This value sets the maximum recursion depth.
+
+ Special values:
+
+ * ``0``: Recursion is disabled
+ * ``-1``: Infinite recursion (don't do this)
+=========== =====
+
+
+extractor.reddit.refresh-token
+------------------------------
+=========== =====
+Type ``string``
+Default ``null``
+Description The ``refresh_token`` value you get from linking your
+ Reddit account to *gallery-dl*.
+
+ Using a ``refresh_token`` allows you to access private or otherwise
+ not publicly available subreddits, given that your account is
+ authorized to do so,
+ but requests to the reddit API are going to be rate limited
+ at 600 requests every 10 minutes/600 seconds.
+=========== =====
+
+
+extractor.sankaku.wait-min & .wait-max
+--------------------------------------
+=========== =====
+Type ``float``
+Default ``3.0`` and ``6.0``
+Description Minimum and maximum wait time in seconds between each image
+
+ Sankaku Channel responds with ``429 Too Many Requests`` if it
+ receives too many HTTP requests in a certain amount of time.
+ Waiting a few seconds between each request tries to prevent that.
+=========== =====
+
+
+extractor.smugmug.videos
+------------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Download video files.
+=========== =====
+
+
+extractor.tumblr.avatar
+-----------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Download blog avatars.
+=========== =====
+
+
+extractor.tumblr.external
+-------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Follow external URLs (e.g. from "Link" posts) and try to extract
+ images from them.
+=========== =====
+
+
+extractor.tumblr.inline
+-----------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Search posts for inline images and videos.
+=========== =====
+
+
+extractor.tumblr.reblogs
+------------------------
+=========== =====
+Type ``bool`` or ``string``
+Default ``true``
+Description * ``true``: Extract media from reblogged posts
+ * ``false``: Skip reblogged posts
+ * ``"same-blog"``: Skip reblogged posts unless the original post
+ is from the same blog
+=========== =====
+
+
+extractor.tumblr.posts
+----------------------
+=========== =====
+Type ``string`` or ``list`` of ``strings``
+Default ``"all"``
+Example ``"video,audio,link"`` or ``["video", "audio", "link"]``
+Description A (comma-separated) list of post types to extract images, etc. from.
+
+ Possible types are ``text``, ``quote``, ``link``, ``answer``,
+ ``video``, ``audio``, ``photo``, ``chat``.
+
+ You can use ``"all"`` instead of listing all types separately.
+=========== =====
+
+
+extractor.twitter.retweets
+--------------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Extract images from retweets.
+=========== =====
+
+
+extractor.twitter.videos
+------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Use `youtube-dl`_ to download from video tweets.
+=========== =====
+
+
+extractor.wallhaven.api-key
+---------------------------
+=========== =====
+Type ``string``
+Default ``null``
+Description Your `API Key <https://wallhaven.cc/settings/account>`__ to use
+ your account's browsing settings and default filters when searching.
+
+ See https://wallhaven.cc/help/api for more information.
+=========== =====
+
+
+extractor.[booru].tags
+----------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Categorize tags by their respective types
+ and provide them as ``tags_<type>`` metadata fields.
+
+ Note: This requires 1 additional HTTP request for each post.
+=========== =====
+
+
+extractor.[manga-extractor].chapter-reverse
+-------------------------------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Reverse the order of chapter URLs extracted from manga pages.
+
+ * ``true``: Start with the latest chapter
+ * ``false``: Start with the first chapter
+=========== =====
+
+
+
+Downloader Options
+==================
+
+
+downloader.*.enabled
+--------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Enable/Disable this downloader module.
+=========== =====
+
+
+downloader.*.part
+-----------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Controls the use of ``.part`` files during file downloads.
+
+ * ``true``: Write downloaded data into ``.part`` files and rename
+ them upon download completion. This mode additionally supports
+ resuming incomplete downloads.
+ * ``false``: Do not use ``.part`` files and write data directly
+ into the actual output files.
+=========== =====
+
+
+downloader.*.part-directory
+---------------------------
+=========== =====
+Type |Path|_
+Default ``null``
+Description Alternate location for ``.part`` files.
+
+ Missing directories will be created as needed.
+ If this value is ``null``, ``.part`` files are going to be stored
+ alongside the actual output files.
+=========== =====
+
+
+downloader.*.rate
+-----------------
+=========== =====
+Type ``string``
+Default ``null``
+Examples ``"32000"``, ``"500k"``, ``"2.5M"``
+Description Maximum download rate in bytes per second.
+
+ Possible values are valid integer or floating-point numbers
+ optionally followed by one of ``k``, ``m``. ``g``, ``t`` or ``p``.
+ These suffixes are case-insensitive.
+=========== =====
+
+
+downloader.*.retries
+--------------------
+=========== =====
+Type ``integer``
+Default `extractor.*.retries`_
+Description Number of retries during file downloads.
+=========== =====
+
+
+downloader.*.timeout
+--------------------
+=========== =====
+Type ``float`` or ``null``
+Default `extractor.*.timeout`_
+Description Connection timeout during file downloads.
+=========== =====
+
+
+downloader.*.verify
+-------------------
+=========== =====
+Type ``bool`` or ``string``
+Default `extractor.*.verify`_
+Description Certificate validation during file downloads.
+=========== =====
+
+
+downloader.ytdl.format
+----------------------
+=========== =====
+Type ``string``
+Default youtube-dl's default, currently ``"bestvideo+bestaudio/best"``
+Description Video `format selection
+ <https://github.com/ytdl-org/youtube-dl#format-selection>`__
+ directly passed to youtube-dl.
+=========== =====
+
+
+downloader.ytdl.logging
+-----------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description | Route youtube-dl's output through gallery-dl's logging system.
+ | Otherwise youtube-dl will write its output directly to stdout/stderr.
+
+ Note: Set ``quiet`` and ``no_warnings`` in
+ `downloader.ytdl.raw-options`_ to ``true`` to suppress all output.
+=========== =====
+
+
+downloader.ytdl.raw-options
+---------------------------
+=========== =====
+Type ``object``
+Example .. code::
+
+ {
+ "quiet": true,
+ "writesubtitles": true,
+ "merge_output_format": "mkv"
+ }
+
+Description | Additional options passed directly to the ``YoutubeDL`` constructor.
+ | All available options can be found in `youtube-dl's docstrings
+ <https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L138-L318>`__.
+=========== =====
+
+
+
+Output Options
+==============
+
+
+output.mode
+-----------
+=========== =====
+Type ``string``
+Default ``"auto"``
+Description Controls the output string format and status indicators.
+
+ * ``"null"``: No output
+ * ``"pipe"``: Suitable for piping to other processes or files
+ * ``"terminal"``: Suitable for the standard Windows console
+ * ``"color"``: Suitable for terminals that understand ANSI escape codes and colors
+ * ``"auto"``: Automatically choose the best suitable output mode
+=========== =====
+
+
+output.shorten
+--------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Controls whether the output strings should be shortened to fit
+ on one console line.
+=========== =====
+
+
+output.progress
+---------------
+=========== =====
+Type ``bool`` or ``string``
+Default ``true``
+Description Controls the progress indicator when *gallery-dl* is run with
+ multiple URLs as arguments.
+
+ * ``true``: Show the default progress indicator
+ (``"[{current}/{total}] {url}"``)
+ * ``false``: Do not show any progress indicator
+ * Any ``string``: Show the progress indicator using this
+ as a custom `format string`_. Possible replacement keys are
+ ``current``, ``total`` and ``url``.
+=========== =====
+
+
+output.log
+----------
+=========== =====
+Type ``string`` or |Logging Configuration|_
+Default ``"[{name}][{levelname}] {message}"``
+Description Configuration for standard logging output to stderr.
+
+ If this is a simple ``string``, it specifies
+ the format string for logging messages.
+=========== =====
+
+
+output.logfile
+--------------
+=========== =====
+Type |Path|_ or |Logging Configuration|_
+Default ``null``
+Description File to write logging output to.
+=========== =====
+
+
+output.unsupportedfile
+----------------------
+=========== =====
+Type |Path|_ or |Logging Configuration|_
+Default ``null``
+Description File to write external URLs unsupported by *gallery-dl* to.
+
+ The default format string here is ``"{message}"``.
+=========== =====
+
+
+output.num-to-str
+-----------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Convert numeric values (``integer`` or ``float``) to ``string``
+ before outputting them as JSON.
+=========== =====
+
+
+
+Postprocessor Options
+=====================
+
+
+classify
+--------
+
+Categorize files by filename extension
+
+classify.mapping
+----------------
+=========== =====
+Type ``object``
+Default .. code::
+
+ {
+ "Pictures" : ["jpg", "jpeg", "png", "gif", "bmp", "svg", "webp"],
+ "Video" : ["flv", "ogv", "avi", "mp4", "mpg", "mpeg", "3gp", "mkv", "webm", "vob", "wmv"],
+ "Music" : ["mp3", "aac", "flac", "ogg", "wma", "m4a", "wav"],
+ "Archives" : ["zip", "rar", "7z", "tar", "gz", "bz2"]
+ }
+
+Description A mapping from directory names to filename extensions that should
+ be stored in them.
+
+ Files with an extension not listed will be ignored and stored
+ in their default location.
+=========== =====
+
+
+exec
+----
+
+Execute external commands.
+
+exec.async
+----------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Controls whether to wait for a subprocess to finish
+ or to let it run asynchronously.
+=========== =====
+
+exec.command
+------------
+=========== =====
+Type ``list`` of ``strings``
+Example ``["echo", "{user[account]}", "{id}"]``
+Description The command to run.
+
+ Each element of this list is treated as a `format string`_ using
+ the files' metadata.
+=========== =====
+
+
+metadata
+--------
+
+Write image metadata to separate files
+
+metadata.mode
+-------------
+=========== =====
+Type ``string``
+Default ``"json"``
+Description Select how to write metadata.
+
+ * ``"json"``: all metadata using `json.dump()
+ <https://docs.python.org/3/library/json.html#json.dump>`_
+ * ``"tags"``: ``tags`` separated by newlines
+ * ``"custom"``: result of applying `metadata.format`_ to a file's
+ metadata dictionary
+=========== =====
+
+metadata.extension
+------------------
+=========== =====
+Type ``string``
+Default ``"json"`` or ``"txt"``
+Description Filename extension for metadata files.
+=========== =====
+
+metadata.format
+---------------
+=========== =====
+Type ``string``
+Example ``"tags:\n\n{tags:J\n}\n"``
+Description Custom format string to build content of metadata files.
+
+ Note: Only applies for ``"mode": "custom"``.
+=========== =====
+
+
+ugoira
+------
+
+Convert Pixiv Ugoira to WebM using `FFmpeg <https://www.ffmpeg.org/>`__.
+
+ugoira.extension
+----------------
+=========== =====
+Type ``string``
+Default ``"webm"``
+Description Filename extension for the resulting video files.
+=========== =====
+
+ugoira.ffmpeg-args
+------------------
+=========== =====
+Type ``list`` of ``strings``
+Default ``null``
+Example ``["-c:v", "libvpx-vp9", "-an", "-b:v", "2M"]``
+Description Additional FFmpeg command-line arguments.
+=========== =====
+
+ugoira.ffmpeg-location
+----------------------
+=========== =====
+Type |Path|_
+Default ``"ffmpeg"``
+Description Location of the ``ffmpeg`` (or ``avconv``) executable to use.
+=========== =====
+
+ugoira.ffmpeg-output
+--------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Show FFmpeg output.
+=========== =====
+
+ugoira.ffmpeg-twopass
+---------------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Enable Two-Pass encoding.
+=========== =====
+
+ugoira.framerate
+----------------
+=========== =====
+Type ``string``
+Default ``"auto"``
+Description Controls the frame rate argument (``-r``) for FFmpeg
+
+ * ``"auto"``: Automatically assign a fitting frame rate
+ based on delays between frames.
+ * any other ``string``: Use this value as argument for ``-r``.
+ * ``null`` or an empty ``string``: Don't set an explicit frame rate.
+=========== =====
+
+ugoira.keep-files
+-----------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Keep ZIP archives after conversion.
+=========== =====
+
+ugoira.libx264-prevent-odd
+--------------------------
+=========== =====
+Type ``bool``
+Default ``true``
+Description Prevent ``"width/height not divisible by 2"`` errors
+ when using ``libx264`` or ``libx265`` encoders
+ by applying a simple cropping filter. See this `Stack Overflow
+ thread <https://stackoverflow.com/questions/20847674>`__
+ for more information.
+
+ This option, when ``libx264/5`` is used, automatically
+ adds ``["-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)"]``
+ to the list of FFmpeg command-line arguments
+ to reduce an odd width/height by 1 pixel and make them even.
+=========== =====
+
+
+zip
+---
+
+Store files in a ZIP archive.
+
+zip.compression
+---------------
+=========== =====
+Type ``string``
+Default ``"store"``
+Description Compression method to use when writing the archive.
+
+ Possible values are ``"store"``, ``"zip"``, ``"bzip2"``, ``"lzma"``.
+=========== =====
+
+zip.extension
+-------------
+=========== =====
+Type ``string``
+Default ``"zip"``
+Description Filename extension for the created ZIP archive.
+=========== =====
+
+zip.keep-files
+--------------
+=========== =====
+Type ``bool``
+Default ``false``
+Description Keep the actual files after writing them to a ZIP archive.
+=========== =====
+
+
+
+Miscellaneous Options
+=====================
+
+
+cache.file
+----------
+=========== =====
+Type |Path|_
+Default |tempfile.gettempdir()|_ + ``".gallery-dl.cache"``
+Description Path of the SQLite3 database used to cache login sessions,
+ cookies and API tokens across `gallery-dl` invocations.
+
+ Set this option to ``null`` or an invalid path to disable
+ this cache.
+=========== =====
+
+
+
+API Tokens & IDs
+================
+
+
+All configuration keys listed in this section have fully functional default
+values embedded into *gallery-dl* itself, but if things unexpectedly break
+or you want to use your own personal client credentials, you can follow these
+instructions to get an alternative set of API tokens and IDs.
+
+
+extractor.deviantart.client-id & .client-secret
+-----------------------------------------------
+=========== =====
+Type ``string``
+How To * login and visit DeviantArt's
+ `Applications & Keys <https://www.deviantart.com/developers/apps>`__
+ section
+ * click "Register your Application"
+ * scroll to "OAuth2 Redirect URI Whitelist (Required)"
+ and enter "https://mikf.github.io/gallery-dl/oauth-redirect.html"
+ * click "Save" (top right)
+ * copy ``client_id`` and ``client_secret`` of your new
+ application and put them in your configuration file
+=========== =====
+
+
+extractor.flickr.api-key & .api-secret
+--------------------------------------
+=========== =====
+Type ``string``
+How To * login and `Create an App <https://www.flickr.com/services/apps/create/apply/>`__
+ in Flickr's `App Garden <https://www.flickr.com/services/>`__
+ * click "APPLY FOR A NON-COMMERCIAL KEY"
+ * fill out the form with a random name and description
+ and click "SUBMIT"
+ * copy ``Key`` and ``Secret`` and put them in your configuration
+ file
+=========== =====
+
+
+extractor.pawoo.access-token
+----------------------------
+=========== =====
+Type ``string``
+How To
+=========== =====
+
+
+extractor.reddit.client-id & .user-agent
+----------------------------------------
+=========== =====
+Type ``string``
+How To * login and visit the `apps <https://www.reddit.com/prefs/apps/>`__
+ section of your account's preferences
+ * click the "are you a developer? create an app..." button
+ * fill out the form, choose "installed app", preferably set
+ "http://localhost:6414/" as "redirect uri" and finally click
+ "create app"
+ * copy the client id (third line, under your application's name and
+ "installed app") and put it in your configuration file
+ * use "``Python:<application name>:v1.0 (by /u/<username>)``" as
+ user-agent and replace ``<application name>`` and ``<username>``
+ accordingly (see Reddit's
+ `API access rules <https://github.com/reddit/reddit/wiki/API>`__)
+=========== =====
+
+
+extractor.smugmug.api-key & .api-secret
+---------------------------------------
+=========== =====
+Type ``string``
+How To * login and `Apply for an API Key <https://api.smugmug.com/api/developer/apply>`__
+ * use a random name and description,
+ set "Type" to "Application", "Platform" to "All",
+ and "Use" to "Non-Commercial"
+ * fill out the two checkboxes at the bottom and click "Apply"
+ * copy ``API Key`` and ``API Secret``
+ and put them in your configuration file
+=========== =====
+
+
+extractor.tumblr.api-key & .api-secret
+--------------------------------------
+=========== =====
+Type ``string``
+How To * login and visit Tumblr's
+ `Applications <https://www.tumblr.com/oauth/apps>`__ section
+ * click "Register application"
+ * fill out the form: use a random name and description, set
+ https://example.org/ as "Application Website" and "Default
+ callback URL"
+ * solve Google's "I'm not a robot" challenge and click "Register"
+ * click "Show secret key" (below "OAuth Consumer Key")
+ * copy your ``OAuth Consumer Key`` and ``Secret Key``
+ and put them in your configuration file
+=========== =====
+
+
+
+Custom Types
+============
+
+
+Path
+----
+=========== =====
+Type ``string`` or ``list`` of ``strings``
+Examples * ``"file.ext"``
+ * ``"~/path/to/file.ext"``
+ * ``"$HOME/path/to/file.ext"``
+ * ``["$HOME", "path", "to", "file.ext"]``
+Description A |Path|_ is a ``string`` representing the location of a file
+ or directory.
+
+ Simple `tilde expansion <https://docs.python.org/3/library/os.path.html#os.path.expanduser>`__
+ and `environment variable expansion <https://docs.python.org/3/library/os.path.html#os.path.expandvars>`__
+ is supported.
+
+ In Windows environments, backslashes (``"\"``) can, in addition to
+ forward slashes (``"/"``), be used as path separators.
+ Because backslashes are JSON's escape character,
+ they themselves have to be escaped.
+ The path ``C:\path\to\file.ext`` has therefore to be written as
+ ``"C:\\path\\to\\file.ext"`` if you want to use backslashes.
+=========== =====
+
+
+Logging Configuration
+---------------------
+=========== =====
+Type ``object``
+
+Example .. code::
+
+ {
+ "format": "{asctime} {name}: {message}",
+ "format-date": "%H:%M:%S",
+ "path": "~/log.txt",
+ "encoding": "ascii"
+ }
+
+Description Extended logging output configuration.
+
+ * format
+ * Format string for logging messages
+
+ In addition to the default
+ `LogRecord attributes <https://docs.python.org/3/library/logging.html#logrecord-attributes>`__,
+ it is also possible to access the current
+ `extractor <https://github.com/mikf/gallery-dl/blob/2e516a1e3e09cb8a9e36a8f6f7e41ce8d4402f5a/gallery_dl/extractor/common.py#L24>`__
+ and `job <https://github.com/mikf/gallery-dl/blob/2e516a1e3e09cb8a9e36a8f6f7e41ce8d4402f5a/gallery_dl/job.py#L19>`__
+ objects as well as their attributes
+ (e.g. ``"{extractor.url}"``)
+ * Default: ``"[{name}][{levelname}] {message}"``
+ * format-date
+ * Format string for ``{asctime}`` fields in logging messages
+ (see `strftime() directives <https://docs.python.org/3/library/time.html#time.strftime>`__)
+ * Default: ``"%Y-%m-%d %H:%M:%S"``
+ * level
+ * Minimum logging message level
+ (one of ``"debug"``, ``"info"``, ``"warning"``, ``"error"``, ``"exception"``)
+ * Default: ``"info"``
+ * path
+ * |Path|_ to the output file
+ * mode
+ * Mode in which the file is opened;
+ use ``"w"`` to truncate or ``"a"`` to append
+ (see `open() <https://docs.python.org/3/library/functions.html#open>`__)
+ * Default: ``"w"``
+ * encoding
+ * File encoding
+ * Default: ``"utf-8"``
+
+ Note: path, mode and encoding are only applied when configuring
+ logging output to a file.
+=========== =====
+
+
+Postprocessor Configuration
+---------------------------
+=========== =====
+Type ``object``
+
+Example .. code::
+
+ {
+ "name": "zip",
+ "compression": "store",
+ "extension": "cbz",
+ "whitelist": ["mangadex", "exhentai", "nhentai"]
+ }
+
+Description An object with the ``name`` of a post-processor and its options.
+
+ See `Postprocessor Options`_ for a list of all available
+ post-processors and their respective options.
+
+ You can also set a ``whitelist`` or ``blacklist`` to
+ only enable or disable a post-processor for the specified
+ extractor categories.
+=========== =====
+
+
+
+.. |.netrc| replace:: ``.netrc``
+.. |tempfile.gettempdir()| replace:: ``tempfile.gettempdir()``
+.. |requests.request()| replace:: ``requests.request()``
+.. |timeout| replace:: ``timeout``
+.. |verify| replace:: ``verify``
+.. |mature_content| replace:: ``mature_content``
+.. |webbrowser.open()| replace:: ``webbrowser.open()``
+.. |datetime.max| replace:: ``datetime.max``
+.. |Path| replace:: ``Path``
+.. |Logging Configuration| replace:: ``Logging Configuration``
+.. |Postprocessor Configuration| replace:: ``Postprocessor Configuration``
+.. |strptime| replace:: strftime() and strptime() Behavior
+
+.. _base-directory: `extractor.*.base-directory`_
+.. _skipped: `extractor.*.skip`_
+.. _`date-min and date-max`: `extractor.reddit.date-min & .date-max`_
+.. _date-format: extractor.reddit.date-format_
+.. _deviantart.metadata: extractor.deviantart.metadata_
+
+.. _.netrc: https://stackoverflow.com/tags/.netrc/info
+.. _tempfile.gettempdir(): https://docs.python.org/3/library/tempfile.html#tempfile.gettempdir
+.. _requests.request(): https://docs.python-requests.org/en/master/api/#requests.request
+.. _timeout: https://docs.python-requests.org/en/latest/user/advanced/#timeouts
+.. _verify: https://docs.python-requests.org/en/master/user/advanced/#ssl-cert-verification
+.. _`Requests' proxy documentation`: http://docs.python-requests.org/en/master/user/advanced/#proxies
+.. _format string: https://docs.python.org/3/library/string.html#formatstrings
+.. _format strings: https://docs.python.org/3/library/string.html#formatstrings
+.. _strptime: https://docs.python.org/3/library/datetime.html#strftime-strptime-behavior
+.. _mature_content: https://www.deviantart.com/developers/http/v1/20160316/object/deviation
+.. _webbrowser.open(): https://docs.python.org/3/library/webbrowser.html
+.. _datetime.max: https://docs.python.org/3/library/datetime.html#datetime.datetime.max
+.. _Authentication: https://github.com/mikf/gallery-dl#5authentication
+.. _youtube-dl: https://github.com/ytdl-org/youtube-dl
diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf
new file mode 100644
index 0000000..a5270d2
--- /dev/null
+++ b/docs/gallery-dl-example.conf
@@ -0,0 +1,172 @@
+{
+ "extractor":
+ {
+ "base-directory": "~/gallery-dl/",
+ "archive": "~/gallery-dl/archive.sqlite3",
+ "proxy": "http://10.10.1.10:3128",
+
+ "postprocessors": [
+ {
+ "name": "ugoira",
+ "whitelist": ["pixiv", "danbooru"],
+ "ffmpeg-twopass": true,
+ "ffmpeg-args": ["-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an"]
+ },
+ {
+ "name": "metadata",
+ "whitelist": ["danbooru", "yandere", "sankaku"],
+ "mode": "tags"
+ }
+ ],
+
+ "pixiv":
+ {
+ "archive": "~/gallery-dl/archive-pixiv.sqlite3",
+
+ "filename": "{id}{num}.{extension}",
+ "directory": ["Pixiv", "Works", "{user[id]}"],
+
+ "username": "foo",
+ "password": "bar",
+
+ "favorite":
+ {
+ "directory": ["Pixiv", "Favorites", "{user[id]}"]
+ },
+
+ "bookmark":
+ {
+ "directory": ["Pixiv", "My Bookmarks"],
+
+ "username": "foo123",
+ "password": "bar123"
+ }
+ },
+
+ "exhentai":
+ {
+ "cookies":
+ {
+ "ipb_member_id": "12345",
+ "ipb_pass_hash": "1234567890abcdef"
+ },
+
+ "proxy":
+ {
+ "http": "http://10.10.1.10:8080",
+ "https": "https://10.10.1.10:443"
+ },
+
+ "filename": "{num:>04}_{name}.{extension}",
+ "directory": ["{category!c}", "{title}"],
+
+ "wait-min": 1.0,
+ "wait-max": 5.0
+ },
+
+ "mangadex":
+ {
+ "postprocessors": [{
+ "name": "zip",
+ "keep-files": false,
+ "compression": "zip"
+ }]
+ },
+
+ "flickr":
+ {
+ "access-token": "1234567890-abcdef",
+ "access-token-secret": "1234567890abcdef",
+ "size-max": 1920
+ },
+
+ "reddit":
+ {
+ "morecomments": true,
+ "date-min": "2017-01",
+ "date-format": "%Y-%m",
+ "recursion": 1
+ },
+
+ "sankaku":
+ {
+ "sleep": 2,
+ "wait-min": 5.0,
+ "wait-max": 5.0,
+ "cookies": "~/gallery-dl/cookies-sankaku.txt"
+ },
+
+ "tumblr":
+ {
+ "posts": "all",
+ "external": false,
+ "reblogs": false,
+ "inline": true,
+
+ "likes":
+ {
+ "posts": "video,photo,link",
+ "external": true,
+ "reblogs": true
+ }
+ },
+
+ "mastodon":
+ {
+ "mastodon.xyz":
+ {
+ "access-token": "cab65529..."
+ },
+ "tabletop.social": {
+ "access-token": "513a36c6..."
+ },
+
+ "directory": ["mastodon", "{instance}", "{account[username]!l}"],
+ "filename": "{id}_{media[id]}.{extension}"
+ },
+
+ "foolslide": {
+ "otscans": {"root": "https://otscans.com/foolslide"},
+ "helvetica": {"root": "https://helveticascans.com/r" }
+ },
+
+ "foolfuuka": {
+ "fireden-onion": {"root": "http://ydt6jy2ng3s3xg2e.onion"},
+ "scalearchive": {"root": "https://archive.scaled.team" }
+ }
+ },
+
+ "downloader":
+ {
+ "part-directory": "/tmp/.download/",
+ "rate": "1M",
+ "retries": 3,
+ "timeout": 8.5
+ },
+
+ "output":
+ {
+ "mode": "terminal",
+ "log": {
+ "format": "{name}: {message}",
+ "level": "info"
+ },
+ "logfile": {
+ "path": "~/gallery-dl/log.txt",
+ "mode": "w",
+ "level": "debug"
+ },
+ "unsupportedfile": {
+ "path": "~/gallery-dl/unsupported.txt",
+ "mode": "a",
+ "format": "{asctime} {message}",
+ "format-date": "%Y-%m-%d-%H-%M-%S"
+ }
+ },
+
+ "cache": {
+ "file": "~/gallery-dl/cache.sqlite3"
+ },
+
+ "netrc": true
+}
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
new file mode 100644
index 0000000..c792e9e
--- /dev/null
+++ b/docs/gallery-dl.conf
@@ -0,0 +1,172 @@
+{
+ "extractor":
+ {
+ "base-directory": "./gallery-dl/",
+ "postprocessors": null,
+ "archive": null,
+ "cookies": null,
+ "proxy": null,
+ "skip": true,
+ "sleep": 0,
+ "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0",
+
+ "artstation":
+ {
+ "external": false
+ },
+ "danbooru":
+ {
+ "username": null,
+ "password": null
+ },
+ "deviantart":
+ {
+ "refresh-token": null,
+ "extra": false,
+ "flat": true,
+ "folders": false,
+ "journals": "html",
+ "mature": true,
+ "metadata": false,
+ "original": true,
+ "wait-min": 0
+ },
+ "exhentai":
+ {
+ "username": null,
+ "password": null,
+ "original": true,
+ "wait-min": 3.0,
+ "wait-max": 6.0
+ },
+ "flickr":
+ {
+ "access-token": null,
+ "access-token-secret": null,
+ "videos": true,
+ "size-max": null
+ },
+ "gelbooru":
+ {
+ "api": true
+ },
+ "gfycat":
+ {
+ "format": "mp4"
+ },
+ "idolcomplex":
+ {
+ "username": null,
+ "password": null,
+ "wait-min": 3.0,
+ "wait-max": 6.0
+ },
+ "imgur":
+ {
+ "mp4": true
+ },
+ "kissmanga":
+ {
+ "captcha": "stop"
+ },
+ "nijie":
+ {
+ "username": null,
+ "password": null
+ },
+ "oauth":
+ {
+ "browser": true
+ },
+ "pixiv":
+ {
+ "username": null,
+ "password": null,
+ "ugoira": true
+ },
+ "reactor":
+ {
+ "wait-min": 3.0,
+ "wait-max": 6.0
+ },
+ "readcomiconline":
+ {
+ "captcha": "stop"
+ },
+ "recursive":
+ {
+ "blacklist": ["directlink", "oauth", "recursive", "test"]
+ },
+ "reddit":
+ {
+ "refresh-token": null,
+ "comments": 500,
+ "morecomments": false,
+ "date-min": 0,
+ "date-max": 253402210800,
+ "date-format": "%Y-%m-%dT%H:%M:%S",
+ "id-min": "0",
+ "id-max": "zik0zj",
+ "recursion": 0,
+ "user-agent": "Python:gallery-dl:0.8.4 (by /u/mikf1)"
+ },
+ "sankaku":
+ {
+ "username": null,
+ "password": null,
+ "wait-min": 3.0,
+ "wait-max": 6.0
+ },
+ "seiga":
+ {
+ "username": null,
+ "password": null
+ },
+ "tumblr":
+ {
+ "avatar": false,
+ "external": false,
+ "inline": true,
+ "posts": "all",
+ "reblogs": true
+ },
+ "twitter":
+ {
+ "retweets": true,
+ "videos": false
+ },
+ "wallhaven":
+ {
+ "api-key": null
+ },
+ "booru":
+ {
+ "tags": false
+ }
+ },
+
+ "downloader":
+ {
+ "part": true,
+ "part-directory": null,
+
+ "http":
+ {
+ "rate": null,
+ "retries": 5,
+ "timeout": 30.0,
+ "verify": true
+ }
+ },
+
+ "output":
+ {
+ "mode": "auto",
+ "progress": true,
+ "shorten": true,
+ "logfile": null,
+ "unsupportedfile": null
+ },
+
+ "netrc": false
+}
diff --git a/docs/supportedsites.rst b/docs/supportedsites.rst
new file mode 100644
index 0000000..f47ed10
--- /dev/null
+++ b/docs/supportedsites.rst
@@ -0,0 +1,137 @@
+Supported Sites
+===============
+==================== =================================== ================================================== ================
+Site URL Capabilities Authentication
+==================== =================================== ================================================== ================
+35PHOTO https://35photo.pro/ Images from Users, Genres, individual Images
+3dbooru http://behoimi.org/ Pools, Popular Images, Posts, Tag-Searches
+4chan https://www.4chan.org/ Threads
+4plebs https://archive.4plebs.org/ Threads
+500px https://500px.com/ Images from Users, Galleries, individual Images
+8chan https://8ch.net/ Threads
+8muses https://www.8muses.com/ Albums
+Adobe Portfolio https://www.myportfolio.com/ Galleries
+arch.b4k.co https://arch.b4k.co/ Threads
+Archive of Sins https://archiveofsins.com/ Threads
+Archived.Moe https://archived.moe/ Threads
+ArtStation https://www.artstation.com/ |artstation-C|
+Behance https://www.behance.net/ Images from Users, Collections, Galleries
+BobX http://www.bobx.com/dark/ Galleries, Idols
+Danbooru https://danbooru.donmai.us/ Pools, Popular Images, Posts, Tag-Searches Optional
+Desuarchive https://desuarchive.org/ Threads
+DeviantArt https://www.deviantart.com/ |deviantart-C| Optional (OAuth)
+Doki Reader https://kobato.hologfx.com/reader/ Chapters, Manga
+Dynasty Reader https://dynasty-scans.com/ Chapters, individual Images, Search Results
+e621 https://e621.net/ Pools, Popular Images, Posts, Tag-Searches
+ExHentai https://exhentai.org/ Favorites, Galleries, Search Results Optional
+Fallen Angels Scans https://www.fascans.com/ Chapters, Manga
+Fashion Nova https://www.fashionnova.com/ Collections, Products
+Fireden https://boards.fireden.net/ Threads
+Flickr https://www.flickr.com/ |flickr-C| Optional (OAuth)
+Futaba Channel https://www.2chan.net/ Threads
+Gelbooru https://gelbooru.com/ Pools, Posts, Tag-Searches
+Gfycat https://gfycat.com/ individual Images
+HBrowse https://www.hbrowse.com/ Chapters, Manga
+Hentai Cafe https://hentai.cafe/ Chapters, Manga
+Hentai Foundry https://www.hentai-foundry.com/ |hentaifoundry-C|
+Hentai2Read https://hentai2read.com/ Chapters, Manga
+HentaiFox https://hentaifox.com/ Galleries, Search Results
+HentaiHere https://hentaihere.com/ Chapters, Manga
+Hentainexus https://hentainexus.com/ Galleries, Search Results
+Hitomi.la https://hitomi.la/ Galleries
+Hypnohub https://hypnohub.net/ Pools, Popular Images, Posts, Tag-Searches
+Idol Complex https://idol.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional
+ImageBam http://www.imagebam.com/ Galleries, individual Images
+ImageFap https://imagefap.com/ Images from Users, Galleries, individual Images
+imgbox https://imgbox.com/ Galleries, individual Images
+imgth https://imgth.com/ Galleries
+imgur https://imgur.com/ Albums, individual Images
+Instagram https://www.instagram.com/ Images from Users, individual Images, Tag-Searches
+Jaimini's Box https://jaiminisbox.com/reader/ Chapters, Manga
+Joyreactor http://joyreactor.cc/ |joyreactor-C|
+Keenspot http://www.keenspot.com/ Comics
+Khinsider https://downloads.khinsider.com/ Soundtracks
+Kirei Cake https://reader.kireicake.com/ Chapters, Manga
+KissManga https://kissmanga.com/ Chapters, Manga
+Komikcast https://komikcast.com/ Chapters, Manga
+Konachan https://konachan.com/ Pools, Popular Images, Posts, Tag-Searches
+livedoor Blog http://blog.livedoor.jp/ Blogs, Posts
+Luscious https://luscious.net/ Albums, Search Results Optional
+Manga Fox https://fanfox.net/ Chapters
+Manga Here https://www.mangahere.cc/ Chapters, Manga
+Manga Stream https://readms.net/ Chapters
+MangaDex https://mangadex.org/ Chapters, Manga
+Mangapanda https://www.mangapanda.com/ Chapters, Manga
+MangaPark https://mangapark.me/ Chapters, Manga
+Mangareader https://www.mangareader.net/ Chapters, Manga
+Mangoxo https://www.mangoxo.com/ Albums, Channels Optional
+Newgrounds https://www.newgrounds.com/ Images from Users, individual Images, Videos
+Ngomik http://ngomik.in/ Chapters
+nhentai https://nhentai.net/ Galleries, Search Results
+Niconico Seiga https://seiga.nicovideo.jp/ Images from Users, individual Images Required
+nijie https://nijie.info/ |nijie-C| Required
+NSFWalbum.com https://nsfwalbum.com/ Albums
+Nyafuu Archive https://archive.nyafuu.org/ Threads
+Patreon https://www.patreon.com/ Images from Users, Creators
+Pawoo https://pawoo.net/ Images from Users, Images from Statuses
+Photobucket https://photobucket.com/ Albums, individual Images
+Piczel https://piczel.tv/ Images from Users, Folders, individual Images
+Pinterest https://www.pinterest.com/ Boards, Pins, pin.it Links, related Pins
+Pixiv https://www.pixiv.net/ |pixiv-C| Required
+Pixnet https://www.pixnet.net/ |pixnet-C|
+Plurk https://www.plurk.com/ Posts, Timelines
+Pornhub https://www.pornhub.com/ Images from Users, Galleries
+Pornreactor http://pornreactor.cc/ |pornreactor-C|
+PowerManga https://read.powermanga.org/ Chapters, Manga
+Pururin https://pururin.io/ Galleries
+Read Comic Online https://readcomiconline.to/ Comic-Issues, Comics
+RebeccaBlackTech https://rbt.asia/ Threads
+Reddit https://www.reddit.com/ individual Images, Submissions, Subreddits Optional (OAuth)
+rule #34 https://rule34.paheal.net/ Posts, Tag-Searches
+Rule 34 https://rule34.xxx/ Pools, Posts, Tag-Searches
+Safebooru https://safebooru.org/ Pools, Posts, Tag-Searches
+Sankaku Channel https://chan.sankakucomplex.com/ Pools, Posts, Tag-Searches Optional
+Sankaku Complex https://www.sankakucomplex.com/ Articles, Tag-Searches
+Sen Manga https://raw.senmanga.com/ Chapters
+Sense-Scans http://sensescans.com/reader/ Chapters, Manga
+Sex.com https://www.sex.com/ Boards, Pins, Search Results
+Simply Hentai https://www.simply-hentai.com/ Galleries, individual Images, Videos
+SlickPic https://www.slickpic.com/ Images from Users, Albums
+SlideShare https://www.slideshare.net/ Presentations
+SmugMug https://www.smugmug.com/ |smugmug-C| Optional (OAuth)
+The /b/ Archive https://thebarchive.com/ Threads
+Tsumino https://www.tsumino.com/ Galleries, Search Results Optional
+Tumblr https://www.tumblr.com/ Images from Users, Likes, Posts, Tag-Searches Optional (OAuth)
+Twitter https://twitter.com/ Media Timelines, Timelines, Tweets Optional
+Wallhaven https://wallhaven.cc/ individual Images, Search Results
+Warosu https://warosu.org/ Threads
+Weibo https://www.weibo.com/ Images from Users, Images from Statuses
+WikiArt.org https://www.wikiart.org/ Artists, Artworks
+World Three http://www.slide.world-three.org/ Chapters, Manga
+xHamster https://xhamster.com/ Images from Users, Galleries
+XVideos https://www.xvideos.com/ Images from Users, Galleries
+Yandere https://yande.re/ Pools, Popular Images, Posts, Tag-Searches
+yaplog! https://yaplog.jp/ Blogs, Posts
+|yuki-S| https://yuki.la/ Threads
+Acidimg https://acidimg.cc/ individual Images
+Imagetwist https://imagetwist.com/ individual Images
+Imagevenue http://imagevenue.com/ individual Images
+Imgspice https://imgspice.com/ individual Images
+Imxto https://imx.to/ individual Images
+Pixhost https://pixhost.to/ individual Images
+Postimg https://postimages.org/ individual Images
+Turboimagehost https://www.turboimagehost.com/ individual Images
+もえぴりあ https://vanilla-rock.com/ Posts, Tag-Searches
+==================== =================================== ================================================== ================
+
+.. |artstation-C| replace:: Images from Users, Albums, Artwork Listings, Challenges, individual Images, Likes, Search Results
+.. |deviantart-C| replace:: Collections, Deviations, Favorites, Folders, Galleries, Journals, Popular Images, Scraps, Sta.sh
+.. |flickr-C| replace:: Images from Users, Albums, Favorites, Galleries, Groups, individual Images, Search Results
+.. |hentaifoundry-C| replace:: Images from Users, Favorites, individual Images, Popular Images, Recent Images, Scraps
+.. |joyreactor-C| replace:: Images from Users, Posts, Search Results, Tag-Searches
+.. |nijie-C| replace:: Images from Users, Doujin, Favorites, individual Images
+.. |pixiv-C| replace:: Images from Users, Favorites, Follows, pixiv.me Links, Rankings, Search Results, Individual Images
+.. |pixnet-C| replace:: Images from Users, Folders, individual Images, Sets
+.. |pornreactor-C| replace:: Images from Users, Posts, Search Results, Tag-Searches
+.. |smugmug-C| replace:: Albums, individual Images, Images from Users and Folders
+.. |yuki-S| replace:: yuki.la 4chan archive
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
new file mode 100644
index 0000000..3643a5c
--- /dev/null
+++ b/gallery_dl/__init__.py
@@ -0,0 +1,255 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from __future__ import unicode_literals, print_function
+
+__author__ = "Mike Fährmann"
+__copyright__ = "Copyright 2014-2018 Mike Fährmann"
+__license__ = "GPLv2"
+__maintainer__ = "Mike Fährmann"
+__email__ = "mike_faehrmann@web.de"
+
+import sys
+
+if sys.hexversion < 0x3040000:
+ sys.exit("Python 3.4+ required")
+
+import json
+import logging
+from . import version, config, option, output, extractor, job, util, exception
+
+__version__ = version.__version__
+
+
+def progress(urls, pformat):
+ """Wrapper around urls to output a simple progress indicator"""
+ if pformat is True:
+ pformat = "[{current}/{total}] {url}"
+ pinfo = {"total": len(urls)}
+ for pinfo["current"], pinfo["url"] in enumerate(urls, 1):
+ print(pformat.format_map(pinfo), file=sys.stderr)
+ yield pinfo["url"]
+
+
+def parse_inputfile(file, log):
+ """Filter and process strings from an input file.
+
+ Lines starting with '#' and empty lines will be ignored.
+ Lines starting with '-' will be interpreted as a key-value pair separated
+ by an '='. where 'key' is a dot-separated option name and 'value' is a
+ JSON-parsable value for it. These config options will be applied while
+ processing the next URL.
+ Lines starting with '-G' are the same as above, except these options will
+ be valid for all following URLs, i.e. they are Global.
+ Everything else will be used as potential URL.
+
+ Example input file:
+
+ # settings global options
+ -G base-directory = "/tmp/"
+ -G skip = false
+
+ # setting local options for the next URL
+ -filename="spaces_are_optional.jpg"
+ -skip = true
+
+ https://example.org/
+
+ # next URL uses default filename and 'skip' is false.
+ https://example.com/index.htm
+ """
+ gconf = []
+ lconf = []
+
+ for line in file:
+ line = line.strip()
+
+ if not line or line[0] == "#":
+ # empty line or comment
+ continue
+
+ elif line[0] == "-":
+ # config spec
+ if len(line) >= 2 and line[1] == "G":
+ conf = gconf
+ line = line[2:]
+ else:
+ conf = lconf
+ line = line[1:]
+
+ key, sep, value = line.partition("=")
+ if not sep:
+ log.warning("input file: invalid <key>=<value> pair: %s", line)
+ continue
+
+ try:
+ value = json.loads(value.strip())
+ except ValueError as exc:
+ log.warning("input file: unable to parse '%s': %s", value, exc)
+ continue
+
+ conf.append((key.strip().split("."), value))
+
+ else:
+ # url
+ if gconf or lconf:
+ yield util.ExtendedUrl(line, gconf, lconf)
+ gconf = []
+ lconf = []
+ else:
+ yield line
+
+
+def main():
+ try:
+ if sys.stdout.encoding.lower() != "utf-8":
+ output.replace_std_streams()
+
+ parser = option.build_parser()
+ args = parser.parse_args()
+ log = output.initialize_logging(args.loglevel)
+
+ # configuration
+ if args.load_config:
+ config.load()
+ if args.cfgfiles:
+ config.load(args.cfgfiles, strict=True)
+ if args.yamlfiles:
+ config.load(args.yamlfiles, strict=True, fmt="yaml")
+ if args.postprocessors:
+ config.set(("postprocessors", ), args.postprocessors)
+ for key, value in args.options:
+ config.set(key, value)
+
+ # stream logging handler
+ output.configure_logging_handler(
+ "log", logging.getLogger().handlers[0])
+
+ # file logging handler
+ handler = output.setup_logging_handler(
+ "logfile", lvl=args.loglevel)
+ if handler:
+ logging.getLogger().addHandler(handler)
+
+ # loglevels
+ if args.loglevel >= logging.ERROR:
+ config.set(("output", "mode"), "null")
+ elif args.loglevel <= logging.DEBUG:
+ import platform
+ import subprocess
+ import os.path
+ import requests
+
+ head = ""
+ try:
+ out, err = subprocess.Popen(
+ ("git", "rev-parse", "--short", "HEAD"),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ cwd=os.path.dirname(os.path.abspath(__file__)),
+ ).communicate()
+ if out and not err:
+ head = " - Git HEAD: " + out.decode().rstrip()
+ except (OSError, subprocess.SubprocessError):
+ pass
+
+ log.debug("Version %s%s", __version__, head)
+ log.debug("Python %s - %s",
+ platform.python_version(), platform.platform())
+ try:
+ log.debug("requests %s - urllib3 %s",
+ requests.__version__,
+ requests.packages.urllib3.__version__)
+ except AttributeError:
+ pass
+
+ if args.list_modules:
+ for module_name in extractor.modules:
+ print(module_name)
+ elif args.list_extractors:
+ for extr in extractor.extractors():
+ if not extr.__doc__:
+ continue
+ print(extr.__name__)
+ print(extr.__doc__)
+ print("Category:", extr.category,
+ "- Subcategory:", extr.subcategory)
+ test = next(extr._get_tests(), None)
+ if test:
+ print("Example :", test[0])
+ print()
+ elif args.clear_cache:
+ from . import cache
+ log = logging.getLogger("cache")
+ cnt = cache.clear()
+
+ if cnt is None:
+ log.error("Database file not available")
+ else:
+ log.info(
+ "Deleted %d %s from '%s'",
+ cnt, "entry" if cnt == 1 else "entries", cache._path(),
+ )
+ else:
+ if not args.urls and not args.inputfile:
+ parser.error(
+ "The following arguments are required: URL\n"
+ "Use 'gallery-dl --help' to get a list of all options.")
+
+ if args.list_urls:
+ jobtype = job.UrlJob
+ jobtype.maxdepth = args.list_urls
+ else:
+ jobtype = args.jobtype or job.DownloadJob
+
+ urls = args.urls
+ if args.inputfile:
+ try:
+ if args.inputfile == "-":
+ file = sys.stdin
+ else:
+ file = open(args.inputfile, encoding="utf-8")
+ urls += parse_inputfile(file, log)
+ file.close()
+ except OSError as exc:
+ log.warning("input file: %s", exc)
+
+ # unsupported file logging handler
+ handler = output.setup_logging_handler(
+ "unsupportedfile", fmt="{message}")
+ if handler:
+ ulog = logging.getLogger("unsupported")
+ ulog.addHandler(handler)
+ ulog.propagate = False
+ job.Job.ulog = ulog
+
+ pformat = config.get(("output", "progress"), True)
+ if pformat and len(urls) > 1 and args.loglevel < logging.ERROR:
+ urls = progress(urls, pformat)
+
+ for url in urls:
+ try:
+ log.debug("Starting %s for '%s'", jobtype.__name__, url)
+ if isinstance(url, util.ExtendedUrl):
+ for key, value in url.gconfig:
+ config.set(key, value)
+ with config.apply(url.lconfig):
+ jobtype(url.value).run()
+ else:
+ jobtype(url).run()
+ except exception.NoExtractorError:
+ log.error("No suitable extractor found for '%s'", url)
+
+ except KeyboardInterrupt:
+ print("\nKeyboardInterrupt", file=sys.stderr)
+ except BrokenPipeError:
+ pass
+ except IOError as exc:
+ import errno
+ if exc.errno != errno.EPIPE:
+ raise
diff --git a/gallery_dl/__main__.py b/gallery_dl/__main__.py
new file mode 100644
index 0000000..04ea9fe
--- /dev/null
+++ b/gallery_dl/__main__.py
@@ -0,0 +1,20 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2017 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import sys
+
+if __package__ is None and not hasattr(sys, "frozen"):
+ import os.path
+ path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+ sys.path.insert(0, os.path.realpath(path))
+
+import gallery_dl
+
+if __name__ == "__main__":
+ gallery_dl.main()
diff --git a/gallery_dl/aes.py b/gallery_dl/aes.py
new file mode 100644
index 0000000..a45f50e
--- /dev/null
+++ b/gallery_dl/aes.py
@@ -0,0 +1,337 @@
+# -*- coding: utf-8 -*-
+
+# This is a stripped down version of youtube-dl's aes module.
+# All credit for this code goes to the authors of the youtube-dl project.
+# https://ytdl-org.github.io/youtube-dl/
+# https://github.com/ytdl-org/youtube-dl/
+
+import base64
+from math import ceil
+
+BLOCK_SIZE_BYTES = 16
+
+
+def aes_cbc_decrypt(data, key, iv):
+ """
+ Decrypt with aes in CBC mode
+
+ @param {int[]} data cipher
+ @param {int[]} key 16/24/32-Byte cipher key
+ @param {int[]} iv 16-Byte IV
+ @returns {int[]} decrypted data
+ """
+ expanded_key = key_expansion(key)
+ block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES))
+
+ decrypted_data = []
+ previous_cipher_block = iv
+ for i in range(block_count):
+ block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+ block += [0] * (BLOCK_SIZE_BYTES - len(block))
+
+ decrypted_block = aes_decrypt(block, expanded_key)
+ decrypted_data += xor(decrypted_block, previous_cipher_block)
+ previous_cipher_block = block
+ decrypted_data = decrypted_data[:len(data)]
+
+ return decrypted_data
+
+
+def aes_cbc_decrypt_text(data, key, iv):
+ """
+ Decrypt with aes in CBC mode
+
+ @param {string} data base64 encoded cipher
+ @param {int[]} key 16/24/32-Byte cipher key
+ @param {int[]} iv 16-Byte IV
+ @returns {string} decrypted data as utf8 encoded string
+ """
+ data = base64.standard_b64decode(bytes(data, "ascii"))
+ charcodes = aes_cbc_decrypt(list(data), key, iv)
+ last = charcodes[-1]
+ if last <= 16:
+ charcodes = charcodes[:-last]
+ return bytes(charcodes).decode()
+
+
+def key_expansion(data):
+ """
+ Generate key schedule
+
+ @param {int[]} data 16/24/32-Byte cipher key
+ @returns {int[]} 176/208/240-Byte expanded key
+ """
+ data = data[:] # copy
+ rcon_iteration = 1
+ key_size_bytes = len(data)
+ expanded_key_size_bytes = (key_size_bytes // 4 + 7) * BLOCK_SIZE_BYTES
+
+ while len(data) < expanded_key_size_bytes:
+ temp = data[-4:]
+ temp = key_schedule_core(temp, rcon_iteration)
+ rcon_iteration += 1
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+ for _ in range(3):
+ temp = data[-4:]
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+ if key_size_bytes == 32:
+ temp = data[-4:]
+ temp = sub_bytes(temp)
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+
+ if key_size_bytes == 32:
+ rounds = 3
+ elif key_size_bytes == 24:
+ rounds = 2
+ else:
+ rounds = 0
+ for _ in range(rounds):
+ temp = data[-4:]
+ data += xor(temp, data[-key_size_bytes: 4 - key_size_bytes])
+ data = data[:expanded_key_size_bytes]
+
+ return data
+
+
+def aes_decrypt(data, expanded_key):
+ """
+ Decrypt one block with aes
+
+ @param {int[]} data 16-Byte cipher
+ @param {int[]} expanded_key 176/208/240-Byte expanded key
+ @returns {int[]} 16-Byte state
+ """
+ rounds = len(expanded_key) // BLOCK_SIZE_BYTES - 1
+
+ for i in range(rounds, 0, -1):
+ data = xor(
+ data,
+ expanded_key[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES]
+ )
+ if i != rounds:
+ data = mix_columns_inv(data)
+ data = shift_rows_inv(data)
+ data = sub_bytes_inv(data)
+ data = xor(data, expanded_key[:BLOCK_SIZE_BYTES])
+
+ return data
+
+
+RCON = (
+ 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36,
+)
+SBOX = (
+ 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5,
+ 0x30, 0x01, 0x67, 0x2B, 0xFE, 0xD7, 0xAB, 0x76,
+ 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0,
+ 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0,
+ 0xB7, 0xFD, 0x93, 0x26, 0x36, 0x3F, 0xF7, 0xCC,
+ 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15,
+ 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A,
+ 0x07, 0x12, 0x80, 0xE2, 0xEB, 0x27, 0xB2, 0x75,
+ 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0,
+ 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84,
+ 0x53, 0xD1, 0x00, 0xED, 0x20, 0xFC, 0xB1, 0x5B,
+ 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF,
+ 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85,
+ 0x45, 0xF9, 0x02, 0x7F, 0x50, 0x3C, 0x9F, 0xA8,
+ 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5,
+ 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2,
+ 0xCD, 0x0C, 0x13, 0xEC, 0x5F, 0x97, 0x44, 0x17,
+ 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73,
+ 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88,
+ 0x46, 0xEE, 0xB8, 0x14, 0xDE, 0x5E, 0x0B, 0xDB,
+ 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C,
+ 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79,
+ 0xE7, 0xC8, 0x37, 0x6D, 0x8D, 0xD5, 0x4E, 0xA9,
+ 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08,
+ 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6,
+ 0xE8, 0xDD, 0x74, 0x1F, 0x4B, 0xBD, 0x8B, 0x8A,
+ 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E,
+ 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E,
+ 0xE1, 0xF8, 0x98, 0x11, 0x69, 0xD9, 0x8E, 0x94,
+ 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF,
+ 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68,
+ 0x41, 0x99, 0x2D, 0x0F, 0xB0, 0x54, 0xBB, 0x16,
+)
+SBOX_INV = (
+ 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+ 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
+ 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+ 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
+ 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+ 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
+ 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+ 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
+ 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+ 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
+ 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+ 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
+ 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+ 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
+ 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+ 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
+ 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+ 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
+ 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+ 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
+ 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+ 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
+ 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+ 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
+ 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+ 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
+ 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+ 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
+ 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+ 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
+ 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+ 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
+)
+MIX_COLUMN_MATRIX = (
+ (0x2, 0x3, 0x1, 0x1),
+ (0x1, 0x2, 0x3, 0x1),
+ (0x1, 0x1, 0x2, 0x3),
+ (0x3, 0x1, 0x1, 0x2),
+)
+MIX_COLUMN_MATRIX_INV = (
+ (0xE, 0xB, 0xD, 0x9),
+ (0x9, 0xE, 0xB, 0xD),
+ (0xD, 0x9, 0xE, 0xB),
+ (0xB, 0xD, 0x9, 0xE),
+)
+RIJNDAEL_EXP_TABLE = (
+ 0x01, 0x03, 0x05, 0x0F, 0x11, 0x33, 0x55, 0xFF,
+ 0x1A, 0x2E, 0x72, 0x96, 0xA1, 0xF8, 0x13, 0x35,
+ 0x5F, 0xE1, 0x38, 0x48, 0xD8, 0x73, 0x95, 0xA4,
+ 0xF7, 0x02, 0x06, 0x0A, 0x1E, 0x22, 0x66, 0xAA,
+ 0xE5, 0x34, 0x5C, 0xE4, 0x37, 0x59, 0xEB, 0x26,
+ 0x6A, 0xBE, 0xD9, 0x70, 0x90, 0xAB, 0xE6, 0x31,
+ 0x53, 0xF5, 0x04, 0x0C, 0x14, 0x3C, 0x44, 0xCC,
+ 0x4F, 0xD1, 0x68, 0xB8, 0xD3, 0x6E, 0xB2, 0xCD,
+ 0x4C, 0xD4, 0x67, 0xA9, 0xE0, 0x3B, 0x4D, 0xD7,
+ 0x62, 0xA6, 0xF1, 0x08, 0x18, 0x28, 0x78, 0x88,
+ 0x83, 0x9E, 0xB9, 0xD0, 0x6B, 0xBD, 0xDC, 0x7F,
+ 0x81, 0x98, 0xB3, 0xCE, 0x49, 0xDB, 0x76, 0x9A,
+ 0xB5, 0xC4, 0x57, 0xF9, 0x10, 0x30, 0x50, 0xF0,
+ 0x0B, 0x1D, 0x27, 0x69, 0xBB, 0xD6, 0x61, 0xA3,
+ 0xFE, 0x19, 0x2B, 0x7D, 0x87, 0x92, 0xAD, 0xEC,
+ 0x2F, 0x71, 0x93, 0xAE, 0xE9, 0x20, 0x60, 0xA0,
+ 0xFB, 0x16, 0x3A, 0x4E, 0xD2, 0x6D, 0xB7, 0xC2,
+ 0x5D, 0xE7, 0x32, 0x56, 0xFA, 0x15, 0x3F, 0x41,
+ 0xC3, 0x5E, 0xE2, 0x3D, 0x47, 0xC9, 0x40, 0xC0,
+ 0x5B, 0xED, 0x2C, 0x74, 0x9C, 0xBF, 0xDA, 0x75,
+ 0x9F, 0xBA, 0xD5, 0x64, 0xAC, 0xEF, 0x2A, 0x7E,
+ 0x82, 0x9D, 0xBC, 0xDF, 0x7A, 0x8E, 0x89, 0x80,
+ 0x9B, 0xB6, 0xC1, 0x58, 0xE8, 0x23, 0x65, 0xAF,
+ 0xEA, 0x25, 0x6F, 0xB1, 0xC8, 0x43, 0xC5, 0x54,
+ 0xFC, 0x1F, 0x21, 0x63, 0xA5, 0xF4, 0x07, 0x09,
+ 0x1B, 0x2D, 0x77, 0x99, 0xB0, 0xCB, 0x46, 0xCA,
+ 0x45, 0xCF, 0x4A, 0xDE, 0x79, 0x8B, 0x86, 0x91,
+ 0xA8, 0xE3, 0x3E, 0x42, 0xC6, 0x51, 0xF3, 0x0E,
+ 0x12, 0x36, 0x5A, 0xEE, 0x29, 0x7B, 0x8D, 0x8C,
+ 0x8F, 0x8A, 0x85, 0x94, 0xA7, 0xF2, 0x0D, 0x17,
+ 0x39, 0x4B, 0xDD, 0x7C, 0x84, 0x97, 0xA2, 0xFD,
+ 0x1C, 0x24, 0x6C, 0xB4, 0xC7, 0x52, 0xF6, 0x01,
+)
+RIJNDAEL_LOG_TABLE = (
+ 0x00, 0x00, 0x19, 0x01, 0x32, 0x02, 0x1a, 0xc6,
+ 0x4b, 0xc7, 0x1b, 0x68, 0x33, 0xee, 0xdf, 0x03,
+ 0x64, 0x04, 0xe0, 0x0e, 0x34, 0x8d, 0x81, 0xef,
+ 0x4c, 0x71, 0x08, 0xc8, 0xf8, 0x69, 0x1c, 0xc1,
+ 0x7d, 0xc2, 0x1d, 0xb5, 0xf9, 0xb9, 0x27, 0x6a,
+ 0x4d, 0xe4, 0xa6, 0x72, 0x9a, 0xc9, 0x09, 0x78,
+ 0x65, 0x2f, 0x8a, 0x05, 0x21, 0x0f, 0xe1, 0x24,
+ 0x12, 0xf0, 0x82, 0x45, 0x35, 0x93, 0xda, 0x8e,
+ 0x96, 0x8f, 0xdb, 0xbd, 0x36, 0xd0, 0xce, 0x94,
+ 0x13, 0x5c, 0xd2, 0xf1, 0x40, 0x46, 0x83, 0x38,
+ 0x66, 0xdd, 0xfd, 0x30, 0xbf, 0x06, 0x8b, 0x62,
+ 0xb3, 0x25, 0xe2, 0x98, 0x22, 0x88, 0x91, 0x10,
+ 0x7e, 0x6e, 0x48, 0xc3, 0xa3, 0xb6, 0x1e, 0x42,
+ 0x3a, 0x6b, 0x28, 0x54, 0xfa, 0x85, 0x3d, 0xba,
+ 0x2b, 0x79, 0x0a, 0x15, 0x9b, 0x9f, 0x5e, 0xca,
+ 0x4e, 0xd4, 0xac, 0xe5, 0xf3, 0x73, 0xa7, 0x57,
+ 0xaf, 0x58, 0xa8, 0x50, 0xf4, 0xea, 0xd6, 0x74,
+ 0x4f, 0xae, 0xe9, 0xd5, 0xe7, 0xe6, 0xad, 0xe8,
+ 0x2c, 0xd7, 0x75, 0x7a, 0xeb, 0x16, 0x0b, 0xf5,
+ 0x59, 0xcb, 0x5f, 0xb0, 0x9c, 0xa9, 0x51, 0xa0,
+ 0x7f, 0x0c, 0xf6, 0x6f, 0x17, 0xc4, 0x49, 0xec,
+ 0xd8, 0x43, 0x1f, 0x2d, 0xa4, 0x76, 0x7b, 0xb7,
+ 0xcc, 0xbb, 0x3e, 0x5a, 0xfb, 0x60, 0xb1, 0x86,
+ 0x3b, 0x52, 0xa1, 0x6c, 0xaa, 0x55, 0x29, 0x9d,
+ 0x97, 0xb2, 0x87, 0x90, 0x61, 0xbe, 0xdc, 0xfc,
+ 0xbc, 0x95, 0xcf, 0xcd, 0x37, 0x3f, 0x5b, 0xd1,
+ 0x53, 0x39, 0x84, 0x3c, 0x41, 0xa2, 0x6d, 0x47,
+ 0x14, 0x2a, 0x9e, 0x5d, 0x56, 0xf2, 0xd3, 0xab,
+ 0x44, 0x11, 0x92, 0xd9, 0x23, 0x20, 0x2e, 0x89,
+ 0xb4, 0x7c, 0xb8, 0x26, 0x77, 0x99, 0xe3, 0xa5,
+ 0x67, 0x4a, 0xed, 0xde, 0xc5, 0x31, 0xfe, 0x18,
+ 0x0d, 0x63, 0x8c, 0x80, 0xc0, 0xf7, 0x70, 0x07,
+)
+
+
+def sub_bytes(data):
+ return [SBOX[x] for x in data]
+
+
+def sub_bytes_inv(data):
+ return [SBOX_INV[x] for x in data]
+
+
+def rotate(data):
+ return data[1:] + [data[0]]
+
+
+def key_schedule_core(data, rcon_iteration):
+ data = rotate(data)
+ data = sub_bytes(data)
+ data[0] = data[0] ^ RCON[rcon_iteration]
+ return data
+
+
+def xor(data1, data2):
+ return [x ^ y for x, y in zip(data1, data2)]
+
+
+def rijndael_mul(a, b):
+ if a == 0 or b == 0:
+ return 0
+ return RIJNDAEL_EXP_TABLE[
+ (RIJNDAEL_LOG_TABLE[a] + RIJNDAEL_LOG_TABLE[b]) % 0xFF
+ ]
+
+
+def mix_column(data, matrix):
+ data_mixed = []
+ for row in range(4):
+ mixed = 0
+ for column in range(4):
+ # xor is (+) and (-)
+ mixed ^= rijndael_mul(data[column], matrix[row][column])
+ data_mixed.append(mixed)
+ return data_mixed
+
+
+def mix_columns(data, matrix=MIX_COLUMN_MATRIX):
+ data_mixed = []
+ for i in range(4):
+ column = data[i * 4: (i + 1) * 4]
+ data_mixed += mix_column(column, matrix)
+ return data_mixed
+
+
+def mix_columns_inv(data):
+ return mix_columns(data, MIX_COLUMN_MATRIX_INV)
+
+
+def shift_rows_inv(data):
+ data_shifted = []
+ for column in range(4):
+ for row in range(4):
+ data_shifted.append(data[((column - row) & 0b11) * 4 + row])
+ return data_shifted
+
+
+__all__ = ['key_expansion', 'aes_cbc_decrypt', 'aes_cbc_decrypt_text']
diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py
new file mode 100644
index 0000000..e6ba61a
--- /dev/null
+++ b/gallery_dl/cache.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Decorators to keep function results in an in-memory and database cache"""
+
+import sqlite3
+import pickle
+import time
+import functools
+from . import config, util
+
+
+class CacheDecorator():
+ """Simplified in-memory cache"""
+ def __init__(self, func, keyarg):
+ self.func = func
+ self.cache = {}
+ self.keyarg = keyarg
+
+ def __get__(self, instance, cls):
+ return functools.partial(self.__call__, instance)
+
+ def __call__(self, *args, **kwargs):
+ key = "" if self.keyarg is None else args[self.keyarg]
+ try:
+ value = self.cache[key]
+ except KeyError:
+ value = self.cache[key] = self.func(*args, **kwargs)
+ return value
+
+ def update(self, key, value):
+ self.cache[key] = value
+
+ def invalidate(self, key):
+ try:
+ del self.cache[key]
+ except KeyError:
+ pass
+
+
+class MemoryCacheDecorator(CacheDecorator):
+ """In-memory cache"""
+ def __init__(self, func, keyarg, maxage):
+ CacheDecorator.__init__(self, func, keyarg)
+ self.maxage = maxage
+
+ def __call__(self, *args, **kwargs):
+ key = "" if self.keyarg is None else args[self.keyarg]
+ timestamp = int(time.time())
+ try:
+ value, expires = self.cache[key]
+ except KeyError:
+ expires = 0
+ if expires < timestamp:
+ value = self.func(*args, **kwargs)
+ expires = timestamp + self.maxage
+ self.cache[key] = value, expires
+ return value
+
+ def update(self, key, value):
+ self.cache[key] = value, int(time.time()) + self.maxage
+
+
+class DatabaseCacheDecorator():
+ """Database cache"""
+ db = None
+ _init = True
+
+ def __init__(self, func, keyarg, maxage):
+ self.key = "%s.%s" % (func.__module__, func.__name__)
+ self.func = func
+ self.cache = {}
+ self.keyarg = keyarg
+ self.maxage = maxage
+
+ def __get__(self, obj, objtype):
+ return functools.partial(self.__call__, obj)
+
+ def __call__(self, *args, **kwargs):
+ key = "" if self.keyarg is None else args[self.keyarg]
+ timestamp = int(time.time())
+
+ # in-memory cache lookup
+ try:
+ value, expires = self.cache[key]
+ if expires > timestamp:
+ return value
+ except KeyError:
+ pass
+
+ # database lookup
+ fullkey = "%s-%s" % (self.key, key)
+ cursor = self.cursor()
+ try:
+ cursor.execute("BEGIN EXCLUSIVE")
+ except sqlite3.OperationalError:
+ pass # Silently swallow exception - workaround for Python 3.6
+ try:
+ cursor.execute(
+ "SELECT value, expires FROM data WHERE key=? LIMIT 1",
+ (fullkey,),
+ )
+ result = cursor.fetchone()
+
+ if result and result[1] > timestamp:
+ value, expires = result
+ value = pickle.loads(value)
+ else:
+ value = self.func(*args, **kwargs)
+ expires = timestamp + self.maxage
+ cursor.execute(
+ "INSERT OR REPLACE INTO data VALUES (?,?,?)",
+ (fullkey, pickle.dumps(value), expires),
+ )
+ finally:
+ self.db.commit()
+ self.cache[key] = value, expires
+ return value
+
+ def update(self, key, value):
+ expires = int(time.time()) + self.maxage
+ self.cache[key] = value, expires
+ self.cursor().execute(
+ "INSERT OR REPLACE INTO data VALUES (?,?,?)",
+ ("%s-%s" % (self.key, key), pickle.dumps(value), expires),
+ )
+
+ def invalidate(self, key):
+ try:
+ del self.cache[key]
+ except KeyError:
+ pass
+ self.cursor().execute(
+ "DELETE FROM data WHERE key=? LIMIT 1",
+ ("%s-%s" % (self.key, key),),
+ )
+
+ def cursor(self):
+ if self._init:
+ self.db.execute(
+ "CREATE TABLE IF NOT EXISTS data "
+ "(key TEXT PRIMARY KEY, value TEXT, expires INTEGER)"
+ )
+ DatabaseCacheDecorator._init = False
+ return self.db.cursor()
+
+
+def memcache(maxage=None, keyarg=None):
+ if maxage:
+ def wrap(func):
+ return MemoryCacheDecorator(func, keyarg, maxage)
+ else:
+ def wrap(func):
+ return CacheDecorator(func, keyarg)
+ return wrap
+
+
+def cache(maxage=3600, keyarg=None):
+ def wrap(func):
+ return DatabaseCacheDecorator(func, keyarg, maxage)
+ return wrap
+
+
+def clear():
+ """Delete all database entries"""
+ db = DatabaseCacheDecorator.db
+
+ if db:
+ rowcount = 0
+ cursor = db.cursor()
+ try:
+ cursor.execute("DELETE FROM data")
+ except sqlite3.OperationalError:
+ pass # database is not initialized, can't be modified, etc.
+ else:
+ rowcount = cursor.rowcount
+ db.commit()
+ cursor.execute("VACUUM")
+ return rowcount
+
+ return None
+
+
+def _path():
+ path = config.get(("cache", "file"), -1)
+
+ if path == -1:
+ import tempfile
+ import os.path
+ return os.path.join(tempfile.gettempdir(), ".gallery-dl.cache")
+
+ return util.expand_path(path)
+
+
+try:
+ DatabaseCacheDecorator.db = sqlite3.connect(
+ _path(), timeout=30, check_same_thread=False)
+except (TypeError, sqlite3.OperationalError):
+ cache = memcache # noqa: F811
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py
new file mode 100644
index 0000000..b9bf32d
--- /dev/null
+++ b/gallery_dl/cloudflare.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Methods to access sites behind Cloudflare protection"""
+
+import re
+import time
+import operator
+import collections
+import urllib.parse
+from . import text, exception
+from .cache import memcache
+
+
+def is_challenge(response):
+ return (response.status_code == 503 and
+ response.headers.get("Server", "").startswith("cloudflare") and
+ b"jschl-answer" in response.content)
+
+
+def is_captcha(response):
+ return (response.status_code == 403 and
+ b'name="captcha-bypass"' in response.content)
+
+
+def solve_challenge(session, response, kwargs):
+ """Solve Cloudflare challenge and get cfclearance cookie"""
+ parsed = urllib.parse.urlsplit(response.url)
+ root = parsed.scheme + "://" + parsed.netloc
+
+ cf_kwargs = {}
+ headers = cf_kwargs["headers"] = collections.OrderedDict()
+ params = cf_kwargs["params"] = collections.OrderedDict()
+
+ page = response.text
+ params["s"] = text.extract(page, 'name="s" value="', '"')[0]
+ params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
+ params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
+ params["jschl_answer"] = solve_js_challenge(page, parsed.netloc)
+ headers["Referer"] = response.url
+
+ time.sleep(4)
+
+ url = root + "/cdn-cgi/l/chk_jschl"
+ cf_kwargs["allow_redirects"] = False
+ cf_response = session.request("GET", url, **cf_kwargs)
+
+ location = cf_response.headers.get("Location")
+ if not location:
+ import logging
+ log = logging.getLogger("cloudflare")
+ rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected"
+ log.error("%s response", rtype)
+ log.debug("Headers:\n%s", cf_response.headers)
+ log.debug("Content:\n%s", cf_response.text)
+ raise exception.StopExtraction()
+
+ if location[0] == "/":
+ location = root + location
+ else:
+ location = re.sub(r"(https?):/(?!/)", r"\1://", location)
+
+ for cookie in cf_response.cookies:
+ if cookie.name == "cf_clearance":
+ return location, cookie.domain, {
+ cookie.name: cookie.value,
+ "__cfduid" : response.cookies.get("__cfduid", ""),
+ }
+ return location, "", {}
+
+
+def solve_js_challenge(page, netloc):
+ """Evaluate JS challenge in 'page' to get 'jschl_answer' value"""
+
+ # build variable name
+ # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
+ data, pos = text.extract_all(page, (
+ ('var' , ',f, ', '='),
+ ('key' , '"' , '"'),
+ ('expr', ':' , '}'),
+ ))
+ variable = "{}.{}".format(data["var"], data["key"])
+ vlength = len(variable)
+
+ # evaluate the initial expression
+ solution = evaluate_expression(data["expr"], page, netloc)
+
+ # iterator over all remaining expressions
+ # and combine their values in 'solution'
+ expressions = text.extract(
+ page, "'challenge-form');", "f.submit();", pos)[0]
+ for expr in expressions.split(";")[1:]:
+
+ if expr.startswith(variable):
+ # select arithmetc function based on operator (+/-/*)
+ func = OPERATORS[expr[vlength]]
+ # evaluate the rest of the expression
+ value = evaluate_expression(expr[vlength+2:], page, netloc)
+ # combine expression value with our current solution
+ solution = func(solution, value)
+
+ elif expr.startswith("a.value"):
+ if "t.length)" in expr:
+ # add length of hostname
+ solution += len(netloc)
+ if ".toFixed(" in expr:
+ # trim solution to 10 decimal places
+ # and strip trailing zeros
+ solution = "{:.10f}".format(solution).rstrip("0")
+ return solution
+
+
+def evaluate_expression(expr, page, netloc, *,
+ split_re=re.compile(r"[(+]+([^)]*)\)")):
+ """Evaluate a single Javascript expression for the challenge"""
+
+ if expr.startswith("function(p)"):
+ # get HTML element with ID k and evaluate the expression inside
+ # 'eval(eval("document.getElementById(k).innerHTML"))'
+ k, pos = text.extract(page, "k = '", "'")
+ e, pos = text.extract(page, 'id="'+k+'"', '<')
+ return evaluate_expression(e.partition(">")[2], page, netloc)
+
+ if "/" in expr:
+ # split the expression in numerator and denominator subexpressions,
+ # evaluate them separately,
+ # and return their fraction-result
+ num, _, denom = expr.partition("/")
+ num = evaluate_expression(num, page, netloc)
+ denom = evaluate_expression(denom, page, netloc)
+ return num / denom
+
+ if "function(p)" in expr:
+ # split initial expression and function code
+ initial, _, func = expr.partition("function(p)")
+ # evaluate said expression
+ initial = evaluate_expression(initial, page, netloc)
+ # get function argument and use it as index into 'netloc'
+ index = evaluate_expression(func[func.index("}")+1:], page, netloc)
+ return initial + ord(netloc[int(index)])
+
+ # iterate over all subexpressions,
+ # evaluate them,
+ # and accumulate their values in 'result'
+ result = ""
+ for subexpr in split_re.findall(expr) or (expr,):
+ result += str(sum(
+ VALUES[part]
+ for part in subexpr.split("[]")
+ ))
+ return int(result)
+
+
+OPERATORS = {
+ "+": operator.add,
+ "-": operator.sub,
+ "*": operator.mul,
+}
+
+VALUES = {
+ "": 0,
+ "+": 0,
+ "!+": 1,
+ "!!": 1,
+ "+!!": 1,
+}
+
+
+@memcache(keyarg=0)
+def cookies(category):
+ return None
diff --git a/gallery_dl/config.py b/gallery_dl/config.py
new file mode 100644
index 0000000..da52f1e
--- /dev/null
+++ b/gallery_dl/config.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Global configuration module"""
+
+import sys
+import json
+import os.path
+import logging
+from . import util
+
+log = logging.getLogger("config")
+
+
+# --------------------------------------------------------------------
+# internals
+
+_config = {}
+
+if os.name == "nt":
+ _default_configs = [
+ r"%USERPROFILE%\gallery-dl\config.json",
+ r"%USERPROFILE%\gallery-dl.conf",
+ ]
+else:
+ _default_configs = [
+ "/etc/gallery-dl.conf",
+ "${HOME}/.config/gallery-dl/config.json",
+ "${HOME}/.gallery-dl.conf",
+ ]
+
+
+# --------------------------------------------------------------------
+# public interface
+
+def load(files=None, strict=False, fmt="json"):
+ """Load JSON configuration files"""
+ if fmt == "yaml":
+ try:
+ import yaml
+ parsefunc = yaml.safe_load
+ except ImportError:
+ log.error("Could not import 'yaml' module")
+ return
+ else:
+ parsefunc = json.load
+
+ for path in files or _default_configs:
+ path = util.expand_path(path)
+ try:
+ with open(path, encoding="utf-8") as file:
+ confdict = parsefunc(file)
+ except OSError as exc:
+ if strict:
+ log.error("%s", exc)
+ sys.exit(1)
+ except Exception as exc:
+ log.warning("Could not parse '%s': %s", path, exc)
+ if strict:
+ sys.exit(2)
+ else:
+ if not _config:
+ _config.update(confdict)
+ else:
+ util.combine_dict(_config, confdict)
+
+
+def clear():
+ """Reset configuration to an empty state"""
+ _config.clear()
+
+
+def get(keys, default=None, conf=_config):
+ """Get the value of property 'key' or a default value"""
+ try:
+ for k in keys:
+ conf = conf[k]
+ return conf
+ except (KeyError, AttributeError):
+ return default
+
+
+def interpolate(keys, default=None, conf=_config):
+ """Interpolate the value of 'key'"""
+ try:
+ lkey = keys[-1]
+ if lkey in conf:
+ return conf[lkey]
+ for k in keys:
+ if lkey in conf:
+ default = conf[lkey]
+ conf = conf[k]
+ return conf
+ except (KeyError, AttributeError):
+ return default
+
+
+def set(keys, value, conf=_config):
+ """Set the value of property 'key' for this session"""
+ for k in keys[:-1]:
+ try:
+ conf = conf[k]
+ except KeyError:
+ temp = {}
+ conf[k] = temp
+ conf = temp
+ conf[keys[-1]] = value
+
+
+def setdefault(keys, value, conf=_config):
+ """Set the value of property 'key' if it doesn't exist"""
+ for k in keys[:-1]:
+ try:
+ conf = conf[k]
+ except KeyError:
+ temp = {}
+ conf[k] = temp
+ conf = temp
+ return conf.setdefault(keys[-1], value)
+
+
+def unset(keys, conf=_config):
+ """Unset the value of property 'key'"""
+ try:
+ for k in keys[:-1]:
+ conf = conf[k]
+ del conf[keys[-1]]
+ except (KeyError, AttributeError):
+ pass
+
+
+class apply():
+ """Context Manager: apply a collection of key-value pairs"""
+ _sentinel = object()
+
+ def __init__(self, kvlist):
+ self.original = []
+ self.kvlist = kvlist
+
+ def __enter__(self):
+ for key, value in self.kvlist:
+ self.original.append((key, get(key, self._sentinel)))
+ set(key, value)
+
+ def __exit__(self, etype, value, traceback):
+ for key, value in self.original:
+ if value is self._sentinel:
+ unset(key)
+ else:
+ set(key, value)
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py
new file mode 100644
index 0000000..97972cd
--- /dev/null
+++ b/gallery_dl/downloader/__init__.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader modules"""
+
+import importlib
+
+modules = [
+ "http",
+ "text",
+ "ytdl",
+]
+
+
+def find(scheme):
+ """Return downloader class suitable for handling the given scheme"""
+ try:
+ return _cache[scheme]
+ except KeyError:
+ klass = None
+ try:
+ if scheme in modules: # prevent unwanted imports
+ module = importlib.import_module("." + scheme, __package__)
+ klass = module.__downloader__
+ except (ImportError, AttributeError, TypeError):
+ pass
+ _cache[scheme] = klass
+ return klass
+
+
+# --------------------------------------------------------------------
+# internals
+
+_cache = {}
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
new file mode 100644
index 0000000..4803c85
--- /dev/null
+++ b/gallery_dl/downloader/common.py
@@ -0,0 +1,170 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Common classes and constants used by downloader modules."""
+
+import os
+import time
+import logging
+from .. import config, util, exception
+from requests.exceptions import RequestException
+from ssl import SSLError
+
+
+class DownloaderBase():
+ """Base class for downloaders"""
+ scheme = ""
+ retries = 1
+
+ def __init__(self, extractor, output):
+ self.session = extractor.session
+ self.out = output
+ self.log = logging.getLogger("downloader." + self.scheme)
+ self.downloading = False
+ self.part = self.config("part", True)
+ self.partdir = self.config("part-directory")
+
+ if self.partdir:
+ self.partdir = util.expand_path(self.partdir)
+ os.makedirs(self.partdir, exist_ok=True)
+
+ def config(self, key, default=None):
+ """Interpolate config value for 'key'"""
+ return config.interpolate(("downloader", self.scheme, key), default)
+
+ def download(self, url, pathfmt):
+ """Download the resource at 'url' and write it to a file-like object"""
+ try:
+ return self.download_impl(url, pathfmt)
+ except Exception:
+ print()
+ raise
+ finally:
+ # remove file from incomplete downloads
+ if self.downloading and not self.part:
+ try:
+ os.remove(pathfmt.temppath)
+ except (OSError, AttributeError):
+ pass
+
+ def download_impl(self, url, pathfmt):
+ """Actual implementaion of the download process"""
+ adj_ext = None
+ tries = 0
+ msg = ""
+
+ if self.part:
+ pathfmt.part_enable(self.partdir)
+
+ while True:
+ self.reset()
+ if tries:
+ self.log.warning("%s (%d/%d)", msg, tries, self.retries)
+ if tries >= self.retries:
+ return False
+ time.sleep(tries)
+ tries += 1
+
+ # check for .part file
+ filesize = pathfmt.part_size()
+
+ # connect to (remote) source
+ try:
+ offset, size = self.connect(url, filesize)
+ except exception.DownloadRetry as exc:
+ msg = exc
+ continue
+ except exception.DownloadComplete:
+ break
+ except Exception as exc:
+ self.log.warning(exc)
+ return False
+
+ # check response
+ if not offset:
+ mode = "w+b"
+ if filesize:
+ self.log.info("Unable to resume partial download")
+ else:
+ mode = "r+b"
+ self.log.info("Resuming download at byte %d", offset)
+
+ # set missing filename extension
+ if not pathfmt.has_extension:
+ pathfmt.set_extension(self.get_extension())
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+
+ self.out.start(pathfmt.path)
+ self.downloading = True
+ with pathfmt.open(mode) as file:
+ if offset:
+ file.seek(offset)
+
+ # download content
+ try:
+ self.receive(file)
+ except (RequestException, SSLError) as exc:
+ msg = exc
+ print()
+ continue
+
+ # check filesize
+ if size and file.tell() < size:
+ msg = "filesize mismatch ({} < {})".format(
+ file.tell(), size)
+ continue
+
+ # check filename extension
+ adj_ext = self._check_extension(file, pathfmt)
+
+ break
+
+ self.downloading = False
+ if adj_ext:
+ pathfmt.set_extension(adj_ext)
+ return True
+
+ def connect(self, url, offset):
+ """Connect to 'url' while respecting 'offset' if possible
+
+ Returns a 2-tuple containing the actual offset and expected filesize.
+ If the returned offset-value is greater than zero, all received data
+ will be appended to the existing .part file.
+ Return '0' as second tuple-field to indicate an unknown filesize.
+ """
+
+ def receive(self, file):
+ """Write data to 'file'"""
+
+ def reset(self):
+ """Reset internal state / cleanup"""
+
+ def get_extension(self):
+ """Return a filename extension appropriate for the current request"""
+
+ @staticmethod
+ def _check_extension(file, pathfmt):
+ """Check filename extension against fileheader"""
+ extension = pathfmt.keywords["extension"]
+ if extension in FILETYPE_CHECK:
+ file.seek(0)
+ header = file.read(8)
+ if len(header) >= 8 and not FILETYPE_CHECK[extension](header):
+ for ext, check in FILETYPE_CHECK.items():
+ if ext != extension and check(header):
+ return ext
+ return None
+
+
+FILETYPE_CHECK = {
+ "jpg": lambda h: h[0:2] == b"\xff\xd8",
+ "png": lambda h: h[0:8] == b"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a",
+ "gif": lambda h: h[0:4] == b"GIF8" and h[5] == 97,
+}
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
new file mode 100644
index 0000000..961c1a2
--- /dev/null
+++ b/gallery_dl/downloader/http.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for http:// and https:// URLs"""
+
+import time
+import mimetypes
+from requests.exceptions import ConnectionError, Timeout
+from .common import DownloaderBase
+from .. import text, exception
+
+
+class HttpDownloader(DownloaderBase):
+ scheme = "http"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+ self.response = None
+ self.retries = self.config("retries", extractor._retries)
+ self.timeout = self.config("timeout", extractor._timeout)
+ self.verify = self.config("verify", extractor._verify)
+ self.rate = self.config("rate")
+ self.chunk_size = 16384
+
+ if self.rate:
+ self.rate = text.parse_bytes(self.rate)
+ if not self.rate:
+ self.log.warning("Invalid rate limit specified")
+ elif self.rate < self.chunk_size:
+ self.chunk_size = self.rate
+
+ def connect(self, url, offset):
+ headers = {}
+ if offset:
+ headers["Range"] = "bytes={}-".format(offset)
+
+ try:
+ self.response = self.session.request(
+ "GET", url, stream=True, headers=headers, allow_redirects=True,
+ timeout=self.timeout, verify=self.verify)
+ except (ConnectionError, Timeout) as exc:
+ raise exception.DownloadRetry(exc)
+
+ code = self.response.status_code
+ if code == 200: # OK
+ offset = 0
+ size = self.response.headers.get("Content-Length")
+ elif code == 206: # Partial Content
+ size = self.response.headers["Content-Range"].rpartition("/")[2]
+ elif code == 416: # Requested Range Not Satisfiable
+ raise exception.DownloadComplete()
+ elif code == 429 or 500 <= code < 600: # Server Error
+ raise exception.DownloadRetry(
+ "{} Server Error: {} for url: {}".format(
+ code, self.response.reason, url))
+ else:
+ self.response.raise_for_status()
+
+ return offset, text.parse_int(size)
+
+ def receive(self, file):
+ if self.rate:
+ total = 0 # total amount of bytes received
+ start = time.time() # start time
+
+ for data in self.response.iter_content(self.chunk_size):
+ file.write(data)
+
+ if self.rate:
+ total += len(data)
+ expected = total / self.rate # expected elapsed time
+ delta = time.time() - start # actual elapsed time since start
+ if delta < expected:
+ # sleep if less time passed than expected
+ time.sleep(expected - delta)
+
+ def reset(self):
+ if self.response:
+ self.response.close()
+ self.response = None
+
+ def get_extension(self):
+ mtype = self.response.headers.get("Content-Type", "image/jpeg")
+ mtype = mtype.partition(";")[0]
+
+ if mtype in MIMETYPE_MAP:
+ return MIMETYPE_MAP[mtype]
+
+ exts = mimetypes.guess_all_extensions(mtype, strict=False)
+ if exts:
+ exts.sort()
+ return exts[-1][1:]
+
+ self.log.warning(
+ "No filename extension found for MIME type '%s'", mtype)
+ return "txt"
+
+
+MIMETYPE_MAP = {
+ "image/jpeg": "jpg",
+ "image/jpg": "jpg",
+ "image/png": "png",
+ "image/gif": "gif",
+ "image/bmp": "bmp",
+ "image/webp": "webp",
+ "image/svg+xml": "svg",
+
+ "video/webm": "webm",
+ "video/ogg": "ogg",
+ "video/mp4": "mp4",
+
+ "audio/wav": "wav",
+ "audio/x-wav": "wav",
+ "audio/webm": "webm",
+ "audio/ogg": "ogg",
+ "audio/mpeg": "mp3",
+
+ "application/ogg": "ogg",
+ "application/octet-stream": "bin",
+}
+
+
+__downloader__ = HttpDownloader
diff --git a/gallery_dl/downloader/text.py b/gallery_dl/downloader/text.py
new file mode 100644
index 0000000..ca33863
--- /dev/null
+++ b/gallery_dl/downloader/text.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for text: URLs"""
+
+from .common import DownloaderBase
+
+
+class TextDownloader(DownloaderBase):
+ scheme = "text"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+ self.content = b""
+
+ def connect(self, url, offset):
+ data = url.encode()
+ self.content = data[offset + 5:]
+ return offset, len(data) - 5
+
+ def receive(self, file):
+ file.write(self.content)
+
+ def reset(self):
+ self.content = b""
+
+ @staticmethod
+ def get_extension():
+ return "txt"
+
+
+__downloader__ = TextDownloader
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
new file mode 100644
index 0000000..57a84d0
--- /dev/null
+++ b/gallery_dl/downloader/ytdl.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Downloader module for URLs requiring youtube-dl support"""
+
+from youtube_dl import YoutubeDL
+from .common import DownloaderBase
+from .. import text
+import os
+
+
+class YoutubeDLDownloader(DownloaderBase):
+ scheme = "ytdl"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+
+ options = {
+ "format": self.config("format") or None,
+ "ratelimit": text.parse_bytes(self.config("rate"), None),
+ "retries": self.config("retries", extractor._retries),
+ "socket_timeout": self.config("timeout", extractor._timeout),
+ "nocheckcertificate": not self.config("verify", extractor._verify),
+ "nopart": not self.part,
+ }
+ options.update(self.config("raw-options") or {})
+
+ if self.config("logging", True):
+ options["logger"] = self.log
+
+ self.ytdl = YoutubeDL(options)
+
+ def download(self, url, pathfmt):
+ try:
+ info_dict = self.ytdl.extract_info(url[5:], download=False)
+ except Exception:
+ return False
+
+ if "entries" in info_dict:
+ index = pathfmt.keywords.get("_ytdl_index")
+ if index is None:
+ return self._download_playlist(pathfmt, info_dict)
+ else:
+ info_dict = info_dict["entries"][index]
+ return self._download_video(pathfmt, info_dict)
+
+ def _download_video(self, pathfmt, info_dict):
+ if "url" in info_dict:
+ text.nameext_from_url(info_dict["url"], pathfmt.keywords)
+ pathfmt.set_extension(info_dict["ext"])
+ if pathfmt.exists():
+ pathfmt.temppath = ""
+ return True
+ if self.part and self.partdir:
+ pathfmt.temppath = os.path.join(
+ self.partdir, pathfmt.filename)
+ self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%%")
+
+ self.out.start(pathfmt.path)
+ try:
+ self.ytdl.process_info(info_dict)
+ except Exception:
+ self.log.debug("Traceback", exc_info=True)
+ return False
+ return True
+
+ def _download_playlist(self, pathfmt, info_dict):
+ pathfmt.set_extension("%(playlist_index)s.%(ext)s")
+ self.ytdl.params["outtmpl"] = pathfmt.realpath
+
+ for entry in info_dict["entries"]:
+ self.ytdl.process_info(entry)
+ return True
+
+
+__downloader__ = YoutubeDLDownloader
diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py
new file mode 100644
index 0000000..3e86177
--- /dev/null
+++ b/gallery_dl/exception.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Exception classes used by gallery-dl
+
+Class Hierarchy:
+
+Exception
+ +-- GalleryDLException
+ +-- ExtractionError
+ | +-- AuthenticationError
+ | +-- AuthorizationError
+ | +-- NotFoundError
+ | +-- HttpError
+ +-- DownloadError
+ | +-- DownloadComplete
+ | +-- DownloadRetry
+ +-- NoExtractorError
+ +-- FormatError
+ +-- FilterError
+ +-- StopExtraction
+"""
+
+
+class GalleryDLException(Exception):
+ """Base class for GalleryDL exceptions"""
+
+
+class ExtractionError(GalleryDLException):
+ """Base class for exceptions during information extraction"""
+
+
+class AuthenticationError(ExtractionError):
+ """Invalid or missing login information"""
+
+
+class AuthorizationError(ExtractionError):
+ """Insufficient privileges to access a resource"""
+
+
+class NotFoundError(ExtractionError):
+ """Requested resource (gallery/image) does not exist"""
+
+
+class HttpError(ExtractionError):
+ """HTTP request during extraction failed"""
+
+
+class DownloadError(GalleryDLException):
+ """Base class for exceptions during file downloads"""
+
+
+class DownloadRetry(DownloadError):
+ """Download attempt failed and should be retried"""
+
+
+class DownloadComplete(DownloadError):
+ """Output file of attempted download is already complete"""
+
+
+class NoExtractorError(GalleryDLException):
+ """No extractor can handle the given URL"""
+
+
+class FormatError(GalleryDLException):
+ """Error while building output path"""
+
+
+class FilterError(GalleryDLException):
+ """Error while evaluating a filter expression"""
+
+
+class StopExtraction(GalleryDLException):
+ """Extraction should stop"""
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py
new file mode 100644
index 0000000..8df8645
--- /dev/null
+++ b/gallery_dl/extractor/2chan.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.2chan.net/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class FutabaThreadExtractor(Extractor):
+ """Extractor for images from threads on www.2chan.net"""
+ category = "2chan"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board_name}", "{thread}")
+ filename_fmt = "{tim}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
+ pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)"
+ test = ("http://dec.2chan.net/70/res/947.htm", {
+ "url": "c5c12b80b290e224b6758507b3bb952044f4595b",
+ "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.server, self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "https://{}.2chan.net/{}/res/{}.htm".format(
+ self.server, self.board, self.thread)
+ page = self.request(url).text
+ data = self.metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in self.posts(page):
+ if "filename" not in post:
+ continue
+ post.update(data)
+ url = self.url_fmt.format_map(post)
+ yield Message.Url, url, post
+
+ def metadata(self, page):
+ """Collect metadata for extractor-job"""
+ title = text.extract(page, "<title>", "</title>")[0]
+ title, _, boardname = title.rpartition(" - ")
+ return {
+ "server": self.server,
+ "title": title,
+ "board": self.board,
+ "board_name": boardname[:-4],
+ "thread": self.thread,
+ }
+
+ def posts(self, page):
+ """Build a list of all post-objects"""
+ page = text.extract(
+ page, '<div class="thre"', '<div style="clear:left"></div>')[0]
+ return [
+ self.parse(post)
+ for post in page.split('<table border=0>')
+ ]
+
+ def parse(self, post):
+ """Build post-object by extracting data from an HTML post"""
+ data = self._extract_post(post)
+ if '<a href="/' in post:
+ self._extract_image(post, data)
+ data["tim"], _, data["extension"] = data["filename"].partition(".")
+ data["time"] = data["tim"][:-3]
+ data["ext"] = "." + data["extension"]
+ return data
+
+ @staticmethod
+ def _extract_post(post):
+ return text.extract_all(post, (
+ ("no" , 'name="', '"'),
+ ("post", '<b>', '</b>'),
+ ("name", '<b>', ' </b>'),
+ ("now" , '</font> ', ' '),
+ (None , '<blockquote', ''),
+ ("com" , '>', '</blockquote>'),
+ ))[0]
+
+ @staticmethod
+ def _extract_image(post, data):
+ text.extract_all(post, (
+ (None , '_blank', ''),
+ ("filename", '>', '<'),
+ ("fsize" , '(', ' '),
+ ), 0, data)
diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py
new file mode 100644
index 0000000..50dbfe8
--- /dev/null
+++ b/gallery_dl/extractor/35photo.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://35photo.pro/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _35photoExtractor(Extractor):
+ category = "35photo"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{id}{title:?_//}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ root = "https://35photo.pro"
+
+ def items(self):
+ first = True
+ data = self.metadata()
+ yield Message.Version, 1
+
+ for photo_id in self.photos():
+ for photo in self._photo_data(photo_id):
+ photo.update(data)
+ url = photo["url"]
+ if first:
+ first = False
+ yield Message.Directory, photo
+ yield Message.Url, url, text.nameext_from_url(url, photo)
+
+ def metadata(self):
+ """Returns general metadata"""
+ return {}
+
+ def photos(self):
+ """Returns an iterable containing all relevant photo IDs"""
+
+ def _pagination(self, params, extra_ids=None):
+ url = "https://35photo.pro/show_block.php"
+ headers = {"Referer": self.root, "X-Requested-With": "XMLHttpRequest"}
+ params["type"] = "getNextPageData"
+
+ if "lastId" not in params:
+ params["lastId"] = "999999999"
+ if extra_ids:
+ yield from extra_ids
+ while params["lastId"]:
+ data = self.request(url, headers=headers, params=params).json()
+ yield from self._photo_ids(data["data"])
+ params["lastId"] = data["lastId"]
+
+ def _photo_data(self, photo_id):
+ params = {"method": "photo.getData", "photoId": photo_id}
+ data = self.request(
+ "https://api.35photo.pro/", params=params).json()["data"][photo_id]
+ info = {
+ "url" : data["src"],
+ "id" : data["photo_id"],
+ "title" : data["photo_name"],
+ "description": data["photo_desc"],
+ "tags" : data["tags"] or [],
+ "views" : data["photo_see"],
+ "favorites" : data["photo_fav"],
+ "score" : data["photo_rating"],
+ "type" : data["photo_type"],
+ "date" : data["timeAdd"],
+ "user" : data["user_login"],
+ "user_id" : data["user_id"],
+ "user_name" : data["user_name"],
+ "other" : data["otherData"],
+ }
+
+ if "series" in data:
+ for info["num"], photo in enumerate(data["series"], 1):
+ info["url"] = photo["src"]
+ info["id_series"] = text.parse_int(photo["id"])
+ info["title_series"] = photo["title"] or ""
+ yield info.copy()
+ else:
+ info["num"] = 1
+ yield info
+
+ @staticmethod
+ def _photo_ids(page):
+ """Extract unique photo IDs and return them as sorted list"""
+ # searching for photo-id="..." doesn't always work (see unit tests)
+ return sorted(
+ set(text.extract_iter(page, "/photo_", "/")),
+ key=text.parse_int,
+ reverse=True,
+ )
+
+
+class _35photoUserExtractor(_35photoExtractor):
+ """Extractor for all images of a user on 35photo.pro"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro"
+ r"/(?!photo_|genre_)([^/?&#]+)")
+ test = (
+ ("https://35photo.pro/liya", {
+ "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg",
+ "count": 9,
+ }),
+ ("https://35photo.pro/suhoveev", {
+ # last photo ID (1267028) isn't given as 'photo-id="<id>"
+ # there are only 23 photos without the last one
+ "count": ">= 33",
+ }),
+ ("https://en.35photo.pro/liya"),
+ ("https://ru.35photo.pro/liya"),
+ )
+
+ def __init__(self, match):
+ _35photoExtractor.__init__(self, match)
+ self.user = match.group(1)
+ self.user_id = 0
+
+ def metadata(self):
+ url = "{}/{}/".format(self.root, self.user)
+ page = self.request(url).text
+ self.user_id = text.parse_int(text.extract(page, "/user_", ".xml")[0])
+ return {
+ "user": self.user,
+ "user_id": self.user_id,
+ }
+
+ def photos(self):
+ return self._pagination({
+ "page": "photoUser",
+ "user_id": self.user_id,
+ })
+
+
+class _35photoGenreExtractor(_35photoExtractor):
+ """Extractor for images of a specific genre on 35photo.pro"""
+ subcategory = "genre"
+ directory_fmt = ("{category}", "Genre", "{genre}")
+ archive_fmt = "g{genre_id}_{id}_{num}"
+ pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/genre_(\d+)(/new/)?"
+ test = (
+ ("https://35photo.pro/genre_109/", {
+ "range": "1-30",
+ }),
+ ("https://35photo.pro/genre_109/new/"),
+ )
+
+ def __init__(self, match):
+ _35photoExtractor.__init__(self, match)
+ self.genre_id, self.new = match.groups()
+ self.photo_ids = None
+
+ def metadata(self):
+ url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/")
+ page = self.request(url).text
+ self.photo_ids = self._photo_ids(text.extract(
+ page, ' class="photo', '\n')[0])
+ return {
+ "genre": text.extract(page, " genre - ", ". ")[0],
+ "genre_id": text.parse_int(self.genre_id),
+ }
+
+ def photos(self):
+ return self._pagination({
+ "page": "genre",
+ "community_id": self.genre_id,
+ "photo_rating": "0" if self.new else "50",
+ "lastId": self.photo_ids[-1],
+ }, self.photo_ids)
+
+
+class _35photoImageExtractor(_35photoExtractor):
+ """Extractor for individual images from 35photo.pro"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/photo_(\d+)"
+ test = ("https://35photo.pro/photo_753340/", {
+ "count": 1,
+ "keyword": {
+ "url" : r"re:https://m\d+.35photo.pro/photos_main/.*.jpg",
+ "id" : 753340,
+ "title" : "Winter walk",
+ "description": str,
+ "tags" : list,
+ "views" : int,
+ "favorites" : int,
+ "score" : int,
+ "type" : 0,
+ "date" : "15 авг, 2014",
+ "user" : "liya",
+ "user_id" : 20415,
+ "user_name" : "Liya Mirzaeva",
+ "other" : str,
+ },
+ })
+
+ def __init__(self, match):
+ _35photoExtractor.__init__(self, match)
+ self.photo_id = match.group(1)
+
+ def photos(self):
+ return (self.photo_id,)
diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py
new file mode 100644
index 0000000..d0e59ad
--- /dev/null
+++ b/gallery_dl/extractor/3dbooru.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from http://behoimi.org/"""
+
+from . import booru
+
+
+class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for 3dbooru extractors"""
+ category = "3dbooru"
+ api_url = "http://behoimi.org/post/index.json"
+ post_url = "http://behoimi.org/post/show/{}"
+ page_limit = 1000
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.session.headers.update({
+ "Referer": "http://behoimi.org/post/show/",
+ "Accept-Encoding": "identity",
+ })
+
+
+class ThreedeebooruTagExtractor(booru.TagMixin,
+ ThreedeebooruExtractor):
+ """Extractor for images from behoimi.org based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org/post"
+ r"(?:/(?:index)?)?\?tags=(?P<tags>[^&#]+)")
+ test = ("http://behoimi.org/post?tags=himekawa_azuru+dress", {
+ "url": "ecb30c6aaaf8a6ff8f55255737a9840832a483c1",
+ "content": "11cbda40c287e026c1ce4ca430810f761f2d0b2a",
+ })
+
+
+class ThreedeebooruPoolExtractor(booru.PoolMixin,
+ ThreedeebooruExtractor):
+ """Extractor for image-pools from behoimi.org"""
+ pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(?P<pool>\d+)"
+ test = ("http://behoimi.org/pool/show/27", {
+ "url": "da75d2d1475449d5ef0c266cb612683b110a30f2",
+ "content": "fd5b37c5c6c2de4b4d6f1facffdefa1e28176554",
+ })
+
+
+class ThreedeebooruPostExtractor(booru.PostMixin,
+ ThreedeebooruExtractor):
+ """Extractor for single images from behoimi.org"""
+ pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(?P<post>\d+)"
+ test = ("http://behoimi.org/post/show/140852", {
+ "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6",
+ "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_character": "furude_rika",
+ "tags_copyright": "higurashi_no_naku_koro_ni",
+ "tags_model": "himekawa_azuru",
+ "tags_general": str,
+ },
+ })
+
+
+class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin,
+ ThreedeebooruExtractor):
+ """Extractor for popular images from behoimi.org"""
+ pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org"
+ r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", {
+ "url": "c70268dce441a9ccc3383c244ec15edb059f494f",
+ "count": 20,
+ })
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = "http://behoimi.org/post/popular_{scale}.json".format(
+ scale=self.scale)
diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py
new file mode 100644
index 0000000..e387b33
--- /dev/null
+++ b/gallery_dl/extractor/4chan.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images and videos from https://www.4chan.org/"""
+
+from . import chan
+from .. import text
+
+
+class FourchanThreadExtractor(chan.ChanThreadExtractor):
+ """Extractor for images from threads from 4chan.org"""
+ category = "4chan"
+ pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org"
+ r"/([^/]+)/thread/(\d+)")
+ test = (
+ ("https://boards.4chan.org/tg/thread/15396072/", {
+ "url": "39082ad166161966d7ba8e37f2173a824eb540f0",
+ "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",
+ "content": "20b7b51afa51c9c31a0020a0737b889532c8d7ec",
+ }),
+ ("https://boards.4channel.org/tg/thread/15396072/", {
+ "url": "39082ad166161966d7ba8e37f2173a824eb540f0",
+ "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",
+ }),
+ )
+ api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
+ file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
+
+ def update(self, post, data=None):
+ chan.ChanThreadExtractor.update(self, post, data)
+ post["filename"] = text.unescape(post["filename"])
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
new file mode 100644
index 0000000..00b8ab5
--- /dev/null
+++ b/gallery_dl/extractor/500px.py
@@ -0,0 +1,238 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://500px.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _500pxExtractor(Extractor):
+ """Base class for 500px extractors"""
+ category = "500px"
+ directory_fmt = ("{category}", "{user[username]}")
+ filename_fmt = "{id}_{name}.{extension}"
+ archive_fmt = "{id}"
+ root = "https://500px.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.session.headers["Referer"] = self.root + "/"
+
+ def items(self):
+ first = True
+ data = self.metadata()
+ yield Message.Version, 1
+
+ for photo in self.photos():
+ url = photo["images"][-1]["url"]
+ fmt = photo["image_format"]
+ photo["extension"] = "jpg" if fmt == "jpeg" else fmt
+ if data:
+ photo.update(data)
+ if first:
+ first = False
+ yield Message.Directory, photo
+ yield Message.Url, url, photo
+
+ def metadata(self):
+ """Returns general metadata"""
+
+ def photos(self):
+ """Returns an iterable containing all relevant photo IDs"""
+
+ def _extend(self, photos):
+ """Extend photos with additional metadata and higher resolution URLs"""
+ url = "https://api.500px.com/v1/photos"
+ params = {
+ "expanded_user_info" : "true",
+ "include_tags" : "true",
+ "include_geo" : "true",
+ "include_equipment_info": "true",
+ "vendor_photos" : "true",
+ "include_licensing" : "true",
+ "include_releases" : "true",
+ "liked_by" : "1",
+ "following_sample" : "100",
+ "image_size" : "32768",
+ "ids" : ",".join(str(p["id"]) for p in photos),
+ }
+
+ data = self._api_call(url, params)["photos"]
+ for photo in photos:
+ pid = str(photo["id"])
+ photo.update(data[pid])
+ return photos
+
+ def _api_call(self, url, params, csrf_token=None):
+ headers = {"Origin": self.root, "X-CSRF-Token": csrf_token}
+ return self.request(url, headers=headers, params=params).json()
+
+ def _pagination(self, url, params, csrf):
+ params["page"] = 1
+ while True:
+ data = self._api_call(url, params, csrf)
+ yield from self._extend(data["photos"])
+
+ if params["page"] >= data["total_pages"]:
+ return
+ params["page"] += 1
+
+
+class _500pxUserExtractor(_500pxExtractor):
+ """Extractor for photos from a user's photostream on 500px.com"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?500px\.com"
+ r"/(?!photo/)([^/?&#]+)/?(?:$|\?|#)")
+ test = ("https://500px.com/light_expression_photography", {
+ "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D5000/v2",
+ "range": "1-99",
+ "count": 99,
+ })
+
+ def __init__(self, match):
+ _500pxExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def photos(self):
+ # get csrf token and user id from webpage
+ url = "{}/{}".format(self.root, self.user)
+ page = self.request(url).text
+ csrf_token, pos = text.extract(page, 'csrf-token" content="', '"')
+ user_id , pos = text.extract(page, '/user/', '"', pos)
+
+ # get user photos
+ url = "https://api.500px.com/v1/photos"
+ params = {
+ "feature" : "user",
+ "stream" : "photos",
+ "rpp" : "50",
+ "user_id" : user_id,
+ }
+ return self._pagination(url, params, csrf_token)
+
+
+class _500pxGalleryExtractor(_500pxExtractor):
+ """Extractor for photo galleries on 500px.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user[username]}", "{gallery[name]}")
+ pattern = (r"(?:https?://)?500px\.com"
+ r"/(?!photo/)([^/?&#]+)/galleries/([^/?&#]+)")
+ test = ("https://500px.com/fashvamp/galleries/lera", {
+ "url": "8a520272ece83278166b4f8556f9c9da43c43c45",
+ "count": 3,
+ "keyword": {
+ "gallery": dict,
+ "user": dict,
+ },
+ })
+
+ def __init__(self, match):
+ _500pxExtractor.__init__(self, match)
+ self.user_name, self.gallery_name = match.groups()
+ self.user_id = self.gallery_id = self.csrf_token = None
+
+ def metadata(self):
+ # get csrf token and user id from webpage
+ url = "{}/{}/galleries/{}".format(
+ self.root, self.user_name, self.gallery_name)
+ page = self.request(url).text
+ self.csrf_token, pos = text.extract(page, 'csrf-token" content="', '"')
+ self.user_id , pos = text.extract(page, 'App.CuratorId =', '\n', pos)
+ self.user_id = self.user_id.strip()
+
+ # get gallery metadata; transform gallery name into id
+ url = "https://api.500px.com/v1/users/{}/galleries/{}".format(
+ self.user_id, self.gallery_name)
+ params = {
+ # "include_user": "true",
+ "include_cover": "1",
+ "cover_size": "2048",
+ }
+ data = self._api_call(url, params, self.csrf_token)
+ self.gallery_id = data["gallery"]["id"]
+ return data
+
+ def photos(self):
+ url = "https://api.500px.com/v1/users/{}/galleries/{}/items".format(
+ self.user_id, self.gallery_id)
+ params = {
+ "sort" : "position",
+ "sort_direction" : "asc",
+ "rpp" : "50",
+ }
+ return self._pagination(url, params, self.csrf_token)
+
+
+class _500pxImageExtractor(_500pxExtractor):
+ """Extractor for individual images from 500px.com"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?500px\.com/photo/(\d+)"
+ test = ("https://500px.com/photo/222049255/queen-of-coasts", {
+ "url": "d1eda7afeaa589f71f05b9bb5c0694e3ffb357cd",
+ "count": 1,
+ "keyword": {
+ "camera": "Canon EOS 600D",
+ "camera_info": dict,
+ "collections_count": int,
+ "comments": list,
+ "comments_count": int,
+ "converted": False,
+ "converted_bits": int,
+ "created_at": "2017-08-01T04:40:05-04:00",
+ "crop_version": 0,
+ "description": str,
+ "editored_by": dict,
+ "editors_choice": False,
+ "extension": "jpg",
+ "favorites_count": int,
+ "feature": "popular",
+ "feature_date": "2017-08-01T09:58:28+00:00",
+ "focal_length": "208",
+ "height": 3111,
+ "id": 222049255,
+ "image_format": "jpeg",
+ "image_url": str,
+ "images": list,
+ "iso": "100",
+ "lens": "EF-S55-250mm f/4-5.6 IS II",
+ "lens_info": dict,
+ "license_type": 0,
+ "licensed_at": None,
+ "liked": False,
+ "location": None,
+ "location_details": dict,
+ "name": "Queen Of Coasts",
+ "nsfw": False,
+ "privacy": False,
+ "profile": True,
+ "rating": float,
+ "sales_count": int,
+ "status": 1,
+ "store_download": False,
+ "store_height": 3111,
+ "store_width": 4637,
+ "tags": list,
+ "taken_at": "2017-05-04T13:36:51-04:00",
+ "times_viewed": int,
+ "url": "/photo/222049255/queen-of-coasts-by-olesya-nabieva",
+ "user": dict,
+ "user_id": 12847235,
+ "votes_count": int,
+ "watermark": True,
+ "width": 4637,
+ },
+ })
+
+ def __init__(self, match):
+ _500pxExtractor.__init__(self, match)
+ self.photo_id = match.group(1)
+
+ def photos(self):
+ photos = ({"id": self.photo_id},)
+ return self._extend(photos)
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
new file mode 100644
index 0000000..e526da3
--- /dev/null
+++ b/gallery_dl/extractor/8chan.py
@@ -0,0 +1,29 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images and videos from https://8ch.net/"""
+
+from . import chan
+
+
+class InfinitychanThreadExtractor(chan.ChanThreadExtractor):
+ """Extractor for images from threads from 8ch.net"""
+ category = "8chan"
+ filename_fmt = "{time}-{filename}{ext}"
+ pattern = r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+)"
+ test = ("https://8ch.net/builders/res/3.html", {
+ "url": "5d85c0509f907f217aea379f862b41bf3d01f645",
+ "keyword": "0c497190c0c0f826925fde09815351d01869c783",
+ })
+ api_url = "https://8ch.net/{board}/res/{thread}.json"
+ file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
+ file_url_v2 = "https://media.8ch.net/file_store/{tim}{ext}"
+
+ def build_url(self, post):
+ fmt = self.file_url if len(post["tim"]) < 64 else self.file_url_v2
+ return fmt.format_map(post)
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
new file mode 100644
index 0000000..6fbf6b5
--- /dev/null
+++ b/gallery_dl/extractor/8muses.py
@@ -0,0 +1,129 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.8muses.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class _8musesAlbumExtractor(Extractor):
+ """Extractor for image albums on www.8muses.com"""
+ category = "8muses"
+ subcategory = "album"
+ directory_fmt = ("{category}", "{album[path]}")
+ filename_fmt = "{page:>03}.{extension}"
+ archive_fmt = "{hash}"
+ root = "https://www.8muses.com"
+ pattern = (r"(?:https?://)?(?:www\.)?8muses\.com"
+ r"(/comics/album/[^?&#]+)(\?[^#]+)?")
+ test = (
+ ("https://www.8muses.com/comics/album/Fakku-Comics/santa/Im-Sorry", {
+ "url": "82449d6a26a29204695cba5d52c3ec60170bc159",
+ "keyword": {
+ "url" : str,
+ "hash" : str,
+ "page" : int,
+ "count": 16,
+ "album": {
+ "id" : 10457,
+ "title" : "Im Sorry",
+ "path" : "Fakku Comics/santa/Im Sorry",
+ "private": False,
+ "url" : str,
+ "parent" : 10454,
+ "views" : int,
+ "likes" : int,
+ "date" : "type:datetime",
+ },
+ },
+ }),
+ ("https://www.8muses.com/comics/album/Fakku-Comics/santa", {
+ "count": ">= 3",
+ "pattern": pattern,
+ "keyword": {
+ "url" : str,
+ "name" : str,
+ "private": False,
+ },
+ }),
+ ("https://www.8muses.com/comics/album/Fakku-Comics/6?sort=az", {
+ "count": ">= 70",
+ "keyword": {"name": r"re:^[S-Zs-z]"},
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+ self.params = match.group(2) or ""
+
+ def items(self):
+ url = self.root + self.path + self.params
+
+ while True:
+ data = self._unobfuscate(text.extract(
+ self.request(url).text,
+ 'id="ractive-public" type="text/plain">', '</script>')[0])
+
+ images = data.get("pictures")
+ if images:
+ count = len(images)
+ album = self._make_album(data["album"])
+ yield Message.Directory, {"album": album, "count": count}
+ for num, image in enumerate(images, 1):
+ url = self.root + "/image/fl/" + image["publicUri"]
+ img = {
+ "url" : url,
+ "page" : num,
+ "hash" : image["publicUri"],
+ "count" : count,
+ "album" : album,
+ "extension": "jpg",
+ }
+ yield Message.Url, url, img
+
+ albums = data.get("albums")
+ if albums:
+ for album in albums:
+ url = self.root + "/comics/album/" + album["permalink"]
+ album = {
+ "url" : url,
+ "name" : album["name"],
+ "private": album["isPrivate"],
+ }
+ yield Message.Queue, url, album
+
+ if data["page"] >= data["pages"]:
+ return
+ path, _, num = self.path.rstrip("/").rpartition("/")
+ path = path if num.isdecimal() else self.path
+ url = "{}{}/{}{}".format(
+ self.root, path, data["page"] + 1, self.params)
+
+ def _make_album(self, album):
+ return {
+ "id" : album["id"],
+ "path" : album["path"],
+ "title" : album["name"],
+ "private": album["isPrivate"],
+ "url" : self.root + album["permalink"],
+ "parent" : text.parse_int(album["parentId"]),
+ "views" : text.parse_int(album["numberViews"]),
+ "likes" : text.parse_int(album["numberLikes"]),
+ "date" : text.parse_datetime(
+ album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"),
+ }
+
+ @staticmethod
+ def _unobfuscate(data):
+ return json.loads("".join([
+ chr(33 + (ord(c) + 14) % 94) if c != " " else c
+ for c in text.unescape(data.strip("\t\n\r !"))
+ ]))
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
new file mode 100644
index 0000000..81d480e
--- /dev/null
+++ b/gallery_dl/extractor/__init__.py
@@ -0,0 +1,189 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import re
+import importlib
+
+modules = [
+ "2chan",
+ "35photo",
+ "3dbooru",
+ "4chan",
+ "500px",
+ "8chan",
+ "8muses",
+ "artstation",
+ "behance",
+ "bobx",
+ "danbooru",
+ "deviantart",
+ "dynastyscans",
+ "e621",
+ "exhentai",
+ "fallenangels",
+ "flickr",
+ "gelbooru",
+ "gfycat",
+ "hbrowse",
+ "hentai2read",
+ "hentaicafe",
+ "hentaifoundry",
+ "hentaifox",
+ "hentaihere",
+ "hentainexus",
+ "hitomi",
+ "hypnohub",
+ "idolcomplex",
+ "imagebam",
+ "imagefap",
+ "imgbox",
+ "imgth",
+ "imgur",
+ "instagram",
+ "keenspot",
+ "khinsider",
+ "kissmanga",
+ "komikcast",
+ "konachan",
+ "livedoor",
+ "luscious",
+ "mangadex",
+ "mangafox",
+ "mangahere",
+ "mangapanda",
+ "mangapark",
+ "mangareader",
+ "mangastream",
+ "mangoxo",
+ "myportfolio",
+ "newgrounds",
+ "ngomik",
+ "nhentai",
+ "nijie",
+ "nsfwalbum",
+ "paheal",
+ "patreon",
+ "photobucket",
+ "piczel",
+ "pinterest",
+ "pixiv",
+ "pixnet",
+ "plurk",
+ "pornhub",
+ "pururin",
+ "reactor",
+ "readcomiconline",
+ "reddit",
+ "rule34",
+ "safebooru",
+ "sankaku",
+ "sankakucomplex",
+ "seiga",
+ "senmanga",
+ "sexcom",
+ "simplyhentai",
+ "slickpic",
+ "slideshare",
+ "smugmug",
+ "tsumino",
+ "tumblr",
+ "twitter",
+ "vanillarock",
+ "wallhaven",
+ "warosu",
+ "weibo",
+ "wikiart",
+ "xhamster",
+ "xvideos",
+ "yandere",
+ "yaplog",
+ "yuki",
+ "foolfuuka",
+ "foolslide",
+ "mastodon",
+ "shopify",
+ "imagehosts",
+ "directlink",
+ "recursive",
+ "oauth",
+ "test",
+]
+
+
+def find(url):
+ """Find a suitable extractor for the given URL"""
+ for cls in _list_classes():
+ match = cls.pattern.match(url)
+ if match and cls not in _blacklist:
+ return cls(match)
+ return None
+
+
+def add(cls):
+ """Add 'cls' to the list of available extractors"""
+ cls.pattern = re.compile(cls.pattern)
+ _cache.append(cls)
+ return cls
+
+
+def add_module(module):
+ """Add all extractors in 'module' to the list of available extractors"""
+ classes = _get_classes(module)
+ for cls in classes:
+ cls.pattern = re.compile(cls.pattern)
+ _cache.extend(classes)
+ return classes
+
+
+def extractors():
+ """Yield all available extractor classes"""
+ return sorted(
+ _list_classes(),
+ key=lambda x: x.__name__
+ )
+
+
+class blacklist():
+ """Context Manager to blacklist extractor modules"""
+ def __init__(self, categories, extractors=None):
+ self.extractors = extractors or []
+ for cls in _list_classes():
+ if cls.category in categories:
+ self.extractors.append(cls)
+
+ def __enter__(self):
+ _blacklist.update(self.extractors)
+
+ def __exit__(self, etype, value, traceback):
+ _blacklist.clear()
+
+
+# --------------------------------------------------------------------
+# internals
+
+_cache = []
+_blacklist = set()
+_module_iter = iter(modules)
+
+
+def _list_classes():
+ """Yield all available extractor classes"""
+ yield from _cache
+
+ for module_name in _module_iter:
+ module = importlib.import_module("."+module_name, __package__)
+ yield from add_module(module)
+
+
+def _get_classes(module):
+ """Return a list of all extractor classes in a module"""
+ return [
+ cls for cls in module.__dict__.values() if (
+ hasattr(cls, "pattern") and cls.__module__ == module.__name__
+ )
+ ]
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
new file mode 100644
index 0000000..24197ad
--- /dev/null
+++ b/gallery_dl/extractor/artstation.py
@@ -0,0 +1,369 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.artstation.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+import random
+import string
+
+
+class ArtstationExtractor(Extractor):
+ """Base class for artstation extractors"""
+ category = "artstation"
+ filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}"
+ directory_fmt = ("{category}", "{userinfo[username]}")
+ archive_fmt = "{asset[id]}"
+ root = "https://www.artstation.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1) or match.group(2)
+ self.external = self.config("external", False)
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for project in self.projects():
+ for asset in self.get_project_assets(project["hash_id"]):
+ asset.update(data)
+ adict = asset["asset"]
+
+ if adict["has_embedded_player"] and self.external:
+ player = adict["player_embedded"]
+ url = text.extract(player, 'src="', '"')[0]
+ if not url.startswith(self.root):
+ yield Message.Url, "ytdl:" + url, asset
+ continue
+
+ if adict["has_image"]:
+ url = adict["image_url"]
+ text.nameext_from_url(url, asset)
+ yield Message.Url, self._no_cache(url), asset
+
+ def metadata(self):
+ """Return general metadata"""
+ return {"userinfo": self.get_user_info(self.user)}
+
+ def projects(self):
+ """Return an iterable containing all relevant project IDs"""
+
+ def get_project_assets(self, project_id):
+ """Return all assets associated with 'project_id'"""
+ url = "{}/projects/{}.json".format(self.root, project_id)
+ data = self.request(url).json()
+
+ data["title"] = text.unescape(data["title"])
+ data["description"] = text.unescape(text.remove_html(
+ data["description"]))
+
+ assets = data["assets"]
+ del data["assets"]
+
+ if len(assets) == 1:
+ data["asset"] = assets[0]
+ yield data
+ else:
+ for asset in assets:
+ data["asset"] = asset
+ yield data.copy()
+
+ def get_user_info(self, username):
+ """Return metadata for a specific user"""
+ url = "{}/users/{}/quick.json".format(self.root, username.lower())
+ response = self.request(url, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("user")
+ return response.json()
+
+ def _pagination(self, url, params=None):
+ if not params:
+ params = {}
+ params["page"] = 1
+ total = 0
+
+ while True:
+ data = self.request(url, params=params).json()
+ yield from data["data"]
+
+ total += len(data["data"])
+ if total >= data["total_count"]:
+ return
+
+ params["page"] += 1
+
+ @staticmethod
+ def _no_cache(url, alphabet=(string.digits + string.ascii_letters)):
+ """Cause a cache miss to prevent Cloudflare 'optimizations'
+
+ Cloudflare's 'Polish' optimization strips image metadata and may even
+ recompress an image as lossy JPEG. This can be prevented by causing
+ a cache miss when requesting an image by adding a random dummy query
+ parameter.
+
+ Ref:
+ https://github.com/r888888888/danbooru/issues/3528
+ https://danbooru.donmai.us/forum_topics/14952
+ """
+ param = "gallerydl_no_cache=" + util.bencode(
+ random.getrandbits(64), alphabet)
+ sep = "&" if "?" in url else "?"
+ return url + sep + param
+
+
+class ArtstationUserExtractor(ArtstationExtractor):
+ """Extractor for all projects of an artstation user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com"
+ r"/(?!artwork|projects|search)([^/?&#]+)(?:/albums/all)?"
+ r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$")
+ test = (
+ ("https://www.artstation.com/gaerikim/", {
+ "pattern": r"https://\w+\.artstation\.com/p/assets"
+ r"/images/images/\d+/\d+/\d+/large/[^/]+",
+ "count": ">= 6",
+ }),
+ ("https://www.artstation.com/gaerikim/albums/all/"),
+ ("https://gaerikim.artstation.com/"),
+ ("https://gaerikim.artstation.com/projects/"),
+ )
+
+ def projects(self):
+ url = "{}/users/{}/projects.json".format(self.root, self.user)
+ return self._pagination(url)
+
+
+class ArtstationAlbumExtractor(ArtstationExtractor):
+ """Extractor for all projects in an artstation album"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{userinfo[username]}", "Albums",
+ "{album[id]} - {album[title]}")
+ archive_fmt = "a_{album[id]}_{asset[id]}"
+ pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com"
+ r"/(?!artwork|projects|search)([^/?&#]+)"
+ r"|((?!www)\w+)\.artstation\.com)/albums/(\d+)")
+ test = (
+ ("https://www.artstation.com/huimeiye/albums/770899", {
+ "count": 2,
+ }),
+ ("https://www.artstation.com/huimeiye/albums/770898", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://huimeiye.artstation.com/albums/770899"),
+ )
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ self.album_id = text.parse_int(match.group(3))
+
+ def metadata(self):
+ userinfo = self.get_user_info(self.user)
+ album = None
+
+ for album in userinfo["albums_with_community_projects"]:
+ if album["id"] == self.album_id:
+ break
+ else:
+ raise exception.NotFoundError("album")
+
+ return {
+ "userinfo": userinfo,
+ "album": album
+ }
+
+ def projects(self):
+ url = "{}/users/{}/projects.json".format(self.root, self.user)
+ params = {"album_id": self.album_id}
+ return self._pagination(url, params)
+
+
+class ArtstationLikesExtractor(ArtstationExtractor):
+ """Extractor for liked projects of an artstation user"""
+ subcategory = "likes"
+ directory_fmt = ("{category}", "{userinfo[username]}", "Likes")
+ archive_fmt = "f_{userinfo[id]}_{asset[id]}"
+ pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
+ r"/(?!artwork|projects|search)([^/?&#]+)/likes/?")
+ test = (
+ ("https://www.artstation.com/mikf/likes", {
+ "pattern": r"https://\w+\.artstation\.com/p/assets"
+ r"/images/images/\d+/\d+/\d+/large/[^/]+",
+ "count": 6,
+ }),
+ # no likes
+ ("https://www.artstation.com/sungchoi/likes", {
+ "count": 0,
+ }),
+ )
+
+ def projects(self):
+ url = "{}/users/{}/likes.json".format(self.root, self.user)
+ return self._pagination(url)
+
+
+class ArtstationChallengeExtractor(ArtstationExtractor):
+ """Extractor for submissions of artstation challenges"""
+ subcategory = "challenge"
+ filename_fmt = "{submission_id}_{asset_id}_{filename}.{extension}"
+ directory_fmt = ("{category}", "Challenges",
+ "{challenge[id]} - {challenge[title]}")
+ archive_fmt = "c_{challenge[id]}_{asset_id}"
+ pattern = (r"(?:https?://)?(?:www\.)?artstation\.com"
+ r"/contests/[^/?&#]+/challenges/(\d+)"
+ r"/?(?:\?sorting=([a-z]+))?")
+ test = (
+ ("https://www.artstation.com/contests/thu-2017/challenges/20"),
+ (("https://www.artstation.com/contests/beyond-human"
+ "/challenges/23?sorting=winners"), {
+ "range": "1-30",
+ "count": 30,
+ }),
+ )
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ self.challenge_id = match.group(1)
+ self.sorting = match.group(2) or "popular"
+
+ def items(self):
+ challenge_url = "{}/contests/_/challenges/{}.json".format(
+ self.root, self.challenge_id)
+ submission_url = "{}/contests/_/challenges/{}/submissions.json".format(
+ self.root, self.challenge_id)
+ update_url = "{}/contests/submission_updates.json".format(
+ self.root)
+
+ challenge = self.request(challenge_url).json()
+ yield Message.Version, 1
+ yield Message.Directory, {"challenge": challenge}
+
+ params = {"sorting": self.sorting}
+ for submission in self._pagination(submission_url, params):
+
+ params = {"submission_id": submission["id"]}
+ for update in self._pagination(update_url, params=params):
+
+ del update["replies"]
+ update["challenge"] = challenge
+ for url in text.extract_iter(
+ update["body_presentation_html"], ' href="', '"'):
+ update["asset_id"] = self._id_from_url(url)
+ text.nameext_from_url(url, update)
+ yield Message.Url, self._no_cache(url), update
+
+ @staticmethod
+ def _id_from_url(url):
+ """Get an image's submission ID from its URL"""
+ parts = url.split("/")
+ return text.parse_int("".join(parts[7:10]))
+
+
+class ArtstationSearchExtractor(ArtstationExtractor):
+ """Extractor for artstation search results"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "Searches", "{search[searchterm]}")
+ archive_fmt = "s_{search[searchterm]}_{asset[id]}"
+ pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
+ r"/search/?\?([^#]+)")
+ test = ("https://www.artstation.com/search?sorting=recent&q=ancient",)
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ query = text.parse_query(match.group(1))
+ self.searchterm = query.get("q", "")
+ self.order = query.get("sorting", "recent").lower()
+
+ def metadata(self):
+ return {"search": {
+ "searchterm": self.searchterm,
+ "order": self.order,
+ }}
+
+ def projects(self):
+ order = "likes_count" if self.order == "likes" else "published_at"
+ url = "{}/search/projects.json".format(self.root)
+ params = {
+ "direction": "desc",
+ "order": order,
+ "q": self.searchterm,
+ # "show_pro_first": "true",
+ }
+ return self._pagination(url, params)
+
+
+class ArtstationArtworkExtractor(ArtstationExtractor):
+ """Extractor for projects on artstation's artwork page"""
+ subcategory = "artwork"
+ directory_fmt = ("{category}", "Artworks", "{artwork[sorting]!c}")
+ archive_fmt = "A_{asset[id]}"
+ pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com"
+ r"/artwork/?\?([^#]+)")
+ test = ("https://www.artstation.com/artwork?sorting=latest",)
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ self.query = text.parse_query(match.group(1))
+
+ def metadata(self):
+ return {"artwork": self.query}
+
+ def projects(self):
+ url = "{}/projects.json".format(self.root)
+ params = self.query.copy()
+ params["page"] = 1
+ return self._pagination(url, params)
+
+
+class ArtstationImageExtractor(ArtstationExtractor):
+ """Extractor for images from a single artstation project"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:\w+\.)?artstation\.com/(?:artwork|projects|search)"
+ r"|artstn\.co/p)/(\w+)")
+ test = (
+ ("https://www.artstation.com/artwork/LQVJr", {
+ "pattern": r"https?://\w+\.artstation\.com/p/assets"
+ r"/images/images/008/760/279/large/.+",
+ "content": "1f645ce7634e44675ebde8f6b634d36db0617d3c",
+ # SHA1 hash without _no_cache()
+ # "content": "2e8aaf6400aeff2345274f45e90b6ed3f2a0d946",
+ }),
+ # multiple images per project
+ ("https://www.artstation.com/artwork/Db3dy", {
+ "count": 4,
+ }),
+ # embedded youtube video
+ ("https://www.artstation.com/artwork/g4WPK", {
+ "range": "2",
+ "options": (("external", True),),
+ "pattern": "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0",
+ }),
+ # alternate URL patterns
+ ("https://sungchoi.artstation.com/projects/LQVJr"),
+ ("https://artstn.co/p/LQVJr"),
+ )
+
+ def __init__(self, match):
+ ArtstationExtractor.__init__(self, match)
+ self.project_id = match.group(1)
+ self.assets = None
+
+ def metadata(self):
+ self.assets = list(ArtstationExtractor.get_project_assets(
+ self, self.project_id))
+ self.user = self.assets[0]["user"]["username"]
+ return ArtstationExtractor.metadata(self)
+
+ def projects(self):
+ return ({"hash_id": self.project_id},)
+
+ def get_project_assets(self, project_id):
+ return self.assets
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
new file mode 100644
index 0000000..111d560
--- /dev/null
+++ b/gallery_dl/extractor/behance.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.behance.net/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class BehanceExtractor(Extractor):
+ """Base class for behance extractors"""
+ category = "behance"
+ root = "https://www.behance.net"
+
+ def items(self):
+ yield Message.Version, 1
+ for gallery in self.galleries():
+ gallery["_extractor"] = BehanceGalleryExtractor
+ yield Message.Queue, gallery["url"], self._update(gallery)
+
+ def galleries(self):
+ """Return all relevant gallery URLs"""
+
+ @staticmethod
+ def _update(data):
+ # compress data to simple lists
+ data["fields"] = [field["name"] for field in data["fields"]]
+ data["owners"] = [owner["display_name"] for owner in data["owners"]]
+ if "tags" in data:
+ data["tags"] = [tag["title"] for tag in data["tags"]]
+
+ # backwards compatibility
+ data["gallery_id"] = data["id"]
+ data["title"] = data["name"]
+ data["user"] = ", ".join(data["owners"])
+
+ return data
+
+
+class BehanceGalleryExtractor(BehanceExtractor):
+ """Extractor for image galleries from www.behance.net"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{owners:J, }", "{id} {name}")
+ filename_fmt = "{category}_{id}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ pattern = r"(?:https?://)?(?:www\.)?behance\.net/gallery/(\d+)"
+ test = (
+ ("https://www.behance.net/gallery/17386197/A-Short-Story", {
+ "count": 2,
+ "url": "ab79bd3bef8d3ae48e6ac74fd995c1dfaec1b7d2",
+ "keyword": {
+ "id": 17386197,
+ "name": 're:"Hi". A short story about the important things ',
+ "owners": ["Place Studio", "Julio César Velazquez"],
+ "fields": ["Animation", "Character Design", "Directing"],
+ "tags": list,
+ "module": dict,
+ },
+ }),
+ ("https://www.behance.net/gallery/21324767/Nevada-City", {
+ "count": 6,
+ "url": "0258fe194fe7d828d6f2c7f6086a9a0a4140db1d",
+ "keyword": {"owners": ["Alex Strohl"]},
+ }),
+ )
+
+ def __init__(self, match):
+ BehanceExtractor.__init__(self, match)
+ self.gallery_id = match.group(1)
+
+ def items(self):
+ data = self.get_gallery_data()
+ imgs = self.get_images(data)
+ data["count"] = len(imgs)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], (url, module) in enumerate(imgs, 1):
+ data["module"] = module
+ data["extension"] = text.ext_from_url(url)
+ yield Message.Url, url, data
+
+ def get_gallery_data(self):
+ """Collect gallery info dict"""
+ url = "{}/gallery/{}/a".format(self.root, self.gallery_id)
+ cookies = {
+ "_evidon_consent_cookie":
+ '{"consent_date":"2019-01-31T09:41:15.132Z"}',
+ "bcp": "815b5eee-8bdf-4898-ac79-33c2bcc0ed19",
+ "gk_suid": "66981391",
+ "gki": '{"feature_project_view":false,'
+ '"feature_discover_login_prompt":false,'
+ '"feature_project_login_prompt":false}',
+ "ilo0": "true",
+ }
+ page = self.request(url, cookies=cookies).text
+
+ data = json.loads(text.extract(
+ page, 'id="beconfig-store_state">', '</script>')[0])
+ return self._update(data["project"]["project"])
+
+ @staticmethod
+ def get_images(data):
+ """Extract image results from an API response"""
+ results = []
+
+ for module in data["modules"]:
+
+ if module["type"] == "image":
+ url = module["sizes"]["original"]
+ results.append((url, module))
+
+ elif module["type"] == "embed":
+ embed = module.get("original_embed") or module.get("embed")
+ url = "ytdl:" + text.extract(embed, 'src="', '"')[0]
+ results.append((url, module))
+
+ return results
+
+
+class BehanceUserExtractor(BehanceExtractor):
+ """Extractor for a user's galleries from www.behance.net"""
+ subcategory = "user"
+ categorytransfer = True
+ pattern = r"(?:https?://)?(?:www\.)?behance\.net/([^/?&#]+)/?$"
+ test = ("https://www.behance.net/alexstrohl", {
+ "count": ">= 8",
+ "pattern": BehanceGalleryExtractor.pattern,
+ })
+
+ def __init__(self, match):
+ BehanceExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def galleries(self):
+ url = "{}/{}/projects".format(self.root, self.user)
+ headers = {"X-Requested-With": "XMLHttpRequest"}
+ params = {"offset": 0}
+
+ while True:
+ data = self.request(url, headers=headers, params=params).json()
+ work = data["profile"]["activeSection"]["work"]
+ yield from work["projects"]
+ if not work["hasMore"]:
+ return
+ params["offset"] += len(work["projects"])
+
+
+class BehanceCollectionExtractor(BehanceExtractor):
+ """Extractor for a collection's galleries from www.behance.net"""
+ subcategory = "collection"
+ categorytransfer = True
+ pattern = r"(?:https?://)?(?:www\.)?behance\.net/collection/(\d+)"
+ test = ("https://www.behance.net/collection/170615607/Sky", {
+ "count": ">= 13",
+ "pattern": BehanceGalleryExtractor.pattern,
+ })
+
+ def __init__(self, match):
+ BehanceExtractor.__init__(self, match)
+ self.collection_id = match.group(1)
+
+ def galleries(self):
+ url = "{}/collection/{}/a".format(self.root, self.collection_id)
+ headers = {"X-Requested-With": "XMLHttpRequest"}
+ params = {}
+
+ while True:
+ data = self.request(url, headers=headers, params=params).json()
+ yield from data["output"]
+ if not data.get("offset"):
+ return
+ params["offset"] = data["offset"]
diff --git a/gallery_dl/extractor/bobx.py b/gallery_dl/extractor/bobx.py
new file mode 100644
index 0000000..67427a7
--- /dev/null
+++ b/gallery_dl/extractor/bobx.py
@@ -0,0 +1,112 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from http://www.bobx.com/dark/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class BobxExtractor(Extractor):
+ """Base class for bobx extractors"""
+ category = "bobx"
+ root = "http://www.bobx.com"
+ per_page = 80
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+
+class BobxGalleryExtractor(BobxExtractor):
+ """Extractor for individual image galleries on bobx.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{model}", "{title}")
+ filename_fmt = "{model}_{image_id}_{num:>03}.{extension}"
+ archive_fmt = "{image_id}"
+ pattern = (r"(?:https?://)?(?:www\.)?bobx\.com"
+ r"/([^/]+/[^/]+/photoset/[\w-]+)-\d+-\d+-\d+\.html")
+ test = (
+ (("http://www.bobx.com/idol/mikoto-hibi"
+ "/photoset/wpb-2018-_11-0-2-8.html"), {
+ "url": "93972d6a661f6627e963d62c9d15531e6b36a389",
+ "keyword": "6c620862db494ed05e69356ba30e604b167b0670",
+ "content": "3f176b7fe752524cec21a763aa55567e41181e07",
+ }),
+ (("http://www.bobx.com/idol/nashiko-momotsuki"
+ "/photoset/wpb-net-_221---2018-08---magic-of-summer-0-10-10.html"), {
+ "url": "f5d6c0cd0881ae6f504c21a90d86e3464dc54e8e",
+ "keyword": "f4819c75f494044348889ecd27771508464c0f5f",
+ }),
+ )
+
+ def items(self):
+ num = 0
+ while True:
+ url = "{}/{}-{}-10-8.html".format(self.root, self.path, num)
+ page = self.request(url, encoding="utf-8").text
+
+ if num == 0:
+ data = self.metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ data["num"] = 0
+
+ for url in self.images(page):
+ url = text.urljoin(self.root, url.replace("-preview-", "-"))
+ data = text.nameext_from_url(url, data)
+ data["image_id"] = text.parse_int(
+ data["filename"].rpartition("-")[2])
+ data["num"] += 1
+ yield Message.Url, url, data
+
+ num += self.per_page
+ if num >= data["count"]:
+ return
+
+ @staticmethod
+ def metadata(page):
+ """Collect metadata for extractor-job"""
+ info = text.extract(page, "<title>", "</title>")[0]
+ model, _, info = info.partition(" in ")
+ info, _, count = info.rpartition(" of ")
+ title = info.rpartition(" - @")[0]
+ return {
+ "title": text.unquote(title),
+ "model": text.unquote(model),
+ "count": text.parse_int(count),
+ }
+
+ @staticmethod
+ def images(page):
+ """Extract all image-urls"""
+ page = text.extract(page, "<table CELLPADDING=", "<script ")[0]
+ return text.extract_iter(page, '<img src="/thumbnail', '"')
+
+
+class BobxIdolExtractor(BobxExtractor):
+ """Extractor for an idol's image galleries on bobx.com"""
+ subcategory = "idol"
+ pattern = r"(?:https?://)?(?:www\.)?bobx\.com/([^/]+/[^/?&#]+)/?$"
+ test = ("http://www.bobx.com/idol/rin-okabe/", {
+ "url": "74d80bfcd53b738b31909bb42e5cc97c41b475b8",
+ })
+
+ def items(self):
+ url = "{}/{}/".format(self.root, self.path)
+ data = {"_extractor": BobxGalleryExtractor}
+ page = self.request(url).text
+ skip = True
+
+ yield Message.Version, 1
+ for part in text.extract_iter(page, '="photoset/', '"'):
+ # skip every other entry
+ skip = not skip
+ if skip:
+ continue
+ yield Message.Queue, "{}photoset/{}".format(url, part), data
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
new file mode 100644
index 0000000..c63085a
--- /dev/null
+++ b/gallery_dl/extractor/booru.py
@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Base classes for extractors for danbooru and co"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text, exception
+from xml.etree import ElementTree
+import collections
+import datetime
+import operator
+import re
+
+
+class BooruExtractor(SharedConfigMixin, Extractor):
+ """Base class for all booru extractors"""
+ basecategory = "booru"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
+ api_url = ""
+ post_url = ""
+ per_page = 50
+ page_start = 1
+ page_limit = None
+ sort = False
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.params = {}
+ self.extags = self.post_url and self.config("tags", False)
+
+ def skip(self, num):
+ pages = num // self.per_page
+ if self.page_limit and pages + self.page_start > self.page_limit:
+ pages = self.page_limit - self.page_start
+ self.page_start += pages
+ return pages * self.per_page
+
+ def items(self):
+ data = self.get_metadata()
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ self.reset_page()
+ while True:
+ images = self.parse_response(
+ self.request(self.api_url, params=self.params))
+
+ for image in images:
+ try:
+ url = image["file_url"]
+ except KeyError:
+ continue
+ if url.startswith("/"):
+ url = text.urljoin(self.api_url, url)
+ image.update(data)
+ if self.extags:
+ self.extended_tags(image)
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ if len(images) < self.per_page:
+ return
+ self.update_page(image)
+
+ def reset_page(self):
+ """Initialize params to point to the first page"""
+ self.params["page"] = self.page_start
+
+ def update_page(self, data):
+ """Update params to point to the next page"""
+
+ def parse_response(self, response):
+ """Parse JSON API response"""
+ images = response.json()
+ if self.sort:
+ images.sort(key=operator.itemgetter("score", "id"),
+ reverse=True)
+ return images
+
+ def get_metadata(self):
+ """Collect metadata for extractor-job"""
+ return {}
+
+ def extended_tags(self, image, page=None):
+ """Retrieve extended tag information"""
+ if not page:
+ url = self.post_url.format(image["id"])
+ page = self.request(url).text
+ tags = collections.defaultdict(list)
+ tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+ pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
+ for tag_type, tag_name in pattern.findall(tags_html or ""):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ image["tags_" + key] = " ".join(value)
+
+
+class XmlParserMixin():
+ """Mixin for XML based API responses"""
+ def parse_response(self, response):
+ root = ElementTree.fromstring(response.text)
+ return [post.attrib for post in root]
+
+
+class DanbooruPageMixin():
+ """Pagination for Danbooru v2"""
+ def update_page(self, data):
+ self.params["page"] = "b{}".format(data["id"])
+
+
+class MoebooruPageMixin():
+ """Pagination for Moebooru and Danbooru v1"""
+ def update_page(self, data):
+ if self.page_limit:
+ self.params["page"] = None
+ self.params["before_id"] = data["id"]
+ else:
+ self.params["page"] += 1
+
+
+class GelbooruPageMixin():
+ """Pagination for Gelbooru-like sites"""
+ page_start = 0
+
+ def reset_page(self):
+ self.params["pid"] = self.page_start
+
+ def update_page(self, data):
+ self.params["pid"] += 1
+
+
+class TagMixin():
+ """Extraction of images based on search-tags"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.tags = text.unquote(match.group("tags").replace("+", " "))
+ self.params["tags"] = self.tags
+ self.params["limit"] = self.per_page
+
+ def get_metadata(self):
+ return {"search_tags": self.tags}
+
+
+class PoolMixin():
+ """Extraction of image-pools"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool}")
+ archive_fmt = "p_{pool}_{id}"
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.pool = match.group("pool")
+ self.params["tags"] = "pool:" + self.pool
+ self.params["limit"] = self.per_page
+
+ def get_metadata(self):
+ return {"pool": text.parse_int(self.pool)}
+
+
+class GelbooruPoolMixin(PoolMixin):
+ """Image-pool extraction for Gelbooru-like sites"""
+ per_page = 1
+
+ def get_metadata(self):
+ page = self.request(self.pool_url.format(self.pool)).text
+ name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
+ if not name:
+ name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
+ if not name:
+ raise exception.NotFoundError("pool")
+ self.posts = list(text.extract_iter(page, 'id="p', '"', pos))
+
+ return {
+ "pool": text.parse_int(self.pool),
+ "pool_name": text.unescape(name),
+ "count": len(self.posts),
+ }
+
+ def reset_page(self):
+ self.index = self.page_start
+ self.update_page(None)
+
+ def update_page(self, data):
+ try:
+ post = self.posts[self.index]
+ self.index += 1
+ except IndexError:
+ post = "0"
+ self.params["tags"] = "id:" + post
+
+
+class PostMixin():
+ """Extraction of a single image-post"""
+ subcategory = "post"
+ archive_fmt = "{id}"
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.post = match.group("post")
+ self.params["tags"] = "id:" + self.post
+
+
+class PopularMixin():
+ """Extraction and metadata handling for Danbooru v2"""
+ subcategory = "popular"
+ directory_fmt = ("{category}", "popular", "{scale}", "{date}")
+ archive_fmt = "P_{scale[0]}_{date}_{id}"
+ page_start = None
+ sort = True
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.params.update(text.parse_query(match.group("query")))
+
+ def get_metadata(self, fmt="%Y-%m-%d"):
+ date = self.get_date() or datetime.datetime.utcnow().strftime(fmt)
+ scale = self.get_scale() or "day"
+
+ if scale == "week":
+ dt = datetime.datetime.strptime(date, fmt)
+ dt -= datetime.timedelta(days=dt.weekday())
+ date = dt.strftime(fmt)
+ elif scale == "month":
+ date = date[:-3]
+
+ return {"date": date, "scale": scale}
+
+ def get_scale(self):
+ if "scale" in self.params:
+ return self.params["scale"]
+ return None
+
+ def get_date(self):
+ if "date" in self.params:
+ return self.params["date"][:10]
+ return None
+
+
+class MoebooruPopularMixin(PopularMixin):
+ """Extraction and metadata handling for Moebooru and Danbooru v1"""
+ def __init__(self, match):
+ super().__init__(match)
+ self.scale = match.group("scale")
+
+ def get_date(self):
+ if "year" in self.params:
+ return "{:>04}-{:>02}-{:>02}".format(
+ self.params["year"],
+ self.params.get("month", "01"),
+ self.params.get("day", "01"))
+ return None
+
+ def get_scale(self):
+ if self.scale and self.scale.startswith("by_"):
+ return self.scale[3:]
+ return self.scale
diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py
new file mode 100644
index 0000000..5e44fd9
--- /dev/null
+++ b/gallery_dl/extractor/chan.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Base classes for extractors for different Futaba Channel-like boards"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class ChanThreadExtractor(Extractor):
+ """Base class for extractors for Futaba Channel-like boards"""
+ category = "chan"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} - {title}")
+ filename_fmt = "{tim}-{filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ api_url = ""
+ file_url = ""
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.metadata = {
+ "board": match.group(1),
+ "thread": match.group(2),
+ }
+
+ def items(self):
+ yield Message.Version, 1
+ url = self.api_url.format_map(self.metadata)
+ posts = self.request(url).json()["posts"]
+ self.metadata["title"] = self.get_thread_title(posts[0])
+ yield Message.Directory, self.metadata
+ for post in posts:
+ if "filename" not in post:
+ continue
+ self.update(post)
+ yield Message.Url, self.build_url(post), post
+ if "extra_files" in post:
+ for file in post["extra_files"]:
+ self.update(post, file)
+ yield Message.Url, self.build_url(post), post
+
+ def update(self, post, data=None):
+ """Update keyword dictionary"""
+ post.update(data or self.metadata)
+ post["extension"] = post["ext"][1:]
+
+ def build_url(self, post):
+ """Construct an image url out of a post object"""
+ return self.file_url.format_map(post)
+
+ @staticmethod
+ def get_thread_title(post):
+ """Return thread title from first post"""
+ title = post["sub"] if "sub" in post else text.remove_html(post["com"])
+ return text.unescape(title)[:50]
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
new file mode 100644
index 0000000..175af63
--- /dev/null
+++ b/gallery_dl/extractor/common.py
@@ -0,0 +1,432 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Common classes and constants used by extractor modules."""
+
+import re
+import time
+import netrc
+import queue
+import logging
+import requests
+import threading
+import http.cookiejar
+from .message import Message
+from .. import config, text, exception, cloudflare
+
+
+class Extractor():
+
+ category = ""
+ subcategory = ""
+ categorytransfer = False
+ directory_fmt = ("{category}",)
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = ""
+ cookiedomain = ""
+ root = ""
+ test = None
+
+ def __init__(self, match):
+ self.session = requests.Session()
+ self.log = logging.getLogger(self.category)
+ self.url = match.string
+ self._init_headers()
+ self._init_cookies()
+ self._init_proxies()
+ self._retries = self.config("retries", 5)
+ self._timeout = self.config("timeout", 30)
+ self._verify = self.config("verify", True)
+
+ @classmethod
+ def from_url(cls, url):
+ if isinstance(cls.pattern, str):
+ cls.pattern = re.compile(cls.pattern)
+ match = cls.pattern.match(url)
+ return cls(match) if match else None
+
+ def __iter__(self):
+ return self.items()
+
+ def items(self):
+ yield Message.Version, 1
+
+ def skip(self, num):
+ return 0
+
+ def config(self, key, default=None):
+ return config.interpolate(
+ ("extractor", self.category, self.subcategory, key), default)
+
+ def request(self, url, method="GET", *, session=None,
+ encoding=None, expect=(), retries=None, **kwargs):
+ tries = 0
+ retries = retries or self._retries
+ session = session or self.session
+ kwargs.setdefault("timeout", self._timeout)
+ kwargs.setdefault("verify", self._verify)
+
+ while True:
+ try:
+ response = session.request(method, url, **kwargs)
+ except (requests.exceptions.ConnectionError,
+ requests.exceptions.Timeout,
+ requests.exceptions.ChunkedEncodingError,
+ requests.exceptions.ContentDecodingError) as exc:
+ msg = exc
+ except (requests.exceptions.RequestException) as exc:
+ raise exception.HttpError(exc)
+ else:
+ code = response.status_code
+ if 200 <= code < 400 or code in expect:
+ if encoding:
+ response.encoding = encoding
+ return response
+ if cloudflare.is_challenge(response):
+ self.log.info("Solving Cloudflare challenge")
+ url, domain, cookies = cloudflare.solve_challenge(
+ session, response, kwargs)
+ cloudflare.cookies.update(self.category, (domain, cookies))
+ continue
+
+ msg = "{}: {} for url: {}".format(code, response.reason, url)
+ if code < 500 and code != 429:
+ break
+
+ tries += 1
+ self.log.debug("%s (%d/%d)", msg, tries, retries)
+ if tries >= retries:
+ break
+ time.sleep(2 ** tries)
+
+ raise exception.HttpError(msg)
+
+ def _get_auth_info(self):
+ """Return authentication information as (username, password) tuple"""
+ username = self.config("username")
+ password = None
+
+ if username:
+ password = self.config("password")
+ elif self.config("netrc", False):
+ try:
+ info = netrc.netrc().authenticators(self.category)
+ username, _, password = info
+ except (OSError, netrc.NetrcParseError) as exc:
+ self.log.error("netrc: %s", exc)
+ except TypeError:
+ self.log.warning("netrc: No authentication info")
+
+ return username, password
+
+ def _init_headers(self):
+ """Set additional headers for the 'session' object"""
+ headers = self.session.headers
+ headers.clear()
+
+ headers["User-Agent"] = self.config(
+ "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) "
+ "Gecko/20100101 Firefox/62.0"))
+ headers["Accept"] = "*/*"
+ headers["Accept-Language"] = "en-US,en;q=0.5"
+ headers["Accept-Encoding"] = "gzip, deflate"
+ headers["Connection"] = "keep-alive"
+ headers["Upgrade-Insecure-Requests"] = "1"
+
+ def _init_proxies(self):
+ """Update the session's proxy map"""
+ proxies = self.config("proxy")
+ if proxies:
+ if isinstance(proxies, str):
+ proxies = {"http": proxies, "https": proxies}
+ if isinstance(proxies, dict):
+ for scheme, proxy in proxies.items():
+ if "://" not in proxy:
+ proxies[scheme] = "http://" + proxy.lstrip("/")
+ self.session.proxies = proxies
+ else:
+ self.log.warning("invalid proxy specifier: %s", proxies)
+
+ def _init_cookies(self):
+ """Populate the session's cookiejar"""
+ cookies = self.config("cookies")
+ if cookies:
+ if isinstance(cookies, dict):
+ self._update_cookies_dict(cookies, self.cookiedomain)
+ else:
+ cookiejar = http.cookiejar.MozillaCookieJar()
+ try:
+ cookiejar.load(cookies)
+ except OSError as exc:
+ self.log.warning("cookies: %s", exc)
+ else:
+ self.session.cookies.update(cookiejar)
+
+ cookies = cloudflare.cookies(self.category)
+ if cookies:
+ domain, cookies = cookies
+ self._update_cookies_dict(cookies, domain)
+
+ def _update_cookies(self, cookies, *, domain=""):
+ """Update the session's cookiejar with 'cookies'"""
+ if isinstance(cookies, dict):
+ self._update_cookies_dict(cookies, domain or self.cookiedomain)
+ else:
+ setcookie = self.session.cookies.set_cookie
+ try:
+ cookies = iter(cookies)
+ except TypeError:
+ setcookie(cookies)
+ else:
+ for cookie in cookies:
+ setcookie(cookie)
+
+ def _update_cookies_dict(self, cookiedict, domain):
+ """Update cookiejar with name-value pairs from a dict"""
+ setcookie = self.session.cookies.set
+ for name, value in cookiedict.items():
+ setcookie(name, value, domain=domain)
+
+ def _check_cookies(self, cookienames, *, domain=""):
+ """Check if all 'cookienames' are in the session's cookiejar"""
+ if not domain:
+ domain = self.cookiedomain
+ try:
+ for name in cookienames:
+ self.session.cookies._find(name, domain)
+ except KeyError:
+ return False
+ return True
+
+ @classmethod
+ def _get_tests(cls):
+ """Yield an extractor's test cases as (URL, RESULTS) tuples"""
+ tests = cls.test
+ if not tests:
+ return
+
+ if len(tests) == 2 and (not tests[1] or isinstance(tests[1], dict)):
+ tests = (tests,)
+
+ for test in tests:
+ if isinstance(test, str):
+ test = (test, None)
+ yield test
+
+
+class ChapterExtractor(Extractor):
+
+ subcategory = "chapter"
+ directory_fmt = (
+ "{category}", "{manga}",
+ "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}")
+ filename_fmt = (
+ "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
+ archive_fmt = (
+ "{manga}_{chapter}{chapter_minor}_{page}")
+
+ def __init__(self, match, url=None):
+ Extractor.__init__(self, match)
+ self.chapter_url = url or self.root + match.group(1)
+
+ def items(self):
+ self.login()
+ page = self.request(self.chapter_url).text
+ data = self.metadata(page)
+ imgs = self.images(page)
+
+ if "count" in data:
+ images = zip(
+ range(1, data["count"]+1),
+ imgs,
+ )
+ else:
+ try:
+ data["count"] = len(imgs)
+ except TypeError:
+ pass
+ images = enumerate(imgs, 1)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"], (url, imgdata) in images:
+ if imgdata:
+ data.update(imgdata)
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def login(self):
+ """Login and set necessary cookies"""
+
+ def metadata(self, page):
+ """Return a dict with general metadata"""
+
+ def images(self, page):
+ """Return a list of all (image-url, metadata)-tuples"""
+
+
+class MangaExtractor(Extractor):
+
+ subcategory = "manga"
+ categorytransfer = True
+ chapterclass = None
+ reverse = True
+
+ def __init__(self, match, url=None):
+ Extractor.__init__(self, match)
+ self.manga_url = url or self.root + match.group(1)
+
+ if self.config("chapter-reverse", False):
+ self.reverse = not self.reverse
+
+ def items(self):
+ self.login()
+ page = self.request(self.manga_url).text
+
+ chapters = self.chapters(page)
+ if self.reverse:
+ chapters.reverse()
+
+ yield Message.Version, 1
+ for chapter, data in chapters:
+ data["_extractor"] = self.chapterclass
+ yield Message.Queue, chapter, data
+
+ def login(self):
+ """Login and set necessary cookies"""
+
+ def chapters(self, page):
+ """Return a list of all (chapter-url, metadata)-tuples"""
+
+
+class GalleryExtractor(ChapterExtractor):
+
+ subcategory = "gallery"
+ filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}"
+ directory_fmt = ("{category}", "{gallery_id} {title}")
+ archive_fmt = "{gallery_id}_{page}"
+
+
+class AsynchronousMixin():
+ """Run info extraction in a separate thread"""
+
+ def __iter__(self):
+ messages = queue.Queue(5)
+ thread = threading.Thread(
+ target=self.async_items,
+ args=(messages,),
+ daemon=True,
+ )
+
+ thread.start()
+ while True:
+ msg = messages.get()
+ if msg is None:
+ thread.join()
+ return
+ if isinstance(msg, Exception):
+ thread.join()
+ raise msg
+ yield msg
+ messages.task_done()
+
+ def async_items(self, messages):
+ try:
+ for msg in self.items():
+ messages.put(msg)
+ except Exception as exc:
+ messages.put(exc)
+ messages.put(None)
+
+
+class SharedConfigMixin():
+ """Enable sharing of config settings based on 'basecategory'"""
+ basecategory = ""
+
+ def config(self, key, default=None, *, sentinel=object()):
+ value = Extractor.config(self, key, sentinel)
+ if value is sentinel:
+ cat, self.category = self.category, self.basecategory
+ value = Extractor.config(self, key, default)
+ self.category = cat
+ return value
+
+
+def generate_extractors(extractor_data, symtable, classes):
+ """Dynamically generate Extractor classes"""
+ extractors = config.get(("extractor", classes[0].basecategory))
+ ckey = extractor_data.get("_ckey")
+ prev = None
+
+ if extractors:
+ extractor_data.update(extractors)
+
+ for category, info in extractor_data.items():
+
+ if not isinstance(info, dict):
+ continue
+
+ root = info["root"]
+ domain = root[root.index(":") + 3:]
+ pattern = info.get("pattern") or re.escape(domain)
+ name = (info.get("name") or category).capitalize()
+
+ for cls in classes:
+
+ class Extr(cls):
+ pass
+ Extr.__module__ = cls.__module__
+ Extr.__name__ = Extr.__qualname__ = \
+ name + cls.subcategory.capitalize() + "Extractor"
+ Extr.__doc__ = \
+ "Extractor for " + cls.subcategory + "s from " + domain
+ Extr.category = category
+ Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt
+ Extr.test = info.get("test-" + cls.subcategory)
+ Extr.root = root
+
+ if "extra" in info:
+ for key, value in info["extra"].items():
+ setattr(Extr, key, value)
+ if prev and ckey:
+ setattr(Extr, ckey, prev)
+
+ symtable[Extr.__name__] = prev = Extr
+
+
+# Reduce strictness of the expected magic string in cookiejar files.
+# (This allows the use of Wget-generated cookiejars without modification)
+http.cookiejar.MozillaCookieJar.magic_re = re.compile(
+ "#( Netscape)? HTTP Cookie File", re.IGNORECASE)
+
+# Update default cipher list of urllib3
+# to fix issues with Cloudflare and, by extension, Artstation (#227)
+from requests.packages.urllib3.util import ssl_ # noqa
+logging.getLogger("gallery-dl").debug("updating default urllib3 ciphers")
+
+# cipher list taken from urllib3 1.25
+# https://github.com/urllib3/urllib3/blob/1.25/src/urllib3/util/ssl_.py
+# with additions from
+# https://github.com/Anorov/cloudflare-scrape/pull/242
+ssl_.DEFAULT_CIPHERS = (
+ "ECDHE+AESGCM:"
+ "ECDHE+CHACHA20:"
+ "DHE+AESGCM:"
+ "DHE+CHACHA20:"
+ "ECDH+AESGCM:"
+ "DH+AESGCM:"
+ "ECDH+AES:"
+ "DH+AES:"
+ "RSA+AESGCM:"
+ "RSA+AES:"
+ "!ECDHE+SHA:"
+ "!AES128-SHA:"
+ "!aNULL:"
+ "!eNULL:"
+ "!MD5:"
+ "!DSS"
+)
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
new file mode 100644
index 0000000..211c340
--- /dev/null
+++ b/gallery_dl/extractor/danbooru.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://danbooru.donmai.us/"""
+
+from . import booru
+
+
+BASE_PATTERN = (
+ r"(?:https?://)?"
+ r"(?P<subdomain>danbooru|hijiribe|sonohara|safebooru)"
+ r"\.donmai\.us")
+
+
+class DanbooruExtractor(booru.DanbooruPageMixin, booru.BooruExtractor):
+ """Base class for danbooru extractors"""
+ category = "danbooru"
+ page_limit = 1000
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.subdomain = match.group("subdomain")
+ self.scheme = "https" if self.subdomain == "danbooru" else "http"
+ self.api_url = "{scheme}://{subdomain}.donmai.us/posts.json".format(
+ scheme=self.scheme, subdomain=self.subdomain)
+
+ username, api_key = self._get_auth_info()
+ if username:
+ self.log.debug("Using HTTP Basic Auth for user '%s'", username)
+ self.session.auth = (username, api_key)
+
+
+class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor):
+ """Extractor for images from danbooru based on search-tags"""
+ pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"
+ test = (
+ ("https://danbooru.donmai.us/posts?tags=bonocho", {
+ "content": "b196fb9f1668109d7774a0a82efea3ffdda07746",
+ }),
+ # test page transitions
+ ("https://danbooru.donmai.us/posts?tags=canvas_%28cocktail_soft%29", {
+ "count": ">= 50",
+ }),
+ ("https://hijiribe.donmai.us/posts?tags=bonocho"),
+ ("https://sonohara.donmai.us/posts?tags=bonocho"),
+ ("https://safebooru.donmai.us/posts?tags=bonocho"),
+ )
+
+
+class DanbooruPoolExtractor(booru.PoolMixin, DanbooruExtractor):
+ """Extractor for image-pools from danbooru"""
+ pattern = BASE_PATTERN + r"/pools/(?P<pool>\d+)"
+ test = ("https://danbooru.donmai.us/pools/7659", {
+ "content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99",
+ })
+
+
+class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor):
+ """Extractor for single images from danbooru"""
+ pattern = BASE_PATTERN + r"/posts/(?P<post>\d+)"
+ test = ("https://danbooru.donmai.us/posts/294929", {
+ "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
+ })
+
+
+class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor):
+ """Extractor for popular images from danbooru"""
+ pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?(?P<query>[^#]*))?"
+ test = (
+ ("https://danbooru.donmai.us/explore/posts/popular"),
+ (("https://danbooru.donmai.us/explore/posts/popular"
+ "?date=2013-06-06+03%3A34%3A22+-0400&scale=week"), {
+ "count": ">= 1",
+ }),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ urlfmt = "{scheme}://{subdomain}.donmai.us/explore/posts/popular.json"
+ self.api_url = urlfmt.format(
+ scheme=self.scheme, subdomain=self.subdomain)
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
new file mode 100644
index 0000000..ebab040
--- /dev/null
+++ b/gallery_dl/extractor/deviantart.py
@@ -0,0 +1,992 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.deviantart.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache, memcache
+import collections
+import itertools
+import mimetypes
+import math
+import time
+import re
+
+
+BASE_PATTERN = (
+ r"(?:https?://)?(?:"
+ r"(?:www\.)?deviantart\.com/([\w-]+)|"
+ r"(?!www\.)([\w-]+)\.deviantart\.com)"
+)
+
+
+class DeviantartExtractor(Extractor):
+ """Base class for deviantart extractors"""
+ category = "deviantart"
+ directory_fmt = ("{category}", "{author[username]!l}")
+ filename_fmt = "{category}_{index}_{title}.{extension}"
+ root = "https://www.deviantart.com"
+
+ def __init__(self, match=None):
+ Extractor.__init__(self, match)
+ self.offset = 0
+ self.flat = self.config("flat", True)
+ self.extra = self.config("extra", False)
+ self.original = self.config("original", True)
+ self.user = match.group(1) or match.group(2)
+ self.group = False
+ self.api = DeviantartAPI(self)
+
+ if self.original != "image":
+ self._update_content = self._update_content_default
+ else:
+ self._update_content = self._update_content_image
+ self.original = True
+
+ self.commit_journal = {
+ "html": self._commit_journal_html,
+ "text": self._commit_journal_text,
+ }.get(self.config("journals", "html"))
+
+ def skip(self, num):
+ self.offset += num
+ return num
+
+ def items(self):
+ if self.user:
+ self.group = not self.api.user_profile(self.user)
+ if self.group:
+ self.subcategory = "group-" + self.subcategory
+
+ yield Message.Version, 1
+ for deviation in self.deviations():
+ if isinstance(deviation, tuple):
+ url, data = deviation
+ yield Message.Queue, url, data
+ continue
+
+ self.prepare(deviation)
+ yield Message.Directory, deviation
+
+ if "content" in deviation:
+ content = deviation["content"]
+
+ if self.original and deviation["is_downloadable"] and \
+ text.ext_from_url(content["src"]) != "gif":
+ self._update_content(deviation, content)
+
+ if deviation["index"] <= 790677560 and \
+ content["src"].startswith("https://images-wixmp-"):
+ # https://github.com/r888888888/danbooru/issues/4069
+ content["src"] = re.sub(
+ r"(/f/[^/]+/[^/]+)/v\d+/.*",
+ r"/intermediary\1", content["src"])
+
+ yield self.commit(deviation, content)
+
+ elif deviation["is_downloadable"]:
+ content = self.api.deviation_download(deviation["deviationid"])
+ yield self.commit(deviation, content)
+
+ if "videos" in deviation:
+ video = max(deviation["videos"],
+ key=lambda x: text.parse_int(x["quality"][:-1]))
+ yield self.commit(deviation, video)
+
+ if "flash" in deviation:
+ yield self.commit(deviation, deviation["flash"])
+
+ if "excerpt" in deviation and self.commit_journal:
+ journal = self.api.deviation_content(deviation["deviationid"])
+ yield self.commit_journal(deviation, journal)
+
+ if self.extra:
+ for match in DeviantartStashExtractor.pattern.finditer(
+ deviation.get("description", "")):
+ deviation["_extractor"] = DeviantartStashExtractor
+ yield Message.Queue, match.group(0), deviation
+
+ def deviations(self):
+ """Return an iterable containing all relevant Deviation-objects"""
+
+ def prepare(self, deviation):
+ """Adjust the contents of a Deviation-object"""
+ try:
+ deviation["index"] = text.parse_int(
+ deviation["url"].rpartition("-")[2])
+ except KeyError:
+ deviation["index"] = 0
+ if self.user:
+ deviation["username"] = self.user
+ deviation["da_category"] = deviation["category"]
+ deviation["published_time"] = text.parse_int(
+ deviation["published_time"])
+ deviation["date"] = text.parse_timestamp(
+ deviation["published_time"])
+
+ @staticmethod
+ def commit(deviation, target):
+ url = target["src"]
+ deviation["target"] = text.nameext_from_url(url, target.copy())
+ deviation["extension"] = deviation["target"]["extension"]
+ return Message.Url, url, deviation
+
+ def _commit_journal_html(self, deviation, journal):
+ title = text.escape(deviation["title"])
+ url = deviation["url"]
+ thumbs = deviation["thumbs"]
+ html = journal["html"]
+ shadow = SHADOW_TEMPLATE.format_map(thumbs[0]) if thumbs else ""
+
+ if "css" in journal:
+ css, cls = journal["css"], "withskin"
+ else:
+ css, cls = "", "journal-green"
+
+ if html.find('<div class="boxtop journaltop">', 0, 250) != -1:
+ needle = '<div class="boxtop journaltop">'
+ header = HEADER_CUSTOM_TEMPLATE.format(
+ title=title, url=url, date=deviation["date"],
+ )
+ else:
+ needle = '<div usr class="gr">'
+ catlist = deviation["category_path"].split("/")
+ categories = " / ".join(
+ ('<span class="crumb"><a href="{}/{}/"><span>{}</span></a>'
+ '</span>').format(self.root, cpath, cat.capitalize())
+ for cat, cpath in zip(
+ catlist,
+ itertools.accumulate(catlist, lambda t, c: t + "/" + c)
+ )
+ )
+ username = deviation["author"]["username"]
+ urlname = deviation.get("username") or username.lower()
+ header = HEADER_TEMPLATE.format(
+ title=title,
+ url=url,
+ userurl="{}/{}/".format(self.root, urlname),
+ username=username,
+ date=deviation["date"],
+ categories=categories,
+ )
+
+ html = JOURNAL_TEMPLATE_HTML.format(
+ title=title,
+ html=html.replace(needle, header, 1),
+ shadow=shadow,
+ css=css,
+ cls=cls,
+ )
+
+ deviation["extension"] = "htm"
+ return Message.Url, html, deviation
+
+ @staticmethod
+ def _commit_journal_text(deviation, journal):
+ content = "\n".join(
+ text.unescape(text.remove_html(txt))
+ for txt in journal["html"].rpartition("<script")[0].split("<br />")
+ )
+ txt = JOURNAL_TEMPLATE_TEXT.format(
+ title=deviation["title"],
+ username=deviation["author"]["username"],
+ date=deviation["date"],
+ content=content,
+ )
+
+ deviation["extension"] = "txt"
+ return Message.Url, txt, deviation
+
+ @staticmethod
+ def _find_folder(folders, name):
+ pattern = re.compile(
+ r"[^\w]*" + name.replace("-", r"[^\w]+") + r"[^\w]*$")
+ for folder in folders:
+ if pattern.match(folder["name"]):
+ return folder
+ raise exception.NotFoundError("folder")
+
+ def _folder_urls(self, folders, category):
+ url = "{}/{}/{}/0/".format(self.root, self.user, category)
+ return [(url + folder["name"], folder) for folder in folders]
+
+ def _update_content_default(self, deviation, content):
+ content.update(self.api.deviation_download(deviation["deviationid"]))
+
+ def _update_content_image(self, deviation, content):
+ data = self.api.deviation_download(deviation["deviationid"])
+ url = data["src"].partition("?")[0]
+ mtype = mimetypes.guess_type(url, False)[0]
+ if mtype and mtype.startswith("image/"):
+ content.update(data)
+
+ def _html_request(self, url, **kwargs):
+ cookies = {"userinfo": (
+ '__167217c8e6aac1a3331f;{"username":"","uniqueid":"ab2e8b184471bf0'
+ 'e3f8ed3ee7a3220aa","vd":"Bc7vEx,BdC7Fy,A,J,A,,B,A,B,BdC7Fy,BdC7XU'
+ ',J,J,A,BdC7XU,13,A,B,A,,A,A,B,A,A,,A","attr":56}'
+ )}
+ return self.request(url, cookies=cookies, **kwargs)
+
+
+class DeviantartGalleryExtractor(DeviantartExtractor):
+ """Extractor for all deviations from an artist's gallery"""
+ subcategory = "gallery"
+ archive_fmt = "g_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"
+ test = (
+ ("https://www.deviantart.com/shimoda7/gallery/", {
+ "pattern": r"https://(s3.amazonaws.com/origin-(img|orig)"
+ r".deviantart.net/|images-wixmp-\w+.wixmp.com/)",
+ "count": ">= 30",
+ "keyword": {
+ "allows_comments": bool,
+ "author": {
+ "type": "regular",
+ "usericon": str,
+ "userid": "9AE51FC7-0278-806C-3FFF-F4961ABF9E2B",
+ "username": "shimoda7",
+ },
+ "category_path": str,
+ "content": {
+ "filesize": int,
+ "height": int,
+ "src": str,
+ "transparency": bool,
+ "width": int,
+ },
+ "da_category": str,
+ "date": "type:datetime",
+ "deviationid": str,
+ "?download_filesize": int,
+ "extension": str,
+ "index": int,
+ "is_deleted": bool,
+ "is_downloadable": bool,
+ "is_favourited": bool,
+ "is_mature": bool,
+ "preview": {
+ "height": int,
+ "src": str,
+ "transparency": bool,
+ "width": int,
+ },
+ "published_time": int,
+ "stats": {
+ "comments": int,
+ "favourites": int,
+ },
+ "target": dict,
+ "thumbs": list,
+ "title": str,
+ "url": r"re:https://www.deviantart.com/shimoda7/art/[^/]+-\d+",
+ "username": "shimoda7",
+ },
+ }),
+ # group
+ ("https://www.deviantart.com/yakuzafc", {
+ "pattern": r"https://www.deviantart.com/yakuzafc/gallery/0/",
+ "count": ">= 15",
+ }),
+ # 'folders' option (#276)
+ ("https://www.deviantart.com/justatest235723", {
+ "count": 2,
+ "options": (("metadata", 1), ("folders", 1), ("original", 0)),
+ "keyword": {
+ "description": str,
+ "folders": list,
+ "is_watching": bool,
+ "license": str,
+ "tags": list,
+ },
+ }),
+ ("https://www.deviantart.com/shimoda8/gallery/", {
+ "exception": exception.NotFoundError,
+ }),
+ # old-style URLs
+ ("https://www.deviantart.com/shimoda7/gallery/?catpath=/"),
+ ("https://shimoda7.deviantart.com/gallery/"),
+ ("https://yakuzafc.deviantart.com/"),
+ ("https://shimoda7.deviantart.com/gallery/?catpath=/"),
+ )
+
+ def deviations(self):
+ if self.flat and not self.group:
+ return self.api.gallery_all(self.user, self.offset)
+ folders = self.api.gallery_folders(self.user)
+ return self._folder_urls(folders, "gallery")
+
+
+class DeviantartFolderExtractor(DeviantartExtractor):
+ """Extractor for deviations inside an artist's gallery folder"""
+ subcategory = "folder"
+ directory_fmt = ("{category}", "{folder[owner]}", "{folder[title]}")
+ archive_fmt = "F_{folder[uuid]}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/gallery/(\d+)/([^/?&#]+)"
+ test = (
+ # user
+ ("https://www.deviantart.com/shimoda7/gallery/722019/Miscellaneous", {
+ "count": 5,
+ "options": (("original", False),),
+ }),
+ # group
+ ("https://www.deviantart.com/yakuzafc/gallery/37412168/Crafts", {
+ "count": ">= 4",
+ "options": (("original", False),),
+ }),
+ ("https://shimoda7.deviantart.com/gallery/722019/Miscellaneous"),
+ ("https://yakuzafc.deviantart.com/gallery/37412168/Crafts"),
+ )
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.fname = match.group(4)
+ self.folder = {"owner": self.user, "index": match.group(3)}
+
+ def deviations(self):
+ folders = self.api.gallery_folders(self.user)
+ folder = self._find_folder(folders, self.fname)
+ self.folder["title"] = folder["name"]
+ self.folder["uuid"] = folder["folderid"]
+ return self.api.gallery(self.user, folder["folderid"], self.offset)
+
+ def prepare(self, deviation):
+ DeviantartExtractor.prepare(self, deviation)
+ deviation["folder"] = self.folder
+
+
+class DeviantartDeviationExtractor(DeviantartExtractor):
+ """Extractor for single deviations"""
+ subcategory = "deviation"
+ archive_fmt = "{index}.{extension}"
+ pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)"
+ test = (
+ (("https://www.deviantart.com/shimoda7/art/"
+ "For-the-sake-of-a-memory-10073852"), {
+ "options": (("original", 0),),
+ "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
+ }),
+ ("https://www.deviantart.com/zzz/art/zzz-1234567890", {
+ "exception": exception.NotFoundError,
+ }),
+ (("https://www.deviantart.com/myria-moon/art/"
+ "Aime-Moi-part-en-vadrouille-261986576"), {
+ "pattern": (r"https?://s3\.amazonaws\.com/origin-orig\."
+ r"deviantart\.net/a383/f/2013/135/e/7/[^.]+\.jpg\?"),
+ }),
+ # wixmp URL rewrite
+ (("https://www.deviantart.com/citizenfresh/art/"
+ "Hverarond-14-the-beauty-of-the-earth-789295466"), {
+ "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
+ r"/intermediary/f/[^/]+/[^.]+\.jpg$")
+ }),
+ # non-download URL for GIFs (#242)
+ (("https://www.deviantart.com/skatergators/art/"
+ "COM-Monique-Model-781571783"), {
+ "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
+ r"/f/[^/]+/[^.]+\.gif\?token="),
+ }),
+ # external URLs from description (#302)
+ (("https://www.deviantart.com/uotapo/art/"
+ "INANAKI-Memorial-Humane7-590297498"), {
+ "options": (("extra", 1), ("original", 0)),
+ "pattern": r"https?://sta\.sh/\w+$",
+ "range": "2-",
+ "count": 4,
+ }),
+ # old-style URLs
+ ("https://shimoda7.deviantart.com"
+ "/art/For-the-sake-of-a-memory-10073852"),
+ ("https://myria-moon.deviantart.com"
+ "/art/Aime-Moi-part-en-vadrouille-261986576"),
+ ("https://zzz.deviantart.com/art/zzz-1234567890"),
+ )
+
+ skip = Extractor.skip
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.path = match.group(3)
+
+ def deviations(self):
+ url = "{}/{}/{}".format(self.root, self.user, self.path)
+ response = self._html_request(url, expect=range(400, 500))
+ deviation_id = text.extract(response.text, '//deviation/', '"')[0]
+ if response.status_code >= 400 or not deviation_id:
+ raise exception.NotFoundError("image")
+ return (self.api.deviation(deviation_id),)
+
+
+class DeviantartStashExtractor(DeviantartExtractor):
+ """Extractor for sta.sh-ed deviations"""
+ subcategory = "stash"
+ archive_fmt = "{index}.{extension}"
+ pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)"
+ test = (
+ ("https://sta.sh/022c83odnaxc", {
+ "pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net",
+ "count": 1,
+ }),
+ # multiple stash items
+ ("https://sta.sh/21jf51j7pzl2", {
+ "pattern": pattern,
+ "count": 4,
+ }),
+ # downloadable, but no "content" field (#307)
+ ("https://sta.sh/024t4coz16mi", {
+ "count": 1,
+ }),
+ ("https://sta.sh/abcdefghijkl", {
+ "exception": exception.HttpError,
+ }),
+ )
+
+ skip = Extractor.skip
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.user = None
+ self.stash_id = match.group(1)
+
+ def deviations(self):
+ url = "https://sta.sh/" + self.stash_id
+ page = self.request(url).text
+ deviation_id = text.extract(page, '//deviation/', '"')[0]
+
+ if deviation_id:
+ yield self.api.deviation(deviation_id)
+ else:
+ data = {"_extractor": DeviantartStashExtractor}
+ page = text.extract(
+ page, '<div id="stash-body"', '<div class="footer"')[0]
+ for url in text.extract_iter(page, '<a href="', '"'):
+ yield url, data
+
+
+class DeviantartFavoriteExtractor(DeviantartExtractor):
+ """Extractor for an artist's favorites"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{username}", "Favourites")
+ archive_fmt = "f_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/favourites/?(?:\?catpath=/)?$"
+ test = (
+ ("https://www.deviantart.com/h3813067/favourites/", {
+ "options": (("metadata", True), ("flat", False)), # issue #271
+ "count": 1,
+ }),
+ ("https://www.deviantart.com/h3813067/favourites/", {
+ "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
+ }),
+ ("https://www.deviantart.com/h3813067/favourites/?catpath=/"),
+ ("https://h3813067.deviantart.com/favourites/"),
+ ("https://h3813067.deviantart.com/favourites/?catpath=/"),
+ )
+
+ def deviations(self):
+ folders = self.api.collections_folders(self.user)
+ if self.flat:
+ return itertools.chain.from_iterable(
+ self.api.collections(self.user, folder["folderid"])
+ for folder in folders
+ )
+ return self._folder_urls(folders, "favourites")
+
+
+class DeviantartCollectionExtractor(DeviantartExtractor):
+ """Extractor for a single favorite collection"""
+ subcategory = "collection"
+ directory_fmt = ("{category}", "{collection[owner]}",
+ "Favourites", "{collection[title]}")
+ archive_fmt = "C_{collection[uuid]}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/favourites/(\d+)/([^/?&#]+)"
+ test = (
+ (("https://www.deviantart.com/pencilshadings"
+ "/favourites/70595441/3D-Favorites"), {
+ "count": ">= 20",
+ "options": (("original", False),),
+ }),
+ ("https://pencilshadings.deviantart.com"
+ "/favourites/70595441/3D-Favorites"),
+ )
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ _, _, cid, self.cname = match.groups()
+ self.collection = {"owner": self.user, "index": cid}
+
+ def deviations(self):
+ folders = self.api.collections_folders(self.user)
+ folder = self._find_folder(folders, self.cname)
+ self.collection["title"] = folder["name"]
+ self.collection["uuid"] = folder["folderid"]
+ return self.api.collections(self.user, folder["folderid"], self.offset)
+
+ def prepare(self, deviation):
+ DeviantartExtractor.prepare(self, deviation)
+ deviation["collection"] = self.collection
+
+
+class DeviantartJournalExtractor(DeviantartExtractor):
+ """Extractor for an artist's journals"""
+ subcategory = "journal"
+ directory_fmt = ("{category}", "{username}", "Journal")
+ archive_fmt = "j_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/(?:journal|blog)/?(?:\?catpath=/)?$"
+ test = (
+ ("https://www.deviantart.com/angrywhitewanker/journal/", {
+ "url": "38db2a0d3a587a7e0f9dba7ff7d274610ebefe44",
+ }),
+ ("https://www.deviantart.com/angrywhitewanker/journal/", {
+ "url": "b2a8e74d275664b1a4acee0fca0a6fd33298571e",
+ "options": (("journals", "text"),),
+ }),
+ ("https://www.deviantart.com/angrywhitewanker/journal/", {
+ "count": 0,
+ "options": (("journals", "none"),),
+ }),
+ ("https://www.deviantart.com/shimoda7/journal/?catpath=/"),
+ ("https://shimoda7.deviantart.com/journal/"),
+ ("https://shimoda7.deviantart.com/journal/?catpath=/"),
+ )
+
+ def deviations(self):
+ return self.api.browse_user_journals(self.user, self.offset)
+
+
+class DeviantartScrapsExtractor(DeviantartExtractor):
+ """Extractor for an artist's scraps"""
+ subcategory = "scraps"
+ directory_fmt = ("{category}", "{username}", "Scraps")
+ archive_fmt = "s_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/gallery/\?catpath=scraps\b"
+ test = (
+ ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps", {
+ "count": 12,
+ "options": (("original", False),),
+ }),
+ ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"),
+ )
+
+ def deviations(self):
+ url = "{}/{}/gallery/?catpath=scraps".format(self.root, self.user)
+ page = self._html_request(url).text
+ csrf, pos = text.extract(page, '"csrf":"', '"')
+ iid , pos = text.extract(page, '"requestid":"', '"', pos)
+
+ url = "https://www.deviantart.com/dapi/v1/gallery/0"
+ data = {
+ "username": self.user,
+ "offset": self.offset,
+ "limit": "24",
+ "catpath": "scraps",
+ "_csrf": csrf,
+ "dapiIid": iid + "-jsok7403-1.1"
+ }
+
+ while True:
+ content = self.request(
+ url, method="POST", data=data).json()["content"]
+
+ for item in content["results"]:
+ if item["html"].startswith('<div class="ad-container'):
+ continue
+ deviation_url = text.extract(item["html"], 'href="', '"')[0]
+ page = self._html_request(deviation_url).text
+ deviation_id = text.extract(page, '//deviation/', '"')[0]
+ if deviation_id:
+ yield self.api.deviation(deviation_id)
+
+ if not content["has_more"]:
+ return
+ data["offset"] = content["next_offset"]
+
+
+class DeviantartPopularExtractor(DeviantartExtractor):
+ """Extractor for popular deviations"""
+ subcategory = "popular"
+ directory_fmt = ("{category}", "Popular",
+ "{popular[range]}", "{popular[search]}")
+ archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}"
+ pattern = (r"(?:https?://)?www\.deviantart\.com"
+ r"((?:/\w+)*)/(?:popular-([^/?&#]+))/?(?:\?([^#]*))?")
+ test = (
+ ("https://www.deviantart.com/popular-24-hours/?q=tree+house", {
+ "options": (("original", False),),
+ }),
+ ("https://www.deviantart.com/artisan/popular-all-time/?q=tree"),
+ )
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.search_term = self.time_range = self.category_path = None
+ self.user = ""
+
+ path, trange, query = match.groups()
+ if path:
+ self.category_path = path.lstrip("/")
+ if trange:
+ self.time_range = trange.replace("-", "").replace("hours", "hr")
+ if query:
+ self.search_term = text.parse_query(query).get("q")
+
+ self.popular = {
+ "search": self.search_term or "",
+ "range": trange or "24-hours",
+ "path": self.category_path,
+ }
+
+ def deviations(self):
+ return self.api.browse_popular(
+ self.search_term, self.time_range, self.category_path, self.offset)
+
+ def prepare(self, deviation):
+ DeviantartExtractor.prepare(self, deviation)
+ deviation["popular"] = self.popular
+
+
+class DeviantartAPI():
+ """Minimal interface for the DeviantArt API
+
+ Ref: https://www.deviantart.com/developers/http/v1/20160316
+ """
+ CLIENT_ID = "5388"
+ CLIENT_SECRET = "76b08c69cfb27f26d6161f9ab6d061a1"
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.log = extractor.log
+ self.headers = {}
+
+ delay = extractor.config("wait-min", 0)
+ self.delay = math.ceil(math.log2(delay)) if delay >= 1 else -1
+ self.delay_min = max(2, self.delay)
+
+ self.mature = extractor.config("mature", "true")
+ if not isinstance(self.mature, str):
+ self.mature = "true" if self.mature else "false"
+
+ self.folders = extractor.config("folders", False)
+ self.metadata = extractor.extra or extractor.config("metadata", False)
+
+ self.refresh_token = extractor.config("refresh-token")
+ self.client_id = extractor.config("client-id", self.CLIENT_ID)
+ self.client_secret = extractor.config(
+ "client-secret", self.CLIENT_SECRET)
+
+ def browse_popular(self, query=None, timerange=None,
+ category_path=None, offset=0):
+ """Yield popular deviations"""
+ endpoint = "browse/popular"
+ params = {"q": query, "offset": offset, "limit": 120,
+ "timerange": timerange, "category_path": category_path,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params)
+
+ def browse_user_journals(self, username, offset=0):
+ """Yield all journal entries of a specific user"""
+ endpoint = "browse/user/journals"
+ params = {"username": username, "offset": offset, "limit": 50,
+ "mature_content": self.mature, "featured": "false"}
+ return self._pagination(endpoint, params)
+
+ def collections(self, username, folder_id, offset=0):
+ """Yield all Deviation-objects contained in a collection folder"""
+ endpoint = "collections/" + folder_id
+ params = {"username": username, "offset": offset, "limit": 24,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params)
+
+ @memcache(keyarg=1)
+ def collections_folders(self, username, offset=0):
+ """Yield all collection folders of a specific user"""
+ endpoint = "collections/folders"
+ params = {"username": username, "offset": offset, "limit": 50,
+ "mature_content": self.mature}
+ return self._pagination_folders(endpoint, params)
+
+ def deviation(self, deviation_id):
+ """Query and return info about a single Deviation"""
+ endpoint = "deviation/" + deviation_id
+ deviation = self._call(endpoint)
+ if self.metadata:
+ self._metadata((deviation,))
+ if self.folders:
+ self._folders((deviation,))
+ return deviation
+
+ def deviation_content(self, deviation_id):
+ """Get extended content of a single Deviation"""
+ endpoint = "deviation/content"
+ params = {"deviationid": deviation_id}
+ return self._call(endpoint, params)
+
+ def deviation_download(self, deviation_id):
+ """Get the original file download (if allowed)"""
+ endpoint = "deviation/download/" + deviation_id
+ params = {"mature_content": self.mature}
+ return self._call(endpoint, params)
+
+ def deviation_metadata(self, deviations):
+ """ Fetch deviation metadata for a set of deviations"""
+ endpoint = "deviation/metadata?" + "&".join(
+ "deviationids[{}]={}".format(num, deviation["deviationid"])
+ for num, deviation in enumerate(deviations)
+ )
+ params = {"mature_content": self.mature}
+ return self._call(endpoint, params)["metadata"]
+
+ def gallery(self, username, folder_id="", offset=0, extend=True):
+ """Yield all Deviation-objects contained in a gallery folder"""
+ endpoint = "gallery/" + folder_id
+ params = {"username": username, "offset": offset, "limit": 24,
+ "mature_content": self.mature, "mode": "newest"}
+ return self._pagination(endpoint, params, extend)
+
+ def gallery_all(self, username, offset=0):
+ """Yield all Deviation-objects of a specific user"""
+ endpoint = "gallery/all"
+ params = {"username": username, "offset": offset, "limit": 24,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params)
+
+ @memcache(keyarg=1)
+ def gallery_folders(self, username, offset=0):
+ """Yield all gallery folders of a specific user"""
+ endpoint = "gallery/folders"
+ params = {"username": username, "offset": offset, "limit": 50,
+ "mature_content": self.mature}
+ return self._pagination_folders(endpoint, params)
+
+ @memcache(keyarg=1)
+ def user_profile(self, username):
+ """Get user profile information"""
+ endpoint = "user/profile/" + username
+ return self._call(endpoint, expect_error=True)
+
+ def authenticate(self, refresh_token):
+ """Authenticate the application by requesting an access token"""
+ self.headers["Authorization"] = self._authenticate_impl(refresh_token)
+
+ @cache(maxage=3600, keyarg=1)
+ def _authenticate_impl(self, refresh_token):
+ """Actual authenticate implementation"""
+ url = "https://www.deviantart.com/oauth2/token"
+ if refresh_token:
+ self.log.info("Refreshing private access token")
+ data = {"grant_type": "refresh_token",
+ "refresh_token": _refresh_token_cache(refresh_token)}
+ else:
+ self.log.info("Requesting public access token")
+ data = {"grant_type": "client_credentials"}
+
+ auth = (self.client_id, self.client_secret)
+ response = self.extractor.request(
+ url, method="POST", data=data, auth=auth)
+ data = response.json()
+
+ if response.status_code != 200:
+ raise exception.AuthenticationError('"{} ({})"'.format(
+ data.get("error_description"), data.get("error")))
+ if refresh_token:
+ _refresh_token_cache.update(refresh_token, data["refresh_token"])
+ return "Bearer " + data["access_token"]
+
+ def _call(self, endpoint, params=None, expect_error=False, public=True):
+ """Call an API endpoint"""
+ url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint
+ while True:
+ if self.delay >= 0:
+ time.sleep(2 ** self.delay)
+
+ self.authenticate(None if public else self.refresh_token)
+ response = self.extractor.request(
+ url,
+ params=params,
+ headers=self.headers,
+ expect=range(400, 500),
+ )
+ data = response.json()
+ status = response.status_code
+
+ if 200 <= status < 400:
+ if self.delay > self.delay_min:
+ self.delay -= 1
+ return data
+ if expect_error:
+ return None
+ if data.get("error_description") == "User not found.":
+ raise exception.NotFoundError("user or group")
+
+ self.log.debug(response.text)
+ msg = "API responded with {} {}".format(
+ status, response.reason)
+ if status == 429:
+ self.delay += 1
+ self.log.warning("%s. Using %ds delay.", msg, 2 ** self.delay)
+ else:
+ self.log.error(msg)
+ return data
+
+ def _pagination(self, endpoint, params, extend=True):
+ public = True
+ while True:
+ data = self._call(endpoint, params, public=public)
+ if "results" not in data:
+ self.log.error("Unexpected API response: %s", data)
+ return
+ if (public and self.refresh_token and
+ len(data["results"]) < params["limit"]):
+ self.log.debug("Switching to private access token")
+ public = False
+ continue
+
+ if extend:
+ if self.metadata:
+ self._metadata(data["results"])
+ if self.folders:
+ self._folders(data["results"])
+ yield from data["results"]
+
+ if not data["has_more"]:
+ return
+ params["offset"] = data["next_offset"]
+
+ def _pagination_folders(self, endpoint, params):
+ result = []
+ result.extend(self._pagination(endpoint, params, False))
+ return result
+
+ def _metadata(self, deviations):
+ """Add extended metadata to each deviation object"""
+ for deviation, metadata in zip(
+ deviations, self.deviation_metadata(deviations)):
+ deviation.update(metadata)
+ deviation["tags"] = [t["tag_name"] for t in deviation["tags"]]
+ return deviations
+
+ def _folders(self, deviations):
+ """Add a list of all containing folders to each deviation object"""
+ for deviation in deviations:
+ deviation["folders"] = self._folders_map(
+ deviation["author"]["username"])[deviation["deviationid"]]
+
+ @memcache(keyarg=1)
+ def _folders_map(self, username):
+ """Generate a deviation_id -> folders mapping for 'username'"""
+ self.log.info("Collecting folder information for '%s'", username)
+ folders = self.gallery_folders(username)
+
+ # add parent names to folders, but ignore "Featured" as parent
+ fmap = {}
+ featured = folders[0]["folderid"]
+ for folder in folders:
+ if folder["parent"] and folder["parent"] != featured:
+ folder["name"] = fmap[folder["parent"]] + "/" + folder["name"]
+ fmap[folder["folderid"]] = folder["name"]
+
+ # map deviationids to folder names
+ dmap = collections.defaultdict(list)
+ for folder in folders:
+ for deviation in self.gallery(
+ username, folder["folderid"], 0, False):
+ dmap[deviation["deviationid"]].append(folder["name"])
+ return dmap
+
+
+@cache(maxage=10*365*24*3600, keyarg=0)
+def _refresh_token_cache(original_token, new_token=None):
+ return new_token or original_token
+
+
+SHADOW_TEMPLATE = """
+<span class="shadow">
+ <img src="{src}" class="smshadow" width="{width}" height="{height}">
+</span>
+<br><br>
+"""
+
+HEADER_TEMPLATE = """<div usr class="gr">
+<div class="metadata">
+ <h2><a href="{url}">{title}</a></h2>
+ <ul>
+ <li class="author">
+ by <span class="name"><span class="username-with-symbol u">
+ <a class="u regular username" href="{userurl}">{username}</a>\
+<span class="user-symbol regular"></span></span></span>,
+ <span>{date}</span>
+ </li>
+ <li class="category">
+ {categories}
+ </li>
+ </ul>
+</div>
+"""
+
+HEADER_CUSTOM_TEMPLATE = """<div class='boxtop journaltop'>
+<h2>
+ <img src="https://st.deviantart.net/minish/gruzecontrol/icons/journal.gif\
+?2" style="vertical-align:middle" alt=""/>
+ <a href="{url}">{title}</a>
+</h2>
+Journal Entry: <span>{date}</span>
+"""
+
+JOURNAL_TEMPLATE_HTML = """text:<!DOCTYPE html>
+<html>
+<head>
+ <meta charset="utf-8">
+ <title>{title}</title>
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/deviantart-network_lc.css?3843780832">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/group_secrets_lc.css?3250492874">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/v6core_lc.css?4246581581">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/sidebar_lc.css?1490570941">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/writer_lc.css?3090682151">
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+css/v6loggedin_lc.css?3001430805">
+ <style>{css}</style>
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+roses/cssmin/core.css?1488405371919" >
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+roses/cssmin/peeky.css?1487067424177" >
+ <link rel="stylesheet" href="https://st.deviantart.net/\
+roses/cssmin/desktop.css?1491362542749" >
+</head>
+<body id="deviantART-v7" class="bubble no-apps loggedout w960 deviantart">
+ <div id="output">
+ <div class="dev-page-container bubbleview">
+ <div class="dev-page-view view-mode-normal">
+ <div class="dev-view-main-content">
+ <div class="dev-view-deviation">
+ {shadow}
+ <div class="journal-wrapper tt-a">
+ <div class="journal-wrapper2">
+ <div class="journal {cls} journalcontrol">
+ {html}
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+ </div>
+</body>
+</html>
+"""
+
+JOURNAL_TEMPLATE_TEXT = """text:{title}
+by {username}, {date}
+
+{content}
+"""
diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py
new file mode 100644
index 0000000..77a19f6
--- /dev/null
+++ b/gallery_dl/extractor/directlink.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Direct link handling"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class DirectlinkExtractor(Extractor):
+ """Extractor for direct links to images and other media files"""
+ category = "directlink"
+ filename_fmt = "{domain}/{path}"
+ archive_fmt = "{domain}/{path}"
+ pattern = (r"(?i)https?://(?P<domain>[^/?&#]+)/(?P<path>[^?&#]+\."
+ r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))"
+ r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$")
+ test = (
+ (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), {
+ "url": "18c5d00077332e98e53be9fed2ee4be66154b88d",
+ "keyword": "e81b9fe3022e971365dd859f38e4ef717a6c69ed",
+ }),
+ # more complex example
+ ("https://example.org/path/file.webm?que=1&ry=2#fragment", {
+ "url": "fd4aec8a32842343394e6078a06c3e6b647bf671",
+ "keyword": "ff75764b1ae66615b723a6357b8193fa2de84678",
+ }),
+ # percent-encoded characters
+ ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", {
+ "url": "2627e8140727fdf743f86fe18f69f99a052c9718",
+ "keyword": "4d19dc12e41ffcb4cbec2013e335cf482377c35e",
+ }),
+ # upper case file extension (#296)
+ ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw"
+ ".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP"
+ "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.data = match.groupdict()
+
+ def items(self):
+ text.nameext_from_url(self.url, self.data)
+ for key, value in self.data.items():
+ if value:
+ self.data[key] = text.unquote(value)
+
+ yield Message.Version, 1
+ yield Message.Directory, self.data
+ yield Message.Url, self.url, self.data
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
new file mode 100644
index 0000000..b10bd35
--- /dev/null
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters from https://dynasty-scans.com/"""
+
+from .common import ChapterExtractor, Extractor, Message
+from .. import text
+import json
+import re
+
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
+
+
+class DynastyscansBase():
+ """Base class for dynastyscans extractors"""
+ category = "dynastyscans"
+ root = "https://dynasty-scans.com"
+
+ def _parse_image_page(self, image_id):
+ url = "{}/images/{}".format(self.root, image_id)
+ extr = text.extract_from(self.request(url).text)
+
+ date = extr("class='create_at'>", "</span>")
+ tags = extr("class='tags'>", "</span>")
+ src = extr("class='btn-group'>", "</div>")
+ url = extr(' src="', '"')
+
+ src = text.extract(src, 'href="', '"')[0] if "Source<" in src else ""
+
+ return {
+ "url" : self.root + url,
+ "image_id": text.parse_int(image_id),
+ "tags" : text.split_html(text.unescape(tags)),
+ "date" : text.remove_html(date),
+ "source" : text.unescape(src),
+ }
+
+
+class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
+ """Extractor for manga-chapters from dynasty-scans.com"""
+ pattern = BASE_PATTERN + r"(/chapters/[^/?&#]+)"
+ test = (
+ (("http://dynasty-scans.com/chapters/"
+ "hitoribocchi_no_oo_seikatsu_ch33"), {
+ "url": "dce64e8c504118f1ab4135c00245ea12413896cb",
+ "keyword": "1564965671ac69bb7fbc340538397f6bd0aa269b",
+ }),
+ (("http://dynasty-scans.com/chapters/"
+ "new_game_the_spinoff_special_13"), {
+ "url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538",
+ "keyword": "22b35029bc65d6d95db2e2c147b0a37f2d290f29",
+ }),
+ )
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ match = re.match(
+ (r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name
+ r"(?: ch(\d+)([^:<]*))?" # chapter info
+ r"(?:: (.+))?"), # title
+ extr("<h3 id='chapter-title'><b>", "</b>"),
+ )
+ author = extr(" by ", "</a>")
+ group = extr('"icon-print"></i> ', '</span>')
+
+ return {
+ "manga" : text.unescape(match.group(1)),
+ "chapter" : text.parse_int(match.group(2)),
+ "chapter_minor": match.group(3) or "",
+ "title" : text.unescape(match.group(4) or ""),
+ "author" : text.remove_html(author),
+ "group" : (text.remove_html(group) or
+ text.extract(group, ' alt="', '"')[0] or ""),
+ "date" : extr('"icon-calendar"></i> ', '<'),
+ "lang" : "en",
+ "language": "English",
+ }
+
+ def images(self, page):
+ data = text.extract(page, "var pages = ", ";\n")[0]
+ return [
+ (self.root + img["image"], None)
+ for img in json.loads(data)
+ ]
+
+
+class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
+ """Extrator for image search results on dynasty-scans.com"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "Images")
+ filename_fmt = "{image_id}.{extension}"
+ archive_fmt = "i_{image_id}"
+ pattern = BASE_PATTERN + r"/images/?(?:\?([^#]+))?$"
+ test = (
+ ("https://dynasty-scans.com/images?with[]=4930&with[]=5211", {
+ "url": "6b570eedd8a741c2cd34fb98b22a49d772f84191",
+ "keyword": "a1e2d05c1406a08b02f347389616a6babb1b50bf",
+ }),
+ ("https://dynasty-scans.com/images", {
+ "range": "1",
+ "count": 1,
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.query = match.group(1) or ""
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, {}
+ for image_id in self.images():
+ image = self._parse_image_page(image_id)
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def images(self):
+ url = self.root + "/images?" + self.query.replace("[]", "%5B%5D")
+ params = {"page": 1}
+
+ while True:
+ page = self.request(url, params=params).text
+ yield from text.extract_iter(page, '"/images/', '"')
+ if 'rel="next"' not in page:
+ return
+ params["page"] += 1
+
+
+class DynastyscansImageExtractor(DynastyscansSearchExtractor):
+ """Extractor for individual images on dynasty-scans.com"""
+ subcategory = "image"
+ pattern = BASE_PATTERN + r"/images/(\d+)"
+ test = ("https://dynasty-scans.com/images/1245", {
+ "url": "15e54bd94148a07ed037f387d046c27befa043b2",
+ "keyword": "3b630c6139e5ff06e141541d57960f8a2957efbb",
+ })
+
+ def images(self):
+ return (self.query,)
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
new file mode 100644
index 0000000..f245ddf
--- /dev/null
+++ b/gallery_dl/extractor/e621.py
@@ -0,0 +1,71 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://e621.net/"""
+
+from . import booru
+
+
+class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for e621 extractors"""
+ category = "e621"
+ api_url = "https://e621.net/post/index.json"
+ post_url = "https://e621.net/post/show/{}"
+ page_limit = 750
+
+
+class E621TagExtractor(booru.TagMixin, E621Extractor):
+ """Extractor for images from e621.net based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?e621\.net/post"
+ r"(?:/index/\d+/|\?tags=)(?P<tags>[^/?&#]+)")
+ test = (
+ ("https://e621.net/post/index/1/anry", {
+ "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba",
+ "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58",
+ }),
+ ("https://e621.net/post?tags=anry"),
+ )
+
+
+class E621PoolExtractor(booru.PoolMixin, E621Extractor):
+ """Extractor for image-pools from e621.net"""
+ pattern = r"(?:https?://)?(?:www\.)?e621\.net/pool/show/(?P<pool>\d+)"
+ test = ("https://e621.net/pool/show/73", {
+ "url": "842f2fb065c7c339486a9b1d689020b8569888ed",
+ "content": "c2c87b7a9150509496cddc75ccab08109922876a",
+ })
+
+
+class E621PostExtractor(booru.PostMixin, E621Extractor):
+ """Extractor for single images from e621.net"""
+ pattern = r"(?:https?://)?(?:www\.)?e621\.net/post/show/(?P<post>\d+)"
+ test = ("https://e621.net/post/show/535", {
+ "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
+ "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "anry",
+ "tags_general": str,
+ "tags_species": str,
+ },
+ })
+
+
+class E621PopularExtractor(booru.MoebooruPopularMixin, E621Extractor):
+ """Extractor for popular images from 621.net"""
+ pattern = (r"(?:https?://)?(?:www\.)?e621\.net"
+ r"/post/popular_by_(?P<scale>day|week|month)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = ("https://e621.net/post/popular_by_month?month=6&year=2013", {
+ "count": 32,
+ })
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = "https://e621.net/post/popular_by_{scale}.json".format(
+ scale=self.scale)
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
new file mode 100644
index 0000000..d67c58a
--- /dev/null
+++ b/gallery_dl/extractor/exhentai.py
@@ -0,0 +1,382 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from galleries at https://exhentai.org/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache
+import itertools
+import random
+import time
+import math
+
+
+BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
+
+
+class ExhentaiExtractor(Extractor):
+ """Base class for exhentai extractors"""
+ category = "exhentai"
+ directory_fmt = ("{category}", "{gallery_id}")
+ filename_fmt = (
+ "{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}")
+ archive_fmt = "{gallery_id}_{num}"
+ cookiedomain = ".exhentai.org"
+ cookienames = ("ipb_member_id", "ipb_pass_hash")
+ root = "https://exhentai.org"
+
+ def __init__(self, match):
+ if match.group(1) != "ex":
+ self.root = "https://e-hentai.org"
+ self.cookiedomain = ".e-hentai.org"
+ Extractor.__init__(self, match)
+ self.limits = self.config("limits", True)
+ self.original = self.config("original", True)
+ self.wait_min = self.config("wait-min", 3)
+ self.wait_max = self.config("wait-max", 6)
+
+ self._remaining = 0
+ if self.wait_max < self.wait_min:
+ self.wait_max = self.wait_min
+ self.session.headers["Referer"] = self.root + "/"
+
+ def request(self, *args, **kwargs):
+ response = Extractor.request(self, *args, **kwargs)
+ if self._is_sadpanda(response):
+ self.log.info("sadpanda.jpg")
+ raise exception.AuthorizationError()
+ return response
+
+ def wait(self, waittime=None):
+ """Wait for a randomly chosen amount of seconds"""
+ if not waittime:
+ waittime = random.uniform(self.wait_min, self.wait_max)
+ else:
+ waittime = random.uniform(waittime * 0.66, waittime * 1.33)
+ time.sleep(waittime)
+
+ def login(self):
+ """Login and set necessary cookies"""
+ if self._check_cookies(self.cookienames):
+ return
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+ else:
+ self.log.info("no username given; using e-hentai.org")
+ self.root = "https://e-hentai.org"
+ self.original = False
+ self.limits = False
+ self.session.cookies["nw"] = "1"
+
+ @cache(maxage=90*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01"
+ headers = {
+ "Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1",
+ }
+ data = {
+ "CookieDate": "1",
+ "b": "d",
+ "bt": "1-1",
+ "UserName": username,
+ "PassWord": password,
+ "ipb_login_submit": "Login!",
+ }
+
+ response = self.request(url, method="POST", headers=headers, data=data)
+ if "You are now logged in as:" not in response.text:
+ raise exception.AuthenticationError()
+ return {c: response.cookies[c] for c in self.cookienames}
+
+ @staticmethod
+ def _is_sadpanda(response):
+ """Return True if the response object contains a sad panda"""
+ return (
+ response.headers.get("Content-Length") == "9615" and
+ "sadpanda.jpg" in response.headers.get("Content-Disposition", "")
+ )
+
+
+class ExhentaiGalleryExtractor(ExhentaiExtractor):
+ """Extractor for image galleries from exhentai.org"""
+ subcategory = "gallery"
+ pattern = (BASE_PATTERN +
+ r"(?:/g/(\d+)/([\da-f]{10})"
+ r"|/s/([\da-f]{10})/(\d+)-(\d+))")
+ test = (
+ ("https://exhentai.org/g/960460/4f0e369d82/", {
+ "keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480",
+ "content": "493d759de534355c9f55f8e365565b62411de146",
+ }),
+ ("https://exhentai.org/g/960461/4f0e369d82/", {
+ "exception": exception.NotFoundError,
+ }),
+ ("http://exhentai.org/g/962698/7f02358e00/", {
+ "exception": exception.AuthorizationError,
+ }),
+ ("https://exhentai.org/s/3957343c3b/960460-5", {
+ "count": 2,
+ }),
+ ("https://e-hentai.org/s/3957343c3b/960460-5", {
+ "count": 2,
+ }),
+ ("https://g.e-hentai.org/g/960460/4f0e369d82/"),
+ )
+
+ def __init__(self, match):
+ ExhentaiExtractor.__init__(self, match)
+ self.key = {}
+ self.count = 0
+ self.gallery_id = text.parse_int(match.group(2) or match.group(5))
+ self.gallery_token = match.group(3)
+ self.image_token = match.group(4)
+ self.image_num = text.parse_int(match.group(6), 1)
+
+ def items(self):
+ self.login()
+
+ if self.gallery_token:
+ gpage = self._gallery_page()
+ self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
+ self.wait()
+ ipage = self._image_page()
+ else:
+ ipage = self._image_page()
+ part = text.extract(ipage, 'hentai.org/g/', '"')[0]
+ self.gallery_token = part.split("/")[1]
+ self.wait()
+ gpage = self._gallery_page()
+
+ data = self.get_metadata(gpage)
+ self.count = data["count"]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ images = itertools.chain(
+ (self.image_from_page(ipage),), self.images_from_api())
+ for url, image in images:
+ data.update(image)
+ if self.limits:
+ self._check_limits(data)
+ if "/fullimg.php" in url:
+ data["extension"] = ""
+ self.wait(1.5)
+ yield Message.Url, url, data
+
+ def get_metadata(self, page):
+ """Extract gallery metadata"""
+ extr = text.extract_from(page)
+ data = {
+ "gallery_id" : self.gallery_id,
+ "gallery_token": self.gallery_token,
+ "title" : text.unescape(extr('<h1 id="gn">', '</h1>')),
+ "title_jp" : text.unescape(extr('<h1 id="gj">', '</h1>')),
+ "date" : text.parse_datetime(extr(
+ '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
+ "parent" : extr(
+ '>Parent:</td><td class="gdt2"><a href="', '"'),
+ "visible" : extr(
+ '>Visible:</td><td class="gdt2">', '<'),
+ "language" : extr(
+ '>Language:</td><td class="gdt2">', ' '),
+ "gallery_size" : text.parse_bytes(extr(
+ '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
+ "count" : text.parse_int(extr(
+ '>Length:</td><td class="gdt2">', ' ')),
+ }
+
+ data["lang"] = util.language_to_code(data["language"])
+ data["tags"] = [
+ text.unquote(tag)
+ for tag in text.extract_iter(page, 'hentai.org/tag/', '"')
+ ]
+
+ return data
+
+ def image_from_page(self, page):
+ """Get image url and data from webpage"""
+ pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
+ extr = text.extract_from(page, pos)
+
+ self.key["next"] = extr("'", "'")
+ iurl = extr('<img id="img" src="', '"')
+ orig = extr('hentai.org/fullimg.php', '"')
+
+ if self.original and orig:
+ url = self.root + "/fullimg.php" + text.unescape(orig)
+ data = self._parse_original_info(extr('ownload original', '<'))
+ else:
+ url = iurl
+ data = self._parse_image_info(url)
+
+ data["num"] = self.image_num
+ data["image_token"] = self.key["start"] = extr('var startkey="', '";')
+ self.key["show"] = extr('var showkey="', '";')
+
+ return url, text.nameext_from_url(iurl, data)
+
+ def images_from_api(self):
+ """Get image url and data from api calls"""
+ api_url = self.root + "/api.php"
+ nextkey = self.key["next"]
+ request = {
+ "method" : "showpage",
+ "gid" : self.gallery_id,
+ "imgkey" : nextkey,
+ "showkey": self.key["show"],
+ }
+ for request["page"] in range(self.image_num + 1, self.count + 1):
+ self.wait()
+ page = self.request(api_url, method="POST", json=request).json()
+ imgkey = nextkey
+ nextkey, pos = text.extract(page["i3"], "'", "'")
+ imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)
+ origurl, pos = text.extract(page["i7"], '<a href="', '"')
+
+ if self.original and origurl:
+ url = text.unescape(origurl)
+ data = self._parse_original_info(
+ text.extract(page["i7"], "ownload original", "<", pos)[0])
+ else:
+ url = imgurl
+ data = self._parse_image_info(url)
+
+ data["num"] = request["page"]
+ data["image_token"] = imgkey
+ yield url, text.nameext_from_url(imgurl, data)
+
+ request["imgkey"] = nextkey
+
+ def _gallery_page(self):
+ url = "{}/g/{}/{}/".format(
+ self.root, self.gallery_id, self.gallery_token)
+ response = self.request(url, expect=range(400, 500))
+ page = response.text
+
+ if response.status_code == 404 and "Gallery Not Available" in page:
+ raise exception.AuthorizationError()
+ if page.startswith(("Key missing", "Gallery not found")):
+ raise exception.NotFoundError("gallery")
+ return page
+
+ def _image_page(self):
+ url = "{}/s/{}/{}-{}".format(
+ self.root, self.image_token, self.gallery_id, self.image_num)
+ page = self.request(url, expect=range(400, 500)).text
+
+ if page.startswith(("Invalid page", "Keep trying")):
+ raise exception.NotFoundError("image page")
+ return page
+
+ def _check_limits(self, data):
+ if not self._remaining or data["num"] % 20 == 0:
+ self._update_limits()
+ self._remaining -= data["cost"]
+
+ if self._remaining <= 0:
+ url = "{}/s/{}/{}-{}".format(
+ self.root, data["image_token"], self.gallery_id, data["num"])
+ self.log.error(
+ "Image limit reached! Reset it and continue with "
+ "'%s' as URL.", url)
+ raise exception.StopExtraction()
+
+ def _update_limits(self):
+ url = "https://e-hentai.org/home.php"
+ cookies = {
+ cookie.name: cookie.value
+ for cookie in self.session.cookies
+ if cookie.domain == self.cookiedomain and cookie.name != "igneous"
+ }
+
+ page = self.request(url, cookies=cookies).text
+ current, pos = text.extract(page, "<strong>", "</strong>")
+ maximum, pos = text.extract(page, "<strong>", "</strong>", pos)
+ self._remaining = text.parse_int(maximum) - text.parse_int(current)
+
+ @staticmethod
+ def _parse_image_info(url):
+ parts = url.split("/")[4].split("-")
+ return {
+ "width": text.parse_int(parts[2]),
+ "height": text.parse_int(parts[3]),
+ "size": text.parse_int(parts[1]),
+ "cost": 1,
+ }
+
+ @staticmethod
+ def _parse_original_info(info):
+ parts = info.lstrip().split(" ")
+ size = text.parse_bytes(parts[3] + parts[4][0])
+ return {
+ "width": text.parse_int(parts[0]),
+ "height": text.parse_int(parts[2]),
+ "size": size,
+ "cost": 1 + math.ceil(size * 5 / 1024 / 1024)
+ }
+
+
+class ExhentaiSearchExtractor(ExhentaiExtractor):
+ """Extractor for exhentai search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/?\?(.*)$"
+ test = (
+ ("https://exhentai.org/?f_search=touhou"),
+ (("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0"
+ "&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0"
+ "&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), {
+ "pattern": ExhentaiGalleryExtractor.pattern,
+ "range": "1-30",
+ "count": 30,
+ }),
+ )
+
+ def __init__(self, match):
+ ExhentaiExtractor.__init__(self, match)
+ self.params = text.parse_query(match.group(2))
+ self.params["page"] = text.parse_int(self.params.get("page"))
+ self.search_url = self.root
+
+ def items(self):
+ self.login()
+ yield Message.Version, 1
+
+ while True:
+ last = None
+ page = self.request(self.search_url, params=self.params).text
+
+ for gallery in ExhentaiGalleryExtractor.pattern.finditer(page):
+ url = gallery.group(0)
+ if url == last:
+ continue
+ last = url
+ yield Message.Queue, url, {}
+
+ if 'class="ptdd">&gt;<' in page or ">No hits found</p>" in page:
+ return
+ self.params["page"] += 1
+ self.wait()
+
+
+class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
+ """Extractor for favorited exhentai galleries"""
+ subcategory = "favorite"
+ pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?"
+ test = (
+ ("https://exhentai.org/favorites.php"),
+ ("https://exhentai.org/favorites.php?favcat=1&f_search=touhou"
+ "&f_apply=Search+Favorites"),
+ )
+
+ def __init__(self, match):
+ ExhentaiSearchExtractor.__init__(self, match)
+ self.search_url = self.root + "/favorites.php"
diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py
new file mode 100644
index 0000000..a2d8c04
--- /dev/null
+++ b/gallery_dl/extractor/fallenangels.py
@@ -0,0 +1,105 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters from https://www.fascans.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, util
+import json
+
+
+class FallenangelsChapterExtractor(ChapterExtractor):
+ """Extractor for manga-chapters from fascans.com"""
+ category = "fallenangels"
+ pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com"
+ r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?")
+ test = (
+ ("https://manga.fascans.com/manga/chronos-ruler/20/1", {
+ "url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3",
+ "keyword": "2dfcc50020e32cd207be88e2a8fac0933e36bdfb",
+ }),
+ ("http://truyen.fascans.com/manga/hungry-marie/8", {
+ "url": "1f923d9cb337d5e7bbf4323719881794a951c6ae",
+ "keyword": "2bdb7334c0e3eceb9946ffd3132df679b4a94f6a",
+ }),
+ ("http://manga.fascans.com/manga/rakudai-kishi-no-eiyuutan/19.5", {
+ "keyword": "9fcca4c1a90d11f00764f62477ebe10bd408021c",
+ }),
+ )
+
+ def __init__(self, match):
+ self.version, self.manga, self.chapter, self.minor = match.groups()
+ url = "https://{}.fascans.com/manga/{}/{}/1".format(
+ self.version, self.manga, self.chapter)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ lang = "vi" if self.version == "truyen" else "en"
+ return {
+ "manga" : extr('name="description" content="', ' Chapter '),
+ "title" : extr(': ', ' - Page 1'),
+ "chapter" : self.chapter,
+ "chapter_minor": self.minor or "",
+ "lang" : lang,
+ "language": util.code_to_language(lang),
+ }
+
+ @staticmethod
+ def images(page):
+ return [
+ (img["page_image"], None)
+ for img in json.loads(
+ text.extract(page, "var pages = ", ";")[0]
+ )
+ ]
+
+
+class FallenangelsMangaExtractor(MangaExtractor):
+ """Extractor for manga from fascans.com"""
+ chapterclass = FallenangelsChapterExtractor
+ category = "fallenangels"
+ pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$"
+ test = (
+ ("http://manga.fascans.com/manga/trinity-seven", {
+ "url": "293057f264de6c438b979bd1c3de4719568db452",
+ "keyword": "50e0374dba60734230e4284b5ffdadef5104ae62",
+ }),
+ ("https://truyen.fascans.com/manga/rakudai-kishi-no-eiyuutan", {
+ "url": "51a731a6b82d5eb7a335fbae6b02d06aeb2ab07b",
+ "keyword": "2d2a2a5d9ea5925eb9a47bb13d848967f3af086c",
+ }),
+ )
+
+ def __init__(self, match):
+ url = "https://" + match.group(1)
+ self.lang = "vi" if match.group(2) == "truyen" else "en"
+ MangaExtractor.__init__(self, match, url)
+
+ def chapters(self, page):
+ extr = text.extract_from(page)
+ results = []
+ language = util.code_to_language(self.lang)
+ while extr('<li style="', '"'):
+ vol = extr('class="volume-', '"')
+ url = extr('href="', '"')
+ cha = extr('>', '<')
+ title = extr('<em>', '</em>')
+
+ manga, _, chapter = cha.rpartition(" ")
+ chapter, dot, minor = chapter.partition(".")
+ results.append((url, {
+ "manga" : manga,
+ "title" : text.unescape(title),
+ "volume" : text.parse_int(vol),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": dot + minor,
+ "lang" : self.lang,
+ "language": language,
+ }))
+ return results
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
new file mode 100644
index 0000000..d941d76
--- /dev/null
+++ b/gallery_dl/extractor/flickr.py
@@ -0,0 +1,503 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.flickr.com/"""
+
+from .common import Extractor, Message
+from .. import text, oauth, util, exception
+
+
+class FlickrExtractor(Extractor):
+ """Base class for flickr extractors"""
+ category = "flickr"
+ filename_fmt = "{category}_{id}.{extension}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = FlickrAPI(self)
+ self.item_id = match.group(1)
+ self.user = None
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for photo in self.photos():
+ photo.update(data)
+ url = photo["url"]
+ yield Message.Url, url, text.nameext_from_url(url, photo)
+
+ def metadata(self):
+ """Return general metadata"""
+ self.user = self.api.urls_lookupUser(self.item_id)
+ return {"user": self.user}
+
+ def photos(self):
+ """Return an iterable with all relevant photo objects"""
+
+
+class FlickrImageExtractor(FlickrExtractor):
+ """Extractor for individual images from flickr.com"""
+ subcategory = "image"
+ archive_fmt = "{id}"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
+ r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
+ r"|flic\.kr/p/([A-Za-z1-9]+))")
+ test = (
+ ("https://www.flickr.com/photos/departingyyz/16089302239", {
+ "pattern": pattern,
+ "content": "0821a28ee46386e85b02b67cf2720063440a228c",
+ "keyword": {
+ "comments": int,
+ "description": str,
+ "extension": "jpg",
+ "filename": "16089302239_de18cd8017_b",
+ "id": 16089302239,
+ "height": 683,
+ "label": "Large",
+ "media": "photo",
+ "url": str,
+ "views": int,
+ "width": 1024,
+ },
+ }),
+ ("https://www.flickr.com/photos/145617051@N08/46733161535", {
+ "count": 1,
+ "keyword": {"media": "video"},
+ }),
+ ("http://c2.staticflickr.com/2/1475/24531000464_9a7503ae68_b.jpg", {
+ "pattern": pattern}),
+ ("https://farm2.static.flickr.com/1035/1188352415_cb139831d0.jpg", {
+ "pattern": pattern}),
+ ("https://flic.kr/p/FPVo9U", {
+ "pattern": pattern}),
+ ("https://www.flickr.com/photos/zzz/16089302238", {
+ "exception": exception.NotFoundError}),
+ )
+
+ def __init__(self, match):
+ FlickrExtractor.__init__(self, match)
+ if not self.item_id:
+ alphabet = ("123456789abcdefghijkmnopqrstu"
+ "vwxyzABCDEFGHJKLMNPQRSTUVWXYZ")
+ self.item_id = util.bdecode(match.group(2), alphabet)
+
+ def items(self):
+ photo = self.api.photos_getInfo(self.item_id)
+
+ if photo["media"] == "video" and self.api.videos:
+ self.api._extract_video(photo)
+ else:
+ self.api._extract_photo(photo)
+
+ photo["title"] = photo["title"]["_content"]
+ photo["comments"] = text.parse_int(photo["comments"]["_content"])
+ photo["description"] = photo["description"]["_content"]
+ photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]]
+ photo["date"] = text.parse_timestamp(photo["dateuploaded"])
+ photo["views"] = text.parse_int(photo["views"])
+ photo["id"] = text.parse_int(photo["id"])
+
+ if "location" in photo:
+ location = photo["location"]
+ for key, value in location.items():
+ if isinstance(value, dict):
+ location[key] = value["_content"]
+
+ url = photo["url"]
+ yield Message.Version, 1
+ yield Message.Directory, photo
+ yield Message.Url, url, text.nameext_from_url(url, photo)
+
+
+class FlickrAlbumExtractor(FlickrExtractor):
+ """Extractor for photo albums from flickr.com"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{subcategory}s",
+ "{album[id]} - {album[title]}")
+ archive_fmt = "a_{album[id]}_{id}"
+ pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
+ r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?")
+ test = (
+ (("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": 6,
+ }),
+ ("https://www.flickr.com/photos/shona_s/albums", {
+ "pattern": pattern,
+ "count": 2,
+ }),
+ )
+
+ def __init__(self, match):
+ FlickrExtractor.__init__(self, match)
+ self.album_id = match.group(2)
+
+ def items(self):
+ if self.album_id:
+ return FlickrExtractor.items(self)
+ return self._album_items()
+
+ def _album_items(self):
+ yield Message.Version, 1
+ data = FlickrExtractor.metadata(self)
+ data["_extractor"] = FlickrAlbumExtractor
+
+ for album in self.api.photosets_getList(self.user["nsid"]):
+ self.api._clean_info(album).update(data)
+ url = "https://www.flickr.com/photos/{}/albums/{}".format(
+ self.user["path_alias"], album["id"])
+ yield Message.Queue, url, album
+
+ def metadata(self):
+ data = FlickrExtractor.metadata(self)
+ data["album"] = self.api.photosets_getInfo(
+ self.album_id, self.user["nsid"])
+ return data
+
+ def photos(self):
+ return self.api.photosets_getPhotos(self.album_id)
+
+
+class FlickrGalleryExtractor(FlickrExtractor):
+ """Extractor for photo galleries from flickr.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "galleries",
+ "{user[username]} {gallery[id]}")
+ archive_fmt = "g_{gallery[id]}_{id}"
+ pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/"
+ r"photos/([^/]+)/galleries/(\d+)")
+ test = (("https://www.flickr.com/photos/flickr/"
+ "galleries/72157681572514792/"), {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": ">= 10",
+ })
+
+ def __init__(self, match):
+ FlickrExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+
+ def metadata(self):
+ data = FlickrExtractor.metadata(self)
+ data["gallery"] = self.api.galleries_getInfo(self.gallery_id)
+ return data
+
+ def photos(self):
+ return self.api.galleries_getPhotos(self.gallery_id)
+
+
+class FlickrGroupExtractor(FlickrExtractor):
+ """Extractor for group pools from flickr.com"""
+ subcategory = "group"
+ directory_fmt = ("{category}", "{subcategory}s", "{group[groupname]}")
+ archive_fmt = "G_{group[nsid]}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)"
+ test = ("https://www.flickr.com/groups/bird_headshots/", {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": "> 150",
+ })
+
+ def metadata(self):
+ self.group = self.api.urls_lookupGroup(self.item_id)
+ return {"group": self.group}
+
+ def photos(self):
+ return self.api.groups_pools_getPhotos(self.group["nsid"])
+
+
+class FlickrUserExtractor(FlickrExtractor):
+ """Extractor for the photostream of a flickr user"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "{user[username]}")
+ archive_fmt = "u_{user[nsid]}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$"
+ test = ("https://www.flickr.com/photos/shona_s/", {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": 28,
+ })
+
+ def photos(self):
+ return self.api.people_getPhotos(self.user["nsid"])
+
+
+class FlickrFavoriteExtractor(FlickrExtractor):
+ """Extractor for favorite photos of a flickr user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{subcategory}s", "{user[username]}")
+ archive_fmt = "f_{user[nsid]}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites"
+ test = ("https://www.flickr.com/photos/shona_s/favorites", {
+ "pattern": FlickrImageExtractor.pattern,
+ "count": 4,
+ })
+
+ def photos(self):
+ return self.api.favorites_getList(self.user["nsid"])
+
+
+class FlickrSearchExtractor(FlickrExtractor):
+ """Extractor for flickr photos based on search results"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "{subcategory}", "{search[text]}")
+ archive_fmt = "s_{search}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)"
+ test = (
+ ("https://flickr.com/search/?text=mountain"),
+ ("https://flickr.com/search/?text=tree%20cloud%20house"
+ "&color_codes=4&styles=minimalism"),
+ )
+
+ def __init__(self, match):
+ FlickrExtractor.__init__(self, match)
+ self.search = text.parse_query(match.group(1))
+ if "text" not in self.search:
+ self.search["text"] = ""
+
+ def metadata(self):
+ return {"search": self.search}
+
+ def photos(self):
+ return self.api.photos_search(self.search)
+
+
+class FlickrAPI(oauth.OAuth1API):
+ """Minimal interface for the flickr API"""
+ API_URL = "https://api.flickr.com/services/rest/"
+ API_KEY = "ac4fd7aa98585b9eee1ba761c209de68"
+ API_SECRET = "3adb0f568dc68393"
+ FORMATS = [
+ ("o", "Original" , None),
+ ("k", "Large 2048" , 2048),
+ ("h", "Large 1600" , 1600),
+ ("l", "Large" , 1024),
+ ("c", "Medium 800" , 800),
+ ("z", "Medium 640" , 640),
+ ("m", "Medium" , 500),
+ ("n", "Small 320" , 320),
+ ("s", "Small" , 240),
+ ("q", "Large Square", 150),
+ ("t", "Thumbnail" , 100),
+ ("s", "Square" , 75),
+ ]
+ VIDEO_FORMATS = {
+ "orig" : 9,
+ "1080p" : 8,
+ "720p" : 7,
+ "360p" : 6,
+ "288p" : 5,
+ "700" : 4,
+ "300" : 3,
+ "100" : 2,
+ "appletv" : 1,
+ "iphone_wifi": 0,
+ }
+
+ def __init__(self, extractor):
+ oauth.OAuth1API.__init__(self, extractor)
+
+ self.videos = extractor.config("videos", True)
+ self.maxsize = extractor.config("size-max")
+ if isinstance(self.maxsize, str):
+ for fmt, fmtname, fmtwidth in self.FORMATS:
+ if self.maxsize == fmt or self.maxsize == fmtname:
+ self.maxsize = fmtwidth
+ break
+ else:
+ self.maxsize = None
+ extractor.log.warning(
+ "Could not match '%s' to any format", self.maxsize)
+ if self.maxsize:
+ self.formats = [fmt for fmt in self.FORMATS
+ if not fmt[2] or fmt[2] <= self.maxsize]
+ else:
+ self.formats = self.FORMATS
+ self.formats = self.formats[:4]
+
+ def favorites_getList(self, user_id):
+ """Returns a list of the user's favorite photos."""
+ params = {"user_id": user_id}
+ return self._pagination("favorites.getList", params)
+
+ def galleries_getInfo(self, gallery_id):
+ """Gets information about a gallery."""
+ params = {"gallery_id": gallery_id}
+ gallery = self._call("galleries.getInfo", params)["gallery"]
+ return self._clean_info(gallery)
+
+ def galleries_getPhotos(self, gallery_id):
+ """Return the list of photos for a gallery."""
+ params = {"gallery_id": gallery_id}
+ return self._pagination("galleries.getPhotos", params)
+
+ def groups_pools_getPhotos(self, group_id):
+ """Returns a list of pool photos for a given group."""
+ params = {"group_id": group_id}
+ return self._pagination("groups.pools.getPhotos", params)
+
+ def people_getPhotos(self, user_id):
+ """Return photos from the given user's photostream."""
+ params = {"user_id": user_id}
+ return self._pagination("people.getPhotos", params)
+
+ def photos_getInfo(self, photo_id):
+ """Get information about a photo."""
+ params = {"photo_id": photo_id}
+ return self._call("photos.getInfo", params)["photo"]
+
+ def photos_getSizes(self, photo_id):
+ """Returns the available sizes for a photo."""
+ params = {"photo_id": photo_id}
+ sizes = self._call("photos.getSizes", params)["sizes"]["size"]
+ if self.maxsize:
+ for index, size in enumerate(sizes):
+ if index > 0 and (int(size["width"]) > self.maxsize or
+ int(size["height"]) > self.maxsize):
+ del sizes[index:]
+ break
+ return sizes
+
+ def photos_search(self, params):
+ """Return a list of photos matching some criteria."""
+ return self._pagination("photos.search", params.copy())
+
+ def photosets_getInfo(self, photoset_id, user_id):
+ """Gets information about a photoset."""
+ params = {"photoset_id": photoset_id, "user_id": user_id}
+ photoset = self._call("photosets.getInfo", params)["photoset"]
+ return self._clean_info(photoset)
+
+ def photosets_getList(self, user_id):
+ """Returns the photosets belonging to the specified user."""
+ params = {"user_id": user_id}
+ return self._pagination_sets("photosets.getList", params)
+
+ def photosets_getPhotos(self, photoset_id):
+ """Get the list of photos in a set."""
+ params = {"photoset_id": photoset_id}
+ return self._pagination("photosets.getPhotos", params, "photoset")
+
+ def urls_lookupGroup(self, groupname):
+ """Returns a group NSID, given the url to a group's page."""
+ params = {"url": "https://www.flickr.com/groups/" + groupname}
+ group = self._call("urls.lookupGroup", params)["group"]
+ return {"nsid": group["id"],
+ "path_alias": groupname,
+ "groupname": group["groupname"]["_content"]}
+
+ def urls_lookupUser(self, username):
+ """Returns a user NSID, given the url to a user's photos or profile."""
+ params = {"url": "https://www.flickr.com/photos/" + username}
+ user = self._call("urls.lookupUser", params)["user"]
+ return {"nsid": user["id"],
+ "path_alias": username,
+ "username": user["username"]["_content"]}
+
+ def video_getStreamInfo(self, video_id, secret=None):
+ """Returns all available video streams"""
+ params = {"photo_id": video_id}
+ if not secret:
+ secret = self._call("photos.getInfo", params)["photo"]["secret"]
+ params["secret"] = secret
+ stream = self._call("video.getStreamInfo", params)["streams"]["stream"]
+ return max(stream, key=lambda s: self.VIDEO_FORMATS.get(s["type"], 0))
+
+ def _call(self, method, params):
+ params["method"] = "flickr." + method
+ params["format"] = "json"
+ params["nojsoncallback"] = "1"
+ if self.api_key:
+ params["api_key"] = self.api_key
+ data = self.request(self.API_URL, params=params).json()
+ if "code" in data:
+ if data["code"] == 1:
+ raise exception.NotFoundError(self.extractor.subcategory)
+ elif data["code"] == 98:
+ raise exception.AuthenticationError(data.get("message"))
+ elif data["code"] == 99:
+ raise exception.AuthorizationError()
+ self.log.error("API call failed: %s", data.get("message"))
+ raise exception.StopExtraction()
+ return data
+
+ def _pagination(self, method, params, key="photos"):
+ params["extras"] = "description,date_upload,tags,views,media,"
+ params["extras"] += ",".join("url_" + fmt[0] for fmt in self.formats)
+ params["page"] = 1
+
+ while True:
+ data = self._call(method, params)[key]
+ yield from map(self._extract_format, data["photo"])
+ if params["page"] >= data["pages"]:
+ return
+ params["page"] += 1
+
+ def _pagination_sets(self, method, params):
+ params["page"] = 1
+
+ while True:
+ data = self._call(method, params)["photosets"]
+ yield from data["photoset"]
+ if params["page"] >= data["pages"]:
+ return
+ params["page"] += 1
+
+ def _extract_format(self, photo):
+ photo["description"] = photo["description"]["_content"].strip()
+ photo["views"] = text.parse_int(photo["views"])
+ photo["date"] = text.parse_timestamp(photo["dateupload"])
+ photo["tags"] = photo["tags"].split()
+ photo["id"] = text.parse_int(photo["id"])
+
+ if photo["media"] == "video" and self.videos:
+ return self._extract_video(photo)
+
+ for fmt, fmtname, fmtwidth in self.formats:
+ key = "url_" + fmt
+ if key in photo:
+ photo["width"] = text.parse_int(photo["width_" + fmt])
+ photo["height"] = text.parse_int(photo["height_" + fmt])
+ if self.maxsize and (photo["width"] > self.maxsize or
+ photo["height"] > self.maxsize):
+ continue
+ photo["url"] = photo[key]
+ photo["label"] = fmtname
+
+ # remove excess data
+ keys = [
+ key for key in photo
+ if key.startswith(("url_", "width_", "height_"))
+ ]
+ for key in keys:
+ del photo[key]
+ break
+ else:
+ self._extract_photo(photo)
+
+ return photo
+
+ def _extract_photo(self, photo):
+ size = self.photos_getSizes(photo["id"])[-1]
+ photo["url"] = size["source"]
+ photo["label"] = size["label"]
+ photo["width"] = text.parse_int(size["width"])
+ photo["height"] = text.parse_int(size["height"])
+ return photo
+
+ def _extract_video(self, photo):
+ stream = self.video_getStreamInfo(photo["id"], photo.get("secret"))
+ photo["url"] = stream["_content"]
+ photo["label"] = stream["type"]
+ photo["width"] = photo["height"] = 0
+ return photo
+
+ @staticmethod
+ def _clean_info(info):
+ info["title"] = info["title"]["_content"]
+ info["description"] = info["description"]["_content"]
+ return info
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
new file mode 100644
index 0000000..5f4c5b8
--- /dev/null
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for 4chan archives based on FoolFuuka"""
+
+from .common import Extractor, Message, SharedConfigMixin, generate_extractors
+from .. import text
+import itertools
+import operator
+
+
+class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
+ """Base extractor for FoolFuuka based boards/archives"""
+ basecategory = "foolfuuka"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board[shortname]}",
+ "{thread_num}{title:? - //}")
+ filename_fmt = "{media[media]}"
+ archive_fmt = "{board[shortname]}_{num}_{timestamp}"
+ pattern_fmt = r"/([^/]+)/thread/(\d+)"
+ external = "default"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+ self.session.headers["Referer"] = self.root
+ if self.external == "direct":
+ self.remote = self._remote_direct
+
+ def items(self):
+ op = True
+ yield Message.Version, 1
+ for post in self.posts():
+ if op:
+ yield Message.Directory, post
+ op = False
+ if not post["media"]:
+ continue
+
+ media = post["media"]
+ url = media["media_link"]
+
+ if not url and "remote_media_link" in media:
+ url = self.remote(media)
+ if url.startswith("/"):
+ url = self.root + url
+
+ post["extension"] = url.rpartition(".")[2]
+ yield Message.Url, url, post
+
+ def posts(self):
+ """Return an iterable with all posts in this thread"""
+ url = self.root + "/_/api/chan/thread/"
+ params = {"board": self.board, "num": self.thread}
+ data = self.request(url, params=params).json()[self.thread]
+
+ # sort post-objects by key
+ posts = sorted(data.get("posts", {}).items())
+ posts = map(operator.itemgetter(1), posts)
+
+ return itertools.chain((data["op"],), posts)
+
+ def remote(self, media):
+ """Resolve a remote media link"""
+ needle = '<meta http-equiv="Refresh" content="0; url='
+ page = self.request(media["remote_media_link"]).text
+ return text.extract(page, needle, '"')[0]
+
+ @staticmethod
+ def _remote_direct(media):
+ return media["remote_media_link"]
+
+
+EXTRACTORS = {
+ "4plebs": {
+ "name": "fourplebs",
+ "root": "https://archive.4plebs.org",
+ "pattern": r"(?:archive\.)?4plebs\.org",
+ "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", {
+ "url": "07452944164b602502b02b24521f8cee5c484d2a",
+ }),
+ },
+ "archivedmoe": {
+ "root": "https://archived.moe",
+ "test-thread": (
+ ("https://archived.moe/gd/thread/309639/", {
+ "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
+ "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
+ }),
+ ("https://archived.moe/a/thread/159767162/", {
+ "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
+ }),
+ ),
+ },
+ "archiveofsins": {
+ "root": "https://archiveofsins.com",
+ "pattern": r"(?:www\.)?archiveofsins\.com",
+ "test-thread": ("https://archiveofsins.com/h/thread/4668813/", {
+ "url": "f612d287087e10a228ef69517cf811539db9a102",
+ "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
+ }),
+ },
+ "b4k": {
+ "root": "https://arch.b4k.co",
+ "extra": {"external": "direct"},
+ "test-thread": ("https://arch.b4k.co/meta/thread/196/", {
+ "url": "9b0ae01292133268fe9178b71332da1ee25b7704",
+ }),
+ },
+ "desuarchive": {
+ "root": "https://desuarchive.org",
+ "test-thread": ("https://desuarchive.org/a/thread/159542679/", {
+ "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
+ }),
+ },
+ "fireden": {
+ "root": "https://boards.fireden.net",
+ "test-thread": ("https://boards.fireden.net/a/thread/159803223/", {
+ "url": "01b7baacfb0656a68e566368290e3072b27f86c9",
+ }),
+ },
+ "nyafuu": {
+ "root": "https://archive.nyafuu.org",
+ "pattern": r"(?:archive\.)?nyafuu\.org",
+ "test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", {
+ "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
+ }),
+ },
+ "rbt": {
+ "root": "https://rbt.asia",
+ "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
+ "test-thread": (
+ ("https://rbt.asia/g/thread/61487650/", {
+ "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+ }),
+ ("https://archive.rebeccablacktech.com/g/thread/61487650/", {
+ "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+ }),
+ ),
+ },
+ "thebarchive": {
+ "root": "https://thebarchive.com",
+ "pattern": r"thebarchive\.com",
+ "test-thread": ("https://thebarchive.com/b/thread/739772332/", {
+ "url": "e8b18001307d130d67db31740ce57c8561b5d80c",
+ }),
+ },
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ FoolfuukaThreadExtractor,
+))
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
new file mode 100644
index 0000000..14baa36
--- /dev/null
+++ b/gallery_dl/extractor/foolslide.py
@@ -0,0 +1,240 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for FoOlSlide based sites"""
+
+from .common import (
+ Extractor,
+ ChapterExtractor,
+ MangaExtractor,
+ SharedConfigMixin,
+ Message,
+ generate_extractors,
+)
+from .. import text, util
+import base64
+import json
+
+
+class FoolslideBase(SharedConfigMixin):
+ """Base class for FoOlSlide extractors"""
+ basecategory = "foolslide"
+
+ def request(self, url):
+ return Extractor.request(
+ self, url, encoding="utf-8", method="POST", data={"adult": "true"})
+
+ @staticmethod
+ def parse_chapter_url(url, data):
+ info = url.partition("/read/")[2].rstrip("/").split("/")
+ lang = info[1].partition("-")[0]
+ data["lang"] = lang
+ data["language"] = util.code_to_language(lang)
+ data["volume"] = text.parse_int(info[2])
+ data["chapter"] = text.parse_int(info[3])
+ data["chapter_minor"] = "." + info[4] if len(info) >= 5 else ""
+ data["title"] = data["chapter_string"].partition(":")[2].strip()
+ return data
+
+
+class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
+ """Base class for chapter extractors for FoOlSlide based sites"""
+ directory_fmt = (
+ "{category}", "{manga}", "{chapter_string}")
+ archive_fmt = "{id}"
+ pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
+ decode = "default"
+
+ def items(self):
+ page = self.request(self.chapter_url).text
+ data = self.metadata(page)
+ imgs = self.images(page)
+
+ data["count"] = len(imgs)
+ data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"])
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"], image in enumerate(imgs, 1):
+ try:
+ url = image["url"]
+ del image["url"]
+ del image["chapter_id"]
+ del image["thumb_url"]
+ except KeyError:
+ pass
+ for key in ("height", "id", "size", "width"):
+ image[key] = text.parse_int(image[key])
+ data.update(image)
+ text.nameext_from_url(data["filename"], data)
+ yield Message.Url, url, data
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ extr('<h1 class="tbtitle dnone">', '')
+ return self.parse_chapter_url(self.chapter_url, {
+ "manga" : text.unescape(extr('title="', '"')).strip(),
+ "chapter_string": text.unescape(extr('title="', '"')),
+ })
+
+ def images(self, page):
+ if self.decode == "base64":
+ base64_data = text.extract(page, 'atob("', '"')[0].encode()
+ data = base64.b64decode(base64_data).decode()
+ elif self.decode == "double":
+ pos = page.find("[{")
+ data = text.extract(page, " = ", ";", pos)[0]
+ else:
+ data = text.extract(page, "var pages = ", ";")[0]
+ return json.loads(data)
+
+
+class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
+ """Base class for manga extractors for FoOlSlide based sites"""
+ pattern_fmt = r"(/series/[^/?&#]+)"
+
+ def chapters(self, page):
+ extr = text.extract_from(page)
+ manga = text.unescape(extr('<h1 class="title">', '</h1>')).strip()
+ author = extr('<b>Author</b>: ', '<br')
+ artist = extr('<b>Artist</b>: ', '<br')
+
+ results = []
+ while True:
+ url = extr('<div class="title"><a href="', '"')
+ if not url:
+ return results
+ results.append((url, self.parse_chapter_url(url, {
+ "manga": manga, "author": author, "artist": artist,
+ "chapter_string": extr('title="', '"'),
+ "group" : extr('title="', '"'),
+ })))
+
+
+EXTRACTORS = {
+ "dokireader": {
+ "root": "https://kobato.hologfx.com/reader",
+ "test-chapter":
+ (("https://kobato.hologfx.com/reader/read/"
+ "hitoribocchi_no_oo_seikatsu/en/3/34"), {
+ "keyword": "6e719ac86f0c6dab89390dd7e507e678459e0dbc",
+ }),
+ "test-manga":
+ (("https://kobato.hologfx.com/reader/series/"
+ "boku_ha_ohimesama_ni_narenai/"), {
+ "url": "1c1f5a7258ce4f631f5fc32be548d78a6a57990d",
+ "keyword": "614d89a6045b85c822cbd3e67578ea7577dfc995",
+ }),
+ },
+ "jaiminisbox": {
+ "root": "https://jaiminisbox.com/reader",
+ "pattern": r"(?:www\.)?jaiminisbox\.com/reader",
+ "extra": {"decode": "base64"},
+ "test-chapter": (
+ ("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", {
+ "keyword": "6009af77cc9c05528ab1fdda47b1ad9d4811c673",
+ }),
+ ("https://jaiminisbox.com/reader/read/dr-stone/en/0/16/", {
+ "keyword": "8607375c24b1d0db7f52d059ef5baff793aa458e",
+ }),
+ ),
+ "test-manga":
+ ("https://jaiminisbox.com/reader/series/sora_no_kian/", {
+ "url": "66612be177dc3b3fa1d1f537ef02f4f701b163ea",
+ "keyword": "0908a4145bb03acc4210f5d01169988969f5acd1",
+ }),
+ },
+ "kireicake": {
+ "root": "https://reader.kireicake.com",
+ "test-chapter":
+ ("https://reader.kireicake.com/read/wonderland/en/1/1/", {
+ "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e",
+ "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6",
+ }),
+ "test-manga":
+ ("https://reader.kireicake.com/series/wonderland/", {
+ "url": "d067b649af1cc88fa8c8b698fde04a10909fd169",
+ "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb",
+ }),
+ },
+ "powermanga": {
+ "root": "https://read.powermanga.org",
+ "pattern": r"read(?:er)?\.powermanga\.org",
+ "test-chapter":
+ (("https://read.powermanga.org"
+ "/read/one_piece_digital_colour_comics/en/0/75/"), {
+ "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384",
+ "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe",
+ }),
+ "test-manga":
+ (("https://read.powermanga.org"
+ "/series/one_piece_digital_colour_comics/"), {
+ "count": ">= 1",
+ "keyword": {
+ "chapter": int,
+ "chapter_minor": str,
+ "chapter_string": str,
+ "group": "PowerManga",
+ "lang": "en",
+ "language": "English",
+ "manga": "One Piece Digital Colour Comics",
+ "title": str,
+ "volume": int,
+ },
+ }),
+ },
+ "sensescans": {
+ "root": "http://sensescans.com/reader",
+ "pattern": r"(?:(?:www\.)?sensescans\.com/reader"
+ r"|reader\.sensescans\.com)",
+ "test-chapter": (
+ (("http://sensescans.com/reader/read/"
+ "magi__labyrinth_of_magic/en/37/369/"), {
+ "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812",
+ "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988",
+ }),
+ (("http://reader.sensescans.com/read/"
+ "magi__labyrinth_of_magic/en/37/369/"), {
+ "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812",
+ "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988",
+ }),
+ ),
+ "test-manga":
+ ("http://sensescans.com/reader/series/hakkenden/", {
+ "url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2",
+ "keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23",
+ }),
+ },
+ "worldthree": {
+ "root": "http://www.slide.world-three.org",
+ "pattern": r"(?:www\.)?slide\.world-three\.org",
+ "test-chapter": (
+ (("http://www.slide.world-three.org"
+ "/read/black_bullet/en/2/7/page/1"), {
+ "url": "be2f04f6e2d311b35188094cfd3e768583271584",
+ "keyword": "967d536a65de4d52478d5b666a1760b181eddb6e",
+ }),
+ (("http://www.slide.world-three.org"
+ "/read/idolmster_cg_shuffle/en/0/4/2/"), {
+ "url": "6028ea5ca282744f925dfad92eeb98509f9cc78c",
+ "keyword": "f3cfe2ad3388991f1d045c85d0fa94795a7694dc",
+ }),
+ ),
+ "test-manga":
+ ("http://www.slide.world-three.org/series/black_bullet/", {
+ "url": "5743b93512d26e6b540d90a7a5d69208b6d4a738",
+ "keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120",
+ }),
+ },
+ "_ckey": "chapterclass",
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ FoolslideChapterExtractor,
+ FoolslideMangaExtractor,
+))
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
new file mode 100644
index 0000000..15bd0a8
--- /dev/null
+++ b/gallery_dl/extractor/gelbooru.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://gelbooru.com/"""
+
+from . import booru
+from .common import Message
+from .. import text, util
+
+
+class GelbooruExtractor(booru.XmlParserMixin,
+ booru.GelbooruPageMixin,
+ booru.BooruExtractor):
+ """Base class for gelbooru extractors"""
+ category = "gelbooru"
+ api_url = "https://gelbooru.com/index.php"
+ post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}"
+ pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}"
+
+ def __init__(self, match):
+ super().__init__(match)
+
+ self.use_api = self.config("api", True)
+ if self.use_api:
+ self.params.update({"page": "dapi", "s": "post", "q": "index"})
+ else:
+ self.items = self.items_noapi
+
+ def items_noapi(self):
+ data = self.get_metadata()
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for post in self.get_posts():
+ post = self.get_post_data(post)
+ url = post["file_url"]
+ post.update(data)
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+ def get_posts(self):
+ """Return an iterable containing all relevant post objects"""
+
+ def get_post_data(self, post_id):
+ """Extract metadata of a single post"""
+ page = self.request(self.post_url.format(post_id)).text
+ data = text.extract_all(page, (
+ (None , '<meta name="keywords"', ''),
+ ("tags" , ' imageboard, ', '"'),
+ ("id" , '<li>Id: ', '<'),
+ ("created_at", '<li>Posted: ', '<'),
+ ("width" , '<li>Size: ', 'x'),
+ ("height" , '', '<'),
+ ("source" , '<li>Source: <a href="', '"'),
+ ("rating" , '<li>Rating: ', '<'),
+ (None , '<li>Score: ', ''),
+ ("score" , '>', '<'),
+ ("file_url" , '<li><a href="http', '"'),
+ ("change" , ' id="lupdated" value="', '"'),
+ ))[0]
+ data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
+ data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
+ data["rating"] = (data["rating"] or "?")[0].lower()
+ data["tags"] = " ".join(
+ [tag.replace(" ", "_") for tag in data["tags"].split(", ")])
+ if self.extags:
+ self.extended_tags(data, page)
+ return data
+
+
+class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
+ """Extractor for images from gelbooru.com based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
+ r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
+ test = (
+ ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
+ "count": 5,
+ }),
+ ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
+ "options": (("api", False),),
+ "count": 5,
+ }),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ if not self.use_api:
+ self.per_page = 42
+
+ def get_posts(self):
+ url = "https://gelbooru.com/index.php?page=post&s=list"
+ params = {"tags": self.tags, "pid": self.page_start * self.per_page}
+
+ while True:
+ page = self.request(url, params=params).text
+ ids = list(text.extract_iter(page, '<a id="p', '"'))
+ yield from ids
+ if len(ids) < self.per_page:
+ return
+ params["pid"] += self.per_page
+
+
+class GelbooruPoolExtractor(booru.GelbooruPoolMixin, GelbooruExtractor):
+ """Extractor for image-pools from gelbooru.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
+ r"\?page=pool&s=show&id=(?P<pool>\d+)")
+ test = ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
+ "count": 6,
+ })
+
+ def get_posts(self):
+ return util.advance(self.posts, self.page_start)
+
+
+class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
+ """Extractor for single images from gelbooru.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
+ r"\?page=post&s=view&id=(?P<post>\d+)")
+ test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
+ "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
+ "count": 1,
+ })
+
+ def get_posts(self):
+ return (self.post,)
diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py
new file mode 100644
index 0000000..1dcb3c8
--- /dev/null
+++ b/gallery_dl/extractor/gfycat.py
@@ -0,0 +1,83 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://gfycat.com/"""
+
+from .common import Extractor, Message
+
+
+class GfycatExtractor(Extractor):
+ """Base class for gfycat extractors"""
+ category = "gfycat"
+ filename_fmt = "{category}_{gfyName}.{extension}"
+ archive_fmt = "{gfyName}"
+ root = "https://gfycat.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.formats = (self.config("format", "mp4"), "mp4", "webm", "gif")
+
+ def _select_format(self, gfyitem):
+ for fmt in self.formats:
+ key = fmt + "Url"
+ if key in gfyitem:
+ url = gfyitem[key]
+ gfyitem["extension"] = url.rpartition(".")[2]
+ return url
+ return ""
+
+ def _get_info(self, gfycat_id):
+ url = "https://api.gfycat.com/v1/gfycats/" + gfycat_id
+ return self.request(url).json()["gfyItem"]
+
+
+class GfycatImageExtractor(GfycatExtractor):
+ """Extractor for individual images from gfycat.com"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:\w+\.)?gfycat\.com"
+ r"/(?:gifs/detail/|\w+/)?([A-Za-z]+)")
+ test = (
+ ("https://gfycat.com/GrayGenerousCowrie", {
+ "url": "e0b5e1d7223108249b15c3c7898dd358dbfae045",
+ "content": "5786028e04b155baa20b87c5f4f77453cd5edc37",
+ "keyword": {
+ "gfyId": "graygenerouscowrie",
+ "gfyName": "GrayGenerousCowrie",
+ "gfyNumber": "755075459",
+ "title": "Bottom's up",
+ "userName": "jackson3oh3",
+ "createDate": 1495884169,
+ "md5": "a4796e05b0db9ba9ce5140145cd318aa",
+ "width": 400,
+ "height": 224,
+ "frameRate": 23,
+ "numFrames": 158,
+ "views": int,
+ },
+ }),
+ (("https://thumbs.gfycat.com/SillyLameIsabellinewheatear"
+ "-size_restricted.gif"), {
+ "url": "13b32e6cc169d086577d7dd3fd36ee6cdbc02726",
+ }),
+ ("https://gfycat.com/detail/UnequaledHastyAnkole?tagname=aww", {
+ "url": "e24c9f69897fd223343782425a429c5cab6a768e",
+ }),
+ ("https://gfycat.com/gifs/detail/UnequaledHastyAnkole"),
+ ("https://gfycat.com/ifr/UnequaledHastyAnkole"),
+ ("https://gfycat.com/ru/UnequaledHastyAnkole"),
+ )
+
+ def __init__(self, match):
+ GfycatExtractor.__init__(self, match)
+ self.gfycat_id = match.group(1)
+
+ def items(self):
+ gfyitem = self._get_info(self.gfycat_id)
+ yield Message.Version, 1
+ yield Message.Directory, gfyitem
+ yield Message.Url, self._select_format(gfyitem), gfyitem
diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py
new file mode 100644
index 0000000..01793dc
--- /dev/null
+++ b/gallery_dl/extractor/hbrowse.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.hbrowse.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, exception
+import json
+
+
+class HbrowseBase():
+ """Base class for hbrowse extractors"""
+ category = "hbrowse"
+ root = "https://www.hbrowse.com"
+
+ def parse_page(self, page, data):
+ """Parse metadata on 'page' and add it to 'data'"""
+ data, pos = text.extract_all(page, (
+ ('manga' , '<td class="listLong">', '</td>'),
+ ('artist', '<td class="listLong">', '</td>'),
+ ('total' , '<td class="listLong">', ' '),
+ ('origin', '<td class="listLong">', '</td>'),
+ ), values=data)
+
+ if not data["manga"] and "<b>Warning</b>" in page:
+ msg = page.rpartition(">")[2].strip()
+ self.log.error("Site is not accessible: '%s'", msg)
+ raise exception.StopExtraction()
+
+ tags = text.extract(page, 'class="listTable"', '</table>', pos)[0]
+
+ data["manga"] = text.unescape(data["manga"])
+ data["total"] = text.parse_int(data["total"])
+ data["artist"] = text.remove_html(data["artist"])
+ data["origin"] = text.remove_html(data["origin"])
+ data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"'))
+ return data
+
+
+class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor):
+ """Extractor for manga-chapters from hbrowse.com"""
+ directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}")
+ filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
+ "{page:>03}.{extension}")
+ archive_fmt = "{manga_id}_{chapter}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))"
+ test = ("https://www.hbrowse.com/10363/c00000", {
+ "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6",
+ "keyword": "274996f6c809e5250b6ff3abbc5147e29f89d9a5",
+ "content": "44578ebbe176c2c27434966aef22945787e2781e",
+ })
+
+ def __init__(self, match):
+ self.path, self.gid, self.chapter = match.groups()
+ self.path += "/"
+ ChapterExtractor.__init__(self, match)
+
+ def metadata(self, page):
+ return self.parse_page(page, {
+ "manga_id": text.parse_int(self.gid),
+ "chapter": text.parse_int(self.chapter)
+ })
+
+ def images(self, page):
+ base = self.root + "/data" + self.path
+ json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
+ return [(base + name, None) for name in json.loads(json_data)]
+
+
+class HbrowseMangaExtractor(HbrowseBase, MangaExtractor):
+ """Extractor for manga from hbrowse.com"""
+ chapterclass = HbrowseChapterExtractor
+ reverse = False
+ pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$"
+ test = ("https://www.hbrowse.com/10363", {
+ "url": "b89682bfb86c11d2af0dc47463804ec3ac4aadd6",
+ "keyword": "4b15fda1858a69de1fbf5afddfe47dd893397312",
+ })
+
+ def chapters(self, page):
+ results = []
+ data = self.parse_page(page, {
+ "manga_id": text.parse_int(
+ self.manga_url.rstrip("/").rpartition("/")[2])
+ })
+
+ pos = 0
+ needle = '<td class="listMiddle">\n<a class="listLink" href="'
+ while True:
+ url, pos = text.extract(page, needle, '"', pos)
+ if not url:
+ return results
+ title, pos = text.extract(page, '>View ', '<', pos)
+ data["chapter"] = text.parse_int(url.rpartition("/")[2][1:])
+ data["title"] = title
+ results.append((text.urljoin(self.root, url), data.copy()))
diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py
new file mode 100644
index 0000000..354acbf
--- /dev/null
+++ b/gallery_dl/extractor/hentai2read.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract hentai-manga from https://hentai2read.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import json
+import re
+
+
+class Hentai2readBase():
+ """Base class for hentai2read extractors"""
+ category = "hentai2read"
+ root = "https://hentai2read.com"
+
+
+class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor):
+ """Extractor for a single manga chapter from hentai2read.com"""
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+/(\d+))"
+ test = ("https://hentai2read.com/amazon_elixir/1/", {
+ "url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
+ "keyword": "ff84b8f751f0e4ee37717efc4332ff1db71951d9",
+ })
+
+ def __init__(self, match):
+ self.chapter = match.group(2)
+ ChapterExtractor.__init__(self, match)
+
+ def metadata(self, page):
+ title, pos = text.extract(page, "<title>", "</title>")
+ manga_id, pos = text.extract(page, 'data-mid="', '"', pos)
+ chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
+ match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - "
+ r"(\d+): (.+) . Page 1 ", title)
+ return {
+ "manga": match.group(1),
+ "manga_id": text.parse_int(manga_id),
+ "chapter": text.parse_int(self.chapter),
+ "chapter_id": text.parse_int(chapter_id),
+ "type": match.group(2),
+ "author": match.group(3),
+ "title": match.group(5),
+ "lang": "en",
+ "language": "English",
+ }
+
+ @staticmethod
+ def images(page):
+ images = text.extract(page, "'images' : ", ",\n")[0]
+ return [
+ ("https://hentaicdn.com/hentai" + part, None)
+ for part in json.loads(images)
+ ]
+
+
+class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor):
+ """Extractor for hmanga from hentai2read.com"""
+ chapterclass = Hentai2readChapterExtractor
+ pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+)/?$"
+ test = (
+ ("https://hentai2read.com/amazon_elixir/", {
+ "url": "273073752d418ec887d7f7211e42b832e8c403ba",
+ "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac",
+ }),
+ ("https://hentai2read.com/oshikage_riot/", {
+ "url": "6595f920a3088a15c2819c502862d45f8eb6bea6",
+ "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36",
+ }),
+ )
+
+ def chapters(self, page):
+ results = []
+ manga, pos = text.extract(
+ page, '<span itemprop="name">', '</span>')
+ mtype, pos = text.extract(
+ page, '<small class="text-danger">[', ']</small>', pos)
+ manga_id = text.parse_int(text.extract(
+ page, 'data-mid="', '"', pos)[0])
+
+ while True:
+ chapter_id, pos = text.extract(page, ' data-cid="', '"', pos)
+ if not chapter_id:
+ return results
+ _ , pos = text.extract(page, ' href="', '"', pos)
+ url, pos = text.extract(page, ' href="', '"', pos)
+ chapter, pos = text.extract(page, '>', '<', pos)
+
+ chapter, _, title = text.unescape(chapter).strip().partition(" - ")
+ results.append((url, {
+ "manga_id": manga_id, "manga": manga, "type": mtype,
+ "chapter_id": text.parse_int(chapter_id),
+ "chapter": text.parse_int(chapter),
+ "title": title, "lang": "en", "language": "English",
+ }))
diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py
new file mode 100644
index 0000000..e95467b
--- /dev/null
+++ b/gallery_dl/extractor/hentaicafe.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentai.cafe/"""
+
+from . import foolslide
+from .. import text
+from ..cache import memcache
+import re
+
+
+class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
+ """Extractor for manga-chapters from hentai.cafe"""
+ category = "hentaicafe"
+ directory_fmt = ("{category}", "{manga}")
+ pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe"
+ r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
+ test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
+ "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
+ "keyword": "6913608267d883c82b887303b9ced13821188329",
+ })
+ root = "https://hentai.cafe"
+
+ def metadata(self, page):
+ info = text.unescape(text.extract(page, '<title>', '</title>')[0])
+ manga, _, chapter_string = info.partition(" :: ")
+
+ data = self._data(self.chapter_url.split("/")[5])
+ data["manga"] = manga
+ data["chapter_string"] = chapter_string.rstrip(" :")
+ return self.parse_chapter_url(self.chapter_url, data)
+
+ @memcache(keyarg=1)
+ def _data(self, manga):
+ return {"artist": [], "tags": []}
+
+
+class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
+ """Extractor for manga from hentai.cafe"""
+ category = "hentaicafe"
+ pattern = (r"(?:https?://)?" + r"(?:www\.)?hentai\.cafe"
+ r"((?:/manga/series)?/[^/?&#]+)/?$")
+ test = (
+ # single chapter
+ ("https://hentai.cafe/hazuki-yuuto-summer-blues/", {
+ "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b",
+ "keyword": "eb9f98544098c961bd8cf5dbe69e6da51c4fb2f6",
+ }),
+ # multi-chapter
+ ("https://hentai.cafe/saitom-saitom-box/", {
+ "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
+ "keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb",
+ }),
+ # foolslide URL
+ ("https://hentai.cafe/manga/series/saitom-box/", {
+ "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
+ "keyword": "f0ece32d958f889d8229ed4052716d398a0a875c",
+ }),
+ )
+ root = "https://hentai.cafe"
+ reverse = False
+ chapterclass = HentaicafeChapterExtractor
+
+ def chapters(self, page):
+ if "/manga/series/" in self.manga_url:
+ chapters = foolslide.FoolslideMangaExtractor.chapters(self, page)
+ chapters.reverse()
+ return chapters
+
+ tags , pos = text.extract(page, "<p>Tags: ", "</br>")
+ artist, pos = text.extract(page, "\nArtists: ", "</br>", pos)
+ manga , pos = text.extract(page, "/manga/read/", "/", pos)
+ data = {
+ "tags" : text.split_html(tags)[::2],
+ "artist": text.split_html(artist),
+ }
+ HentaicafeChapterExtractor._data(manga).update(data)
+
+ return [
+ (url, data)
+ for url in re.findall(
+ r'<a +class="x-btn[^"]*" +href="([^"]+)"', page)
+ ]
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
new file mode 100644
index 0000000..d31f66f
--- /dev/null
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -0,0 +1,264 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.hentai-foundry.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+
+
+class HentaifoundryExtractor(Extractor):
+ """Base class for hentaifoundry extractors"""
+ category = "hentaifoundry"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{category}_{index}_{title}.{extension}"
+ archive_fmt = "{index}"
+ root = "https://www.hentai-foundry.com"
+ per_page = 25
+
+ def __init__(self, match, user="", page=1):
+ Extractor.__init__(self, match)
+ self.page_url = ""
+ self.user = user
+ self.start_post = 0
+ self.start_page = text.parse_int(page, 1)
+
+ def items(self):
+ data = self.get_job_metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ self.set_filters()
+ for page_url in util.advance(self.get_image_pages(), self.start_post):
+ url, image = self.get_image_metadata(page_url)
+ image.update(data)
+ yield Message.Url, url, image
+
+ def skip(self, num):
+ pages, posts = divmod(num, self.per_page)
+ self.start_page += pages
+ self.start_post += posts
+ return num
+
+ def get_job_metadata(self):
+ """Collect metadata for extractor-job"""
+ self.request(self.root + "/?enterAgree=1")
+ return {"user": self.user}
+
+ def get_image_pages(self):
+ """Yield urls of all relevant image pages"""
+ num = self.start_page
+
+ while True:
+ page = self.request("{}/page/{}".format(self.page_url, num)).text
+ yield from text.extract_iter(page, 'thumbTitle"><a href="', '"')
+
+ if 'class="pager"' not in page or 'class="last hidden"' in page:
+ return
+ num += 1
+
+ def get_image_metadata(self, page_url):
+ """Collect url and metadata from an image page"""
+ page = self.request(text.urljoin(self.root, page_url)).text
+ index = page_url.rsplit("/", 2)[1]
+ title , pos = text.extract(page, '<title>', '</title>')
+ _ , pos = text.extract(page, 'id="picBox"', '', pos)
+ width , pos = text.extract(page, 'width="', '"', pos)
+ height, pos = text.extract(page, 'height="', '"', pos)
+ url , pos = text.extract(page, 'src="', '"', pos)
+
+ title, _, artist = title.rpartition(" - ")[0].rpartition(" by ")
+
+ data = text.nameext_from_url(url, {
+ "title": text.unescape(title),
+ "artist": text.unescape(artist),
+ "index": text.parse_int(index),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ })
+ if not data["extension"]:
+ data["extension"] = "jpg"
+ return text.urljoin(self.root, url), data
+
+ def set_filters(self):
+ """Set site-internal filters to show all images"""
+ token = text.unquote(text.extract(
+ self.session.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0])
+ data = {
+ "YII_CSRF_TOKEN": token,
+ "rating_nudity": 3,
+ "rating_violence": 3,
+ "rating_profanity": 3,
+ "rating_racism": 3,
+ "rating_sex": 3,
+ "rating_spoilers": 3,
+ "rating_yaoi": 1,
+ "rating_yuri": 1,
+ "rating_teen": 1,
+ "rating_guro": 1,
+ "rating_furry": 1,
+ "rating_beast": 1,
+ "rating_male": 1,
+ "rating_female": 1,
+ "rating_futa": 1,
+ "rating_other": 1,
+ "rating_scat": 1,
+ "rating_incest": 1,
+ "rating_rape": 1,
+ "filter_media": "A",
+ "filter_order": "date_new",
+ "filter_type": 0,
+ }
+ url = self.root + "/site/filters"
+ self.request(url, method="POST", data=data)
+
+
+class HentaifoundryUserExtractor(HentaifoundryExtractor):
+ """Extractor for all images of a hentai-foundry-user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/(?:pictures/user/([^/]+)(?:/page/(\d+))?/?$"
+ r"|user/([^/]+)/profile)")
+ test = (
+ ("https://www.hentai-foundry.com/pictures/user/Tenpura", {
+ "url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28",
+ "keyword": "63ad576f87f82fa166ca4676761762f7f8496cf5",
+ }),
+ ("https://www.hentai-foundry.com/pictures/user/Tenpura/page/3"),
+ ("https://www.hentai-foundry.com/user/Tenpura/profile"),
+ )
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(
+ self, match, match.group(1) or match.group(3), match.group(2))
+ self.page_url = "{}/pictures/user/{}".format(self.root, self.user)
+
+ def get_job_metadata(self):
+ page = self.request(self.page_url + "?enterAgree=1").text
+ count = text.extract(page, ">Pictures (", ")")[0]
+ return {"user": self.user, "count": text.parse_int(count)}
+
+
+class HentaifoundryScrapsExtractor(HentaifoundryExtractor):
+ """Extractor for scrap images of a hentai-foundry-user"""
+ subcategory = "scraps"
+ directory_fmt = ("{category}", "{user}", "Scraps")
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/pictures/user/([^/]+)/scraps(?:/page/(\d+))?")
+ test = (
+ ("https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", {
+ "url": "00a11e30b73ff2b00a1fba0014f08d49da0a68ec",
+ "keyword": "410c6c900cfd23a8dd1e53dfcc97a79ea68c3359",
+ }),
+ ("https://www.hentai-foundry.com"
+ "/pictures/user/Evulchibi/scraps/page/3"),
+ )
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(
+ self, match, match.group(1), match.group(2))
+ self.page_url = "{}/pictures/user/{}/scraps".format(
+ self.root, self.user)
+
+ def get_job_metadata(self):
+ page = self.request(self.page_url + "?enterAgree=1").text
+ count = text.extract(page, ">Scraps (", ")")[0]
+ return {"user": self.user, "count": text.parse_int(count)}
+
+
+class HentaifoundryFavoriteExtractor(HentaifoundryExtractor):
+ """Extractor for favorite images of a hentai-foundry-user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{user}", "Favorites")
+ archive_fmt = "f_{user}_{index}"
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/user/([^/]+)/faves/pictures(?:/page/(\d+))?")
+ test = (
+ ("https://www.hentai-foundry.com/user/Tenpura/faves/pictures", {
+ "url": "56f9ae2e89fe855e9fe1da9b81e5ec6212b0320b",
+ "keyword": "2b9478725e66d46ea043fa87476bbd28546958e7",
+ }),
+ ("https://www.hentai-foundry.com"
+ "/user/Tenpura/faves/pictures/page/3"),
+ )
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(
+ self, match, match.group(1), match.group(2))
+ self.page_url = "{}/user/{}/faves/pictures".format(
+ self.root, self.user)
+
+
+class HentaifoundryRecentExtractor(HentaifoundryExtractor):
+ """Extractor for 'Recent Pictures' on hentaifoundry.com"""
+ subcategory = "recent"
+ directory_fmt = ("{category}", "Recent Pictures", "{date}")
+ archive_fmt = "r_{index}"
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/pictures/recent/(\d+-\d+-\d+)(?:/page/(\d+))?")
+ test = ("http://www.hentai-foundry.com/pictures/recent/2018-09-20",)
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(self, match, "", match.group(2))
+ self.date = match.group(1)
+ self.page_url = "{}/pictures/recent/{}".format(self.root, self.date)
+
+ def get_job_metadata(self):
+ self.request(self.root + "/?enterAgree=1")
+ return {"date": self.date}
+
+
+class HentaifoundryPopularExtractor(HentaifoundryExtractor):
+ """Extractor for popular images on hentaifoundry.com"""
+ subcategory = "popular"
+ directory_fmt = ("{category}", "Popular Pictures")
+ archive_fmt = "p_{index}"
+ pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com"
+ r"/pictures/popular(?:/page/(\d+))?")
+ test = ("http://www.hentai-foundry.com/pictures/popular",)
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(self, match, "", match.group(1))
+ self.page_url = self.root + "/pictures/popular"
+
+
+class HentaifoundryImageExtractor(HentaifoundryExtractor):
+ """Extractor for a single image from hentaifoundry.com"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com"
+ r"/(?:pictures/user|[^/])/([^/]+)/(\d+)")
+ test = (
+ (("https://www.hentai-foundry.com"
+ "/pictures/user/Tenpura/407501/shimakaze"), {
+ "url": "fbf2fd74906738094e2575d2728e8dc3de18a8a3",
+ "keyword": "cbb9381e6c2acce58db4adf4efc0ad7d138bddc4",
+ "content": "91bf01497c39254b6dfb234a18e8f01629c77fd1",
+ }),
+ ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/", {
+ "exception": exception.HttpError,
+ }),
+ ("https://pictures.hentai-foundry.com"
+ "/t/Tenpura/407501/Tenpura-407501-shimakaze.png"),
+ )
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(self, match, match.group(1))
+ self.index = match.group(2)
+
+ def items(self):
+ post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format(
+ self.root, self.user, self.index)
+ url, data = self.get_image_metadata(post_url)
+ data["user"] = self.user
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, url, data
+
+ def skip(self, _):
+ return 0
diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py
new file mode 100644
index 0000000..cf4871f
--- /dev/null
+++ b/gallery_dl/extractor/hentaifox.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentaifox.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text
+
+
+class HentaifoxBase():
+ """Base class for hentaifox extractors"""
+ category = "hentaifox"
+ root = "https://hentaifox.com"
+
+
+class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
+ """Extractor for image galleries on hentaifox.com"""
+ pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
+ test = ("https://hentaifox.com/gallery/56622/", {
+ "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
+ "count": 24,
+ "keyword": "38f8517605feb6854d48833297da6b05c6541b69",
+ })
+
+ def __init__(self, match):
+ GalleryExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+
+ def metadata(self, page, split=text.split_html):
+ extr = text.extract_from(page)
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : text.unescape(extr("<h1>", "</h1>")),
+ "parody" : split(extr(">Parodies:" , "</a></span>"))[::2],
+ "characters": split(extr(">Characters:", "</a></span>"))[::2],
+ "tags" : split(extr(">Tags:" , "</a></span>"))[::2],
+ "artist" : split(extr(">Artists:" , "</a></span>"))[::2],
+ "group" : split(extr(">Groups:" , "</a></span>"))[::2],
+ "type" : text.remove_html(extr(">Category:", "</a></span>")),
+ "language" : "English",
+ "lang" : "en",
+ }
+
+ def images(self, page):
+ return [
+ (text.urljoin(self.root, url.replace("t.", ".")), None)
+ for url in text.extract_iter(page, 'data-src="', '"')
+ ]
+
+
+class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
+ """Extractor for search results and listings on hentaifox.com"""
+ subcategory = "search"
+ pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
+ r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)")
+ test = (
+ ("https://hentaifox.com/parody/touhou-project/"),
+ ("https://hentaifox.com/character/reimu-hakurei/"),
+ ("https://hentaifox.com/artist/distance/"),
+ ("https://hentaifox.com/search/touhou/"),
+ ("https://hentaifox.com/tag/full-colour/", {
+ "pattern": HentaifoxGalleryExtractor.pattern,
+ "count": ">= 40",
+ "keyword": {
+ "url": str,
+ "gallery_id": int,
+ "thumbnail": r"re:https://i\d*.hentaifox.com/\d+/\d+/thumb\.",
+ "title": str,
+ "tags": list,
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for gallery in self.galleries():
+ yield Message.Queue, gallery["url"], gallery
+
+ def galleries(self):
+ url = "{}/{}/".format(self.root, self.path)
+
+ while True:
+ page = self.request(url).text
+ info, gpos = text.extract(
+ page, 'class="galleries_overview">', 'class="clear">')
+
+ for ginfo in text.extract_iter(info, '<div class="item', '</a>'):
+ tags , pos = text.extract(ginfo, '', '"')
+ url , pos = text.extract(ginfo, 'href="', '"', pos)
+ title, pos = text.extract(ginfo, 'alt="', '"', pos)
+ thumb, pos = text.extract(ginfo, 'src="', '"', pos)
+
+ yield {
+ "url": text.urljoin(self.root, url),
+ "gallery_id": text.parse_int(
+ url.strip("/").rpartition("/")[2]),
+ "thumbnail": text.urljoin(self.root, thumb),
+ "title": text.unescape(title),
+ "tags": tags.split(),
+ "_extractor": HentaifoxGalleryExtractor,
+ }
+
+ pos = page.find('class="current"', gpos)
+ url = text.extract(page, 'href="', '"', pos)[0]
+ if pos == -1 or "/pag" not in url:
+ return
+ url = text.urljoin(self.root, url)
diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py
new file mode 100644
index 0000000..8083a9b
--- /dev/null
+++ b/gallery_dl/extractor/hentaihere.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract hentai-manga from https://hentaihere.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import json
+import re
+
+
+class HentaihereBase():
+ """Base class for hentaihere extractors"""
+ category = "hentaihere"
+ root = "https://hentaihere.com"
+
+
+class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
+ """Extractor for a single manga chapter from hentaihere.com"""
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)"
+ test = ("https://hentaihere.com/m/S13812/1/1/", {
+ "url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
+ "keyword": "cbcee0c0eb178c4b87f06a834085784f8dddad24",
+ })
+
+ def __init__(self, match):
+ self.manga_id, self.chapter = match.groups()
+ url = "{}/m/S{}/{}/1".format(self.root, self.manga_id, self.chapter)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ title = text.extract(page, "<title>", "</title>")[0]
+ chapter_id = text.extract(page, 'report/C', '"')[0]
+ pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
+ match = re.match(pattern, title)
+ return {
+ "manga": match.group(1),
+ "manga_id": text.parse_int(self.manga_id),
+ "chapter": text.parse_int(self.chapter),
+ "chapter_id": text.parse_int(chapter_id),
+ "type": match.group(2),
+ "title": match.group(3),
+ "author": match.group(4),
+ "lang": "en",
+ "language": "English",
+ }
+
+ @staticmethod
+ def images(page):
+ images = text.extract(page, "var rff_imageList = ", ";")[0]
+ return [
+ ("https://hentaicdn.com/hentai" + part, None)
+ for part in json.loads(images)
+ ]
+
+
+class HentaihereMangaExtractor(HentaihereBase, MangaExtractor):
+ """Extractor for hmanga from hentaihere.com"""
+ chapterclass = HentaihereChapterExtractor
+ pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com(/m/S\d+)/?$"
+ test = (
+ ("https://hentaihere.com/m/S13812", {
+ "url": "d1ba6e28bb2162e844f8559c2b2725ba0a093559",
+ "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac",
+ }),
+ ("https://hentaihere.com/m/S7608", {
+ "url": "6c5239758dc93f6b1b4175922836c10391b174f7",
+ "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36",
+ }),
+ )
+
+ def chapters(self, page):
+ results = []
+ manga_id = text.parse_int(
+ self.manga_url.rstrip("/").rpartition("/")[2][1:])
+ manga, pos = text.extract(
+ page, '<span itemprop="name">', '</span>')
+ mtype, pos = text.extract(
+ page, '<span class="mngType text-danger">[', ']</span>', pos)
+
+ while True:
+ marker, pos = text.extract(
+ page, '<li class="sub-chp clearfix">', '', pos)
+ if marker is None:
+ return results
+ url, pos = text.extract(page, '<a href="', '"', pos)
+ chapter, pos = text.extract(page, 'title="Tagged: -">\n', '<', pos)
+ chapter_id, pos = text.extract(page, '/C', '"', pos)
+ chapter, _, title = text.unescape(chapter).strip().partition(" - ")
+ results.append((url, {
+ "manga_id": manga_id, "manga": manga, "type": mtype,
+ "chapter_id": text.parse_int(chapter_id),
+ "chapter": text.parse_int(chapter),
+ "title": title, "lang": "en", "language": "English",
+ }))
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
new file mode 100644
index 0000000..d875817
--- /dev/null
+++ b/gallery_dl/extractor/hentainexus.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentainexus.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+import json
+
+
+class HentainexusGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries on hentainexus.com"""
+ category = "hentainexus"
+ root = "https://hentainexus.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
+ r"/(?:view|read)/(\d+)")
+ test = (
+ ("https://hentainexus.com/view/5688", {
+ "url": "746d0043e20030f1171aae5ea113176607302517",
+ "keyword": "b05986369fbaf29cfa08b118960d92c49e59524b",
+ }),
+ ("https://hentainexus.com/read/5688"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/view/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ rmve = text.remove_html
+ extr = text.extract_from(page)
+ data = {
+ "gallery_id" : text.parse_int(self.gallery_id),
+ "tags" : extr('"og:description" content="', '"').split(", "),
+ "thumbnail" : extr('"og:image" content="', '"'),
+ "title" : extr('<h1 class="title">', '</h1>'),
+ "artist" : rmve(extr('viewcolumn">Artist</td>' , '</td>')),
+ "book" : rmve(extr('viewcolumn">Book</td>' , '</td>')),
+ "language" : rmve(extr('viewcolumn">Language</td>' , '</td>')),
+ "magazine" : rmve(extr('viewcolumn">Magazine</td>' , '</td>')),
+ "parody" : rmve(extr('viewcolumn">Parody</td>' , '</td>')),
+ "publisher" : rmve(extr('viewcolumn">Publisher</td>' , '</td>')),
+ "description": rmve(extr('viewcolumn">Description</td>', '</td>')),
+ }
+ data["lang"] = util.language_to_code(data["language"])
+ return data
+
+ def images(self, page):
+ url = "{}/read/{}".format(self.root, self.gallery_id)
+ extr = text.extract_from(self.request(url).text)
+ urls = extr("initReader(", "]") + "]"
+ return [(url, None) for url in json.loads(urls)]
+
+
+class HentainexusSearchExtractor(Extractor):
+ """Extractor for search results on hentainexus.com"""
+ category = "hentainexus"
+ subcategory = "search"
+ root = "https://hentainexus.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
+ r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$")
+ test = (
+ ("https://hentainexus.com/?q=tag:%22heart+pupils%22%20tag:group", {
+ "pattern": HentainexusGalleryExtractor.pattern,
+ "count": ">= 50",
+ }),
+ ("https://hentainexus.com/page/3?q=tag:%22heart+pupils%22"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.params = text.parse_query(match.group(1))
+
+ def items(self):
+ yield Message.Version, 1
+ params = self.params
+ path = "/"
+
+ while path:
+ page = self.request(self.root + path, params=params).text
+ extr = text.extract_from(page)
+ data = {"_extractor": HentainexusGalleryExtractor}
+
+ while True:
+ gallery_id = extr('<a href="/view/', '"')
+ if not gallery_id:
+ break
+ yield Message.Queue, self.root + "/view/" + gallery_id, data
+
+ path = extr('class="pagination-next" href="', '"')
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
new file mode 100644
index 0000000..c112465
--- /dev/null
+++ b/gallery_dl/extractor/hitomi.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://hitomi.la/"""
+
+from .common import GalleryExtractor
+from .. import text, util
+import string
+
+
+class HitomiGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from hitomi.la"""
+ category = "hitomi"
+ root = "https://hitomi.la"
+ pattern = r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)"
+ test = (
+ ("https://hitomi.la/galleries/867789.html", {
+ "url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
+ "keyword": "067b5d9b9c0f98530cd5dd2444e0f5a5b4b00d38",
+ }),
+ ("https://hitomi.la/galleries/1036181.html", {
+ # "aa" subdomain for gallery-id ending in 1 (#142)
+ "pattern": r"https://aa\.hitomi\.la/",
+ }),
+ ("https://hitomi.la/galleries/1401410.html", {
+ # download test
+ "range": "1",
+ "content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c",
+ }),
+ ("https://hitomi.la/galleries/733697.html", {
+ # Game CG with scenes (#321)
+ "url": "c2a84185f467450b8b9b72fbe40c0649029ce007",
+ "count": 210,
+ }),
+ ("https://hitomi.la/reader/867789.html"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = text.parse_int(match.group(1))
+ url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
+ data = {
+ "gallery_id": self.gallery_id,
+ "title" : text.unescape(extr('.html">', '<').strip()),
+ "artist" : self._prep(extr('<h2>', '</h2>')),
+ "group" : self._prep(extr('<td>Group</td><td>', '</td>')),
+ "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')),
+ "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')),
+ "parody" : self._prep(extr('<td>Series</td><td>', '</td>')),
+ "characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
+ "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),
+ "date" : self._date(extr('<span class="date">', '</span>')),
+ }
+ if data["language"] == "N/a":
+ data["language"] = None
+ data["lang"] = util.language_to_code(data["language"])
+ return data
+
+ def images(self, page):
+ # see https://ltn.hitomi.la/common.js
+ offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0
+ subdomain = chr(97 + offset) + "a"
+ base = "https://" + subdomain + ".hitomi.la/galleries/"
+
+ # set Referer header before image downloads (#239)
+ self.session.headers["Referer"] = self.chapter_url
+
+ # handle Game CG galleries with scenes (#321)
+ scenes = text.extract(page, "var scene_indexes = [", "]")[0]
+ if scenes and scenes.strip():
+ url = "{}/reader/{}.html".format(self.root, self.gallery_id)
+ page = self.request(url).text
+ begin, end = ">//g.hitomi.la/galleries/", "</div>"
+ else:
+ begin, end = "'//tn.hitomi.la/smalltn/", ".jpg',"
+
+ return [
+ (base + urlpart, None)
+ for urlpart in text.extract_iter(page, begin, end)
+ ]
+
+ @staticmethod
+ def _prep(value):
+ return [
+ text.unescape(string.capwords(v))
+ for v in text.extract_iter(value or "", '.html">', '<')
+ ]
+
+ @staticmethod
+ def _prep_1(value):
+ return text.remove_html(value).capitalize()
+
+ @staticmethod
+ def _date(value):
+ return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")
diff --git a/gallery_dl/extractor/hypnohub.py b/gallery_dl/extractor/hypnohub.py
new file mode 100644
index 0000000..bf2db96
--- /dev/null
+++ b/gallery_dl/extractor/hypnohub.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hypnohub.net/"""
+
+from . import booru
+
+
+class HypnohubExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for hypnohub extractors"""
+ category = "hypnohub"
+ api_url = "https://hypnohub.net/post.json"
+ post_url = "https://hypnohub.net/post/show/{}"
+
+
+class HypnohubTagExtractor(booru.TagMixin, HypnohubExtractor):
+ """Extractor for images from hypnohub.net based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net"
+ r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
+ test = ("https://hypnohub.net/post?tags=gonoike_biwa", {
+ "url": "6bebc4318489ee37e0c3b814352acd6783ba95d6",
+ })
+
+
+class HypnohubPoolExtractor(booru.PoolMixin, HypnohubExtractor):
+ """Extractor for image-pools from hypnohub.net"""
+ pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/pool/show/(?P<pool>\d+)"
+ test = ("https://hypnohub.net/pool/show/61", {
+ "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf",
+ })
+
+
+class HypnohubPostExtractor(booru.PostMixin, HypnohubExtractor):
+ """Extractor for single images from hypnohub.net"""
+ pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/post/show/(?P<post>\d+)"
+ test = ("https://hypnohub.net/post/show/73964", {
+ "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "gonoike_biwa icontrol_(manipper)",
+ "tags_character": "komaru_naegi",
+ "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode",
+ "tags_general": str,
+ },
+ })
+
+
+class HypnohubPopularExtractor(booru.MoebooruPopularMixin, HypnohubExtractor):
+ """Extractor for popular images from hypnohub.net"""
+ pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net"
+ r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = (
+ ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", {
+ "count": 20,
+ }),
+ ("https://hypnohub.net/post/popular_recent"),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = "https://hypnohub.net/post/popular_{scale}.json".format(
+ scale=self.scale)
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
new file mode 100644
index 0000000..dcb4a54
--- /dev/null
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -0,0 +1,59 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://idol.sankakucomplex.com/"""
+
+from . import sankaku
+
+
+class IdolcomplexExtractor(sankaku.SankakuExtractor):
+ """Base class for idolcomplex extractors"""
+ category = "idolcomplex"
+ cookiedomain = "idol.sankakucomplex.com"
+ subdomain = "idol"
+
+
+class IdolcomplexTagExtractor(IdolcomplexExtractor,
+ sankaku.SankakuTagExtractor):
+ """Extractor for images from idol.sankakucomplex.com by search-tags"""
+ pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)"
+ test = (
+ ("https://idol.sankakucomplex.com/?tags=lyumos+wreath", {
+ "count": ">= 6",
+ "pattern": r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
+ r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+",
+ }),
+ ("https://idol.sankakucomplex.com"
+ "/?tags=lyumos+wreath&page=3&next=694215"),
+ )
+
+
+class IdolcomplexPoolExtractor(IdolcomplexExtractor,
+ sankaku.SankakuPoolExtractor):
+ """Extractor for image-pools from idol.sankakucomplex.com"""
+ pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)"
+ test = ("https://idol.sankakucomplex.com/pool/show/145", {
+ "count": 3,
+ })
+
+
+class IdolcomplexPostExtractor(IdolcomplexExtractor,
+ sankaku.SankakuPostExtractor):
+ """Extractor for single images from idol.sankakucomplex.com"""
+ pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)"
+ test = ("https://idol.sankakucomplex.com/post/show/694215", {
+ "content": "694ec2491240787d75bf5d0c75d0082b53a85afd",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_character": "shani_(the_witcher)",
+ "tags_copyright": "the_witcher",
+ "tags_idol": str,
+ "tags_medium": str,
+ "tags_general": str,
+ },
+ })
diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py
new file mode 100644
index 0000000..6980185
--- /dev/null
+++ b/gallery_dl/extractor/imagebam.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from http://www.imagebam.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+
+class ImagebamExtractor(Extractor):
+ """Base class for imagebam extractors"""
+ category = "imagebam"
+ root = "http://www.imagebam.com"
+
+ def get_image_data(self, page_url, data):
+ """Fill 'data' and return image URL"""
+ page = self.request(page_url).text
+ image_url = text.extract(page, 'property="og:image" content="', '"')[0]
+ data["extension"] = image_url.rpartition(".")[2]
+ data["image_key"] = page_url.rpartition("/")[2]
+ data["image_id"] = data["image_key"][6:]
+ return image_url
+
+ def request_page(self, url):
+ """Retrive the main part of a gallery page"""
+ page = self.request(text.urljoin(self.root, url)).text
+ return text.extract(page, "<fieldset>", "</fieldset>")[0]
+
+
+class ImagebamGalleryExtractor(ImagebamExtractor):
+ """Extractor for image galleries from imagebam.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{title} - {gallery_key}")
+ filename_fmt = "{num:>03}-{image_key}.{extension}"
+ archive_fmt = "{gallery_key}_{image_key}"
+ pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)"
+ test = (
+ ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", {
+ "url": "fb01925129a1ff1941762eaa3a2783a66de6847f",
+ "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a",
+ "content": "596e6bfa157f2c7169805d50075c2986549973a8",
+ }),
+ ("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", {
+ # more than 100 images; see issue #219
+ "count": 107,
+ "url": "f92ce5b17676b6ea69288f0aef26f4cdbea7fd8d",
+ }),
+ ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ ImagebamExtractor.__init__(self, match)
+ self.gallery_key = match.group(1)
+
+ def items(self):
+ url = "{}/gallery/{}".format(self.root, self.gallery_key)
+ page = self.request_page(url)
+ if not page or ">Error<" in page:
+ raise exception.NotFoundError("gallery")
+
+ data = self.get_metadata(page)
+ imgs = self.get_image_pages(page)
+ data["count"] = len(imgs)
+ data["gallery_key"] = self.gallery_key
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], page_url in enumerate(imgs, 1):
+ image_url = self.get_image_data(page_url, data)
+ yield Message.Url, image_url, data
+
+ @staticmethod
+ def get_metadata(page):
+ """Return gallery metadata"""
+ return text.extract_all(page, (
+ ("title" , "'> ", " <span "),
+ (None , "'>", "</span>"),
+ ("description", ":#FCFCFC;'>", "</div>"),
+ ))[0]
+
+ def get_image_pages(self, page):
+ """Return a list of all image pages"""
+ pages = []
+ while True:
+ pages.extend(text.extract_iter(page, "\n<a href='", "'"))
+ pos = page.find('"pagination_current"')
+ if pos > 0:
+ url = text.extract(page, "<a href='", "'", pos)[0]
+ if url:
+ page = self.request_page(url)
+ continue
+ return pages
+
+
+class ImagebamImageExtractor(ImagebamExtractor):
+ """Extractor for single images from imagebam.com"""
+ subcategory = "image"
+ filename_fmt = "{image_key}.{extension}"
+ archive_fmt = "{image_key}"
+ pattern = (r"(?:https?://)?(?:\w+\.)?imagebam\.com"
+ r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)")
+ test = (
+ ("http://www.imagebam.com/image/94d56c502511890", {
+ "url": "b384893c35a01a09c58018db71ddc4cf2480be95",
+ "keyword": "4263d4840007524129792b8587a562b5d20c2687",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ }),
+ ("http://images3.imagebam.com/1d/8c/44/94d56c502511890.png"),
+ )
+
+ def __init__(self, match):
+ ImagebamExtractor.__init__(self, match)
+ self.image_key = match.group(1)
+
+ def items(self):
+ page_url = "{}/image/{}".format(self.root, self.image_key)
+ data = {}
+ image_url = self.get_image_data(page_url, data)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, image_url, data
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
new file mode 100644
index 0000000..152b631
--- /dev/null
+++ b/gallery_dl/extractor/imagefap.py
@@ -0,0 +1,195 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://imagefap.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class ImagefapExtractor(Extractor):
+ """Base class for imagefap extractors"""
+ category = "imagefap"
+ directory_fmt = ("{category}", "{gallery_id} {title}")
+ filename_fmt = "{category}_{gallery_id}_{filename}.{extension}"
+ archive_fmt = "{gallery_id}_{image_id}"
+ root = "https://www.imagefap.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.session.headers["Referer"] = self.root
+
+
+class ImagefapGalleryExtractor(ImagefapExtractor):
+ """Extractor for image galleries from imagefap.com"""
+ subcategory = "gallery"
+ pattern = (r"(?:https?://)?(?:www\.)?imagefap\.com/"
+ r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)")
+ test = (
+ ("https://www.imagefap.com/pictures/7102714", {
+ "url": "268995eac5d01ddecd0fe58cfa9828390dc85a84",
+ "keyword": "b5bd65ab2ff574ed1639db9a43c7b1b8583c85ef",
+ "content": "694a0a57385980a6f90fbc296cadcd6c11ba2dab",
+ }),
+ ("https://www.imagefap.com/gallery/5486966", {
+ "url": "14906b4f0b8053d1d69bc730a325acb793cbc898",
+ "keyword": "ab90972f3527a2011478fabc621a2c99a541f752",
+ }),
+ ("https://www.imagefap.com/gallery.php?gid=7102714"),
+ )
+
+ def __init__(self, match):
+ ImagefapExtractor.__init__(self, match)
+ self.gid = match.group(1)
+ self.image_id = ""
+
+ def items(self):
+ url = "{}/pictures/{}/".format(self.root, self.gid)
+ page = self.request(url).text
+ data = self.get_job_metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for url, image in self.get_images():
+ data.update(image)
+ yield Message.Url, url, data
+
+ def get_job_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ descr, pos = text.extract(
+ page, '<meta name="description" content="Browse ', '"')
+ count, pos = text.extract(page, ' 1 of ', ' pics"', pos)
+ self.image_id = text.extract(page, 'id="img_ed_', '"', pos)[0]
+
+ title, _, descr = descr.partition(" porn picture gallery by ")
+ uploader, _, tags = descr.partition(" to see hottest ")
+ return {
+ "gallery_id": text.parse_int(self.gid),
+ "title": text.unescape(title),
+ "uploader": uploader,
+ "tags": tags[:-11].split(", "),
+ "count": text.parse_int(count),
+ }
+
+ def get_images(self):
+ """Collect image-urls and -metadata"""
+ num = 0
+ url = "{}/photo/{}/".format(self.root, self.image_id)
+ params = {"gid": self.gid, "idx": 0, "partial": "true"}
+ while True:
+ pos = 0
+ page = self.request(url, params=params).text
+ for _ in range(24):
+ imgurl, pos = text.extract(page, '<a href="', '"', pos)
+ if not imgurl:
+ return
+ num += 1
+ _, imgid, name = imgurl.rsplit("/", 2)
+ data = {"image_id": text.parse_int(imgid), "num": num}
+ yield imgurl, text.nameext_from_url(name, data)
+ params["idx"] += 24
+
+
+class ImagefapImageExtractor(ImagefapExtractor):
+ """Extractor for single images from imagefap.com"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)"
+ test = ("https://www.imagefap.com/photo/1369341772/", {
+ "url": "b31ee405b61ff0450020a1bf11c0581ca9adb471",
+ "keyword": "eadaa8f8012298384996efd21cf1f9e9e0dddb9b",
+ })
+
+ def __init__(self, match):
+ ImagefapExtractor.__init__(self, match)
+ self.image_id = match.group(1)
+
+ def items(self):
+ data = self.get_job_metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, data["url"], data
+
+ def get_job_metadata(self):
+ """Collect metadata for extractor-job"""
+ url = "{}/photo/{}/".format(self.root, self.image_id)
+ page = self.request(url).text
+ info = json.loads(text.extract(
+ page, '<script type="application/ld+json">', '</script>')[0])
+ parts = info["contentUrl"].rsplit("/", 3)
+ return text.nameext_from_url(parts[3], {
+ "url": info["contentUrl"],
+ "title": text.unescape(info["name"]),
+ "uploader": info["author"],
+ "date": info["datePublished"],
+ "width": text.parse_int(info["width"]),
+ "height": text.parse_int(info["height"]),
+ "gallery_id": text.parse_int(parts[1]),
+ "image_id": text.parse_int(parts[2]),
+ })
+
+
+class ImagefapUserExtractor(ImagefapExtractor):
+ """Extractor for all galleries from a user at imagefap.com"""
+ subcategory = "user"
+ categorytransfer = True
+ pattern = (r"(?:https?://)?(?:www\.)?imagefap\.com/"
+ r"(?:profile(?:\.php\?user=|/)([^/?&#]+)"
+ r"|usergallery\.php\?userid=(\d+))")
+ test = (
+ ("https://www.imagefap.com/profile/LucyRae/galleries", {
+ "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd",
+ }),
+ ("https://www.imagefap.com/usergallery.php?userid=1862791", {
+ "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd",
+ }),
+ ("https://www.imagefap.com/profile.php?user=LucyRae"),
+ )
+
+ def __init__(self, match):
+ ImagefapExtractor.__init__(self, match)
+ self.user, self.user_id = match.groups()
+
+ def items(self):
+ yield Message.Version, 1
+ for gid, name in self.get_gallery_data():
+ url = "{}/gallery/{}".format(self.root, gid)
+ data = {
+ "gallery_id": text.parse_int(gid),
+ "title": text.unescape(name),
+ "_extractor": ImagefapGalleryExtractor,
+ }
+ yield Message.Queue, url, data
+
+ def get_gallery_data(self):
+ """Yield all gallery_ids of a specific user"""
+ folders = self.get_gallery_folders()
+ url = "{}/ajax_usergallery_folder.php".format(self.root)
+ params = {"userid": self.user_id}
+ for folder_id in folders:
+ params["id"] = folder_id
+ page = self.request(url, params=params).text
+
+ pos = 0
+ while True:
+ gid, pos = text.extract(page, '<a href="/gallery/', '"', pos)
+ if not gid:
+ break
+ name, pos = text.extract(page, "<b>", "<", pos)
+ yield gid, name
+
+ def get_gallery_folders(self):
+ """Create a list of all folder_ids of a specific user"""
+ if self.user:
+ url = "{}/profile/{}/galleries".format(self.root, self.user)
+ else:
+ url = "{}/usergallery.php?userid={}".format(
+ self.root, self.user_id)
+ page = self.request(url).text
+ self.user_id, pos = text.extract(page, '?userid=', '"')
+ folders, pos = text.extract(page, ' id="tgl_all" value="', '"', pos)
+ return folders.split("|")[:-1]
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
new file mode 100644
index 0000000..954c1f0
--- /dev/null
+++ b/gallery_dl/extractor/imagehosts.py
@@ -0,0 +1,251 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Collection of extractors for various imagehosts"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text, exception
+from ..cache import memcache
+from os.path import splitext
+
+
+class ImagehostImageExtractor(SharedConfigMixin, Extractor):
+ """Base class for single-image extractors for various imagehosts"""
+ basecategory = "imagehost"
+ subcategory = "image"
+ archive_fmt = "{token}"
+ https = False
+ method = "post"
+ params = "simple"
+ cookies = None
+ encoding = None
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page_url = "http{}://{}".format(
+ "s" if self.https else "", match.group(1))
+ self.token = match.group(2)
+ if self.params == "simple":
+ self.params = {
+ "imgContinue": "Continue+to+image+...+",
+ }
+ elif self.params == "complex":
+ self.params = {
+ "op": "view",
+ "id": self.token,
+ "pre": "1",
+ "adb": "1",
+ "next": "Continue+to+image+...+",
+ }
+ else:
+ self.params = {}
+ self.method = "get"
+
+ def items(self):
+ page = self.request(
+ self.page_url,
+ method=self.method,
+ data=self.params,
+ cookies=self.cookies,
+ encoding=self.encoding,
+ ).text
+
+ url, filename = self.get_info(page)
+ data = text.nameext_from_url(filename, {"token": self.token})
+ if self.https and url.startswith("http:"):
+ url = "https:" + url[5:]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, url, data
+
+ def get_info(self, page):
+ """Find image-url and string to get filename from"""
+
+
+class ImxtoImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imx.to"""
+ category = "imxto"
+ pattern = (r"(?:https?://)?(?:www\.)?((?:imx\.to|img\.yt)"
+ r"/(?:i/|img-)(\w+)(\.html)?)")
+ test = (
+ ("https://imx.to/i/1qdeva", { # new-style URL
+ "url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130",
+ "keyword": "1153a986c939d7aed599905588f5c940048bc517",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ }),
+ ("https://imx.to/img-57a2050547b97.html", { # old-style URL
+ "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",
+ "keyword": "fd2240aee77a21b8252d5b829a1f7e542f927f09",
+ "content": "54592f2635674c25677c6872db3709d343cdf92f",
+ }),
+ ("https://img.yt/img-57a2050547b97.html", { # img.yt domain
+ "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",
+ }),
+ ("https://imx.to/img-57a2050547b98.html", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+ https = True
+ encoding = "utf-8"
+
+ def __init__(self, match):
+ ImagehostImageExtractor.__init__(self, match)
+ if "/img-" in self.page_url:
+ self.page_url = self.page_url.replace("img.yt", "imx.to")
+ self.url_ext = True
+ else:
+ self.url_ext = False
+
+ def get_info(self, page):
+ url, pos = text.extract(
+ page, '<div style="text-align:center;"><a href="', '"')
+ if not url:
+ raise exception.NotFoundError("image")
+ filename, pos = text.extract(page, ' title="', '"', pos)
+ if self.url_ext and filename:
+ filename += splitext(url)[1]
+ return url, filename or url
+
+
+class AcidimgImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from acidimg.cc"""
+ category = "acidimg"
+ pattern = r"(?:https?://)?((?:www\.)?acidimg\.cc/img-([a-z0-9]+)\.html)"
+ test = ("https://acidimg.cc/img-5acb6b9de4640.html", {
+ "url": "f132a630006e8d84f52d59555191ed82b3b64c04",
+ "keyword": "a8bb9ab8b2f6844071945d31f8c6e04724051f37",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ })
+ https = True
+ encoding = "utf-8"
+
+ def get_info(self, page):
+ url, pos = text.extract(page, "<img class='centred' src='", "'")
+ if not url:
+ raise exception.NotFoundError("image")
+ filename, pos = text.extract(page, " alt='", "'", pos)
+ return url, (filename + splitext(url)[1]) if filename else url
+
+
+class ImagevenueImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imagevenue.com"""
+ category = "imagevenue"
+ pattern = (r"(?:https?://)?(img\d+\.imagevenue\.com"
+ r"/img\.php\?image=(?:[a-z]+_)?(\d+)_[^&#]+)")
+ test = (("http://img28116.imagevenue.com/img.php"
+ "?image=th_52709_test_122_64lo.jpg"), {
+ "url": "46812995d557f2c6adf0ebd0e631e6e4e45facde",
+ "content": "59ec819cbd972dd9a71f25866fbfc416f2f215b3",
+ })
+ params = None
+
+ def get_info(self, page):
+ url = text.extract(page, "SRC='", "'")[0]
+ return text.urljoin(self.page_url, url), url
+
+
+class ImagetwistImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imagetwist.com"""
+ category = "imagetwist"
+ pattern = r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))"
+ test = ("https://imagetwist.com/4e46hv31tu0q/test.jpg", {
+ "url": "c999dc1a5dec0525ac9eb8c092f173dfe6dba0b0",
+ "keyword": "a9f2e01757ec96d4ee4752cbd8446ede80f7935e",
+ "content": "96b1fd099b06faad5879fce23a7e4eb8290d8810",
+ })
+ https = True
+ params = None
+
+ @property
+ @memcache(maxage=3*3600)
+ def cookies(self):
+ return self.request(self.page_url).cookies
+
+ def get_info(self, page):
+ url , pos = text.extract(page, 'center;"><img src="', '"')
+ filename, pos = text.extract(page, ' alt="', '"', pos)
+ return url, filename
+
+
+class ImgspiceImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from imgspice.com"""
+ category = "imgspice"
+ pattern = r"(?:https?://)?((?:www\.)?imgspice\.com/([^/?&#]+))"
+ test = ("https://imgspice.com/nwfwtpyog50y/test.png.html", {
+ "url": "b8c30a8f51ee1012959a4cfd46197fabf14de984",
+ "keyword": "100e310a19a2fa22d87e1bbc427ecb9f6501e0c0",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ })
+ https = True
+ params = None
+
+ def get_info(self, page):
+ pos = page.find('id="imgpreview"')
+ if pos < 0:
+ raise exception.NotFoundError("image")
+ url , pos = text.extract(page, 'src="', '"', pos)
+ name, pos = text.extract(page, 'alt="', '"', pos)
+ return url, text.unescape(name)
+
+
+class PixhostImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from pixhost.to"""
+ category = "pixhost"
+ pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)"
+ r"/show/\d+/(\d+)_[^/?&#]+)")
+ test = ("https://pixhost.to/show/224/96246707_test-.png", {
+ "url": "8f3d41fdd2dbec4c844e5ee45bf49961fbd79c67",
+ "keyword": "ecefe2d5814286f9d1dff3d88d9bdc78dd456c5d",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ })
+ https = True
+ params = None
+ cookies = {"pixhostads": "1", "pixhosttest": "1"}
+
+ def get_info(self, page):
+ url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"")
+ filename, pos = text.extract(page, "alt=\"", "\"", pos)
+ return url, filename
+
+
+class PostimgImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from postimages.org"""
+ category = "postimg"
+ pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)"
+ r"/(?:image/)?([^/?&#]+)/?)")
+ test = ("https://postimg.cc/Wtn2b3hC", {
+ "url": "0794cfda9b8951a8ac3aa692472484200254ab86",
+ "keyword": "2d05808d04e4e83e33200db83521af06e3147a84",
+ "content": "cfaa8def53ed1a575e0c665c9d6d8cf2aac7a0ee",
+ })
+ https = True
+ params = None
+
+ def get_info(self, page):
+ url , pos = text.extract(page, 'id="main-image" src="', '"')
+ filename, pos = text.extract(page, 'class="imagename">', '<', pos)
+ return url, text.unescape(filename)
+
+
+class TurboimagehostImageExtractor(ImagehostImageExtractor):
+ """Extractor for single images from www.turboimagehost.com"""
+ category = "turboimagehost"
+ pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com"
+ r"/p/(\d+)/[^/?&#]+\.html)")
+ test = ("https://www.turboimagehost.com/p/39078423/test--.png.html", {
+ "url": "b94de43612318771ced924cb5085976f13b3b90e",
+ "keyword": "704757ca8825f51cec516ec44c1e627c1f2058ca",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ })
+ https = True
+ params = None
+
+ def get_info(self, page):
+ url = text.extract(page, 'src="', '"', page.index("<img "))[0]
+ return url, url
diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py
new file mode 100644
index 0000000..516ef18
--- /dev/null
+++ b/gallery_dl/extractor/imgbox.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from galleries at https://imgbox.com/"""
+
+from .common import Extractor, Message, AsynchronousMixin
+from .. import text, exception
+import re
+
+
+class ImgboxExtractor(Extractor):
+ """Base class for imgbox extractors"""
+ category = "imgbox"
+ root = "https://imgbox.com"
+
+ def items(self):
+ data = self.get_job_metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for image_key in self.get_image_keys():
+ imgpage = self.request(self.root + "/" + image_key).text
+ imgdata = self.get_image_metadata(imgpage)
+ if imgdata["filename"]:
+ imgdata.update(data)
+ imgdata["image_key"] = image_key
+ text.nameext_from_url(imgdata["filename"], imgdata)
+ yield Message.Url, self.get_image_url(imgpage), imgdata
+
+ @staticmethod
+ def get_job_metadata():
+ """Collect metadata for extractor-job"""
+ return {}
+
+ @staticmethod
+ def get_image_keys():
+ """Return an iterable containing all image-keys"""
+ return []
+
+ @staticmethod
+ def get_image_metadata(page):
+ """Collect metadata for a downloadable file"""
+ return text.extract_all(page, (
+ ("num" , '</a> &nbsp; ', ' of '),
+ (None , 'class="image-container"', ''),
+ ("filename" , ' title="', '"'),
+ ))[0]
+
+ @staticmethod
+ def get_image_url(page):
+ """Extract download-url"""
+ pos = page.index(">Image</a>")
+ return text.extract(page, '<a href="', '"', pos)[0]
+
+
+class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor):
+ """Extractor for image galleries from imgbox.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{title} - {gallery_key}")
+ filename_fmt = "{num:>03}-{filename}.{extension}"
+ archive_fmt = "{gallery_key}_{image_key}"
+ pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})"
+ test = (
+ ("https://imgbox.com/g/JaX5V5HX7g", {
+ "url": "678f0bca1251d810372326ea4f16582cafa800e4",
+ "keyword": "4b1e62820ac2c6205b7ad0b6322cc8e00dbe1b0c",
+ "content": "d20307dc8511ac24d688859c55abf2e2cc2dd3cc",
+ }),
+ ("https://imgbox.com/g/cUGEkRbdZZ", {
+ "url": "d839d47cbbbeb121f83c520072512f7e51f52107",
+ "keyword": "fb0427b87983197849fb2887905e758f3e50cb6e",
+ }),
+ ("https://imgbox.com/g/JaX5V5HX7h", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ ImgboxExtractor.__init__(self, match)
+ self.gallery_key = match.group(1)
+ self.image_keys = []
+
+ def get_job_metadata(self):
+ page = self.request(self.root + "/g/" + self.gallery_key).text
+ if "The specified gallery could not be found." in page:
+ raise exception.NotFoundError("gallery")
+ self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page)
+
+ title = text.extract(page, "<h1>", "</h1>")[0]
+ title, _, count = title.rpartition(" - ")
+ return {
+ "gallery_key": self.gallery_key,
+ "title": text.unescape(title),
+ "count": count[:-7],
+ }
+
+ def get_image_keys(self):
+ return self.image_keys
+
+
+class ImgboxImageExtractor(ImgboxExtractor):
+ """Extractor for single images from imgbox.com"""
+ subcategory = "image"
+ archive_fmt = "{image_key}"
+ pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})"
+ test = (
+ ("https://imgbox.com/qHhw7lpG", {
+ "url": "d931f675a9b848fa7cb9077d6c2b14eb07bdb80f",
+ "keyword": "dfc72310026b45f3feb4f9cada20c79b2575e1af",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ }),
+ ("https://imgbox.com/qHhw7lpH", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ ImgboxExtractor.__init__(self, match)
+ self.image_key = match.group(1)
+
+ def get_image_keys(self):
+ return (self.image_key,)
+
+ @staticmethod
+ def get_image_metadata(page):
+ data = ImgboxExtractor.get_image_metadata(page)
+ if not data["filename"]:
+ raise exception.NotFoundError("image")
+ return data
diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py
new file mode 100644
index 0000000..a97f2e0
--- /dev/null
+++ b/gallery_dl/extractor/imgth.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://imgth.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class ImgthGalleryExtractor(Extractor):
+ """Extractor for image galleries from imgth.com"""
+ category = "imgth"
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{gallery_id} {title}")
+ filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+ archive_fmt = "{gallery_id}_{num}"
+ pattern = r"(?:https?://)?imgth\.com/gallery/(\d+)"
+ test = ("http://imgth.com/gallery/37/wallpaper-anime", {
+ "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",
+ "keyword": "6f8c00d6849ea89d1a028764675ec1fe9dbd87e2",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.gid = match.group(1)
+ self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/"
+
+ def items(self):
+ page = self.request(self.url_base + "0").text
+ data = self.metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], url in enumerate(self.images(page), 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def images(self, page):
+ """Yield all image urls for this gallery"""
+ pnum = 0
+ while True:
+ thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
+ for url in text.extract_iter(thumbs, '<img src="', '"'):
+ yield "https://imgth.com/images/" + url[24:]
+ if '<li class="next">' not in page:
+ return
+ pnum += 1
+ page = self.request(self.url_base + str(pnum)).text
+
+ def metadata(self, page):
+ """Collect metadata for extractor-job"""
+ return text.extract_all(page, (
+ ("title", '<h1>', '</h1>'),
+ ("count", 'total of images in this gallery: ', ' '),
+ ("date" , 'created on ', ' by <'),
+ (None , 'href="/users/', ''),
+ ("user" , '>', '<'),
+ ), values={"gallery_id": self.gid})[0]
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
new file mode 100644
index 0000000..0468c0b
--- /dev/null
+++ b/gallery_dl/extractor/imgur.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://imgur.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+import json
+
+
+class ImgurExtractor(Extractor):
+ """Base class for imgur extractors"""
+ category = "imgur"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item_id = match.group(1)
+ self.mp4 = self.config("mp4", True)
+
+ def _get_data(self, urlpart):
+ response = self.request("https://imgur.com/" + urlpart, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError(self.subcategory)
+ data = text.extract(response.text, "image : ", ",\n")[0]
+ return self._clean(json.loads(data))
+
+ def _prepare(self, image):
+ image["ext"] = image["ext"].partition("?")[0]
+ if image["ext"] == ".gif" and (
+ (self.mp4 and image["prefer_video"]) or self.mp4 == "always"):
+ image["ext"] = ".mp4"
+ url = "https://i.imgur.com/" + image["hash"] + image["ext"]
+ image["extension"] = image["ext"][1:]
+ return url
+
+ @staticmethod
+ def _clean(data):
+ try:
+ del data["adConfig"]
+ del data["isAd"]
+ except KeyError:
+ pass
+ return data
+
+
+class ImgurImageExtractor(ImgurExtractor):
+ """Extractor for individual images from imgur.com"""
+ subcategory = "image"
+ filename_fmt = "{category}_{hash}{title:?_//}.{extension}"
+ archive_fmt = "{hash}"
+ pattern = (r"(?:https?://)?(?:www\.|[im]\.|)?imgur\.com"
+ r"/(?!gallery)(\w{7}|\w{5})[sbtmlh]?\.?")
+ test = (
+ ("https://imgur.com/21yMxCS", {
+ "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ "keyword": {
+ "animated": False,
+ "datetime": "2016-11-10 14:24:35",
+ "description": str,
+ "ext": ".png",
+ "extension": "png",
+ "hash": "21yMxCS",
+ "height": "32",
+ "is_moderated": False,
+ "is_safe": False,
+ "is_viral": 0,
+ "looping": False,
+ "mimetype": "image/png",
+ "name": None,
+ "prefer_video": False,
+ "size": 182,
+ "source": "",
+ "title": "Test",
+ "video_host": None,
+ "video_source": None,
+ "width": "64",
+ },
+ }),
+ ("http://imgur.com/0gybAXR", { # gifv/mp4 video
+ "url": "a2220eb265a55b0c95e0d3d721ec7665460e3fd7",
+ "content": "a3c080e43f58f55243ab830569ba02309d59abfc",
+ }),
+ ("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1'
+ "url": "73f361b50753ab25da64160aa50bc5d139480d45",
+ }),
+ ("https://imgur.com/zzzzzzz", { # not found
+ "exception": exception.NotFoundError,
+ }),
+ ("https://www.imgur.com/21yMxCS"), # www
+ ("https://m.imgur.com/21yMxCS"), # mobile
+ ("https://imgur.com/zxaY6"), # 5 character key
+ ("https://i.imgur.com/21yMxCS.png"), # direct link
+ ("https://i.imgur.com/21yMxCSh.png"), # direct link thumbnail
+ ("https://i.imgur.com/zxaY6.gif"), # direct link (short)
+ ("https://i.imgur.com/zxaY6s.gif"), # direct link (short; thumb)
+ )
+
+ def items(self):
+ image = self._get_data(self.item_id)
+ url = self._prepare(image)
+
+ yield Message.Version, 1
+ yield Message.Directory, image
+ yield Message.Url, url, image
+
+
+class ImgurAlbumExtractor(ImgurExtractor):
+ """Extractor for image albums from imgur.com"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}")
+ filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}"
+ archive_fmt = "{album[hash]}_{hash}"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com"
+ r"/(?:a|gallery|t/unmuted)/(\w{7}|\w{5})")
+ test = (
+ ("https://imgur.com/a/TcBmP", {
+ "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563",
+ "keyword": {
+ "album": {
+ "album_cover": "693j2Kr",
+ "album_description": None,
+ "cover": "693j2Kr",
+ "datetime": "2015-10-09 10:37:50",
+ "description": None,
+ "hash": "TcBmP",
+ "id": "TcBmP",
+ "is_album": True,
+ "num_images": "19",
+ "title": "138",
+ "title_clean": "TcBmP",
+ "views": str,
+ },
+ "animated": bool,
+ "datetime": str,
+ "extension": str,
+ "hash": str,
+ "height": int,
+ "num": int,
+ "prefer_video": bool,
+ "size": int,
+ "title": str,
+ "width": int,
+ },
+ }),
+ ("https://imgur.com/gallery/eD9CT", { # large album
+ "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937",
+ }),
+ ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash
+ "url": "695ef0c950023362a0163ee5041796300db76674",
+ }),
+ ("https://imgur.com/t/unmuted/YMqBcua", { # unmuted URL
+ "url": "86b4747f8147cec7602f0214e267309af73a8655",
+ }),
+ ("https://imgur.com/a/TcBmQ", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://www.imgur.com/a/TcBmP"), # www
+ ("https://m.imgur.com/a/TcBmP"), # mobile
+ )
+
+ def items(self):
+ album = self._get_data("a/" + self.item_id + "/all")
+ images = album["album_images"]["images"]
+ del album["album_images"]
+
+ if int(album["num_images"]) > len(images):
+ url = ("https://imgur.com/ajaxalbums/getimages/" +
+ self.item_id + "/hit.json")
+ images = self.request(url).json()["data"]["images"]
+
+ yield Message.Version, 1
+ yield Message.Directory, {"album": album, "count": len(images)}
+ for num, image in enumerate(images, 1):
+ url = self._prepare(image)
+ image["num"] = num
+ image["album"] = album
+ yield Message.Url, url, image
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
new file mode 100644
index 0000000..871236b
--- /dev/null
+++ b/gallery_dl/extractor/instagram.py
@@ -0,0 +1,277 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Leonardo Taccari, Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.instagram.com/"""
+
+import hashlib
+import json
+from .common import Extractor, Message
+from .. import text
+
+
+class InstagramExtractor(Extractor):
+ """Base class for instagram extractors"""
+ category = "instagram"
+ directory_fmt = ("{category}", "{username}")
+ filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}"
+ archive_fmt = "{media_id}"
+ root = "https://www.instagram.com"
+
+ def get_metadata(self):
+ return {}
+
+ def items(self):
+ yield Message.Version, 1
+
+ metadata = self.get_metadata()
+ for data in self.instagrams():
+ data.update(metadata)
+ yield Message.Directory, data
+
+ if data['typename'] == 'GraphImage':
+ yield Message.Url, data['display_url'], \
+ text.nameext_from_url(data['display_url'], data)
+ elif data['typename'] == 'GraphVideo':
+ yield Message.Url, \
+ 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
+
+ def _extract_shared_data(self, page):
+ return json.loads(text.extract(page,
+ 'window._sharedData = ', ';</script>')[0])
+
+ def _extract_postpage(self, url):
+ page = self.request(url).text
+ shared_data = self._extract_shared_data(page)
+ media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']
+
+ common = {
+ 'date': text.parse_timestamp(media['taken_at_timestamp']),
+ 'likes': text.parse_int(media['edge_media_preview_like']['count']),
+ 'owner_id': media['owner']['id'],
+ 'username': media['owner']['username'],
+ 'fullname': media['owner']['full_name'],
+ 'description': text.parse_unicode_escapes('\n'.join(
+ edge['node']['text']
+ for edge in media['edge_media_to_caption']['edges']
+ )),
+ }
+
+ medias = []
+ if media['__typename'] == 'GraphSidecar':
+ yi = 0
+ for n in media['edge_sidecar_to_children']['edges']:
+ children = n['node']
+ media_data = {
+ 'media_id': children['id'],
+ 'shortcode': children['shortcode'],
+ 'typename': children['__typename'],
+ 'display_url': children['display_url'],
+ 'height': text.parse_int(children['dimensions']['height']),
+ 'width': text.parse_int(children['dimensions']['width']),
+ 'sidecar_media_id': media['id'],
+ 'sidecar_shortcode': media['shortcode'],
+ }
+ if children['__typename'] == 'GraphVideo':
+ media_data["_ytdl_index"] = yi
+ yi += 1
+ media_data.update(common)
+ medias.append(media_data)
+
+ else:
+ media_data = {
+ 'media_id': media['id'],
+ 'shortcode': media['shortcode'],
+ 'typename': media['__typename'],
+ 'display_url': media['display_url'],
+ 'height': text.parse_int(media['dimensions']['height']),
+ 'width': text.parse_int(media['dimensions']['width']),
+ }
+ media_data.update(common)
+ medias.append(media_data)
+
+ return medias
+
+ def _extract_page(self, url, page_type):
+ shared_data_fields = {
+ 'ProfilePage': {
+ 'node': 'user',
+ 'node_id': 'id',
+ 'edge_to_medias': 'edge_owner_to_timeline_media',
+ 'variables_id': 'id',
+ 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41',
+ },
+ 'TagPage': {
+ 'node': 'hashtag',
+ 'node_id': 'name',
+ 'edge_to_medias': 'edge_hashtag_to_media',
+ 'variables_id': 'tag_name',
+ 'query_hash': 'f92f56d47dc7a55b606908374b43a314',
+ },
+ }
+
+ page = self.request(url).text
+ shared_data = self._extract_shared_data(page)
+ psdf = shared_data_fields[page_type]
+
+ while True:
+ # Deal with different structure of pages: the first page
+ # has interesting data in `entry_data', next pages in `data'.
+ if 'entry_data' in shared_data:
+ base_shared_data = shared_data['entry_data'][page_type][0]['graphql']
+
+ # variables_id is available only in the first page
+ variables_id = base_shared_data[psdf['node']][psdf['node_id']]
+ else:
+ base_shared_data = shared_data['data']
+
+ medias = base_shared_data[psdf['node']][psdf['edge_to_medias']]
+ has_next_page = medias['page_info']['has_next_page']
+ shortcodes = [n['node']['shortcode'] for n in medias['edges']]
+
+ for s in shortcodes:
+ url = '{}/p/{}/'.format(self.root, s)
+ yield from self._extract_postpage(url)
+
+ if not has_next_page:
+ break
+
+ end_cursor = medias['page_info']['end_cursor']
+ variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format(
+ psdf['variables_id'],
+ variables_id,
+ end_cursor,
+ )
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "X-Instagram-GIS": hashlib.md5(variables.encode()).hexdigest(),
+ }
+ url = '{}/graphql/query/?query_hash={}&variables={}'.format(
+ self.root,
+ psdf['query_hash'],
+ variables,
+ )
+ shared_data = self.request(url, headers=headers).json()
+
+ def _extract_profilepage(self, url):
+ yield from self._extract_page(url, 'ProfilePage')
+
+ def _extract_tagpage(self, url):
+ yield from self._extract_page(url, 'TagPage')
+
+
+class InstagramImageExtractor(InstagramExtractor):
+ """Extractor for PostPage"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/?&#]+)"
+ test = (
+ # GraphImage
+ ("https://www.instagram.com/p/BqvsDleB3lV/", {
+ "pattern": r"https://[^/]+\.(cdninstagram\.com|fbcdn\.net)"
+ r"/vp/[0-9a-f]+/[0-9A-F]+/t51.2885-15/e35"
+ r"/44877605_725955034447492_3123079845831750529_n.jpg",
+ "keyword": {
+ "date": "type:datetime",
+ "description": str,
+ "height": int,
+ "likes": int,
+ "media_id": "1922949326347663701",
+ "shortcode": "BqvsDleB3lV",
+ "typename": "GraphImage",
+ "username": "instagram",
+ "width": int,
+ }
+ }),
+
+ # GraphSidecar
+ ("https://www.instagram.com/p/BoHk1haB5tM/", {
+ "count": 5,
+ "keyword": {
+ "sidecar_media_id": "1875629777499953996",
+ "sidecar_shortcode": "BoHk1haB5tM",
+ "likes": int,
+ "username": "instagram",
+ }
+ }),
+
+ # GraphVideo
+ ("https://www.instagram.com/p/Bqxp0VSBgJg/", {
+ "url": "8f38c1cf460c9804842f7306c487410f33f82e7e",
+ "keyword": {
+ "date": "type:datetime",
+ "description": str,
+ "height": int,
+ "likes": int,
+ "media_id": "1923502432034620000",
+ "shortcode": "Bqxp0VSBgJg",
+ "typename": "GraphVideo",
+ "username": "instagram",
+ "width": int,
+ }
+ }),
+
+ # GraphSidecar with 2 embedded GraphVideo objects
+ ("https://www.instagram.com/p/BtOvDOfhvRr/", {
+ "count": 2,
+ "url": "e290d4180a58ae50c910d51d3b04d5f5c4622cd7",
+ "keyword": {
+ "sidecar_media_id": "1967717017113261163",
+ "sidecar_shortcode": "BtOvDOfhvRr",
+ "_ytdl_index": int,
+ }
+ })
+ )
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.shortcode = match.group(1)
+
+ def instagrams(self):
+ url = '{}/p/{}/'.format(self.root, self.shortcode)
+ return self._extract_postpage(url)
+
+
+class InstagramUserExtractor(InstagramExtractor):
+ """Extractor for ProfilePage"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/(?!p/|explore/|directory/|accounts/)([^/?&#]+)")
+ test = ("https://www.instagram.com/instagram/", {
+ "range": "1-12",
+ "count": ">= 12",
+ })
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.username = match.group(1)
+
+ def instagrams(self):
+ url = '{}/{}/'.format(self.root, self.username)
+ return self._extract_profilepage(url)
+
+
+class InstagramTagExtractor(InstagramExtractor):
+ """Extractor for TagPage"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{subcategory}", "{tag}")
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/explore/tags/([^/?&#]+)")
+ test = ("https://www.instagram.com/explore/tags/instagram/", {
+ "range": "1-12",
+ "count": ">= 12",
+ })
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.tag = match.group(1)
+
+ def get_metadata(self):
+ return {"tag": self.tag}
+
+ def instagrams(self):
+ url = '{}/explore/tags/{}/'.format(self.root, self.tag)
+ return self._extract_tagpage(url)
diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py
new file mode 100644
index 0000000..5902333
--- /dev/null
+++ b/gallery_dl/extractor/keenspot.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for http://www.keenspot.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class KeenspotComicExtractor(Extractor):
+ """Extractor for webcomics from keenspot.com"""
+ category = "keenspot"
+ subcategory = "comic"
+ directory_fmt = ("{category}", "{comic}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{comic}_{filename}"
+ pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?"
+ test = (
+ ("http://marksmen.keenspot.com/", { # link
+ "range": "1-3",
+ "url": "83bcf029103bf8bc865a1988afa4aaeb23709ba6",
+ }),
+ ("http://barkercomic.keenspot.com/", { # id
+ "range": "1-3",
+ "url": "c4080926db18d00bac641fdd708393b7d61379e6",
+ }),
+ ("http://crowscare.keenspot.com/", { # id v2
+ "range": "1-3",
+ "url": "a00e66a133dd39005777317da90cef921466fcaa"
+ }),
+ ("http://supernovas.keenspot.com/", { # ks
+ "range": "1-3",
+ "url": "de21b12887ef31ff82edccbc09d112e3885c3aab"
+ }),
+ ("http://twokinds.keenspot.com/comic/1066/", { # "random" access
+ "range": "1-3",
+ "url": "97e2a6ed8ba1709314f2449f84b6b1ce5db21c04",
+ })
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.comic = match.group(1).lower()
+ self.path = match.group(2)
+ self.root = "http://" + self.comic + ".keenspot.com"
+
+ self._needle = ""
+ self._image = 'class="ksc"'
+ self._next = self._next_needle
+
+ def items(self):
+ data = {"comic": self.comic}
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ url = self._first(self.request(self.root + "/").text)
+ if self.path:
+ url = self.root + self.path
+
+ prev = None
+ ilen = len(self._image)
+ while url and url != prev:
+ prev = url
+ page = self.request(text.urljoin(self.root, url)).text
+
+ pos = 0
+ while True:
+ pos = page.find(self._image, pos)
+ if pos < 0:
+ break
+ img, pos = text.extract(page, 'src="', '"', pos + ilen)
+ if img.endswith(".js"):
+ continue
+ if img[0] == "/":
+ img = self.root + img
+ elif "youtube.com/" in img:
+ img = "ytdl:" + img
+ yield Message.Url, img, text.nameext_from_url(img, data)
+
+ url = self._next(page)
+
+ def _first(self, page):
+ if self.comic == "brawlinthefamily":
+ self._next = self._next_brawl
+ self._image = '<div id="comic">'
+ return "http://brawlinthefamily.keenspot.com/comic/theshowdown/"
+
+ url = text.extract(page, '<link rel="first" href="', '"')[0]
+ if url:
+ if self.comic == "porcelain":
+ self._needle = 'id="porArchivetop_"'
+ else:
+ self._next = self._next_link
+ return url
+
+ pos = page.find('id="first_day1"')
+ if pos >= 0:
+ self._next = self._next_id
+ return text.rextract(page, 'href="', '"', pos)[0]
+
+ pos = page.find('>FIRST PAGE<')
+ if pos >= 0:
+ if self.comic == "lastblood":
+ self._next = self._next_lastblood
+ self._image = '<div id="comic">'
+ else:
+ self._next = self._next_id
+ return text.rextract(page, 'href="', '"', pos)[0]
+
+ pos = page.find('<div id="kscomicpart"')
+ if pos >= 0:
+ self._needle = '<a href="/archive.html'
+ return text.extract(page, 'href="', '"', pos)[0]
+
+ pos = page.find('>First Comic<') # twokinds
+ if pos >= 0:
+ self._image = '</header>'
+ self._needle = 'class="navarchive"'
+ return text.rextract(page, 'href="', '"', pos)[0]
+
+ pos = page.find('id="flip_FirstDay"') # flipside
+ if pos >= 0:
+ self._image = 'class="flip_Pages ksc"'
+ self._needle = 'id="flip_ArcButton"'
+ return text.rextract(page, 'href="', '"', pos)[0]
+
+ self.log.error("Unrecognized page layout")
+ return None
+
+ def _next_needle(self, page):
+ pos = page.index(self._needle) + len(self._needle)
+ return text.extract(page, 'href="', '"', pos)[0]
+
+ @staticmethod
+ def _next_link(page):
+ return text.extract(page, '<link rel="next" href="', '"')[0]
+
+ @staticmethod
+ def _next_id(page):
+ pos = page.find('id="next_')
+ return text.rextract(page, 'href="', '"', pos)[0] if pos >= 0 else None
+
+ @staticmethod
+ def _next_lastblood(page):
+ pos = page.index("link rel='next'")
+ return text.extract(page, "href='", "'", pos)[0]
+
+ @staticmethod
+ def _next_brawl(page):
+ pos = page.index("comic-nav-next")
+ url = text.rextract(page, 'href="', '"', pos)[0]
+ return None if "?random" in url else url
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
new file mode 100644
index 0000000..c9e6959
--- /dev/null
+++ b/gallery_dl/extractor/khinsider.py
@@ -0,0 +1,69 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract soundtracks from https://downloads.khinsider.com/"""
+
+from .common import Extractor, Message, AsynchronousMixin
+from .. import text, exception
+
+
+class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
+ """Extractor for soundtracks from khinsider.com"""
+ category = "khinsider"
+ subcategory = "soundtrack"
+ directory_fmt = ("{category}", "{album}")
+ archive_fmt = "{album}_{filename}.{extension}"
+ pattern = (r"(?:https?://)?downloads\.khinsider\.com"
+ r"/game-soundtracks/album/([^/?&#]+)")
+ test = (("https://downloads.khinsider.com"
+ "/game-soundtracks/album/horizon-riders-wii"), {
+ "pattern": r"https?://\d+\.\d+\.\d+\.\d+/ost/horizon-riders-wii/[^/]+"
+ r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3",
+ "count": 1,
+ "keyword": "b4f460c78dd23e1f1121f4ac784dd67ded7c2679",
+ })
+ root = "https://downloads.khinsider.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.album = match.group(1)
+
+ def items(self):
+ url = (self.root + "/game-soundtracks/album/" + self.album)
+ page = self.request(url, encoding="utf-8").text
+ data = self.get_job_metadata(page)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for url, track in self.get_album_tracks(page):
+ track.update(data)
+ yield Message.Url, url, track
+
+ def get_job_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ if "Download all songs at once:" not in page:
+ raise exception.NotFoundError("soundtrack")
+ data = text.extract_all(page, (
+ ("album", "Album name: <b>", "</b>"),
+ ("count", "Number of Files: <b>", "</b>"),
+ ("size" , "Total Filesize: <b>", "</b>"),
+ ("date" , "Date added: <b>", "</b>"),
+ ("type" , "Album type: <b>", "</b>"),
+ ))[0]
+ data["album"] = text.unescape(data["album"])
+ return data
+
+ def get_album_tracks(self, page):
+ """Collect url and metadata for all tracks of a soundtrack"""
+ page = text.extract(page, '<table id="songlist">', '</table>')[0]
+ for num, url in enumerate(text.extract_iter(
+ page, '<td class="clickable-row"><a href="', '"'), 1):
+ url = text.urljoin(self.root, url)
+ page = self.request(url, encoding="utf-8").text
+ url = text.extract(
+ page, '<p><a style="color: #21363f;" href="', '"')[0]
+ yield url, text.nameext_from_url(url, {"num": num})
diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py
new file mode 100644
index 0000000..6314a94
--- /dev/null
+++ b/gallery_dl/extractor/kissmanga.py
@@ -0,0 +1,223 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://kissmanga.com/"""
+
+from .common import ChapterExtractor, MangaExtractor, Extractor
+from .. import text, aes, exception
+from ..cache import cache
+import hashlib
+import ast
+import re
+
+
+class RedirectMixin():
+ """Detect and handle redirects to CAPTCHA pages"""
+
+ def request(self, url):
+ while True:
+ response = Extractor.request(self, url)
+ if not response.history or "/AreYouHuman" not in response.url:
+ return response
+ if self.config("captcha", "stop") == "wait":
+ self.log.warning(
+ "Redirect to \n%s\nVisit this URL in your browser, solve "
+ "the CAPTCHA, and press ENTER to continue", response.url)
+ try:
+ input()
+ except (EOFError, OSError):
+ pass
+ else:
+ self.log.error(
+ "Redirect to \n%s\nVisit this URL in your browser and "
+ "solve the CAPTCHA to continue", response.url)
+ raise exception.StopExtraction()
+
+
+class KissmangaBase(RedirectMixin):
+ """Base class for kissmanga extractors"""
+ category = "kissmanga"
+ archive_fmt = "{chapter_id}_{page}"
+ root = "https://kissmanga.com"
+
+ @staticmethod
+ def parse_chapter_string(data):
+ """Parse 'chapter_string' value contained in 'data'"""
+ data["chapter_string"] = text.unescape(data["chapter_string"])
+
+ match = re.match((
+ r"(?:[Vv]ol\.0*(\d+) )?"
+ r"(?:[Cc]h\.)?0*(\d+)"
+ r"(?:[.:]0*(\d+))?"
+ r"(?: *[:-]? *(.+))?"
+ ), data["chapter_string"])
+
+ if not match:
+ match = re.match((
+ r".+?(?: -)? ()"
+ r"0*(\d+)(?:[Vv.]0*(\d+))?"
+ r"(?: *[:-]? *(.+))?"
+ ), data["chapter_string"])
+
+ if match:
+ volume, chapter, minor, title = match.groups()
+ else:
+ volume, chapter, minor, title = 0, 0, "", data["chapter_string"]
+
+ data["volume"] = text.parse_int(volume)
+ data["chapter"] = text.parse_int(chapter)
+ data["chapter_minor"] = "." + minor if minor else ""
+ data["title"] = title if title and title != "Read Online" else ""
+ return data
+
+
+class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
+ """Extractor for manga-chapters from kissmanga.com"""
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com"
+ r"(/Manga/[^/?&#]+/[^/?&#]+\?id=(\d+))")
+ test = (
+ ("https://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", {
+ "url": "46e63fd63e9e16f19bc1e6c7a45dc060815642fd",
+ "keyword": "1cd0b5214ac7ae4d53e2fd8fec40ceec84cd09bf",
+ }),
+ ("https://kissmanga.com/Manga/Urban-Tales/a?id=256717", {
+ "url": "c26be8bf9c2abacee2076979d021634092cf38f1",
+ "keyword": "e1d16780df8e04076ed2b5f0637c5b710ec2f2ea",
+ }),
+ ("https://kissmanga.com/Manga/Monster/Monster-79?id=7608", {
+ "count": 23,
+ "keyword": "f433a7a8fae840e17dace316a243fa27faab86de",
+ }),
+ ("https://kissmanga.com/Manga/Houseki-no-Kuni/Oneshot?id=404189", {
+ "count": 49,
+ "keyword": "d44d1b21d08e4dbf888b0c450a3f1bc919588b4f",
+ }),
+ ("https://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608"),
+ )
+
+ def __init__(self, match):
+ ChapterExtractor.__init__(self, match)
+ self.chapter_id = match.group(2)
+ self.session.headers["Referer"] = self.root
+
+ def metadata(self, page):
+ title = text.extract(page, "<title>", "</title>")[0].strip()
+ manga, cinfo = title.split("\n")[1:3]
+ data = {
+ "manga": manga.strip(),
+ "chapter_string": cinfo.strip(),
+ "chapter_id": text.parse_int(self.chapter_id),
+ "lang": "en",
+ "language": "English",
+ }
+ return self.parse_chapter_string(data)
+
+ def images(self, page):
+ self.session.headers["Referer"] = None
+ try:
+ key = self.build_aes_key(page)
+ iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0,
+ 0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3)
+ return [
+ (aes.aes_cbc_decrypt_text(
+ data, key, iv).partition("&")[0], None)
+ for data in text.extract_iter(
+ page, 'lstImages.push(wrapKA("', '"'
+ )
+ ]
+ except UnicodeDecodeError:
+ self.log.error("Failed to decrypt image URLs")
+ except (ValueError, IndexError):
+ self.log.error("Failed to get AES key")
+ return []
+
+ def build_aes_key(self, page):
+ chko = self._chko_from_external_script()
+
+ for script in self._scripts(page):
+ for stmt in [s.strip() for s in script.split(";")]:
+
+ if stmt.startswith("var _"):
+ name, _, value = stmt[4:].partition(" = ")
+ name += "[0]"
+ value = ast.literal_eval(value)[0]
+
+ elif stmt.startswith("chko = "):
+ stmt = stmt[7:]
+ if stmt == name:
+ chko = value
+ elif stmt == "chko + " + name:
+ chko = chko + value
+ elif stmt == name + " + chko":
+ chko = value + chko
+ else:
+ self.log.warning("unrecognized expression: '%s'", stmt)
+
+ elif stmt.startswith("key = "):
+ pass
+
+ else:
+ self.log.warning("unrecognized statement: '%s'", stmt)
+
+ return list(hashlib.sha256(chko.encode("ascii")).digest())
+
+ @staticmethod
+ def _scripts(page):
+ end = 0
+ while True:
+ pos = page.find("key = ", end)
+ if pos == -1:
+ return
+ beg = page.rindex('<script type="text/javascript">', 0, pos) + 31
+ end = page.index('</script>', pos)
+ yield page[beg:end]
+
+ @cache(maxage=3600)
+ def _chko_from_external_script(self):
+ script = self.request(self.root + "/Scripts/lo.js").text
+
+ pos = script.index("var chko")
+ var = text.extract(script, "=", "[", pos)[0].lstrip()
+ idx = text.extract(script, "[", "]", pos)[0]
+
+ pos = script.index(var)
+ lst = text.extract(script, "=", ";", pos)[0]
+ return ast.literal_eval(lst.strip())[int(idx)]
+
+
+class KissmangaMangaExtractor(KissmangaBase, MangaExtractor):
+ """Extractor for manga from kissmanga.com"""
+ chapterclass = KissmangaChapterExtractor
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com"
+ r"(/Manga/[^/?&#]+/?)$")
+ test = (
+ ("https://kissmanga.com/Manga/Dropout", {
+ "url": "9e3a6f715b229aa3fafa42a1d5da5d65614cb532",
+ "keyword": "32b09711c28b481845acc32e3bb6054cfc90224d",
+ }),
+ ("https://kissmanga.com/manga/feng-shen-ji"), # lowercase
+ )
+
+ def chapters(self, page):
+ results = []
+ manga, pos = text.extract(page, ' class="barTitle">', '\ninformation')
+ page , pos = text.extract(page, ' class="listing">', '</table>', pos)
+ manga = manga.strip()
+ needle = '" title="Read ' + manga + ' '
+ manga = text.unescape(manga)
+
+ for item in text.extract_iter(page, '<a href="', ' online">'):
+ url, _, chapter = item.partition(needle)
+ data = {
+ "manga": manga, "chapter_string": chapter,
+ "chapter_id": text.parse_int(url.rpartition("=")[2]),
+ "lang": "en", "language": "English",
+ }
+ self.parse_chapter_string(data)
+ results.append((self.root + url, data))
+ return results
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
new file mode 100644
index 0000000..8541e4f
--- /dev/null
+++ b/gallery_dl/extractor/komikcast.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://komikcast.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import re
+
+
+class KomikcastBase():
+ """Base class for komikcast extractors"""
+ category = "komikcast"
+ root = "https://komikcast.com"
+
+ @staticmethod
+ def parse_chapter_string(chapter_string, data=None):
+ """Parse 'chapter_string' value and add its info to 'data'"""
+ if not data:
+ data = {}
+
+ match = re.match(
+ r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?",
+ text.unescape(chapter_string),
+ )
+ manga, chapter, data["chapter_minor"], title = match.groups()
+
+ if manga:
+ data["manga"] = manga.partition(" Chapter ")[0]
+ if title and title.lower() != "bahasa indonesia":
+ data["title"] = title.strip()
+ else:
+ data["title"] = ""
+ data["chapter"] = text.parse_int(chapter)
+ data["lang"] = "id"
+ data["language"] = "Indonesian"
+
+ return data
+
+
+class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
+ """Extractor for manga-chapters from komikcast.com"""
+ pattern = r"(?:https?://)?(?:www\.)?komikcast\.com(/chapter/[^/?&#]+/)"
+ test = (
+ (("https://komikcast.com/chapter/"
+ "apotheosis-chapter-02-2-bahasa-indonesia/"), {
+ "url": "f6b43fbc027697749b3ea1c14931c83f878d7936",
+ "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4",
+ }),
+ (("https://komikcast.com/chapter/"
+ "tonari-no-kashiwagi-san-chapter-18b/"), {
+ "url": "aff90dd21dbb945a726778b10bdef522af7c42fe",
+ "keyword": "19b5783864c4299913de436513b124b028b557c1",
+ }),
+ (("https://komikcast.com/chapter/090-eko-to-issho-chapter-1/"), {
+ "url": "cda104a32ea2b06b3d6b096726622f519ed1fa33",
+ }),
+ )
+
+ def metadata(self, page):
+ info = text.extract(page, '<b>', "</b>")[0]
+ return self.parse_chapter_string(info)
+
+ @staticmethod
+ def images(page):
+ readerarea = text.extract(
+ page, '<div id="readerarea">', '<div class="navig">')[0]
+ return [
+ (text.unescape(url), None)
+ for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
+ if "/Banner-" not in url
+ ]
+
+
+class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
+ """Extractor for manga from komikcast.com"""
+ chapterclass = KomikcastChapterExtractor
+ pattern = (r"(?:https?://)?(?:www\.)?komikcast\.com"
+ r"(/(?:komik/)?[^/?&#]+)/?$")
+ test = (
+ ("https://komikcast.com/komik/090-eko-to-issho/", {
+ "url": "dc798d107697d1f2309b14ca24ca9dba30c6600f",
+ "keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1",
+ }),
+ ("https://komikcast.com/tonari-no-kashiwagi-san/"),
+ )
+
+ def chapters(self, page):
+ results = []
+ data = self.metadata(page)
+
+ for item in text.extract_iter(
+ page, '<span class="leftoff"><a href="', '</a>'):
+ url, _, chapter_string = item.rpartition('">Chapter ')
+ self.parse_chapter_string(chapter_string, data)
+ results.append((url, data.copy()))
+ return results
+
+ @staticmethod
+ def metadata(page):
+ """Return a dict with general metadata"""
+ manga , pos = text.extract(page, "<title>" , "</title>")
+ genres, pos = text.extract(page, ">Genres:", "</span>", pos)
+ author, pos = text.extract(page, ">Author:", "</span>", pos)
+ mtype , pos = text.extract(page, ">Type:" , "</span>", pos)
+
+ return {
+ "manga": text.unescape(manga[:-12]),
+ "author": text.remove_html(author),
+ "genres": text.split_html(genres)[::2],
+ "type": text.remove_html(mtype),
+ }
diff --git a/gallery_dl/extractor/konachan.py b/gallery_dl/extractor/konachan.py
new file mode 100644
index 0000000..a9d8b3a
--- /dev/null
+++ b/gallery_dl/extractor/konachan.py
@@ -0,0 +1,85 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://konachan.com/"""
+
+from . import booru
+
+
+class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for konachan extractors"""
+ category = "konachan"
+
+ def __init__(self, match):
+ root = "https://konachan." + match.group("tld")
+ self.api_url = root + "/post.json"
+ self.post_url = root + "/post/show/{}"
+ super().__init__(match)
+
+
+class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
+ """Extractor for images from konachan.com based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
+ r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
+ test = (
+ ("https://konachan.com/post?tags=patata", {
+ "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
+ }),
+ ("https://konachan.net/post?tags=patata"),
+ )
+
+
+class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
+ """Extractor for image-pools from konachan.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
+ r"/pool/show/(?P<pool>\d+)")
+ test = (
+ ("https://konachan.com/pool/show/95", {
+ "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
+ }),
+ ("https://konachan.net/pool/show/95"),
+ )
+
+
+class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
+ """Extractor for single images from konachan.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
+ r"/post/show/(?P<post>\d+)")
+ test = (
+ ("https://konachan.com/post/show/205189", {
+ "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "patata",
+ "tags_character": "clownpiece",
+ "tags_copyright": "touhou",
+ "tags_general": str,
+ },
+ }),
+ ("https://konachan.net/post/show/205189"),
+ )
+
+
+class KonachanPopularExtractor(booru.MoebooruPopularMixin, KonachanExtractor):
+ """Extractor for popular images from konachan.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
+ r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = (
+ ("https://konachan.com/post/popular_by_month?month=11&year=2010", {
+ "count": 20,
+ }),
+ ("https://konachan.com/post/popular_recent"),
+ ("https://konachan.net/post/popular_recent"),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = (
+ "https://konachan.{tld}/post/popular_{scale}.json".format(
+ tld=match.group("tld"), scale=self.scale))
diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py
new file mode 100644
index 0000000..ed72f4c
--- /dev/null
+++ b/gallery_dl/extractor/livedoor.py
@@ -0,0 +1,156 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for http://blog.livedoor.jp/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class LivedoorExtractor(Extractor):
+ """Base class for livedoor extractors"""
+ category = "livedoor"
+ root = "http://blog.livedoor.jp"
+ filename_fmt = "{post[id]}_{post[title]}_{num:>02}.{extension}"
+ directory_fmt = ("{category}", "{post[user]}")
+ archive_fmt = "{post[id]}_{hash}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for post in self.posts():
+ images = self._images(post)
+ if images:
+ yield Message.Directory, {"post": post}
+ for image in images:
+ yield Message.Url, image["url"], image
+
+ def posts(self):
+ """Return an iterable with post objects"""
+
+ def _load(self, data, body):
+ extr = text.extract_from(data)
+ tags = text.extract(body, '</dt><dd>', '</dl>')[0]
+
+ return {
+ "id" : text.parse_int(extr("id : '", "'")),
+ "title" : text.unescape(extr("title : '", "'")),
+ "categories": [extr("name:'", "'"), extr("name:'", "'")],
+ "date" : text.parse_datetime(
+ extr("date : '", "'"), "%Y-%m-%d %H:%M:%S"),
+ "tags" : text.split_html(tags),
+ "user" : self.user,
+ "body" : body,
+ }
+
+ def _images(self, post):
+ imgs = []
+ body = post.pop("body")
+
+ for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
+ src = text.extract(img, 'src="', '"')[0]
+ alt = text.extract(img, 'alt="', '"')[0]
+
+ if not src:
+ continue
+ if "://livedoor.blogimg.jp/" in src:
+ url = src.replace("-s.", ".")
+ else:
+ url = text.urljoin(self.root, src)
+ name, _, ext = url.rpartition("/")[2].rpartition(".")
+
+ imgs.append({
+ "url" : url,
+ "num" : num,
+ "hash" : name,
+ "filename" : alt or name,
+ "extension": ext,
+ "post" : post,
+ })
+
+ return imgs
+
+
+class LivedoorBlogExtractor(LivedoorExtractor):
+ """Extractor for a user's blog on blog.livedoor.jp"""
+ subcategory = "blog"
+ pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])"
+ test = (
+ ("http://blog.livedoor.jp/zatsu_ke/", {
+ "range": "1-50",
+ "count": 50,
+ "pattern": r"https?://livedoor.blogimg.jp/\w+/imgs/\w/\w/\w+\.\w+",
+ "keyword": {
+ "post": {
+ "categories": list,
+ "date": "type:datetime",
+ "id": int,
+ "tags": list,
+ "title": str,
+ "user": "zatsu_ke"
+ },
+ "filename": str,
+ "hash": r"re:\w{4,}",
+ "num": int,
+ },
+ }),
+ ("http://blog.livedoor.jp/uotapo/", {
+ "range": "1-5",
+ "count": 5,
+ }),
+ )
+
+ def posts(self):
+ url = "{}/{}".format(self.root, self.user)
+
+ while url:
+ extr = text.extract_from(self.request(url).text)
+ while True:
+ data = extr('.articles.push(', ');')
+ if not data:
+ break
+ body = extr('class="article-body-inner">',
+ 'class="article-footer">')
+ yield self._load(data, body)
+ url = extr('<a rel="next" href="', '"')
+
+
+class LivedoorPostExtractor(LivedoorExtractor):
+ """Extractor for images from a blog post on blog.livedoor.jp"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/archives/(\d+)"
+ test = (
+ ("http://blog.livedoor.jp/zatsu_ke/archives/51493859.html", {
+ "url": "8826fe623f19dc868e7538e8519bf8491e92a0a2",
+ "keyword": "52fcba9253a000c339bcd658572d252e282626af",
+ }),
+ ("http://blog.livedoor.jp/amaumauma/archives/7835811.html", {
+ "url": "fc1d6a9557245b5a27d3a10bf0fa9922ef377215",
+ "keyword": "0229072abb5cd8a221df72e0ffdfc13336c0e9ce",
+ }),
+ ("http://blog.livedoor.jp/uotapo/archives/1050616939.html", {
+ "url": "3f3581807ec4776e6a67ed7985a22494d4bc4904",
+ "keyword": "2eb3e383c68e909c4dd3d563c16d0b6e2fe6627b",
+ }),
+ )
+
+ def __init__(self, match):
+ LivedoorExtractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def posts(self):
+ url = "{}/{}/archives/{}.html".format(
+ self.root, self.user, self.post_id)
+ extr = text.extract_from(self.request(url).text)
+ data = extr('articles :', '</script>')
+ body = extr('class="article-body-inner">',
+ 'class="article-footer">')
+ return (self._load(data, body),)
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
new file mode 100644
index 0000000..65ae843
--- /dev/null
+++ b/gallery_dl/extractor/luscious.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://luscious.net/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class LusciousBase(Extractor):
+ """Base class for luscious extractors"""
+ category = "luscious"
+ cookiedomain = ".luscious.net"
+ root = "https://members.luscious.net"
+
+ def login(self):
+ """Login and set necessary cookies"""
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=14*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "https://members.luscious.net/accounts/login/"
+ headers = {"Referer": "https://members.luscious.net/login/"}
+ data = {
+ "login": username,
+ "password": password,
+ "remember": "on",
+ "next": "/",
+ }
+
+ response = self.request(url, method="POST", headers=headers, data=data)
+ if "/accounts/login/" in response.url or not response.history:
+ raise exception.AuthenticationError()
+ for cookie in response.history[0].cookies:
+ if cookie.name.startswith("sessionid_"):
+ return {cookie.name: cookie.value}
+ raise exception.AuthenticationError()
+
+ @staticmethod
+ def _parse_tags(tags):
+ return [
+ text.unescape(tag.replace(":_", ":"))
+ for tag in text.extract_iter(tags or "", "/tags/", "/")
+ ]
+
+
+class LusciousAlbumExtractor(LusciousBase, GalleryExtractor):
+ """Extractor for image albums from luscious.net"""
+ subcategory = "album"
+ archive_fmt = "{gallery_id}_{image_id}"
+ pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net"
+ r"/(?:albums|pictures/c/[^/?&#]+/album)/([^/?&#]+_(\d+))")
+ test = (
+ ("https://luscious.net/albums/okinami-no-koigokoro_277031/", {
+ "url": "7e4984a271a1072ac6483e4228a045895aff86f3",
+ "keyword": "c597c132834f4990f90bf5dee5de2a9d4ba263a4",
+ "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3",
+ }),
+ ("https://luscious.net/albums/virgin-killer-sweater_282582/", {
+ "url": "21cc68a7548f4d71dfd67d8caf96349dde7e791c",
+ "keyword": "e1202078b504adeccd521aa932f456a5a85479a0",
+ }),
+ ("https://luscious.net/albums/not-found_277035/", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://members.luscious.net/albums/login-required_323871/", {
+ "options": (("username", None),),
+ "exception": exception.AuthorizationError,
+ }),
+ ("https://www.luscious.net/albums/okinami_277031/"),
+ ("https://members.luscious.net/albums/okinami_277031/"),
+ ("https://luscious.net/pictures/c/video_game_manga/album"
+ "/okinami-no-koigokoro_277031/sorted/position/id/16528978/@_1"),
+ )
+
+ def __init__(self, match):
+ path, self.gallery_id = match.groups()
+ url = "{}/albums/{}/".format(self.root, path)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ pos = page.find("<h1>404 Not Found</h1>")
+ if pos >= 0:
+ msg = text.extract(page, '<div class="content">', '</div>', pos)[0]
+ if msg and "content is not available" in msg:
+ raise exception.AuthorizationError()
+ raise exception.NotFoundError("album")
+
+ title, pos = text.extract(page, '"og:title" content="', '"')
+ info , pos = text.extract(page, '<li class="user_info">', "", pos)
+ if info is None:
+ count, pos = text.extract(page, '>Pages:', '<', pos)
+ else:
+ count, pos = text.extract(page, '<p>', ' ', pos)
+ genre, pos = text.extract(page, '<p>Genre:', '</p>', pos)
+ adnce, pos = text.extract(page, '<p>Audience:', '</p>', pos)
+ tags , pos = text.extract(page, '"tag_list static">', '</ol>', pos)
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : text.unescape(title or ""),
+ "count" : text.parse_int(count),
+ "genre" : text.remove_html(genre),
+ "audience" : text.remove_html(adnce),
+ "tags" : self._parse_tags(tags),
+ }
+
+ def images(self, page):
+ extr = text.extract
+
+ url = "{}/pictures/album/x_{}/sorted/old/page/1/".format(
+ self.root, self.gallery_id)
+ page = self.request(url).text
+ pos = page.find('<div id="picture_page_')
+ url = extr(page, '<a href="', '"', pos)[0]
+ iurl = None
+
+ while url and not url.endswith("/more_like_this/"):
+ page = self.request(self.root + url).text
+
+ if not iurl: # first loop iteraton
+ current = extr(page, '"pj_current_page" value="', '"')[0]
+ if current and current != "1":
+ url = "{}/albums/{}/jump_to_page/1/".format(
+ self.root, self.gallery_id)
+ page = self.request(url, method="POST").text
+
+ iid , pos = extr(url , '/id/', '/')
+ url , pos = extr(page, '<link rel="next" href="', '"')
+ name, pos = extr(page, '<h1 id="picture_title">', '</h1>', pos)
+ _ , pos = extr(page, '<ul class="image_option_icons">', '', pos)
+ iurl, pos = extr(page, '<li><a href="', '"', pos+100)
+
+ if iurl[0] == "/":
+ iurl = text.urljoin(self.root, iurl)
+
+ yield iurl, {
+ "name": name,
+ "image_id": text.parse_int(iid),
+ }
+
+
+class LusciousSearchExtractor(LusciousBase, Extractor):
+ """Extractor for album searches on luscious.net"""
+ subcategory = "search"
+ pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net"
+ r"/(albums(?:/(?![^/?&#]+_\d+)[^/?&#]+)+|manga|pictures)/?$")
+ test = (
+ ("https://luscious.net/manga/"),
+ ("https://members.luscious.net/albums/sorted/updated/album_type/manga"
+ "/content_id/2/tagged/+full_color/page/1/", {
+ "pattern": LusciousAlbumExtractor.pattern,
+ "range": "20-40",
+ "count": 21,
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1).partition("/page/")[0]
+ if not self.path.startswith("albums/"):
+ self.path = "albums/sorted/updated/album_type/" + self.path
+
+ def items(self):
+ self.login()
+ yield Message.Version, 1
+ for album in self.albums():
+ url, data = self.parse_album(album)
+ yield Message.Queue, url, data
+
+ def albums(self, pnum=1):
+ while True:
+ url = "{}/{}/page/{}/.json/".format(self.root, self.path, pnum)
+ data = self.request(url).json()
+
+ yield from text.extract_iter(
+ data["html"], "<figcaption>", "</figcaption>")
+
+ if data["paginator_complete"]:
+ return
+ pnum += 1
+
+ def parse_album(self, album):
+ url , pos = text.extract(album, 'href="', '"')
+ title, pos = text.extract(album, ">", "<", pos)
+ count, pos = text.extract(album, "# of pictures:", "<", pos)
+ date , pos = text.extract(album, "Updated:&nbsp;", "<", pos)
+ desc , pos = text.extract(album, "class='desc'>", "<", pos)
+ tags , pos = text.extract(album, "<ol ", "</ol>", pos)
+
+ return text.urljoin(self.root, url), {
+ "title": text.unescape(title or ""),
+ "description": text.unescape(desc or ""),
+ "gallery_id": text.parse_int(url.rpartition("_")[2].rstrip("/")),
+ "count": text.parse_int(count),
+ "date": date,
+ "tags": self._parse_tags(tags),
+ "_extractor": LusciousAlbumExtractor,
+ }
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
new file mode 100644
index 0000000..d0eb2a9
--- /dev/null
+++ b/gallery_dl/extractor/mangadex.py
@@ -0,0 +1,180 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://mangadex.org/"""
+
+from .common import Extractor, Message
+from .. import text, util
+from ..cache import memcache
+
+
+class MangadexExtractor(Extractor):
+ """Base class for mangadex extractors"""
+ category = "mangadex"
+ root = "https://mangadex.org"
+
+ # mangadex-to-iso639-1 codes
+ iso639_map = {
+ "br": "pt",
+ "ct": "ca",
+ "gb": "en",
+ "vn": "vi",
+ }
+
+ def chapter_data(self, chapter_id):
+ """Request API results for 'chapter_id'"""
+ url = "{}/api/chapter/{}".format(self.root, chapter_id)
+ return self.request(url).json()
+
+ @memcache(keyarg=1)
+ def manga_data(self, manga_id):
+ """Request API results for 'manga_id'"""
+ url = "{}/api/manga/{}".format(self.root, manga_id)
+ return self.request(url).json()
+
+
+class MangadexChapterExtractor(MangadexExtractor):
+ """Extractor for manga-chapters from mangadex.org"""
+ subcategory = "chapter"
+ directory_fmt = (
+ "{category}", "{manga}",
+ "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}")
+ filename_fmt = (
+ "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)"
+ test = (
+ ("https://mangadex.org/chapter/122094", {
+ "keyword": "1c834dca33025f521e1874aee1f71c51e28ebf99",
+ "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",
+ }),
+ # oneshot
+ ("https://mangadex.org/chapter/138086", {
+ "count": 64,
+ "keyword": "178777bd0352fb19eb934cbee5630d16e3fb60ab",
+ }),
+ )
+
+ def __init__(self, match):
+ MangadexExtractor.__init__(self, match)
+ self.chapter_id = match.group(1)
+ self.data = None
+
+ def items(self):
+ data = self.metadata()
+ imgs = self.images()
+ data["count"] = len(imgs)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def metadata(self):
+ """Return a dict with general metadata"""
+ cdata = self.chapter_data(self.chapter_id)
+ mdata = self.manga_data(cdata["manga_id"])
+ self.data = cdata
+
+ chapter, sep, minor = cdata["chapter"].partition(".")
+ return {
+ "manga": mdata["manga"]["title"],
+ "manga_id": cdata["manga_id"],
+ "artist": mdata["manga"]["artist"],
+ "author": mdata["manga"]["author"],
+ "title": text.unescape(cdata["title"]),
+ "volume": text.parse_int(cdata["volume"]),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "chapter_id": cdata["id"],
+ "group": mdata["chapter"][self.chapter_id]["group_name"],
+ "date": cdata["timestamp"],
+ "lang": util.language_to_code(cdata["lang_name"]),
+ "language": cdata["lang_name"],
+ }
+
+ def images(self):
+ """Return a list of all image URLs"""
+ base = self.data["server"] + self.data["hash"] + "/"
+ if base.startswith("/"):
+ base = text.urljoin(self.root, base)
+ return [base + page for page in self.data["page_array"]]
+
+
+class MangadexMangaExtractor(MangadexExtractor):
+ """Extractor for manga from mangadex.org"""
+ subcategory = "manga"
+ categorytransfer = True
+ pattern = (r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)"
+ r"/(?:title|manga)/(\d+)")
+ test = (
+ ("https://mangadex.org/manga/2946/souten-no-koumori", {
+ "pattern": r"https://mangadex.org/chapter/\d+",
+ "keywords": {
+ "manga": "Souten no Koumori",
+ "manga_id": 2946,
+ "title": "Oneshot",
+ "volume": 0,
+ "chapter": 0,
+ "chapter_minor": "",
+ "chapter_id": int,
+ "group": str,
+ "date": int,
+ "lang": str,
+ "language": str,
+ },
+ }),
+ ("https://mangadex.org/manga/13318/dagashi-kashi/chapters/2/", {
+ "count": ">= 100",
+ }),
+ ("https://mangadex.org/title/13004/yorumori-no-kuni-no-sora-ni", {
+ "count": 0,
+ }),
+ ("https://mangadex.org/title/2946/souten-no-koumori"),
+ )
+
+ def __init__(self, match):
+ MangadexExtractor.__init__(self, match)
+ self.manga_id = text.parse_int(match.group(1))
+
+ def items(self):
+ yield Message.Version, 1
+ for data in self.chapters():
+ url = "{}/chapter/{}".format(self.root, data["chapter_id"])
+ yield Message.Queue, url, data
+
+ def chapters(self):
+ """Return a sorted list of chapter-metadata dicts"""
+ data = self.manga_data(self.manga_id)
+ if "chapter" not in data:
+ return ()
+ manga = data["manga"]
+
+ results = []
+ for chid, info in data["chapter"].items():
+ chapter, sep, minor = info["chapter"].partition(".")
+ lang = self.iso639_map.get(info["lang_code"], info["lang_code"])
+ results.append({
+ "manga": manga["title"],
+ "manga_id": self.manga_id,
+ "artist": manga["artist"],
+ "author": manga["author"],
+ "title": text.unescape(info["title"]),
+ "volume": text.parse_int(info["volume"]),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "chapter_id": text.parse_int(chid),
+ "group": text.unescape(info["group_name"]),
+ "date": info["timestamp"],
+ "lang": lang,
+ "language": util.code_to_language(lang),
+ "_extractor": MangadexChapterExtractor,
+ })
+
+ results.sort(key=lambda x: (x["chapter"], x["chapter_minor"]))
+ return results
diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py
new file mode 100644
index 0000000..1b8a4a6
--- /dev/null
+++ b/gallery_dl/extractor/mangafox.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://fanfox.net/"""
+
+from .common import ChapterExtractor
+from .. import text
+
+
+class MangafoxChapterExtractor(ChapterExtractor):
+ """Extractor for manga-chapters from fanfox.net"""
+ category = "mangafox"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?(?:mangafox\.me|fanfox\.net)"
+ r"(/manga/[^/]+/((?:v(\d+)/)?c(\d+)([^/?&#]*)))")
+ test = (
+ ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {
+ "keyword": "5661dab258d42d09d98f194f7172fb9851a49766",
+ "content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
+ }),
+ ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/"),
+ )
+ root = "https://m.fanfox.net"
+
+ def __init__(self, match):
+ base, self.cstr, self.volume, self.chapter, self.minor = match.groups()
+ self.urlbase = self.root + base
+ ChapterExtractor.__init__(self, match, self.urlbase + "/1.html")
+
+ def metadata(self, page):
+ manga, pos = text.extract(page, "<title>", "</title>")
+ count, pos = text.extract(
+ page, ">", "<", page.find("</select>", pos) - 20)
+ sid , pos = text.extract(page, "var series_id =", ";", pos)
+ cid , pos = text.extract(page, "var chapter_id =", ";", pos)
+
+ return {
+ "manga": text.unescape(manga),
+ "volume": text.parse_int(self.volume),
+ "chapter": text.parse_int(self.chapter),
+ "chapter_minor": self.minor or "",
+ "chapter_string": self.cstr,
+ "count": text.parse_int(count),
+ "sid": text.parse_int(sid),
+ "cid": text.parse_int(cid),
+ }
+
+ def images(self, page):
+ pnum = 1
+ while True:
+ url, pos = text.extract(page, '<img src="', '"')
+ yield url, None
+ url, pos = text.extract(page, ' src="', '"', pos)
+ yield url, None
+
+ pnum += 2
+ page = self.request("{}/{}.html".format(self.urlbase, pnum)).text
diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py
new file mode 100644
index 0000000..e15acbe
--- /dev/null
+++ b/gallery_dl/extractor/mangahere.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://www.mangahere.cc/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import re
+
+
+class MangahereBase():
+ """Base class for mangahere extractors"""
+ category = "mangahere"
+ root = "https://www.mangahere.cc"
+ mobile_root = "https://m.mangahere.cc"
+ url_fmt = mobile_root + "/manga/{}/{}.html"
+
+
+class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
+ """Extractor for manga-chapters from mangahere.cc"""
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/"
+ r"([^/]+(?:/v0*(\d+))?/c([^/?&#]+))")
+ test = (
+ ("https://www.mangahere.cc/manga/dongguo_xiaojie/c004.2/", {
+ "keyword": "7c98d7b50a47e6757b089aa875a53aa970cac66f",
+ "content": "708d475f06893b88549cbd30df1e3f9428f2c884",
+ }),
+ ("http://www.mangahere.co/manga/dongguo_xiaojie/c003.2/"),
+ ("http://m.mangahere.co/manga/dongguo_xiaojie/c003.2/"),
+ )
+
+ def __init__(self, match):
+ self.part, self.volume, self.chapter = match.groups()
+ url = self.url_fmt.format(self.part, 1)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ pos = page.index("</select>")
+ count , pos = text.extract(page, ">", "<", pos - 20)
+ manga_id , pos = text.extract(page, "series_id = ", ";", pos)
+ chapter_id, pos = text.extract(page, "chapter_id = ", ";", pos)
+ manga , pos = text.extract(page, '"name":"', '"', pos)
+ chapter, dot, minor = self.chapter.partition(".")
+
+ return {
+ "manga": text.unescape(manga),
+ "manga_id": text.parse_int(manga_id),
+ "title": self._get_title(),
+ "volume": text.parse_int(self.volume),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": dot + minor,
+ "chapter_id": text.parse_int(chapter_id),
+ "count": text.parse_int(count),
+ "lang": "en",
+ "language": "English",
+ }
+
+ def images(self, page):
+ pnum = 1
+
+ while True:
+ url, pos = text.extract(page, '<img src="', '"')
+ yield url, None
+ url, pos = text.extract(page, ' src="', '"', pos)
+ yield url, None
+ pnum += 2
+ page = self.request(self.url_fmt.format(self.part, pnum)).text
+
+ def _get_title(self):
+ url = "{}/manga/{}/".format(self.root, self.part)
+ page = self.request(url).text
+
+ try:
+ pos = page.index(self.part) + len(self.part)
+ pos = page.index(self.part, pos) + len(self.part)
+ return text.extract(page, ' title="', '"', pos)[0]
+ except ValueError:
+ return ""
+
+
+class MangahereMangaExtractor(MangahereBase, MangaExtractor):
+ """Extractor for manga from mangahere.cc"""
+ chapterclass = MangahereChapterExtractor
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]"
+ r"(/manga/[^/]+)/?(?:#.*)?$")
+ test = (
+ ("https://www.mangahere.cc/manga/aria/", {
+ "url": "23ad9256f7392de5973b79a36f6875e9fdcb7563",
+ "keyword": "79e326641e7d5d2fed43a1eb9949471b8162a9e0",
+ }),
+ ("https://www.mangahere.cc/manga/hiyokoi/#50", {
+ "url": "654850570aa03825cd57e2ae2904af489602c523",
+ "keyword": "c8084d89a9ea6cf40353093669f9601a39bf5ca2",
+ }),
+ ("https://www.mangahere.co/manga/aria/"),
+ ("https://m.mangahere.co/manga/aria/"),
+ )
+
+ def chapters(self, page):
+ results = []
+ manga, pos = text.extract(page, '<meta name="og:title" content="', '"')
+ manga = text.unescape(manga)
+
+ page = text.extract(
+ page, 'id="chapterlist"', 'class="detail-main-list-more"', pos)[0]
+ pos = 0
+ while True:
+ url, pos = text.extract(page, ' href="', '"', pos)
+ if not url:
+ return results
+ info, pos = text.extract(page, 'class="title3">', '<', pos)
+ date, pos = text.extract(page, 'class="title2">', '<', pos)
+
+ match = re.match(
+ r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?", info)
+ if match:
+ volume, chapter, minor, title = match.groups()
+ else:
+ chapter, _, minor = url[:-1].rpartition("/c")[2].partition(".")
+ minor = "." + minor
+ volume = 0
+ title = ""
+
+ results.append((text.urljoin(self.root, url), {
+ "manga": manga,
+ "title": text.unescape(title) if title else "",
+ "volume": text.parse_int(volume),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": minor,
+ "date": date,
+ "lang": "en",
+ "language": "English",
+ }))
diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py
new file mode 100644
index 0000000..18ef005
--- /dev/null
+++ b/gallery_dl/extractor/mangapanda.py
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://www.mangapanda.com/"""
+
+from .mangareader import MangareaderMangaExtractor, MangareaderChapterExtractor
+
+
+class MangapandaBase():
+ """Base class for mangapanda extractors"""
+ category = "mangapanda"
+ root = "https://www.mangapanda.com"
+
+
+class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor):
+ """Extractor for manga-chapters from mangapanda.com"""
+ pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))"
+ test = ("https://www.mangapanda.com/red-storm/2", {
+ "url": "1f633f776e950531ba9b1e81965316458e785261",
+ "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb",
+ })
+
+
+class MangapandaMangaExtractor(MangapandaBase, MangareaderMangaExtractor):
+ """Extractor for manga from mangapanda.com"""
+ chapterclass = MangapandaChapterExtractor
+ pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com(/[^/?&#]+)/?$"
+ test = ("https://www.mangapanda.com/mushishi", {
+ "url": "357f965732371cac1990fee8b480f62e29141a42",
+ "keyword": "031b3ea085921c552de017ecbb9b906e462229c9",
+ })
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
new file mode 100644
index 0000000..ee11231
--- /dev/null
+++ b/gallery_dl/extractor/mangapark.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://mangapark.me/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text, exception
+import json
+
+
+class MangaparkBase():
+ """Base class for mangapark extractors"""
+ category = "mangapark"
+ root_fmt = "https://mangapark.{}"
+
+ @staticmethod
+ def parse_chapter_path(path, data):
+ """Get volume/chapter information from url-path of a chapter"""
+ data["volume"], data["chapter_minor"] = 0, ""
+ for part in path.split("/")[1:]:
+ key, value = part[0], part[1:]
+ if key == "c":
+ chapter, dot, minor = value.partition(".")
+ data["chapter"] = text.parse_int(chapter)
+ data["chapter_minor"] = dot + minor
+ elif key == "i":
+ data["chapter_id"] = text.parse_int(value)
+ elif key == "v":
+ data["volume"] = text.parse_int(value)
+ elif key == "s":
+ data["stream"] = text.parse_int(value)
+ elif key == "e":
+ data["chapter_minor"] = "v" + value
+
+
+class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
+ """Extractor for manga-chapters from mangapark.me"""
+ pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)"
+ r"/manga/([^?&#]+/i\d+)")
+ test = (
+ ("https://mangapark.me/manga/gosu/i811615/c55/1", {
+ "count": 50,
+ "keyword": "373d678048d29492f9763743ccaa9b6d840f17cf",
+ }),
+ (("https://mangapark.me/manga"
+ "/ad-astra-per-aspera-hata-kenjirou/i662054/c001.2/1"), {
+ "count": 40,
+ "keyword": "8e9cce4ed0e25d12a45e02f840d6f32ef838e257",
+ }),
+ ("https://mangapark.me/manga/gekkan-shoujo-nozaki-kun/i655476/c70/1", {
+ "count": 15,
+ "keyword": "19f730617074d65f91c0781f429de324890925bf",
+ }),
+ ("https://mangapark.net/manga/gosu/i811615/c55/1"),
+ ("https://mangapark.com/manga/gosu/i811615/c55/1"),
+ )
+
+ def __init__(self, match):
+ tld, self.path = match.groups()
+ self.root = self.root_fmt.format(tld)
+ url = "{}/manga/{}?zoom=2".format(self.root, self.path)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ data = text.extract_all(page, (
+ ("manga_id" , "var _manga_id = '", "'"),
+ ("chapter_id", "var _book_id = '", "'"),
+ ("stream" , "var _stream = '", "'"),
+ ("path" , "var _book_link = '", "'"),
+ ("manga" , "<h2>", "</h2>"),
+ ("title" , "</a>", "<"),
+ ), values={"lang": "en", "language": "English"})[0]
+
+ if not data["path"]:
+ raise exception.NotFoundError("chapter")
+ self.parse_chapter_path(data["path"], data)
+
+ data["manga"], _, data["type"] = data["manga"].rpartition(" ")
+ data["manga"] = text.unescape(data["manga"])
+ data["title"] = data["title"].partition(": ")[2]
+ for key in ("manga_id", "chapter_id", "stream"):
+ data[key] = text.parse_int(data[key])
+
+ return data
+
+ def images(self, page):
+ data = json.loads(text.extract(
+ page, "var _load_pages =", ";")[0] or "[]")
+ return [
+ (text.urljoin(self.root, item["u"]), {
+ "width": text.parse_int(item["w"]),
+ "height": text.parse_int(item["h"]),
+ })
+ for item in data
+ ]
+
+
+class MangaparkMangaExtractor(MangaparkBase, MangaExtractor):
+ """Extractor for manga from mangapark.me"""
+ chapterclass = MangaparkChapterExtractor
+ pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)"
+ r"(/manga/[^/?&#]+)/?$")
+ test = (
+ ("https://mangapark.me/manga/aria", {
+ "url": "a58be23ef3874fe9705b0b41dd462b67eaaafd9a",
+ "keyword": "b3b5a30aa2a326bc0ca8b74c65b5ecd4bf676ebf",
+ }),
+ ("https://mangapark.net/manga/aria"),
+ ("https://mangapark.com/manga/aria"),
+ )
+
+ def __init__(self, match):
+ self.root = self.root_fmt.format(match.group(1))
+ MangaExtractor.__init__(self, match, self.root + match.group(2))
+
+ def chapters(self, page):
+ results = []
+ data = {"lang": "en", "language": "English"}
+ data["manga"] = text.unescape(
+ text.extract(page, '<title>', ' Manga - ')[0])
+
+ for stream in page.split('<div id="stream_')[1:]:
+ data["stream"] = text.parse_int(text.extract(stream, '', '"')[0])
+
+ for chapter in text.extract_iter(stream, '<li ', '</li>'):
+ path , pos = text.extract(chapter, 'href="', '"')
+ title, pos = text.extract(chapter, '>: </span>', '<', pos)
+ count, pos = text.extract(chapter, ' of ', ' ', pos)
+
+ self.parse_chapter_path(path[8:], data)
+ data["title"] = title.strip() if title else ""
+ data["count"] = text.parse_int(count)
+ results.append((self.root + path, data.copy()))
+
+ return results
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py
new file mode 100644
index 0000000..d24d452
--- /dev/null
+++ b/gallery_dl/extractor/mangareader.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://www.mangareader.net/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+
+
+class MangareaderBase():
+ """Base class for mangareader extractors"""
+ category = "mangareader"
+ root = "https://www.mangareader.net"
+
+ @staticmethod
+ def parse_page(page, data):
+ """Parse metadata on 'page' and add it to 'data'"""
+ text.extract_all(page, (
+ ("manga" , '<h2 class="aname">', '</h2>'),
+ ("release", '>Year of Release:</td>\n<td>', '</td>'),
+ ('author' , '>Author:</td>\n<td>', '</td>'),
+ ('artist' , '>Artist:</td>\n<td>', '</td>'),
+ ), values=data)
+ data["manga"] = data["manga"].strip()
+ data["author"] = text.unescape(data["author"])
+ data["artist"] = text.unescape(data["artist"])
+ return data
+
+
+class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
+ """Extractor for manga-chapters from mangareader.net"""
+ archive_fmt = "{manga}_{chapter}_{page}"
+ pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"
+ test = (("https://www.mangareader.net"
+ "/karate-shoukoushi-kohinata-minoru/11"), {
+ "url": "061cc92a07edf17bb991ce0821fa4c77a147a860",
+ "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6",
+ })
+
+ def __init__(self, match):
+ path, self.url_title, self.chapter = match.groups()
+ ChapterExtractor.__init__(self, match, self.root + path)
+
+ def metadata(self, chapter_page):
+ page = self.request(self.root + self.url_title).text
+ data = self.parse_page(page, {
+ "chapter": text.parse_int(self.chapter),
+ "lang": "en",
+ "language": "English",
+ })
+ text.extract_all(page, (
+ ('title', ' ' + self.chapter + '</a> : ', '</td>'),
+ ('date', '<td>', '</td>'),
+ ), page.index('<div id="chapterlist">'), data)
+ data["count"] = text.parse_int(text.extract(
+ chapter_page, '</select> of ', '<')[0]
+ )
+ return data
+
+ def images(self, page):
+ while True:
+ next_url, image_url, image_data = self.get_image_metadata(page)
+ yield image_url, image_data
+
+ if not next_url:
+ return
+ page = self.request(next_url).text
+
+ def get_image_metadata(self, page):
+ """Collect next url, image-url and metadata for one manga-page"""
+ extr = text.extract
+ width = None
+ test , pos = extr(page, "document['pu']", '')
+ if test is None:
+ return None, None, None
+ if page.find("document['imgwidth']", pos, pos+200) != -1:
+ width , pos = extr(page, "document['imgwidth'] = ", ";", pos)
+ height, pos = extr(page, "document['imgheight'] = ", ";", pos)
+ _ , pos = extr(page, '<div id="imgholder">', '')
+ url, pos = extr(page, ' href="', '"', pos)
+ if width is None:
+ width , pos = extr(page, '<img id="img" width="', '"', pos)
+ height, pos = extr(page, ' height="', '"', pos)
+ image, pos = extr(page, ' src="', '"', pos)
+ return self.root + url, image, {
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ }
+
+
+class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
+ """Extractor for manga from mangareader.net"""
+ chapterclass = MangareaderChapterExtractor
+ reverse = False
+ pattern = r"(?:https?://)?(?:www\.)?mangareader\.net(/[^/?&#]+)/?$"
+ test = ("https://www.mangareader.net/mushishi", {
+ "url": "bc203b858b4ad76e5d77e39118a7be0350e357da",
+ "keyword": "031b3ea085921c552de017ecbb9b906e462229c9",
+ })
+
+ def chapters(self, page):
+ results = []
+ data = self.parse_page(page, {"lang": "en", "language": "English"})
+
+ needle = '<div class="chico_manga"></div>\n<a href="'
+ pos = page.index('<div id="chapterlist">')
+ while True:
+ url, pos = text.extract(page, needle, '"', pos)
+ if not url:
+ return results
+ data["title"], pos = text.extract(page, '</a> : ', '</td>', pos)
+ data["date"] , pos = text.extract(page, '<td>', '</td>', pos)
+ data["chapter"] = text.parse_int(url.rpartition("/")[2])
+ results.append((self.root + url, data.copy()))
diff --git a/gallery_dl/extractor/mangastream.py b/gallery_dl/extractor/mangastream.py
new file mode 100644
index 0000000..7ff0239
--- /dev/null
+++ b/gallery_dl/extractor/mangastream.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters from https://readms.net/"""
+
+from .common import ChapterExtractor
+from .. import text
+
+
+class MangastreamChapterExtractor(ChapterExtractor):
+ """Extractor for manga-chapters from mangastream.com"""
+ category = "mangastream"
+ archive_fmt = "{chapter_id}_{page}"
+ pattern = (r"(?:https?://)?(?:www\.)?(?:readms\.net|mangastream\.com)"
+ r"/r(?:ead)?/([^/]*/([^/]+)/(\d+))")
+ test = (
+ ("https://readms.net/r/onepunch_man/087/4874/1"),
+ ("https://mangastream.com/r/onepunch_man/087/4874/1"),
+ )
+ root = "https://readms.net"
+
+ def __init__(self, match):
+ self.part, self.chapter, self.chapter_id = match.groups()
+ url = "{}/r/{}".format(self.root, self.part)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ manga, pos = text.extract(
+ page, '<span class="hidden-xs hidden-sm">', "<")
+ pos = page.find(self.part, pos)
+ title, pos = text.extract(page, ' - ', '<', pos)
+ count, pos = text.extract(page, 'Last Page (', ')', pos)
+ return {
+ "manga": manga,
+ "chapter": text.unquote(self.chapter),
+ "chapter_id": text.parse_int(self.chapter_id),
+ "title": title,
+ "count": text.parse_int(count, 1),
+ "lang": "en",
+ "language": "English",
+ }
+
+ def images(self, page):
+ while True:
+ pos = page.index(' class="page"')
+ next_url = text.extract(page, ' href="', '"', pos)[0]
+ image_url = text.extract(page, ' src="', '"', pos)[0]
+ yield text.urljoin(self.root, image_url), None
+ page = self.request(text.urljoin(self.root, next_url)).text
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
new file mode 100644
index 0000000..4ad8da2
--- /dev/null
+++ b/gallery_dl/extractor/mangoxo.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.mangoxo.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+import hashlib
+
+
+class MangoxoExtractor(Extractor):
+ """Base class for mangoxo extractors"""
+ category = "mangoxo"
+ root = "https://www.mangoxo.com"
+ cookiedomain = "www.mangoxo.com"
+ cookienames = ("SESSION",)
+ _warning = True
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+ elif MangoxoExtractor._warning:
+ MangoxoExtractor._warning = False
+ self.log.warning("Unauthenticated users cannot see "
+ "more than 5 images per album")
+
+ @cache(maxage=3*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ page = self.request(self.root + "/login/").text
+ token = text.extract(page, 'id="loginToken" value="', '"')[0]
+ if not token:
+ self.log.debug("failed to extract 'loginToken'")
+
+ url = self.root + "/login/loginxmm"
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": self.root + "/login",
+ }
+ data = {
+ "name": username,
+ "password": hashlib.md5(password.encode()).hexdigest(),
+ "loginToken": token,
+ }
+ response = self.request(url, method="POST", headers=headers, data=data)
+
+ if response.json().get("result") != "1":
+ raise exception.AuthenticationError()
+ return {"SESSION": self.session.cookies.get("SESSION")}
+
+ @staticmethod
+ def _total_pages(page):
+ return text.parse_int(text.extract(page, "total :", ",")[0])
+
+
+class MangoxoAlbumExtractor(MangoxoExtractor):
+ """Extractor for albums on mangoxo.com"""
+ subcategory = "album"
+ filename_fmt = "{album[id]}_{num:>03}.{extension}"
+ directory_fmt = ("{category}", "{channel[name]}", "{album[name]}")
+ archive_fmt = "{album[id]}_{num}"
+ pattern = r"(?:https?://)?(?:www\.)?mangoxo\.com/album/(\w+)"
+ test = ("https://www.mangoxo.com/album/lzVOv1Q9", {
+ "url": "ad921fe62663b06e7d73997f7d00646cab7bdd0d",
+ "keyword": {
+ "channel": {
+ "id": "QeYKRkO0",
+ "name": "美女图社",
+ "cover": str,
+ },
+ "album": {
+ "id": "lzVOv1Q9",
+ "name": "池永康晟 Ikenaga Yasunari 透出古朴气息的日本美女人像画作",
+ "date": "2019.3.22 14:42",
+ "description": str,
+ },
+ "num": int,
+ "count": 65,
+ },
+ })
+
+ def __init__(self, match):
+ MangoxoExtractor.__init__(self, match)
+ self.album_id = match.group(1)
+
+ def items(self):
+ self.login()
+ url = "{}/album/{}/".format(self.root, self.album_id)
+ page = self.request(url).text
+ data = self.metadata(page)
+ imgs = self.images(url, page)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], image in enumerate(imgs, 1):
+ yield Message.Url, image, text.nameext_from_url(image, data)
+
+ def metadata(self, page):
+ """Return general metadata"""
+ title, pos = text.extract(page, '<title>', '</title>')
+ count, pos = text.extract(page, 'id="pic-count">', '<', pos)
+ cover, pos = text.extract(page, ' src="', '"', pos)
+ cid , pos = text.extract(page, '//www.mangoxo.com/channel/', '"', pos)
+ cname, pos = text.extract(page, '>', '<', pos)
+ date , pos = text.extract(page, '</i>', '<', pos)
+ descr, pos = text.extract(page, '<pre>', '</pre>', pos)
+
+ return {
+ "channel": {
+ "id": cid,
+ "name": text.unescape(cname),
+ "cover": cover,
+ },
+ "album": {
+ "id": self.album_id,
+ "name": text.unescape(title),
+ "date": date.strip(),
+ "description": text.unescape(descr),
+ },
+ "count": text.parse_int(count),
+ }
+
+ def images(self, url, page):
+ """Generator; Yields all image URLs"""
+ total = self._total_pages(page)
+ num = 1
+
+ while True:
+ yield from text.extract_iter(
+ page, 'class="lightgallery-item" href="', '"')
+ if num >= total:
+ return
+ num += 1
+ page = self.request(url + str(num)).text
+
+
+class MangoxoChannelExtractor(MangoxoExtractor):
+ """Extractor for all albums on a mangoxo channel"""
+ subcategory = "channel"
+ pattern = r"(?:https?://)?(?:www\.)?mangoxo\.com/channel/(\w+)"
+ test = ("https://www.mangoxo.com/channel/QeYKRkO0", {
+ "pattern": MangoxoAlbumExtractor.pattern,
+ "range": "1-30",
+ "count": "> 20",
+ })
+
+ def __init__(self, match):
+ MangoxoExtractor.__init__(self, match)
+ self.channel_id = match.group(1)
+
+ def items(self):
+ self.login()
+ num = total = 1
+ url = "{}/channel/{}/album/".format(self.root, self.channel_id)
+ yield Message.Version, 1
+
+ while True:
+ page = self.request(url + str(num)).text
+
+ for album in text.extract_iter(
+ page, '<a class="link black" href="', '"'):
+ yield Message.Queue, album, {}
+
+ if num == 1:
+ total = self._total_pages(page)
+ if num >= total:
+ return
+ num += 1
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
new file mode 100644
index 0000000..28a2c2d
--- /dev/null
+++ b/gallery_dl/extractor/mastodon.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for mastodon instances"""
+
+from .common import Extractor, Message
+from .. import text, config, exception
+import re
+
+
+class MastodonExtractor(Extractor):
+ """Base class for mastodon extractors"""
+ basecategory = "mastodon"
+ directory_fmt = ("mastodon", "{instance}", "{account[username]}")
+ filename_fmt = "{category}_{id}_{media[id]}.{extension}"
+ archive_fmt = "{media[id]}"
+ instance = None
+ root = None
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = MastodonAPI(self)
+
+ def config(self, key, default=None, *, sentinel=object()):
+ value = Extractor.config(self, key, sentinel)
+ if value is not sentinel:
+ return value
+ return config.interpolate(
+ ("extractor", "mastodon", self.instance, self.subcategory, key),
+ default,
+ )
+
+ def items(self):
+ yield Message.Version, 1
+ for status in self.statuses():
+ attachments = self.prepare(status)
+ yield Message.Directory, status
+ for media in attachments:
+ status["media"] = media
+ url = media["url"]
+ yield Message.Url, url, text.nameext_from_url(url, status)
+
+ def statuses(self):
+ """Return an iterable containing all relevant Status-objects"""
+ return ()
+
+ def prepare(self, status):
+ """Prepare a status object"""
+ status["instance"] = self.instance
+ status["tags"] = [tag["name"] for tag in status["tags"]]
+ attachments = status["media_attachments"]
+ del status["media_attachments"]
+ return attachments
+
+
+class MastodonUserExtractor(MastodonExtractor):
+ """Extractor for all images of an account/user"""
+ subcategory = "user"
+
+ def __init__(self, match):
+ MastodonExtractor.__init__(self, match)
+ self.account_name = match.group(1)
+
+ def statuses(self):
+ results = self.api.account_search("@" + self.account_name, 1)
+ for account in results:
+ if account["username"] == self.account_name:
+ break
+ else:
+ raise exception.NotFoundError("account")
+ return self.api.account_statuses(account["id"])
+
+
+class MastodonStatusExtractor(MastodonExtractor):
+ """Extractor for images from a status"""
+ subcategory = "status"
+
+ def __init__(self, match):
+ MastodonExtractor.__init__(self, match)
+ self.status_id = match.group(1)
+
+ def statuses(self):
+ return (self.api.status(self.status_id),)
+
+
+class MastodonAPI():
+ """Minimal interface for the Mastodon API
+
+ https://github.com/tootsuite/mastodon
+ https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
+ """
+
+ def __init__(self, extractor, access_token=None):
+ self.root = extractor.root
+ self.extractor = extractor
+
+ if not access_token:
+ access_token = extractor.config(
+ "access-token", extractor.access_token)
+ self.headers = {"Authorization": "Bearer {}".format(access_token)}
+
+ def account_search(self, query, limit=40):
+ """Search for content"""
+ params = {"q": query, "limit": limit}
+ return self._call("accounts/search", params)
+
+ def account_statuses(self, account_id):
+ """Get an account's statuses"""
+ endpoint = "accounts/{}/statuses".format(account_id)
+ params = {"only_media": "1"}
+ return self._pagination(endpoint, params)
+
+ def status(self, status_id):
+ """Fetch a Status"""
+ return self._call("statuses/" + status_id)
+
+ def _call(self, endpoint, params=None):
+ url = "{}/api/v1/{}".format(self.root, endpoint)
+ response = self.extractor.request(
+ url, params=params, headers=self.headers)
+ return self._parse(response)
+
+ def _pagination(self, endpoint, params):
+ url = "{}/api/v1/{}".format(self.root, endpoint)
+ while url:
+ response = self.extractor.request(
+ url, params=params, headers=self.headers)
+ yield from self._parse(response)
+ url = response.links.get("next", {}).get("url")
+
+ @staticmethod
+ def _parse(response):
+ """Parse an API response"""
+ if response.status_code == 404:
+ raise exception.NotFoundError()
+ return response.json()
+
+
+def generate_extractors():
+ """Dynamically generate Extractor classes for Mastodon instances"""
+
+ symtable = globals()
+ extractors = config.get(("extractor", "mastodon"))
+ if extractors:
+ EXTRACTORS.update(extractors)
+ config.set(("extractor", "mastodon"), EXTRACTORS)
+
+ for instance, info in EXTRACTORS.items():
+
+ if not isinstance(info, dict):
+ continue
+
+ category = info.get("category") or instance.replace(".", "")
+ root = info.get("root") or "https://" + instance
+ name = (info.get("name") or category).capitalize()
+ token = info.get("access-token")
+ pattern = info.get("pattern") or re.escape(instance)
+
+ class Extr(MastodonUserExtractor):
+ pass
+
+ Extr.__name__ = Extr.__qualname__ = name + "UserExtractor"
+ Extr.__doc__ = "Extractor for all images of a user on " + instance
+ Extr.category = category
+ Extr.instance = instance
+ Extr.pattern = (r"(?:https?://)?" + pattern +
+ r"/@([^/?&#]+)(?:/media)?/?$")
+ Extr.root = root
+ Extr.access_token = token
+ symtable[Extr.__name__] = Extr
+
+ class Extr(MastodonStatusExtractor):
+ pass
+
+ Extr.__name__ = Extr.__qualname__ = name + "StatusExtractor"
+ Extr.__doc__ = "Extractor for images from a status on " + instance
+ Extr.category = category
+ Extr.instance = instance
+ Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?&#]+/(\d+)"
+ Extr.root = root
+ Extr.access_token = token
+ symtable[Extr.__name__] = Extr
+
+
+EXTRACTORS = {
+ "pawoo.net": {
+ "category" : "pawoo",
+ "access-token" : "286462927198d0cf3e24683e91c8259a"
+ "ac4367233064e0570ca18df2ac65b226",
+ "client-id" : "97b142b6904abf97a1068d51a7bc2f2f"
+ "cf9323cef81f13cb505415716dba7dac",
+ "client-secret": "e45bef4bad45b38abf7d9ef88a646b73"
+ "75e7fb2532c31a026327a93549236481",
+ },
+}
+
+
+generate_extractors()
diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py
new file mode 100644
index 0000000..1831620
--- /dev/null
+++ b/gallery_dl/extractor/message.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+
+class Message():
+ """Enum for message identifiers
+
+ Extractors yield their results as message-tuples, where the first element
+ is one of the following identifiers. This message-identifier determines
+ the type and meaning of the other elements in such a tuple.
+
+ - Message.Version:
+ - Message protocol version (currently always '1')
+ - 2nd element specifies the version of all following messages as integer
+
+ - Message.Directory:
+ - Sets the target directory for all following images
+ - 2nd element is a dictionary containing general metadata
+
+ - Message.Url:
+ - Image URL and its metadata
+ - 2nd element is the URL as a string
+ - 3rd element is a dictionary with image-specific metadata
+
+ - Message.Headers: # obsolete
+ - HTTP headers to use while downloading
+ - 2nd element is a dictionary with header-name and -value pairs
+
+ - Message.Cookies: # obsolete
+ - Cookies to use while downloading
+ - 2nd element is a dictionary with cookie-name and -value pairs
+
+ - Message.Queue:
+ - (External) URL that should be handled by another extractor
+ - 2nd element is the (external) URL as a string
+ - 3rd element is a dictionary containing URL-specific metadata
+
+ - Message.Urllist:
+ - Same as Message.Url, but its 2nd element is a list of multiple URLs
+ - The additional URLs serve as a fallback if the primary one fails
+ """
+
+ Version = 1
+ Directory = 2
+ Url = 3
+ # Headers = 4
+ # Cookies = 5
+ Queue = 6
+ Urllist = 7
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
new file mode 100644
index 0000000..1515f53
--- /dev/null
+++ b/gallery_dl/extractor/myportfolio.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.myportfolio.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class MyportfolioGalleryExtractor(Extractor):
+ """Extractor for an image gallery on www.myportfolio.com"""
+ category = "myportfolio"
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user}", "{title}")
+ filename_fmt = "{num:>02}.{extension}"
+ archive_fmt = "{user}_{filename}"
+ pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
+ r"(?:https?://)?([^.]+\.myportfolio\.com))"
+ r"(/[^/?&#]+)?")
+ test = (
+ ("https://hannahcosgrove.myportfolio.com/robyn", {
+ "url": "93b5430e765e53564b13e7d9c64c30c286011a6b",
+ "keyword": "25cb3dbdad6b011242a133f30ec598318b7512e8",
+ }),
+ ("https://hannahcosgrove.myportfolio.com/lfw", {
+ "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$",
+ "count": ">= 8",
+ }),
+ ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", {
+ "count": 3,
+ }),
+ ("myportfolio:https://tooco.com.ar/", {
+ "count": ">= 40",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ domain1, domain2, self.path = match.groups()
+ self.domain = domain1 or domain2
+ self.prefix = "myportfolio:" if domain1 else ""
+
+ def items(self):
+ yield Message.Version, 1
+ url = "https://" + self.domain + (self.path or "")
+ page = self.request(url).text
+
+ projects = text.extract(
+ page, '<section class="project-covers', '</section>')[0]
+
+ if projects:
+ data = {"_extractor": MyportfolioGalleryExtractor}
+ base = self.prefix + "https://" + self.domain
+ for path in text.extract_iter(projects, ' href="', '"'):
+ yield Message.Queue, base + path, data
+ else:
+ data = self.metadata(page)
+ imgs = self.images(page)
+ data["count"] = len(imgs)
+ yield Message.Directory, data
+ for data["num"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ @staticmethod
+ def metadata(page):
+ """Collect general image metadata"""
+ # og:title contains data as "<user> - <title>", but both
+ # <user> and <title> can contain a "-" as well, so we get the title
+ # from somewhere else and cut that amount from the og:title content
+
+ user, pos = text.extract(
+ page, 'property=og:title content="', '"')
+ desc, pos = text.extract(
+ page, 'property=og:description content="', '"', pos)
+ title, pos = text.extract(
+ page, '<h1 ', '</h1>', pos)
+
+ title = title.partition(">")[2]
+ user = user[:-len(title)-3]
+
+ return {
+ "user": text.unescape(user),
+ "title": text.unescape(title),
+ "description": text.unescape(desc or ""),
+ }
+
+ @staticmethod
+ def images(page):
+ """Extract and return a list of all image-urls"""
+ return list(text.extract_iter(page, 'js-lightbox" data-src="', '"'))
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
new file mode 100644
index 0000000..9e0aaa3
--- /dev/null
+++ b/gallery_dl/extractor/newgrounds.py
@@ -0,0 +1,155 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.newgrounds.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class NewgroundsExtractor(Extractor):
+ """Base class for newgrounds extractors"""
+ category = "newgrounds"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{category}_{index}_{title}.{extension}"
+ archive_fmt = "{index}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+ self.root = "https://{}.newgrounds.com".format(self.user)
+
+ def items(self):
+ data = self.get_metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for page_url in self.get_page_urls():
+ image = self.parse_page_data(page_url)
+ image.update(data)
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def get_metadata(self):
+ """Collect metadata for extractor-job"""
+ return {"user": self.user}
+
+ def get_page_urls(self):
+ """Return urls of all relevant image pages"""
+
+ def parse_page_data(self, page_url):
+ """Collect url and metadata from an image page"""
+ extr = text.extract_from(self.request(page_url).text)
+ full = text.extract_from(json.loads(extr('"full_image_text":', '});')))
+ data = {
+ "description": text.unescape(extr(':description" content="', '"')),
+ "date" : extr('itemprop="datePublished" content="', '"'),
+ "rating" : extr('class="rated-', '"'),
+ "favorites" : text.parse_int(extr('id="faves_load">', '<')),
+ "score" : text.parse_float(extr('id="score_number">', '<')),
+ "url" : full('src="', '"'),
+ "title" : text.unescape(full('alt="', '"')),
+ "width" : text.parse_int(full('width="', '"')),
+ "height" : text.parse_int(full('height="', '"')),
+ }
+
+ tags = text.split_html(extr('<dd class="tags momag">', '</dd>'))
+ tags.sort()
+ data["tags"] = tags
+
+ data["date"] = text.parse_datetime(data["date"])
+ data["index"] = text.parse_int(
+ data["url"].rpartition("/")[2].partition("_")[0])
+ return data
+
+ def _pagination(self, url):
+ headers = {
+ "Referer": self.root,
+ "X-Requested-With": "XMLHttpRequest",
+ "Accept": "application/json, text/javascript, */*; q=0.01",
+ }
+
+ while True:
+ data = self.request(url, headers=headers).json()
+
+ for year in data["sequence"]:
+ for item in data["years"][str(year)]["items"]:
+ page_url = text.extract(item, 'href="', '"')[0]
+ yield text.urljoin(self.root, page_url)
+
+ if not data["more"]:
+ return
+ url = text.urljoin(self.root, data["more"])
+
+
+class NewgroundsUserExtractor(NewgroundsExtractor):
+ """Extractor for all images of a newgrounds user"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com(?:/art)?/?$"
+ test = (
+ ("https://blitzwuff.newgrounds.com/art", {
+ "url": "24b19c4a135a09889fac7b46a74e427e4308d02b",
+ "keyword": "2aab0532a894ff3cf88dd01ce5c60f114011b268",
+ }),
+ ("https://blitzwuff.newgrounds.com/"),
+ )
+
+ def get_page_urls(self):
+ return self._pagination(self.root + "/art/page/1")
+
+
+class NewgroundsImageExtractor(NewgroundsExtractor):
+ """Extractor for a single image from newgrounds.com"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:www\.)?newgrounds\.com/art/view/([^/?&#]+)/[^/?&#]+"
+ r"|art\.ngfiles\.com/images/\d+/\d+_([^_]+)_([^.]+))")
+ test = (
+ ("https://www.newgrounds.com/art/view/blitzwuff/ffx", {
+ "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818",
+ "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e",
+ "content": "cb067d6593598710292cdd340d350d14a26fe075",
+ }),
+ ("https://art.ngfiles.com/images/587000/587551_blitzwuff_ffx.png", {
+ "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818",
+ "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e",
+ }),
+ )
+
+ def __init__(self, match):
+ NewgroundsExtractor.__init__(self, match)
+ if match.group(2):
+ self.user = match.group(2)
+ self.page_url = "https://www.newgrounds.com/art/view/{}/{}".format(
+ self.user, match.group(3))
+ else:
+ self.page_url = match.group(0)
+
+ def get_page_urls(self):
+ return (self.page_url,)
+
+
+class NewgroundsVideoExtractor(NewgroundsExtractor):
+ """Extractor for all videos of a newgrounds user"""
+ subcategory = "video"
+ filename_fmt = "{category}_{index}.{extension}"
+ pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$"
+ test = ("https://twistedgrim.newgrounds.com/movies", {
+ "pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+",
+ "count": ">= 29",
+ })
+
+ def get_page_urls(self):
+ return self._pagination(self.root + "/movies/page/1")
+
+ def parse_page_data(self, page_url):
+ return {
+ "url" : "ytdl:" + page_url,
+ "index": text.parse_int(page_url.rpartition("/")[2]),
+ }
diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py
new file mode 100644
index 0000000..8135a8a
--- /dev/null
+++ b/gallery_dl/extractor/ngomik.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from http://ngomik.in/"""
+
+from .common import ChapterExtractor
+from .. import text
+import re
+
+
+class NgomikChapterExtractor(ChapterExtractor):
+ """Extractor for manga-chapters from ngomik.in"""
+ category = "ngomik"
+ root = "http://ngomik.in"
+ pattern = (r"(?:https?://)?(?:www\.)?ngomik\.in"
+ r"(/[^/?&#]+-chapter-[^/?&#]+)")
+ test = (
+ ("https://www.ngomik.in/14-sai-no-koi-chapter-1-6/", {
+ "url": "8e67fdf751bbc79bc6f4dead7675008ddb8e32a4",
+ "keyword": "204d177f09d438fd50c9c28d98c73289194640d8",
+ }),
+ ("https://ngomik.in/break-blade-chapter-26/", {
+ "count": 34,
+ }),
+ )
+
+ def metadata(self, page):
+ info = text.extract(page, '<title>', "</title>")[0]
+ manga, _, chapter = info.partition(" Chapter ")
+ chapter, sep, minor = chapter.partition(" ")[0].partition(".")
+
+ return {
+ "manga": text.unescape(manga),
+ "chapter": text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "lang": "id",
+ "language": "Indonesian",
+ }
+
+ @staticmethod
+ def images(page):
+ readerarea = text.extract(page, 'id=readerarea', 'class=chnav')[0]
+ return [
+ (text.unescape(url), None)
+ for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea)
+ ]
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
new file mode 100644
index 0000000..746144a
--- /dev/null
+++ b/gallery_dl/extractor/nhentai.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://nhentai.net/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+import collections
+import json
+
+
+class NhentaiBase():
+ """Base class for nhentai extractors"""
+ category = "nhentai"
+ root = "https://nhentai.net"
+ media_url = "https://i.nhentai.net"
+
+
+class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
+ """Extractor for image galleries from nhentai.net"""
+ pattern = r"(?:https?://)?nhentai\.net(/g/(\d+))"
+ test = ("https://nhentai.net/g/147850/", {
+ "url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0",
+ "keyword": {
+ "title" : r"re:\[Morris\] Amazon no Hiyaku \| Amazon Elixir",
+ "title_en" : str,
+ "title_ja" : str,
+ "gallery_id": 147850,
+ "media_id" : 867789,
+ "count" : 16,
+ "date" : 1446050915,
+ "scanlator" : "",
+ "artist" : ["morris"],
+ "group" : list,
+ "parody" : list,
+ "characters": list,
+ "tags" : list,
+ "type" : "manga",
+ "lang" : "en",
+ "language" : "English",
+ "width" : int,
+ "height" : int,
+ },
+ })
+
+ def __init__(self, match):
+ GalleryExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+ self.data = None
+
+ def metadata(self, page):
+ data = json.loads(text.extract(page, "N.gallery(", ");")[0])
+ self.data = data
+
+ title_en = data["title"].get("english", "")
+ title_ja = data["title"].get("japanese", "")
+
+ info = collections.defaultdict(list)
+ for tag in data["tags"]:
+ info[tag["type"]].append(tag["name"])
+
+ language = ""
+ for language in info["language"]:
+ if language != "translated":
+ language = language.capitalize()
+ break
+
+ return {
+ "title" : title_en or title_ja,
+ "title_en" : title_en,
+ "title_ja" : title_ja,
+ "gallery_id": data["id"],
+ "media_id" : text.parse_int(data["media_id"]),
+ "date" : data["upload_date"],
+ "scanlator" : data["scanlator"],
+ "artist" : info["artist"],
+ "group" : info["group"],
+ "parody" : info["parody"],
+ "characters": info["character"],
+ "tags" : info["tag"],
+ "type" : info["category"][0] if info["category"] else "",
+ "lang" : util.language_to_code(language),
+ "language" : language,
+ }
+
+ def images(self, _):
+ ufmt = "{}/galleries/{}/{{}}.{{}}".format(
+ self.media_url, self.data["media_id"])
+ extdict = {"j": "jpg", "p": "png", "g": "gif"}
+
+ return [
+ (ufmt.format(num, extdict.get(img["t"], "jpg")), {
+ "width": img["w"], "height": img["h"],
+ })
+ for num, img in enumerate(self.data["images"]["pages"], 1)
+ ]
+
+
+class NhentaiSearchExtractor(NhentaiBase, Extractor):
+ """Extractor for nhentai search results"""
+ category = "nhentai"
+ subcategory = "search"
+ pattern = r"(?:https?://)?nhentai\.net/search/?\?([^#]+)"
+ test = ("https://nhentai.net/search/?q=touhou", {
+ "pattern": NhentaiGalleryExtractor.pattern,
+ "count": 30,
+ "range": "1-30",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.params = text.parse_query(match.group(1))
+
+ def items(self):
+ yield Message.Version, 1
+ data = {"_extractor": NhentaiGalleryExtractor}
+ for gallery_id in self._pagination(self.params):
+ url = "{}/g/{}/".format(self.root, gallery_id)
+ yield Message.Queue, url, data
+
+ def _pagination(self, params):
+ url = "{}/search/".format(self.root)
+ params["page"] = text.parse_int(params.get("page"), 1)
+
+ while True:
+ page = self.request(url, params=params).text
+ yield from text.extract_iter(page, 'href="/g/', '/')
+ if 'class="next"' not in page:
+ return
+ params["page"] += 1
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
new file mode 100644
index 0000000..abf1eaa
--- /dev/null
+++ b/gallery_dl/extractor/nijie.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://nijie.info/"""
+
+from .common import Extractor, Message, AsynchronousMixin
+from .. import text, exception
+from ..cache import cache
+
+
+class NijieExtractor(AsynchronousMixin, Extractor):
+ """Base class for nijie extractors"""
+ category = "nijie"
+ directory_fmt = ("{category}", "{user_id}")
+ filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}"
+ archive_fmt = "{image_id}_{index}"
+ cookiedomain = "nijie.info"
+ cookienames = ("nemail", "nlogin")
+ root = "https://nijie.info"
+ view_url = "https://nijie.info/view.php?id="
+ popup_url = "https://nijie.info/view_popup.php?id="
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user_id = match.group(1)
+ self.session.headers["Referer"] = self.root + "/"
+
+ def items(self):
+ self.login()
+ data = self.get_job_metadata()
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for image_id in self.get_image_ids():
+ for image_url, image_data in self.get_image_data(image_id):
+ image_data.update(data)
+ if not image_data["extension"]:
+ image_data["extension"] = "jpg"
+ yield Message.Url, image_url, image_data
+
+ def get_job_metadata(self):
+ """Collect metadata for extractor-job"""
+ return {"user_id": text.parse_int(self.user_id)}
+
+ def get_image_ids(self):
+ """Collect all relevant image-ids"""
+
+ def get_image_data(self, image_id):
+ """Get URL and metadata for images specified by 'image_id'"""
+ page = self.request(self.view_url + image_id).text
+ return self.extract_image_data(page, image_id)
+
+ def extract_image_data(self, page, image_id):
+ """Get URL and metadata for images from 'page'"""
+ title, pos = text.extract(
+ page, '<meta property="og:title" content="', '"')
+ description, pos = text.extract(
+ page, '<meta property="og:description" content="', '"', pos)
+ artist_id, pos = text.extract(
+ page, '"sameAs": "https://nijie.info/members.php?id=', '"', pos)
+ images = list(text.extract_iter(
+ page, '<a href="./view_popup.php', '</a>', pos))
+
+ title = title.rpartition("|")[0].strip()
+ image_id = text.parse_int(image_id)
+ artist_id = text.parse_int(artist_id)
+
+ for index, image in enumerate(images):
+ url = "https:" + text.extract(image, 'src="', '"')[0]
+ url = url.replace("/__rs_l120x120/", "/", 1)
+
+ yield url, text.nameext_from_url(url, {
+ "index": index,
+ "count": len(images),
+ "title": title,
+ "description": description,
+ "image_id": image_id,
+ "artist_id": artist_id,
+ })
+
+ def login(self):
+ """Login and obtain session cookies"""
+ if not self._check_cookies(self.cookienames):
+ username, password = self._get_auth_info()
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=150*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "{}/login_int.php".format(self.root)
+ data = {"email": username, "password": password, "save": "on"}
+
+ response = self.request(url, method="POST", data=data)
+ if "//nijie.info/login.php" in response.text:
+ raise exception.AuthenticationError()
+ return self.session.cookies
+
+ def _pagination(self, path):
+ url = "{}/{}.php".format(self.root, path)
+ params = {"id": self.user_id, "p": 1}
+
+ while True:
+ response = self.request(url, params=params, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("artist")
+
+ page = response.text
+ ids = list(text.extract_iter(page, ' illust_id="', '"'))
+ yield from ids
+
+ if '<a rel="next"' not in page:
+ return
+ params["p"] += 1
+
+
+class NijieUserExtractor(NijieExtractor):
+ """Extractor for works of a nijie-user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
+ r"/members(?:_illust)?\.php\?id=(\d+)")
+ test = (
+ ("https://nijie.info/members_illust.php?id=44", {
+ "url": "585d821df4716b1098660a0be426d01db4b65f2a",
+ "keyword": "d629c69e3172db1d7e026145e8eb640ac31ac16a",
+ }),
+ ("https://nijie.info/members_illust.php?id=43", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://nijie.info/members.php?id=44"),
+ )
+
+ def get_image_ids(self):
+ return self._pagination("members_illust")
+
+
+class NijieDoujinExtractor(NijieExtractor):
+ """Extractor for doujin entries of a nijie-user"""
+ subcategory = "doujin"
+ pattern = (r"(?:https?://)?(?:www\.)?nijie\.info/"
+ r"members_dojin\.php\?id=(\d+)")
+ test = ("https://nijie.info/members_dojin.php?id=6782", {
+ "count": ">= 18",
+ })
+
+ def get_image_ids(self):
+ return self._pagination("members_dojin")
+
+
+class NijieFavoriteExtractor(NijieExtractor):
+ """Extractor for all favorites/bookmarks of a nijie-user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "bookmarks", "{user_id}")
+ archive_fmt = "f_{user_id}_{image_id}_{index}"
+ pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
+ r"/user_like_illust_view\.php\?id=(\d+)")
+ test = ("https://nijie.info/user_like_illust_view.php?id=44", {
+ "count": ">= 16",
+ })
+
+ def get_image_ids(self):
+ return self._pagination("user_like_illust_view")
+
+
+class NijieImageExtractor(NijieExtractor):
+ """Extractor for a work/image from nijie.info"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
+ r"/view(?:_popup)?\.php\?id=(\d+)")
+ test = (
+ ("https://nijie.info/view.php?id=70720", {
+ "url": "a10d4995645b5f260821e32c60a35f73546c2699",
+ "keyword": "408393d010307c76d52cbd0a4368d6d357805aea",
+ "content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6",
+ }),
+ ("https://nijie.info/view.php?id=70724", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://nijie.info/view_popup.php?id=70720"),
+ )
+
+ def __init__(self, match):
+ NijieExtractor.__init__(self, match)
+ self.image_id = match.group(1)
+ self.page = ""
+
+ def get_job_metadata(self):
+ response = self.request(self.view_url + self.image_id, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("image")
+ self.page = response.text
+ self.user_id = text.extract(
+ self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0]
+ return NijieExtractor.get_job_metadata(self)
+
+ def get_image_ids(self):
+ return (self.image_id,)
+
+ def get_image_data(self, _):
+ return self.extract_image_data(self.page, self.image_id)
diff --git a/gallery_dl/extractor/nsfwalbum.py b/gallery_dl/extractor/nsfwalbum.py
new file mode 100644
index 0000000..c55f80a
--- /dev/null
+++ b/gallery_dl/extractor/nsfwalbum.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://nsfwalbum.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class NsfwalbumAlbumExtractor(GalleryExtractor):
+ """Extractor for image albums on nsfwalbum.com"""
+ category = "nsfwalbum"
+ subcategory = "album"
+ root = "https://nsfwalbum.com"
+ filename_fmt = "{album_id}_{page:>03}_{id}.{extension}"
+ directory_fmt = ("{category}", "{album_id} {title}")
+ archive_fmt = "{id}"
+ pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))"
+ test = ("https://nsfwalbum.com/album/295201", {
+ "range": "1-5",
+ "url": "e60eced1873215f5deee1ca7226d60cb4dcc051c",
+ "keyword": "e0573ecb1966611e96d10172a3ca1db1078a7984",
+ })
+
+ def __init__(self, match):
+ self.album_id = match.group(2)
+ GalleryExtractor.__init__(self, match)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ return {
+ "album_id": text.parse_int(self.album_id),
+ "title" : text.unescape(extr('<h6>', '</h6>')),
+ "models" : text.split_html(extr('"models"> Models:', '</div>')),
+ "studio" : text.remove_html(extr('"models"> Studio:', '</div>')),
+ }
+
+ def images(self, page):
+ iframe = self.root + "/iframe_image.php?id="
+ backend = self.root + "/backend.php"
+ for image_id in text.extract_iter(page, 'data-img-id="', '"'):
+ spirit = text.extract(self.request(
+ iframe + image_id).text, 'giraffe.annihilate("', '"')[0]
+ params = {"spirit": self._annihilate(spirit), "photo": image_id}
+ data = self.request(backend, params=params).json()
+ yield data[0], {
+ "id" : text.parse_int(image_id),
+ "width" : text.parse_int(data[1]),
+ "height": text.parse_int(data[2]),
+ }
+
+ @staticmethod
+ def _annihilate(value, base=6):
+ return "".join(
+ chr(ord(char) ^ base)
+ for char in value
+ )
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
new file mode 100644
index 0000000..e26eae1
--- /dev/null
+++ b/gallery_dl/extractor/oauth.py
@@ -0,0 +1,375 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Utility classes to setup OAuth and link a users account to gallery-dl"""
+
+from .common import Extractor, Message
+from . import deviantart, flickr, reddit, smugmug, tumblr
+from .. import text, oauth, config, exception
+from ..cache import cache
+import os
+import urllib.parse
+
+
+class OAuthBase(Extractor):
+ """Base class for OAuth Helpers"""
+ category = "oauth"
+ redirect_uri = "http://localhost:6414/"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.client = None
+
+ def oauth_config(self, key, default=None):
+ return config.interpolate(
+ ("extractor", self.subcategory, key), default)
+
+ def recv(self):
+ """Open local HTTP server and recv callback parameters"""
+ import socket
+ print("Waiting for response. (Cancel with Ctrl+c)")
+ server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+ server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+ server.bind(("localhost", 6414))
+ server.listen(1)
+
+ # workaround for ctrl+c not working during server.accept on Windows
+ if os.name == "nt":
+ server.settimeout(1.0)
+ while True:
+ try:
+ self.client = server.accept()[0]
+ break
+ except socket.timeout:
+ pass
+ server.close()
+
+ data = self.client.recv(1024).decode()
+ path = data.split(" ", 2)[1]
+ return text.parse_query(path.partition("?")[2])
+
+ def send(self, msg):
+ """Send 'msg' to the socket opened in 'recv()'"""
+ print(msg)
+ self.client.send(b"HTTP/1.1 200 OK\r\n\r\n" + msg.encode())
+ self.client.close()
+
+ def open(self, url, params):
+ """Open 'url' in browser amd return response parameters"""
+ import webbrowser
+ url += "?" + urllib.parse.urlencode(params)
+ if not self.config("browser", True) or not webbrowser.open(url):
+ print("Please open this URL in your browser:")
+ print(url, end="\n\n", flush=True)
+ return self.recv()
+
+ def _oauth1_authorization_flow(
+ self, request_token_url, authorize_url, access_token_url):
+ """Perform the OAuth 1.0a authorization flow"""
+ # get a request token
+ params = {"oauth_callback": self.redirect_uri}
+ data = self.session.get(request_token_url, params=params).text
+
+ data = text.parse_query(data)
+ self.session.auth.token_secret = data["oauth_token_secret"]
+
+ # get the user's authorization
+ params = {"oauth_token": data["oauth_token"], "perms": "read"}
+ data = self.open(authorize_url, params)
+
+ # exchange the request token for an access token
+ data = self.session.get(access_token_url, params=data).text
+
+ data = text.parse_query(data)
+ self.send(OAUTH1_MSG_TEMPLATE.format(
+ category=self.subcategory,
+ token=data["oauth_token"],
+ token_secret=data["oauth_token_secret"],
+ ))
+
+ def _oauth2_authorization_code_grant(
+ self, client_id, client_secret, auth_url, token_url,
+ scope="read", key="refresh_token", auth=True,
+ message_template=None):
+ """Perform an OAuth2 authorization code grant"""
+
+ state = "gallery-dl_{}_{}".format(
+ self.subcategory,
+ oauth.nonce(8),
+ )
+
+ auth_params = {
+ "client_id": client_id,
+ "response_type": "code",
+ "state": state,
+ "redirect_uri": self.redirect_uri,
+ "duration": "permanent",
+ "scope": scope,
+ }
+
+ # receive an authorization code
+ params = self.open(auth_url, auth_params)
+
+ # check authorization response
+ if state != params.get("state"):
+ self.send("'state' mismatch: expected {}, got {}.".format(
+ state, params.get("state")
+ ))
+ return
+ if "error" in params:
+ self.send(params["error"])
+ return
+
+ # exchange the authorization code for a token
+ data = {
+ "grant_type": "authorization_code",
+ "code": params["code"],
+ "redirect_uri": self.redirect_uri,
+ }
+
+ if auth:
+ auth = (client_id, client_secret)
+ else:
+ auth = None
+ data["client_id"] = client_id
+ data["client_secret"] = client_secret
+
+ data = self.session.post(token_url, data=data, auth=auth).json()
+
+ # check token response
+ if "error" in data:
+ self.send(data["error"])
+ return
+
+ # display token
+ part = key.partition("_")[0]
+ template = message_template or OAUTH2_MSG_TEMPLATE
+ self.send(template.format(
+ category=self.subcategory,
+ key=part,
+ Key=part.capitalize(),
+ token=data[key],
+ instance=getattr(self, "instance", ""),
+ client_id=client_id,
+ client_secret=client_secret,
+ ))
+
+
+class OAuthDeviantart(OAuthBase):
+ subcategory = "deviantart"
+ pattern = "oauth:deviantart$"
+ redirect_uri = "https://mikf.github.io/gallery-dl/oauth-redirect.html"
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth2_authorization_code_grant(
+ self.oauth_config(
+ "client-id", deviantart.DeviantartAPI.CLIENT_ID),
+ self.oauth_config(
+ "client-secret", deviantart.DeviantartAPI.CLIENT_SECRET),
+ "https://www.deviantart.com/oauth2/authorize",
+ "https://www.deviantart.com/oauth2/token",
+ scope="browse",
+ )
+
+
+class OAuthFlickr(OAuthBase):
+ subcategory = "flickr"
+ pattern = "oauth:flickr$"
+
+ def __init__(self, match):
+ OAuthBase.__init__(self, match)
+ self.session = oauth.OAuth1Session(
+ self.oauth_config("api-key", flickr.FlickrAPI.API_KEY),
+ self.oauth_config("api-secret", flickr.FlickrAPI.API_SECRET),
+ )
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth1_authorization_flow(
+ "https://www.flickr.com/services/oauth/request_token",
+ "https://www.flickr.com/services/oauth/authorize",
+ "https://www.flickr.com/services/oauth/access_token",
+ )
+
+
+class OAuthReddit(OAuthBase):
+ subcategory = "reddit"
+ pattern = "oauth:reddit$"
+
+ def items(self):
+ yield Message.Version, 1
+
+ self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT
+ self._oauth2_authorization_code_grant(
+ self.oauth_config("client-id", reddit.RedditAPI.CLIENT_ID),
+ "",
+ "https://www.reddit.com/api/v1/authorize",
+ "https://www.reddit.com/api/v1/access_token",
+ scope="read",
+ )
+
+
+class OAuthSmugmug(OAuthBase):
+ subcategory = "smugmug"
+ pattern = "oauth:smugmug$"
+
+ def __init__(self, match):
+ OAuthBase.__init__(self, match)
+ self.session = oauth.OAuth1Session(
+ self.oauth_config("api-key", smugmug.SmugmugAPI.API_KEY),
+ self.oauth_config("api-secret", smugmug.SmugmugAPI.API_SECRET),
+ )
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth1_authorization_flow(
+ "https://api.smugmug.com/services/oauth/1.0a/getRequestToken",
+ "https://api.smugmug.com/services/oauth/1.0a/authorize",
+ "https://api.smugmug.com/services/oauth/1.0a/getAccessToken",
+ )
+
+
+class OAuthTumblr(OAuthBase):
+ subcategory = "tumblr"
+ pattern = "oauth:tumblr$"
+
+ def __init__(self, match):
+ OAuthBase.__init__(self, match)
+ self.session = oauth.OAuth1Session(
+ self.oauth_config("api-key", tumblr.TumblrAPI.API_KEY),
+ self.oauth_config("api-secret", tumblr.TumblrAPI.API_SECRET),
+ )
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth1_authorization_flow(
+ "https://www.tumblr.com/oauth/request_token",
+ "https://www.tumblr.com/oauth/authorize",
+ "https://www.tumblr.com/oauth/access_token",
+ )
+
+
+class OAuthMastodon(OAuthBase):
+ subcategory = "mastodon"
+ pattern = "oauth:mastodon:(?:https?://)?([^/?&#]+)"
+
+ def __init__(self, match):
+ OAuthBase.__init__(self, match)
+ self.instance = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+
+ application = self.oauth_config(self.instance)
+ if not application:
+ application = self._register(self.instance)
+
+ self._oauth2_authorization_code_grant(
+ application["client-id"],
+ application["client-secret"],
+ "https://{}/oauth/authorize".format(self.instance),
+ "https://{}/oauth/token".format(self.instance),
+ key="access_token",
+ message_template=MASTODON_MSG_TEMPLATE,
+ )
+
+ @cache(maxage=10*365*24*3600, keyarg=1)
+ def _register(self, instance):
+ self.log.info("Registering application for '%s'", instance)
+
+ url = "https://{}/api/v1/apps".format(instance)
+ data = {
+ "client_name": "gdl:" + oauth.nonce(8),
+ "redirect_uris": self.redirect_uri,
+ "scopes": "read",
+ }
+ data = self.session.post(url, data=data).json()
+
+ if "client_id" not in data or "client_secret" not in data:
+ self.log.error("Failed to register new application: '%s'", data)
+ raise exception.StopExtraction()
+
+ data["client-id"] = data.pop("client_id")
+ data["client-secret"] = data.pop("client_secret")
+
+ self.log.info("client-id:\n%s", data["client-id"])
+ self.log.info("client-secret:\n%s", data["client-secret"])
+
+ return data
+
+
+OAUTH1_MSG_TEMPLATE = """
+Your Access Token and Access Token Secret are
+
+{token}
+{token_secret}
+
+Put these values into your configuration file as
+'extractor.{category}.access-token' and
+'extractor.{category}.access-token-secret'.
+
+Example:
+{{
+ "extractor": {{
+ "{category}": {{
+ "access-token": "{token}",
+ "access-token-secret": "{token_secret}"
+ }}
+ }}
+}}
+"""
+
+
+OAUTH2_MSG_TEMPLATE = """
+Your {Key} Token is
+
+{token}
+
+Put this value into your configuration file as
+'extractor.{category}.{key}-token'.
+
+Example:
+{{
+ "extractor": {{
+ "{category}": {{
+ "{key}-token": "{token}"
+ }}
+ }}
+}}
+"""
+
+
+MASTODON_MSG_TEMPLATE = """
+Your {Key} Token is
+
+{token}
+
+Put this value into your configuration file as
+'extractor.mastodon.{instance}.{key}-token'.
+
+You can also add your 'client-id' and 'client-secret' values
+if you want to register another account in the future.
+
+Example:
+{{
+ "extractor": {{
+ "mastodon": {{
+ "{instance}": {{
+ "{key}-token": "{token}",
+ "client-id": "{client_id}",
+ "client-secret": "{client_secret}"
+ }}
+ }}
+ }}
+}}
+"""
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
new file mode 100644
index 0000000..a4731d0
--- /dev/null
+++ b/gallery_dl/extractor/paheal.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://rule34.paheal.net/"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text
+
+
+class PahealExtractor(SharedConfigMixin, Extractor):
+ """Base class for paheal extractors"""
+ basecategory = "booru"
+ category = "paheal"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
+ archive_fmt = "{id}"
+ root = "https://rule34.paheal.net"
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, self.get_metadata()
+
+ for data in self.get_posts():
+ url = data["file_url"]
+ for key in ("id", "width", "height"):
+ data[key] = text.parse_int(data[key])
+ data["tags"] = text.unquote(data["tags"])
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def get_metadata(self):
+ """Return general metadata"""
+ return {}
+
+ def get_posts(self):
+ """Return an iterable containing data of all relevant posts"""
+
+
+class PahealTagExtractor(PahealExtractor):
+ """Extractor for images from rule34.paheal.net by search-tags"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
+ r"/post/list/([^/?&#]+)")
+ test = ("https://rule34.paheal.net/post/list/k-on/1", {
+ "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20",
+ "count": ">= 15"
+ })
+ per_page = 70
+
+ def __init__(self, match):
+ PahealExtractor.__init__(self, match)
+ self.tags = text.unquote(match.group(1))
+
+ def get_metadata(self):
+ return {"search_tags": self.tags}
+
+ def get_posts(self):
+ pnum = 1
+ while True:
+ url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
+ page = self.request(url).text
+
+ for post in text.extract_iter(
+ page, '<img id="thumb_', '>Image Only<'):
+ yield self._extract_data(post)
+
+ if ">Next<" not in page:
+ return
+ pnum += 1
+
+ @staticmethod
+ def _extract_data(post):
+ pid , pos = text.extract(post, '', '"')
+ data, pos = text.extract(post, 'title="', '"', pos)
+ md5 , pos = text.extract(post, '/_thumbs/', '/', pos)
+ url , pos = text.extract(post, '<a href="', '"', pos)
+
+ tags, dimensions, size, _ = data.split(" // ")
+ width, _, height = dimensions.partition("x")
+
+ return {
+ "id": pid, "md5": md5, "tags": tags, "file_url": url,
+ "width": width, "height": height,
+ "size": text.parse_bytes(size[:-1]),
+ }
+
+
+class PahealPostExtractor(PahealExtractor):
+ """Extractor for single images from rule34.paheal.net"""
+ subcategory = "post"
+ pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
+ r"/post/view/(\d+)")
+ test = ("https://rule34.paheal.net/post/view/481609", {
+ "url": "1142779378f655ec0497d4c301836aa667f788b1",
+ "keyword": "34e9e93d4fa6fa06fac1a56e78c9a52e8cd7b271",
+ "content": "7b924bcf150b352ac75c9d281d061e174c851a11",
+ })
+
+ def __init__(self, match):
+ PahealExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def get_posts(self):
+ url = "{}/post/view/{}".format(self.root, self.post_id)
+ page = self.request(url).text
+
+ tags , pos = text.extract(page, ": ", "<")
+ md5 , pos = text.extract(page, "/_thumbs/", "/", pos)
+ url , pos = text.extract(page, "id='main_image' src='", "'", pos)
+ width , pos = text.extract(page, "data-width='", "'", pos)
+ height, pos = text.extract(page, "data-height='", "'", pos)
+
+ return ({
+ "id": self.post_id, "md5": md5, "tags": tags, "file_url": url,
+ "width": width, "height": height, "size": 0,
+ },)
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
new file mode 100644
index 0000000..4884497
--- /dev/null
+++ b/gallery_dl/extractor/patreon.py
@@ -0,0 +1,183 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.patreon.com/"""
+
+from .common import Extractor, Message
+from .. import text
+from ..cache import memcache
+
+
+class PatreonExtractor(Extractor):
+ """Base class for patreon extractors"""
+ category = "patreon"
+ root = "https://www.patreon.com"
+ directory_fmt = ("{category}", "{creator[full_name]}")
+ filename_fmt = "{id}_{title}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ _warning = True
+
+ def items(self):
+ yield Message.Version, 1
+
+ if self._warning:
+ if "session_id" not in self.session.cookies:
+ self.log.warning("no 'session_id' cookie set")
+ PatreonExtractor._warning = False
+
+ for post in self.posts():
+ yield Message.Directory, post
+
+ post["num"] = 0
+ content = post.get("content")
+ postfile = post.get("post_file")
+
+ for url in text.extract_iter(content or "", 'src="', '"'):
+ post["num"] += 1
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+ if postfile:
+ post["num"] += 1
+ text.nameext_from_url(postfile["name"], post)
+ yield Message.Url, postfile["url"], post
+
+ for attachment in post["attachments"]:
+ post["num"] += 1
+ text.nameext_from_url(attachment["name"], post)
+ yield Message.Url, attachment["url"], post
+
+ def posts(self):
+ """Return all relevant post objects"""
+
+ def _pagination(self, url):
+ headers = {"Referer": self.root}
+ empty = []
+
+ while url:
+ posts = self.request(url, headers=headers).json()
+
+ if "included" not in posts:
+ return
+
+ # collect attachments
+ attachments = {}
+ for inc in posts["included"]:
+ if inc["type"] == "attachment":
+ attachments[inc["id"]] = inc["attributes"]
+
+ # update posts
+ for post in posts["data"]:
+ attr = post["attributes"]
+ attr["id"] = text.parse_int(post["id"])
+ attr["date"] = text.parse_datetime(
+ attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ attr["creator"] = self._user(
+ post["relationships"]["user"]["links"]["related"])
+
+ # add attachments to post attributes
+ files = post["relationships"].get("attachments")
+ if files:
+ attr["attachments"] = [
+ attachments[f["id"]]
+ for f in files["data"]
+ ]
+ else:
+ attr["attachments"] = empty
+
+ yield attr
+
+ if "links" not in posts:
+ return
+ url = posts["links"].get("next")
+
+ @memcache(keyarg=1)
+ def _user(self, url):
+ user = self.request(url).json()["data"]
+ attr = user["attributes"]
+ attr["id"] = user["id"]
+ attr["date"] = text.parse_datetime(
+ attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ return attr
+
+ @staticmethod
+ def _build_url(endpoint, query):
+ return (
+ "https://www.patreon.com/api/" + endpoint +
+
+ "?include=user,attachments,user_defined_tags,campaign,poll.choices"
+ ",poll.current_user_responses.user,poll.current_user_responses.cho"
+ "ice,poll.current_user_responses.poll,access_rules.tier.null"
+
+ "&fields[post]=change_visibility_at,comment_count,content,current_"
+ "user_can_delete,current_user_can_view,current_user_has_liked,embe"
+ "d,image,is_paid,like_count,min_cents_pledged_to_view,post_file,pu"
+ "blished_at,patron_count,patreon_url,post_type,pledge_url,thumbnai"
+ "l_url,teaser_text,title,upgrade_url,url,was_posted_by_campaign_ow"
+ "ner"
+ "&fields[user]=image_url,full_name,url"
+ "&fields[campaign]=avatar_photo_url,earnings_visibility,is_nsfw,is"
+ "_monthly,name,url"
+ "&fields[access_rule]=access_rule_type,amount_cents" + query +
+
+ "&json-api-use-default-includes=false"
+ "&json-api-version=1.0"
+ )
+
+
+class PatreonCreatorExtractor(PatreonExtractor):
+ """Extractor for a creator's works"""
+ subcategory = "creator"
+ pattern = (r"(?:https?://)?(?:www\.)?patreon\.com"
+ r"/(?!(?:home|join|login|signup)(?:$|[/?&#]))([^/?&#]+)/?")
+ test = ("https://www.patreon.com/koveliana", {
+ "range": "1-25",
+ "count": ">= 25",
+ "keyword": {
+ "attachments": list,
+ "comment_count": int,
+ "content": str,
+ "creator": dict,
+ "date": "type:datetime",
+ "id": int,
+ "like_count": int,
+ "post_type": str,
+ "published_at": str,
+ "title": str,
+ },
+ })
+
+ def __init__(self, match):
+ PatreonExtractor.__init__(self, match)
+ self.creator = match.group(1).lower()
+
+ def posts(self):
+ url = "{}/{}".format(self.root, self.creator)
+ page = self.request(url).text
+ campaign_id = text.extract(page, "/campaign/", "/")[0]
+
+ url = self._build_url("posts", (
+ "&sort=-published_at"
+ "&filter[is_draft]=false"
+ "&filter[contains_exclusive_posts]=true"
+ "&filter[campaign_id]=" + campaign_id
+ ))
+ return self._pagination(url)
+
+
+class PatreonUserExtractor(PatreonExtractor):
+ """Extractor for media from creators supported by you"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?(?:www\.)?patreon\.com/home$"
+ test = ("https://www.patreon.com/home",)
+
+ def posts(self):
+ url = self._build_url("stream", (
+ "&page[cursor]=null"
+ "&filter[is_following]=true"
+ ))
+ return self._pagination(url)
diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py
new file mode 100644
index 0000000..83f75a3
--- /dev/null
+++ b/gallery_dl/extractor/photobucket.py
@@ -0,0 +1,178 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://photobucket.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+import base64
+import json
+
+
+class PhotobucketAlbumExtractor(Extractor):
+ """Extractor for albums on photobucket.com"""
+ category = "photobucket"
+ subcategory = "album"
+ directory_fmt = ("{category}", "{username}", "{location}")
+ filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"
+ archive_fmt = "{id}"
+ pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)"
+ r"/user/[^/?&#]+/library/[^?&#]*")
+ test = (
+ ("https://s258.photobucket.com/user/focolandia/library/", {
+ "pattern": r"https?://[oi]+\d+.photobucket.com/albums/hh280/",
+ "count": ">= 39"
+ }),
+ # subalbums of main "directory"
+ ("https://s271.photobucket.com/user/lakerfanryan/library/", {
+ "options": (("image-filter", "False"),),
+ "pattern": pattern,
+ "count": 1,
+ }),
+ # subalbums of subalbum without images
+ ("https://s271.photobucket.com/user/lakerfanryan/library/Basketball", {
+ "pattern": pattern,
+ "count": ">= 9",
+ }),
+ # private (missing JSON data)
+ ("https://s1277.photobucket.com/user/sinisterkat44/library/", {
+ "count": 0,
+ }),
+ ("https://s1110.photobucket.com/user/chndrmhn100/library/"
+ "Chandu%20is%20the%20King?sort=3&page=1"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.album_path = ""
+ self.root = "https://" + match.group(1)
+ self.session.headers["Referer"] = self.url
+
+ def items(self):
+ yield Message.Version, 1
+ for image in self.images():
+ image["titleOrFilename"] = text.unescape(image["titleOrFilename"])
+ image["title"] = text.unescape(image["title"])
+ image["extension"] = image["ext"]
+ yield Message.Directory, image
+ yield Message.Url, image["fullsizeUrl"], image
+
+ if self.config("subalbums", True):
+ for album in self.subalbums():
+ album["_extractor"] = PhotobucketAlbumExtractor
+ yield Message.Queue, album["url"], album
+
+ def images(self):
+ """Yield all images of the current album"""
+ url = self.url
+ params = {"sort": "3", "page": 1}
+
+ while True:
+ page = self.request(url, params=params).text
+ json_data = text.extract(page, "collectionData:", ",\n")[0]
+ if not json_data:
+ msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0]
+ msg = ' ("{}")'.format(text.remove_html(msg)) if msg else ""
+ self.log.error("Unable to get JSON data%s", msg)
+ return
+ data = json.loads(json_data)
+
+ yield from data["items"]["objects"]
+
+ if data["total"] <= data["offset"] + data["pageSize"]:
+ self.album_path = data["currentAlbumPath"]
+ return
+ params["page"] += 1
+
+ def subalbums(self):
+ """Return all subalbum objects"""
+ url = self.root + "/component/Albums-SubalbumList"
+ params = {
+ "albumPath": self.album_path,
+ "fetchSubAlbumsOnly": "true",
+ "deferCollapsed": "true",
+ "json": "1",
+ }
+
+ data = self.request(url, params=params).json()
+ return data["body"].get("subAlbums", ())
+
+
+class PhotobucketImageExtractor(Extractor):
+ """Extractor for individual images from photobucket.com"""
+ category = "photobucket"
+ subcategory = "image"
+ directory_fmt = ("{category}", "{username}")
+ filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"
+ archive_fmt = "{username}_{id}"
+ pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com"
+ r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"
+ r"|/user/([^/?&#]+)/media/[^?&#]+\.html)")
+ test = (
+ (("https://s271.photobucket.com/user/lakerfanryan"
+ "/media/Untitled-3-1.jpg.html"), {
+ "url": "3b647deeaffc184cc48c89945f67574559c9051f",
+ "keyword": "a2de4e60d584912537b8025c01bdd1d20bdea735",
+ }),
+ (("https://s271.photobucket.com/user/lakerfanryan"
+ "/media/IsotopeswBros.jpg.html?sort=3&o=2"), {
+ "url": "12c1890c09c9cdb8a88fba7eec13f324796a8d7b",
+ "keyword": "61200a223df6c06f45ac3d30c88b3f5b048ce9a8",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1) or match.group(3)
+ self.media_id = match.group(2)
+ self.session.headers["Referer"] = self.url
+
+ def items(self):
+ url = "https://photobucket.com/galleryd/search.php"
+ params = {"userName": self.user, "searchTerm": "", "ref": ""}
+
+ if self.media_id:
+ params["mediaId"] = self.media_id
+ else:
+ params["url"] = self.url
+
+ # retry API call up to 5 times, since it can randomly fail
+ tries = 0
+ while tries < 5:
+ data = self.request(url, method="POST", params=params).json()
+ image = data["mediaDocuments"]
+ if "message" not in image:
+ break # success
+ tries += 1
+ self.log.debug("'%s'", image["message"])
+ else:
+ self.log.error("%s", image["message"])
+ raise exception.StopExtraction()
+
+ # adjust metadata entries to be at least somewhat similar
+ # to what the 'album' extractor provides
+ if "media" in image:
+ image = image["media"][image["mediaIndex"]]
+ image["albumView"] = data["mediaDocuments"]["albumView"]
+ image["username"] = image["ownerId"]
+ else:
+ image["fileUrl"] = image.pop("imageUrl")
+
+ image.setdefault("title", "")
+ image.setdefault("description", "")
+ name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".")
+ image["ext"] = image["extension"] = ext
+ image["titleOrFilename"] = image["title"] or name
+ image["tags"] = image.pop("clarifaiTagList", [])
+
+ mtype, _, mid = base64.b64decode(image["id"]).partition(b":")
+ image["pictureId"] = mid.decode() if mtype == b"mediaId" else ""
+
+ yield Message.Version, 1
+ yield Message.Directory, image
+ yield Message.Url, image["fileUrl"], image
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
new file mode 100644
index 0000000..6a5c41c
--- /dev/null
+++ b/gallery_dl/extractor/piczel.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://piczel.tv/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class PiczelExtractor(Extractor):
+ """Base class for piczel extractors"""
+ category = "piczel"
+ directory_fmt = ("{category}", "{user[username]}")
+ filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ root = "https://piczel.tv"
+ api_root = "https://apollo.piczel.tv"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item_id = match.group(1)
+
+ def items(self):
+ first = True
+ yield Message.Version, 1
+ for image in self.unpack(self.get_images()):
+ if first:
+ yield Message.Directory, image
+ first = False
+ path = image["image"]["image"]["url"]
+ url = "{}/static/{}".format(self.api_root, path)
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ @staticmethod
+ def unpack(images):
+ """Unpack 'images' into individual image objects"""
+ for image in images:
+ if image["multi"]:
+ multi = image["images"]
+ del image["images"]
+ for image["num"], img in enumerate(multi):
+ image["image"] = img
+ yield image
+ else:
+ image["num"] = 0
+ yield image
+
+ def get_images(self):
+ """Return an iterable with all relevant image objects"""
+
+
+class PiczelUserExtractor(PiczelExtractor):
+ """Extractor for all images from a user's gallery"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$"
+ test = ("https://piczel.tv/gallery/Lulena", {
+ "count": ">= 13",
+ })
+
+ def get_images(self):
+ url = "{}/api/users/{}/gallery".format(self.api_root, self.item_id)
+ return self.request(url).json()
+
+
+class PiczelFolderExtractor(PiczelExtractor):
+ """Extractor for images inside a user's folder"""
+ subcategory = "folder"
+ directory_fmt = ("{category}", "{user[username]}", "{folder[name]}")
+ archive_fmt = "f{folder[id]}_{id}_{num}"
+ pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv"
+ r"/gallery/(?!image)[^/?&#]+/(\d+)")
+ test = ("https://piczel.tv/gallery/Lulena/1114", {
+ "count": ">= 4",
+ })
+
+ def get_images(self):
+ url = "{}/api/gallery/folder/{}".format(self.api_root, self.item_id)
+ images = self.request(url).json()
+ images.reverse()
+ return images
+
+
+class PiczelImageExtractor(PiczelExtractor):
+ """Extractor for individual images"""
+ subcategory = "image"
+ pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)"
+ test = ("https://piczel.tv/gallery/image/7807", {
+ "url": "9b9e416b6ab7e58676fab84453d5028f306ece34",
+ "content": "df9a053a24234474a19bce2b7e27e0dec23bff87",
+ "keyword": {
+ "created_at": "2018-07-22T05:13:58.000Z",
+ "description": None,
+ "extension": "png",
+ "favorites_count": int,
+ "folder": dict,
+ "folder_id": 1113,
+ "id": 7807,
+ "is_flash": False,
+ "is_video": False,
+ "multi": False,
+ "nsfw": False,
+ "num": 0,
+ "password_protected": False,
+ "tags": "fanart, commission, altair, recreators, ",
+ "title": "Altair",
+ "user": dict,
+ "views": int,
+ },
+ })
+
+ def get_images(self):
+ url = "{}/api/gallery/image/{}".format(self.api_root, self.item_id)
+ return (self.request(url).json(),)
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
new file mode 100644
index 0000000..fa8cd48
--- /dev/null
+++ b/gallery_dl/extractor/pinterest.py
@@ -0,0 +1,260 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.pinterest.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.\w+"
+
+
+class PinterestExtractor(Extractor):
+ """Base class for pinterest extractors"""
+ category = "pinterest"
+ filename_fmt = "{category}_{id}.{extension}"
+ archive_fmt = "{id}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = PinterestAPI(self)
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for pin in self.pins():
+ if "images" in pin:
+ url, pin_data = self.data_from_pin(pin)
+ pin_data.update(data)
+ yield Message.Url, url, pin_data
+
+ def metadata(self):
+ """Return general metadata"""
+
+ def pins(self):
+ """Return all relevant pin-objects"""
+
+ @staticmethod
+ def data_from_pin(pin):
+ """Get image url and metadata from a pin-object"""
+ img = pin["images"]["orig"]
+ url = img["url"]
+ pin["width"] = img["width"]
+ pin["height"] = img["height"]
+ return url, text.nameext_from_url(url, pin)
+
+
+class PinterestPinExtractor(PinterestExtractor):
+ """Extractor for images from a single pin from pinterest.com"""
+ subcategory = "pin"
+ pattern = BASE_PATTERN + r"/pin/([^/?#&]+)(?!.*#related$)"
+ test = (
+ ("https://www.pinterest.com/pin/858146903966145189/", {
+ "url": "afb3c26719e3a530bb0e871c480882a801a4e8a5",
+ # image version depends on CDN server used
+ # "content": "d3e24bc9f7af585e8c23b9136956bd45a4d9b947",
+ # "content": "4c435a66f6bb82bb681db2ecc888f76cf6c5f9ca",
+ }),
+ ("https://www.pinterest.com/pin/858146903966145188/", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.pin_id = match.group(1)
+ self.pin = None
+
+ def metadata(self):
+ self.pin = self.api.pin(self.pin_id)
+ return self.data_from_pin(self.pin)[1]
+
+ def pins(self):
+ return (self.pin,)
+
+
+class PinterestBoardExtractor(PinterestExtractor):
+ """Extractor for images from a board from pinterest.com"""
+ subcategory = "board"
+ directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}")
+ archive_fmt = "{board[id]}_{id}"
+ pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)(?!.*#related$)"
+ test = (
+ ("https://www.pinterest.com/g1952849/test-/", {
+ "pattern": r"https://i\.pinimg\.com/originals/",
+ "count": 2,
+ }),
+ ("https://www.pinterest.com/g1952848/test/", {
+ "exception": exception.GalleryDLException,
+ }),
+ )
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.user = text.unquote(match.group(1))
+ self.board = text.unquote(match.group(2))
+ self.board_id = 0
+
+ def metadata(self):
+ board = self.api.board(self.user, self.board)
+ self.board_id = board["id"]
+ return {"board": board}
+
+ def pins(self):
+ return self.api.board_pins(self.board_id)
+
+
+class PinterestRelatedPinExtractor(PinterestPinExtractor):
+ """Extractor for related pins of another pin from pinterest.com"""
+ subcategory = "related-pin"
+ directory_fmt = ("{category}", "related {original_pin[id]}")
+ pattern = BASE_PATTERN + r"/pin/([^/?#&]+).*#related$"
+ test = ("https://www.pinterest.com/pin/858146903966145189/#related", {
+ "range": "31-50",
+ "count": 20,
+ })
+
+ def metadata(self):
+ pin = self.api.pin(self.pin_id)
+ return {"original_pin": self.data_from_pin(pin)[1]}
+
+ def pins(self):
+ return self.api.pin_related(self.pin_id)
+
+
+class PinterestRelatedBoardExtractor(PinterestBoardExtractor):
+ """Extractor for related pins of a board from pinterest.com"""
+ subcategory = "related-board"
+ directory_fmt = ("{category}", "{board[owner][username]}",
+ "{board[name]}", "related")
+ pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+).*#related$"
+ test = ("https://www.pinterest.com/g1952849/test-/#related", {
+ "range": "31-50",
+ "count": 20,
+ })
+
+ def pins(self):
+ return self.api.board_related(self.board_id)
+
+
+class PinterestPinitExtractor(PinterestExtractor):
+ """Extractor for images from a pin.it URL"""
+ subcategory = "pinit"
+ pattern = r"(?:https?://)?pin\.it/([^/?#&]+)"
+
+ test = (
+ ("https://pin.it/Hvt8hgT", {
+ "url": "8daad8558382c68f0868bdbd17d05205184632fa",
+ }),
+ ("https://pin.it/Hvt8hgS", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.shortened_id = match.group(1)
+
+ def items(self):
+ url = "https://api.pinterest.com/url_shortener/{}/redirect".format(
+ self.shortened_id)
+ response = self.request(url, method="HEAD", allow_redirects=False)
+ location = response.headers.get("Location")
+ if not location or location in ("https://api.pinterest.com/None",
+ "https://pin.it/None",
+ "https://www.pinterest.com"):
+ raise exception.NotFoundError("pin")
+ yield Message.Queue, location, {}
+
+
+class PinterestAPI():
+ """Minimal interface for the Pinterest Web API
+
+ For a better and more complete implementation in PHP, see
+ - https://github.com/seregazhuk/php-pinterest-bot
+ """
+
+ BASE_URL = "https://www.pinterest.com"
+ HEADERS = {
+ "Accept" : "application/json, text/javascript, "
+ "*/*, q=0.01",
+ "Accept-Language" : "en-US,en;q=0.5",
+ "X-Pinterest-AppState": "active",
+ "X-APP-VERSION" : "cb1c7f9",
+ "X-Requested-With" : "XMLHttpRequest",
+ "Origin" : BASE_URL + "/",
+ }
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+
+ def pin(self, pin_id):
+ """Query information about a pin"""
+ options = {"id": pin_id, "field_set_key": "detailed"}
+ return self._call("Pin", options)["resource_response"]["data"]
+
+ def pin_related(self, pin_id):
+ """Yield related pins of another pin"""
+ options = {"pin": pin_id, "add_vase": True, "pins_only": True}
+ return self._pagination("RelatedPinFeed", options)
+
+ def board(self, user, board):
+ """Query information about a board"""
+ options = {"slug": board, "username": user,
+ "field_set_key": "detailed"}
+ return self._call("Board", options)["resource_response"]["data"]
+
+ def board_pins(self, board_id):
+ """Yield all pins of a specific board"""
+ options = {"board_id": board_id}
+ return self._pagination("BoardFeed", options)
+
+ def board_related(self, board_id):
+ """Yield related pins of a specific board"""
+ options = {"board_id": board_id, "add_vase": True}
+ return self._pagination("BoardRelatedPixieFeed", options)
+
+ def _call(self, resource, options):
+ url = "{}/resource/{}Resource/get/".format(self.BASE_URL, resource)
+ params = {"data": json.dumps({"options": options}), "source_url": ""}
+
+ response = self.extractor.request(
+ url, params=params, headers=self.HEADERS, expect=range(400, 500))
+
+ try:
+ data = response.json()
+ except ValueError:
+ data = {}
+
+ if 200 <= response.status_code < 400 and not response.history:
+ return data
+
+ if response.status_code == 404 or response.history:
+ resource = self.extractor.subcategory.rpartition("-")[2]
+ raise exception.NotFoundError(resource)
+ self.extractor.log.error("API request failed")
+ self.extractor.log.debug("%s", response.text)
+ raise exception.StopExtraction()
+
+ def _pagination(self, resource, options):
+ while True:
+ data = self._call(resource, options)
+ yield from data["resource_response"]["data"]
+
+ try:
+ bookmarks = data["resource"]["options"]["bookmarks"]
+ if (not bookmarks or bookmarks[0] == "-end-" or
+ bookmarks[0].startswith("Y2JOb25lO")):
+ return
+ options["bookmarks"] = bookmarks
+ except KeyError:
+ return
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
new file mode 100644
index 0000000..af29c4b
--- /dev/null
+++ b/gallery_dl/extractor/pixiv.py
@@ -0,0 +1,517 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images and ugoira from https://www.pixiv.net/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+from datetime import datetime, timedelta
+
+
+class PixivExtractor(Extractor):
+ """Base class for pixiv extractors"""
+ category = "pixiv"
+ directory_fmt = ("{category}", "{user[id]} {user[account]}")
+ filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}"
+ archive_fmt = "{id}{num}.{extension}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = PixivAppAPI(self)
+ self.user_id = -1
+ self.load_ugoira = self.config("ugoira", True)
+
+ def items(self):
+ metadata = self.get_metadata()
+ yield Message.Version, 1
+
+ for work in self.works():
+ if not work["user"]["id"]:
+ continue
+
+ meta_single_page = work["meta_single_page"]
+ meta_pages = work["meta_pages"]
+ del work["meta_single_page"]
+ del work["image_urls"]
+ del work["meta_pages"]
+ work["num"] = ""
+ work["tags"] = [tag["name"] for tag in work["tags"]]
+ work["date"] = text.parse_datetime(work["create_date"])
+ work.update(metadata)
+
+ yield Message.Directory, work
+
+ if work["type"] == "ugoira":
+ if not self.load_ugoira:
+ continue
+ ugoira = self.api.ugoira_metadata(work["id"])
+
+ url = ugoira["zip_urls"]["medium"].replace(
+ "_ugoira600x600", "_ugoira1920x1080")
+ work["frames"] = ugoira["frames"]
+ work["extension"] = "zip"
+ yield Message.Url, url, work
+
+ elif work["page_count"] == 1:
+ url = meta_single_page["original_image_url"]
+ work["extension"] = url.rpartition(".")[2]
+ yield Message.Url, url, work
+
+ else:
+ for num, img in enumerate(meta_pages):
+ url = img["image_urls"]["original"]
+ work["num"] = "_p{:02}".format(num)
+ work["extension"] = url.rpartition(".")[2]
+ yield Message.Url, url, work
+
+ def works(self):
+ """Return an iterable containing all relevant 'work'-objects"""
+
+ def get_metadata(self, user=None):
+ """Collect metadata for extractor-job"""
+ if not user:
+ user = self.api.user_detail(self.user_id)
+ return {"user": user}
+
+
+class PixivUserExtractor(PixivExtractor):
+ """Extractor for works of a pixiv-user"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/"
+ r"(?:member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?"
+ r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))")
+ test = (
+ ("http://www.pixiv.net/member_illust.php?id=173530", {
+ "url": "852c31ad83b6840bacbce824d85f2a997889efb7",
+ }),
+ # illusts with specific tag
+ (("https://www.pixiv.net/member_illust.php?id=173530"
+ "&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), {
+ "url": "25b1cd81153a8ff82eec440dd9f20a4a22079658",
+ }),
+ ("http://www.pixiv.net/member_illust.php?id=173531", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://www.pixiv.net/u/173530"),
+ ("https://www.pixiv.net/user/173530"),
+ ("https://www.pixiv.net/mypage.php#id=173530"),
+ ("https://www.pixiv.net/#id=173530"),
+ ("https://touch.pixiv.net/member_illust.php?id=173530"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.user_id = match.group(1) or match.group(3)
+ self.query = text.parse_query(match.group(2))
+
+ def works(self):
+ works = self.api.user_illusts(self.user_id)
+
+ if "tag" in self.query:
+ tag = text.unquote(self.query["tag"]).lower()
+ works = (
+ work for work in works
+ if tag in [t["name"].lower() for t in work["tags"]]
+ )
+
+ return works
+
+
+class PixivMeExtractor(PixivExtractor):
+ """Extractor for pixiv.me URLs"""
+ subcategory = "me"
+ pattern = r"(?:https?://)?pixiv\.me/([^/?&#]+)"
+ test = (
+ ("https://pixiv.me/del_shannon", {
+ "url": "0b1a18c3e3553c44ee6e0ccc36a7fd906c498e8f",
+ }),
+ ("https://pixiv.me/del_shanno", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.account = match.group(1)
+
+ def items(self):
+ url = "https://pixiv.me/" + self.account
+ response = self.request(
+ url, method="HEAD", allow_redirects=False, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("user")
+ yield Message.Version, 1
+ yield Message.Queue, response.headers["Location"], {}
+
+
+class PixivWorkExtractor(PixivExtractor):
+ """Extractor for a single pixiv work/illustration"""
+ subcategory = "work"
+ pattern = (r"(?:https?://)?(?:(?:www\.|touch\.)?pixiv\.net"
+ r"/member(?:_illust)?\.php\?(?:[^&]+&)*illust_id=(\d+)"
+ r"|(?:i(?:\d+\.pixiv|\.pximg)\.net"
+ r"/(?:(?:.*/)?img-[^/]+/img/\d{4}(?:/\d\d){5}|img\d+/img/[^/]+)"
+ r"|img\d*\.pixiv\.net/img/[^/]+|(?:www\.)?pixiv\.net/i)/(\d+))")
+ test = (
+ (("http://www.pixiv.net/member_illust.php"
+ "?mode=medium&illust_id=966412"), {
+ "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
+ "content": "69a8edfb717400d1c2e146ab2b30d2c235440c5a",
+ }),
+ (("http://www.pixiv.net/member_illust.php"
+ "?mode=medium&illust_id=966411"), {
+ "exception": exception.NotFoundError,
+ }),
+ # ugoira
+ (("https://www.pixiv.net/member_illust.php"
+ "?mode=medium&illust_id=66806629"), {
+ "url": "7267695a985c4db8759bebcf8d21dbdd2d2317ef",
+ "keywords": {"frames": list},
+ }),
+ ("http://i1.pixiv.net/c/600x600/img-master"
+ "/img/2008/06/13/00/29/13/966412_p0_master1200.jpg"),
+ ("https://i.pximg.net/img-original"
+ "/img/2017/04/25/07/33/29/62568267_p0.png"),
+ ("https://www.pixiv.net/i/966412"),
+ ("http://img.pixiv.net/img/soundcross/42626136.jpg"),
+ ("http://i2.pixiv.net/img76/img/snailrin/42672235.jpg"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.illust_id = match.group(1) or match.group(2)
+ self.load_ugoira = True
+ self.work = None
+
+ def works(self):
+ return (self.work,)
+
+ def get_metadata(self, user=None):
+ self.work = self.api.illust_detail(self.illust_id)
+ return PixivExtractor.get_metadata(self, self.work["user"])
+
+
+class PixivFavoriteExtractor(PixivExtractor):
+ """Extractor for all favorites/bookmarks of a pixiv-user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "bookmarks",
+ "{user_bookmark[id]} {user_bookmark[account]}")
+ archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}"
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/bookmark\.php(?:\?([^#]*))?")
+ test = (
+ ("https://www.pixiv.net/bookmark.php?id=173530", {
+ "url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
+ }),
+ # bookmarks with specific tag
+ (("https://www.pixiv.net/bookmark.php?id=3137110"
+ "&tag=%E3%81%AF%E3%82%93%E3%82%82%E3%82%93&p=1"), {
+ "count": 2,
+ }),
+ # own bookmarks
+ ("https://www.pixiv.net/bookmark.php", {
+ "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
+ }),
+ # touch URLs
+ ("https://touch.pixiv.net/bookmark.php?id=173530"),
+ ("https://touch.pixiv.net/bookmark.php"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.query = text.parse_query(match.group(1))
+ if "id" not in self.query:
+ self.subcategory = "bookmark"
+
+ def works(self):
+ tag = None
+ restrict = "public"
+
+ if "tag" in self.query:
+ tag = text.unquote(self.query["tag"])
+ if "rest" in self.query and self.query["rest"] == "hide":
+ restrict = "private"
+
+ return self.api.user_bookmarks_illust(self.user_id, tag, restrict)
+
+ def get_metadata(self, user=None):
+ if "id" in self.query:
+ user = self.api.user_detail(self.query["id"])
+ else:
+ self.api.login()
+ user = self.api.user
+
+ self.user_id = user["id"]
+ return {"user_bookmark": user}
+
+
+class PixivRankingExtractor(PixivExtractor):
+ """Extractor for pixiv ranking pages"""
+ subcategory = "ranking"
+ archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}"
+ directory_fmt = ("{category}", "rankings",
+ "{ranking[mode]}", "{ranking[date]}")
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/ranking\.php(?:\?([^#]*))?")
+ test = (
+ ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"),
+ ("https://www.pixiv.net/ranking.php"),
+ ("https://touch.pixiv.net/ranking.php"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.query = match.group(1)
+ self.mode = self.date = None
+
+ def works(self):
+ return self.api.illust_ranking(self.mode, self.date)
+
+ def get_metadata(self, user=None):
+ query = text.parse_query(self.query)
+
+ mode = query.get("mode", "daily").lower()
+ mode_map = {
+ "daily": "day",
+ "daily_r18": "day_r18",
+ "weekly": "week",
+ "weekly_r18": "week_r18",
+ "monthly": "month",
+ "male": "day_male",
+ "male_r18": "day_male_r18",
+ "female": "day_female",
+ "female_r18": "day_female_r18",
+ "original": "week_original",
+ "rookie": "week_rookie",
+ "r18g": "week_r18g",
+ }
+ if mode not in mode_map:
+ self.log.warning("invalid mode '%s'", mode)
+ mode = "daily"
+ self.mode = mode_map[mode]
+
+ date = query.get("date")
+ if date:
+ if len(date) == 8 and date.isdecimal():
+ date = "{}-{}-{}".format(date[0:4], date[4:6], date[6:8])
+ else:
+ self.log.warning("invalid date '%s'", date)
+ date = None
+ if not date:
+ date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d")
+ self.date = date
+
+ return {"ranking": {
+ "mode": mode,
+ "date": self.date,
+ }}
+
+
+class PixivSearchExtractor(PixivExtractor):
+ """Extractor for pixiv search results"""
+ subcategory = "search"
+ archive_fmt = "s_{search[word]}_{id}{num}.{extension}"
+ directory_fmt = ("{category}", "search", "{search[word]}")
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/search\.php\?([^#]+)")
+ test = (
+ ("https://www.pixiv.net/search.php?s_mode=s_tag&word=Original"),
+ ("https://touch.pixiv.net/search.php?word=Original"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.query = match.group(1)
+ self.word = self.sort = self.target = None
+
+ def works(self):
+ return self.api.search_illust(self.word, self.sort, self.target)
+
+ def get_metadata(self, user=None):
+ query = text.parse_query(self.query)
+
+ if "word" in query:
+ self.word = text.unescape(query["word"])
+ else:
+ self.log.error("missing search term")
+ raise exception.StopExtraction()
+
+ sort = query.get("order", "date_d")
+ sort_map = {
+ "date": "date_asc",
+ "date_d": "date_desc",
+ }
+ if sort not in sort_map:
+ self.log.warning("invalid sort order '%s'", sort)
+ sort = "date_d"
+ self.sort = sort_map[sort]
+
+ target = query.get("s_mode", "s_tag")
+ target_map = {
+ "s_tag": "partial_match_for_tags",
+ "s_tag_full": "exact_match_for_tags",
+ "s_tc": "title_and_caption",
+ }
+ if target not in target_map:
+ self.log.warning("invalid search target '%s'", target)
+ target = "s_tag"
+ self.target = target_map[target]
+
+ return {"search": {
+ "word": self.word,
+ "sort": self.sort,
+ "target": self.target,
+ }}
+
+
+class PixivFollowExtractor(PixivExtractor):
+ """Extractor for new illustrations from your followed artists"""
+ subcategory = "follow"
+ archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}"
+ directory_fmt = ("{category}", "following")
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net"
+ r"/bookmark_new_illust\.php")
+ test = (
+ ("https://www.pixiv.net/bookmark_new_illust.php"),
+ ("https://touch.pixiv.net/bookmark_new_illust.php"),
+ )
+
+ def works(self):
+ return self.api.illust_follow()
+
+ def get_metadata(self, user=None):
+ self.api.login()
+ return {"user_follow": self.api.user}
+
+
+class PixivAppAPI():
+ """Minimal interface for the Pixiv App API for mobile devices
+
+ For a more complete implementation or documentation, see
+ - https://github.com/upbit/pixivpy
+ - https://gist.github.com/ZipFile/3ba99b47162c23f8aea5d5942bb557b1
+ """
+ CLIENT_ID = "MOBrBDS8blbauoSck0ZfDbtuzpyT"
+ CLIENT_SECRET = "lsACyCD94FhDUtGTXi3QzcFE2uU1hqtDaKeqrdwj"
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.log = extractor.log
+ self.username, self.password = extractor._get_auth_info()
+ self.user = None
+
+ self.client_id = extractor.config(
+ "client-id", self.CLIENT_ID)
+ self.client_secret = extractor.config(
+ "client-secret", self.CLIENT_SECRET)
+
+ extractor.session.headers.update({
+ "App-OS": "ios",
+ "App-OS-Version": "10.3.1",
+ "App-Version": "6.7.1",
+ "User-Agent": "PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)",
+ "Referer": "https://app-api.pixiv.net/",
+ })
+
+ def login(self):
+ """Login and gain an access token"""
+ self.user, auth = self._login_impl(self.username, self.password)
+ self.extractor.session.headers["Authorization"] = auth
+
+ @cache(maxage=3600, keyarg=1)
+ def _login_impl(self, username, password):
+ url = "https://oauth.secure.pixiv.net/auth/token"
+ data = {
+ "client_id": self.client_id,
+ "client_secret": self.client_secret,
+ "get_secure_url": 1,
+ }
+ refresh_token = _refresh_token_cache(username)
+
+ if refresh_token:
+ self.log.info("Refreshing access token")
+ data["grant_type"] = "refresh_token"
+ data["refresh_token"] = refresh_token
+ else:
+ self.log.info("Logging in as %s", username)
+ data["grant_type"] = "password"
+ data["username"] = username
+ data["password"] = password
+
+ response = self.extractor.request(
+ url, method="POST", data=data, expect=(400,))
+ if response.status_code >= 400:
+ raise exception.AuthenticationError()
+
+ data = response.json()["response"]
+ if not refresh_token:
+ _refresh_token_cache.update(username, data["refresh_token"])
+ return data["user"], "Bearer " + data["access_token"]
+
+ def illust_detail(self, illust_id):
+ params = {"illust_id": illust_id}
+ return self._call("v1/illust/detail", params)["illust"]
+
+ def illust_follow(self, restrict="all"):
+ params = {"restrict": restrict}
+ return self._pagination("v2/illust/follow", params)
+
+ def illust_ranking(self, mode="day", date=None):
+ params = {"mode": mode, "date": date}
+ return self._pagination("v1/illust/ranking", params)
+
+ def search_illust(self, word, sort=None, target=None, duration=None):
+ params = {"word": word, "search_target": target,
+ "sort": sort, "duration": duration}
+ return self._pagination("v1/search/illust", params)
+
+ def user_bookmarks_illust(self, user_id, tag=None, restrict="public"):
+ params = {"user_id": user_id, "tag": tag, "restrict": restrict}
+ return self._pagination("v1/user/bookmarks/illust", params)
+
+ def user_detail(self, user_id):
+ params = {"user_id": user_id}
+ return self._call("v1/user/detail", params)["user"]
+
+ def user_illusts(self, user_id):
+ params = {"user_id": user_id}
+ return self._pagination("v1/user/illusts", params)
+
+ def ugoira_metadata(self, illust_id):
+ params = {"illust_id": illust_id}
+ return self._call("v1/ugoira/metadata", params)["ugoira_metadata"]
+
+ def _call(self, endpoint, params=None):
+ url = "https://app-api.pixiv.net/" + endpoint
+
+ self.login()
+ response = self.extractor.request(
+ url, params=params, expect=range(400, 500))
+
+ if 200 <= response.status_code < 400:
+ return response.json()
+ if response.status_code == 404:
+ raise exception.NotFoundError()
+ self.log.error("API request failed: %s", response.text)
+ raise exception.StopExtraction()
+
+ def _pagination(self, endpoint, params):
+ while True:
+ data = self._call(endpoint, params)
+ yield from data["illusts"]
+
+ if not data["next_url"]:
+ return
+ query = data["next_url"].rpartition("?")[2]
+ params = text.parse_query(query)
+
+
+@cache(maxage=10*365*24*3600, keyarg=0)
+def _refresh_token_cache(username):
+ return None
diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py
new file mode 100644
index 0000000..9cada6b
--- /dev/null
+++ b/gallery_dl/extractor/pixnet.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.pixnet.net/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net"
+
+
+class PixnetExtractor(Extractor):
+ """Base class for pixnet extractors"""
+ category = "pixnet"
+ filename_fmt = "{num:>03}_{id}.{extension}"
+ archive_fmt = "{id}"
+ url_fmt = ""
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.blog, self.item_id = match.groups()
+ self.root = "https://{}.pixnet.net".format(self.blog)
+
+ def items(self):
+ url = self.url_fmt.format(self.root, self.item_id)
+ page = self.request(url, encoding="utf-8").text
+ user = text.extract(page, '<meta name="author" content="', '";')[0]
+ data = {
+ "blog": self.blog,
+ "user": user.rpartition(" (")[0],
+ }
+
+ for info in self._pagination(page):
+ url, pos = text.extract(info, ' href="', '"')
+ alt, pos = text.extract(info, ' alt="', '"', pos)
+ item = {
+ "id" : text.parse_int(url.rpartition("/")[2]),
+ "title" : text.unescape(alt),
+ "_extractor": (PixnetFolderExtractor if "/folder/" in url else
+ PixnetSetExtractor),
+ }
+ item.update(data)
+ yield Message.Queue, url, item
+
+ def _pagination(self, page):
+ while True:
+ yield from text.extract_iter(page, '<li id="', '</li>')
+
+ pnext = text.extract(page, 'class="nextBtn"', '>')[0]
+ if "href" not in pnext:
+ return
+ url = self.root + text.extract(pnext, 'href="', '"')[0]
+ page = self.request(url, encoding="utf-8").text
+
+
+class PixnetImageExtractor(PixnetExtractor):
+ """Extractor for a single photo from pixnet.net"""
+ subcategory = "image"
+ filename_fmt = "{id}.{extension}"
+ directory_fmt = ("{category}", "{blog}")
+ pattern = BASE_PATTERN + r"/album/photo/(\d+)"
+ test = ("https://albertayu773.pixnet.net/album/photo/159443828", {
+ "url": "156564c422138914c9fa5b42191677b45c414af4",
+ "keyword": "19971bcd056dfef5593f4328a723a9602be0f087",
+ "content": "0e097bdf49e76dd9b9d57a016b08b16fa6a33280",
+ })
+
+ def items(self):
+ url = "https://api.pixnet.cc/oembed"
+ params = {
+ "url": "https://{}.pixnet.net/album/photo/{}".format(
+ self.blog, self.item_id),
+ "format": "json",
+ }
+
+ data = self.request(url, params=params).json()
+ data["id"] = text.parse_int(
+ data["url"].rpartition("/")[2].partition("-")[0])
+ data["filename"], _, data["extension"] = data["title"].rpartition(".")
+ data["blog"] = self.blog
+ data["user"] = data.pop("author_name")
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, data["url"], data
+
+
+class PixnetSetExtractor(PixnetExtractor):
+ """Extractor for images from a pixnet set"""
+ subcategory = "set"
+ url_fmt = "{}/album/set/{}"
+ directory_fmt = ("{category}", "{blog}",
+ "{folder_id} {folder_title}", "{set_id} {set_title}")
+ pattern = BASE_PATTERN + r"/album/set/(\d+)"
+ test = (
+ ("https://albertayu773.pixnet.net/album/set/15078995", {
+ "url": "6535712801af47af51110542f4938a7cef44557f",
+ "keyword": "bf25d59e5b0959cb1f53e7fd2e2a25f2f67e5925",
+ }),
+ ("https://anrine910070.pixnet.net/album/set/5917493", {
+ "url": "b3eb6431aea0bcf5003432a4a0f3a3232084fc13",
+ "keyword": "bf7004faa1cea18cf9bd856f0955a69be51b1ec6",
+ }),
+ )
+
+ def items(self):
+ url = self.url_fmt.format(self.root, self.item_id)
+ page = self.request(url, encoding="utf-8").text
+ data = self.metadata(page)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for num, info in enumerate(self._pagination(page), 1):
+ url, pos = text.extract(info, ' href="', '"')
+ src, pos = text.extract(info, ' src="', '"', pos)
+ alt, pos = text.extract(info, ' alt="', '"', pos)
+
+ photo = {
+ "id": text.parse_int(url.rpartition("/")[2].partition("#")[0]),
+ "url": src.replace("_s.", "."),
+ "num": num,
+ "filename": alt,
+ "extension": src.rpartition(".")[2],
+ }
+ photo.update(data)
+ yield Message.Url, photo["url"], photo
+
+ def metadata(self, page):
+ user , pos = text.extract(page, '<meta name="author" content="', '";')
+ _ , pos = text.extract(page, 'id="breadcrumb"', '', pos)
+ fid , pos = text.extract(page, '/folder/', '"', pos)
+ fname, pos = text.extract(page, '>', '<', pos)
+ sid , pos = text.extract(page, '/set/', '"', pos)
+ sname, pos = text.extract(page, '>', '<', pos)
+ return {
+ "blog": self.blog,
+ "user": user.rpartition(" (")[0],
+ "folder_id" : text.parse_int(fid, ""),
+ "folder_title": text.unescape(fname).strip(),
+ "set_id" : text.parse_int(sid),
+ "set_title" : text.unescape(sname),
+ }
+
+
+class PixnetFolderExtractor(PixnetExtractor):
+ """Extractor for all sets in a pixnet folder"""
+ subcategory = "folder"
+ url_fmt = "{}/album/folder/{}"
+ pattern = BASE_PATTERN + r"/album/folder/(\d+)"
+ test = ("https://albertayu773.pixnet.net/album/folder/1405768", {
+ "pattern": PixnetSetExtractor.pattern,
+ "count": ">= 15",
+ })
+
+
+class PixnetUserExtractor(PixnetExtractor):
+ """Extractor for all sets and folders of a pixnet user"""
+ subcategory = "user"
+ url_fmt = "{}{}/album/list"
+ pattern = BASE_PATTERN + r"()(?:/blog|/album(?:/list)?)?/?(?:$|[?&#])"
+ test = (
+ ("https://albertayu773.pixnet.net/"),
+ ("https://albertayu773.pixnet.net/blog"),
+ ("https://albertayu773.pixnet.net/album"),
+ ("https://albertayu773.pixnet.net/album/list", {
+ "pattern": PixnetFolderExtractor.pattern,
+ "count": ">= 30",
+ }),
+ ("https://anrine910070.pixnet.net/album/list", {
+ "pattern": PixnetSetExtractor.pattern,
+ "count": ">= 14",
+ }),
+ )
diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py
new file mode 100644
index 0000000..325c6a0
--- /dev/null
+++ b/gallery_dl/extractor/plurk.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.plurk.com/"""
+
+from .common import Extractor, Message
+from .. import text, extractor, exception
+import datetime
+import json
+import re
+
+
+class PlurkExtractor(Extractor):
+ """Base class for plurk extractors"""
+ category = "plurk"
+ root = "https://www.plurk.com"
+
+ def items(self):
+ urls = self._urls_ex if self.config("comments", False) else self._urls
+
+ yield Message.Version, 1
+ with extractor.blacklist(("plurk",)):
+ for plurk in self.plurks():
+ for url in urls(plurk):
+ yield Message.Queue, url, plurk
+
+ def plurks(self):
+ """Return an iterable with all relevant 'plurk' objects"""
+
+ @staticmethod
+ def _urls(obj):
+ """Extract URLs from a 'plurk' object"""
+ return text.extract_iter(obj["content"], ' href="', '"')
+
+ def _urls_ex(self, plurk):
+ """Extract URLs from a 'plurk' and its comments"""
+ yield from self._urls(plurk)
+ for comment in self._comments(plurk):
+ yield from self._urls(comment)
+
+ def _comments(self, plurk):
+ """Return an iterable with a 'plurk's comments"""
+ url = "https://www.plurk.com/Responses/get"
+ data = {"plurk_id": plurk["id"], "count": "200"}
+
+ while True:
+ info = self.request(url, "POST", data=data).json()
+ yield from info["responses"]
+ if not info["has_newer"]:
+ return
+ data["from_response_id"] = info["responses"][-1]["id"]
+
+ @staticmethod
+ def _load(data):
+ if not data:
+ raise exception.NotFoundError("user")
+ return json.loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data))
+
+
+class PlurkTimelineExtractor(PlurkExtractor):
+ """Extractor for URLs from all posts in a Plurk timeline"""
+ subcategory = "timeline"
+ pattern = r"(?:https?://)?(?:www\.)?plurk\.com/(?!p/)(\w+)/?(?:$|[?&#])"
+ test = ("https://www.plurk.com/plurkapi", {
+ "pattern": r"https?://.+",
+ "count": ">= 23"
+ })
+
+ def __init__(self, match):
+ PlurkExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def plurks(self):
+ url = "{}/{}".format(self.root, self.user)
+ page = self.request(url).text
+ user_id, pos = text.extract(page, '"user_id":', ',')
+ plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0])
+
+ url = "https://www.plurk.com/TimeLine/getPlurks"
+ data = {"user_id": user_id.strip()}
+ headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"}
+
+ while plurks:
+ yield from plurks
+
+ offset = datetime.datetime.strptime(
+ plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z")
+ data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z")
+ response = self.request(url, "POST", headers=headers, data=data)
+ plurks = response.json()["plurks"]
+
+
+class PlurkPostExtractor(PlurkExtractor):
+ """Extractor for URLs from a Plurk post"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)"
+ test = (
+ ("https://www.plurk.com/p/i701j1", {
+ "url": "2115f208564591b8748525c2807a84596aaaaa5f",
+ "count": 3,
+ }),
+ ("https://www.plurk.com/p/i701j1", {
+ "options": (("comments", True),),
+ "count": ">= 210",
+ }),
+ )
+
+ def __init__(self, match):
+ PlurkExtractor.__init__(self, match)
+ self.plurk_id = match.group(1)
+
+ def plurks(self):
+ url = "{}/p/{}".format(self.root, self.plurk_id)
+ page = self.request(url).text
+ user, pos = text.extract(page, " GLOBAL = ", "\n")
+ data, pos = text.extract(page, "plurk = ", ";\n", pos)
+
+ data = self._load(data)
+ data["user"] = self._load(user)["page_user"]
+ return (data,)
diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py
new file mode 100644
index 0000000..40816b3
--- /dev/null
+++ b/gallery_dl/extractor/pornhub.py
@@ -0,0 +1,157 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.pornhub.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+
+BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?pornhub\.com"
+
+
+class PornhubExtractor(Extractor):
+ """Base class for pornhub extractors"""
+ category = "pornhub"
+ root = "https://www.pornhub.com"
+
+
+class PornhubGalleryExtractor(PornhubExtractor):
+ """Extractor for image galleries on pornhub.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user}", "{gallery[id]} {gallery[title]}")
+ filename_fmt = "{num:>03}_{id}.{extension}"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/album/(\d+)"
+ test = (
+ ("https://www.pornhub.com/album/1708982", {
+ "pattern": r"https://\w+.phncdn.com/pics/albums/\d+/\d+/\d+/\d+/",
+ "count": 93,
+ "keyword": {
+ "id": int,
+ "num": int,
+ "score": int,
+ "views": int,
+ "caption": str,
+ "user": "Unknown",
+ "gallery": {
+ "id" : 1708982,
+ "score": int,
+ "views": int,
+ "tags" : list,
+ "title": "Random Hentai",
+ },
+ },
+ }),
+ ("https://www.pornhub.com/album/37180171", {
+ "exception": exception.AuthorizationError,
+ }),
+ )
+
+ def __init__(self, match):
+ PornhubExtractor.__init__(self, match)
+ self.gallery_id = match.group(1)
+ self._first = None
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for num, image in enumerate(self.images(), 1):
+ url = image["url"]
+ image.update(data)
+ image["num"] = num
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def metadata(self):
+ url = "{}/album/{}".format(
+ self.root, self.gallery_id)
+ extr = text.extract_from(self.request(url).text)
+
+ title = extr("<title>", "</title>")
+ score = extr('<div id="albumGreenBar" style="width:', '"')
+ views = extr('<div id="viewsPhotAlbumCounter">', '<')
+ tags = extr('<div id="photoTagsBox"', '<script')
+ self._first = extr('<a href="/photo/', '"')
+ title, _, user = title.rpartition(" - ")
+
+ return {
+ "user" : text.unescape(user[:-14]),
+ "gallery": {
+ "id" : text.parse_int(self.gallery_id),
+ "title": text.unescape(title),
+ "score": text.parse_int(score.partition("%")[0]),
+ "views": text.parse_int(views.partition(" ")[0]),
+ "tags" : text.split_html(tags)[2:],
+ },
+ }
+
+ def images(self):
+ url = "{}/album/show_album_json?album={}".format(
+ self.root, self.gallery_id)
+ response = self.request(url)
+
+ if response.content == b"Permission denied":
+ raise exception.AuthorizationError()
+ images = response.json()
+ key = end = self._first
+
+ while True:
+ img = images[key]
+ yield {
+ "url" : img["img_large"],
+ "caption": img["caption"],
+ "id" : text.parse_int(img["id"]),
+ "views" : text.parse_int(img["times_viewed"]),
+ "score" : text.parse_int(img["vote_percent"]),
+ }
+ key = img["next"]
+ if key == end:
+ return
+
+
+class PornhubUserExtractor(PornhubExtractor):
+ """Extractor for all galleries of a pornhub user"""
+ subcategory = "user"
+ pattern = (BASE_PATTERN + r"/(users|model)/([^/?&#]+)"
+ "(?:/photos(?:/(public|private|favorites))?)?/?$")
+ test = (
+ ("https://www.pornhub.com/users/flyings0l0/photos/public", {
+ "pattern": PornhubGalleryExtractor.pattern,
+ "count": ">= 8",
+ }),
+ ("https://www.pornhub.com/users/flyings0l0/"),
+ ("https://www.pornhub.com/users/flyings0l0/photos/public"),
+ ("https://www.pornhub.com/users/flyings0l0/photos/private"),
+ ("https://www.pornhub.com/users/flyings0l0/photos/favorites"),
+ ("https://www.pornhub.com/model/bossgirl/photos"),
+ )
+
+ def __init__(self, match):
+ PornhubExtractor.__init__(self, match)
+ self.type, self.user, self.cat = match.groups()
+
+ def items(self):
+ url = "{}/{}/{}/photos/{}/ajax".format(
+ self.root, self.type, self.user, self.cat or "public")
+ params = {"page": 1}
+ headers = {
+ "Referer": url[:-5],
+ "X-Requested-With": "XMLHttpRequest",
+ }
+
+ data = {"_extractor": PornhubGalleryExtractor}
+ yield Message.Version, 1
+ while True:
+ page = self.request(
+ url, method="POST", headers=headers, params=params).text
+ if not page:
+ return
+ for gid in text.extract_iter(page, 'id="albumphoto', '"'):
+ yield Message.Queue, self.root + "/album/" + gid, data
+ params["page"] += 1
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
new file mode 100644
index 0000000..fa4eb81
--- /dev/null
+++ b/gallery_dl/extractor/pururin.py
@@ -0,0 +1,102 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://pururin.io/"""
+
+from .common import GalleryExtractor
+from .. import text, util
+import json
+
+
+class PururinGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries on pururin.io"""
+ category = "pururin"
+ pattern = r"(?:https?://)?(?:www\.)?pururin\.io/(?:gallery|read)/(\d+)"
+ test = (
+ ("https://pururin.io/gallery/38661/iowant-2", {
+ "pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg",
+ "keyword": {
+ "title" : "Iowant 2!!",
+ "title_en" : "Iowant 2!!",
+ "title_jp" : "",
+ "gallery_id": 38661,
+ "count" : 19,
+ "artist" : ["Shoda Norihiro"],
+ "group" : ["Obsidian Order"],
+ "parody" : ["Kantai Collection"],
+ "characters": ["Iowa", "Teitoku"],
+ "tags" : list,
+ "type" : "Doujinshi",
+ "collection": "",
+ "convention": "C92",
+ "rating" : float,
+ "uploader" : "demo",
+ "scanlator" : "",
+ "lang" : "en",
+ "language" : "English",
+ }
+ }),
+ ("https://pururin.io/gallery/7661/unisis-team-vanilla", {
+ "count": 17,
+ }),
+ )
+ root = "https://pururin.io"
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/gallery/{}/x".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ self._ext = ""
+ self._cnt = 0
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+
+ def _lst(key, e=extr):
+ return [
+ text.unescape(item)
+ for item in text.extract_iter(e(key, "</td>"), 'title="', '"')
+ ]
+
+ def _str(key, e=extr):
+ return text.unescape(text.extract(
+ e(key, "</td>"), 'title="', '"')[0] or "")
+
+ url = "{}/read/{}/01/x".format(self.root, self.gallery_id)
+ page = self.request(url).text
+ info = json.loads(text.unescape(text.extract(
+ page, ':gallery="', '"')[0]))
+ self._ext = info["image_extension"]
+ self._cnt = info["total_pages"]
+
+ data = {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : info["title"] or info.get("j_title") or "",
+ "title_en" : info["title"],
+ "title_jp" : info.get("j_title") or "",
+ "artist" : _lst("<td>Artist</td>"),
+ "group" : _lst("<td>Circle</td>"),
+ "parody" : _lst("<td>Parody</td>"),
+ "tags" : _lst("<td>Contents</td>"),
+ "type" : _str("<td>Category</td>"),
+ "characters": _lst("<td>Character</td>"),
+ "collection": _str("<td>Collection</td>"),
+ "language" : _str("<td>Language</td>"),
+ "scanlator" : _str("<td>Scanlator</td>"),
+ "convention": _str("<td>Convention</td>"),
+ "uploader" : text.remove_html(extr("<td>Uploader</td>", "</td>")),
+ "rating" : text.parse_float(extr(" :rating='" , "'")),
+ }
+ data["lang"] = util.language_to_code(data["language"])
+ return data
+
+ def images(self, _):
+ ufmt = "https://cdn.pururin.io/assets/images/data/{}/{{}}.{}".format(
+ self.gallery_id, self._ext)
+ return [(ufmt.format(num), None) for num in range(1, self._cnt + 1)]
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
new file mode 100644
index 0000000..59d502a
--- /dev/null
+++ b/gallery_dl/extractor/reactor.py
@@ -0,0 +1,338 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Generic extractors for *reactor sites"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text
+import urllib.parse
+import random
+import time
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)"
+
+
+class ReactorExtractor(SharedConfigMixin, Extractor):
+ """Base class for *reactor.cc extractors"""
+ basecategory = "reactor"
+ filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
+ archive_fmt = "{post_id}_{num}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.root = "http://" + match.group(1)
+ self.session.headers["Referer"] = self.root
+
+ self.wait_min = self.config("wait-min", 3)
+ self.wait_max = self.config("wait-max", 6)
+ if self.wait_max < self.wait_min:
+ self.wait_max = self.wait_min
+
+ if not self.category:
+ # set category based on domain name
+ netloc = urllib.parse.urlsplit(self.root).netloc
+ self.category = netloc.rpartition(".")[0]
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in self.posts():
+ for image in self._parse_post(post):
+ url = image["url"]
+ image.update(data)
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def metadata(self):
+ """Collect metadata for extractor-job"""
+ return {}
+
+ def posts(self):
+ """Return all relevant post-objects"""
+ return self._pagination(self.url)
+
+ def _pagination(self, url):
+ while True:
+ time.sleep(random.uniform(self.wait_min, self.wait_max))
+
+ response = self.request(url)
+ if response.history:
+ # sometimes there is a redirect from
+ # the last page of a listing (.../tag/<tag>/1)
+ # to the first page (.../tag/<tag>)
+ # which could cause an endless loop
+ cnt_old = response.history[0].url.count("/")
+ cnt_new = response.url.count("/")
+ if cnt_old == 5 and cnt_new == 4:
+ return
+ page = response.text
+
+ yield from text.extract_iter(
+ page, '<div class="uhead">', '<div class="ufoot">')
+
+ try:
+ pos = page.index("class='next'")
+ pos = page.rindex("class='current'", 0, pos)
+ url = self.root + text.extract(page, "href='", "'", pos)[0]
+ except (ValueError, TypeError):
+ return
+
+ def _parse_post(self, post):
+ post, _, script = post.partition('<script type="application/ld+json">')
+ images = text.extract_iter(post, '<div class="image">', '</div>')
+ script = script[:script.index("</")].strip()
+
+ try:
+ data = json.loads(script)
+ except ValueError:
+ try:
+ # remove control characters and escape backslashes
+ mapping = dict.fromkeys(range(32))
+ script = script.translate(mapping).replace("\\", "\\\\")
+ data = json.loads(script)
+ except ValueError as exc:
+ self.log.warning("Unable to parse JSON data: %s", exc)
+ return
+
+ num = 0
+ date = text.parse_datetime(data["datePublished"])
+ user = data["author"]["name"]
+ description = text.unescape(data["description"])
+ title, _, tags = text.unescape(data["headline"]).partition(" / ")
+ post_id = text.parse_int(
+ data["mainEntityOfPage"]["@id"].rpartition("/")[2])
+
+ if not tags:
+ title, tags = tags, title
+ tags = tags.split(" :: ")
+
+ for image in images:
+ url = text.extract(image, ' src="', '"')[0]
+ if not url:
+ continue
+ width = text.extract(image, ' width="', '"')[0]
+ height = text.extract(image, ' height="', '"')[0]
+ image_id = url.rpartition("-")[2].partition(".")[0]
+ num += 1
+
+ if image.startswith("<iframe "): # embed
+ url = "ytdl:" + text.unescape(url)
+ elif "/post/webm/" not in url and "/post/mp4/" not in url:
+ url = url.replace("/post/", "/post/full/")
+
+ yield {
+ "url": url,
+ "post_id": post_id,
+ "image_id": text.parse_int(image_id),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ "title": title,
+ "description": description,
+ "tags": tags,
+ "date": date,
+ "user": user,
+ "num": num,
+ }
+
+
+class ReactorTagExtractor(ReactorExtractor):
+ """Extractor for tag searches on *reactor.cc sites"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "{search_tags}_{post_id}_{num}"
+ pattern = BASE_PATTERN + r"/tag/([^/?&#]+)"
+ test = ("http://anime.reactor.cc/tag/Anime+Art",)
+
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.tag = match.group(2)
+
+ def metadata(self):
+ return {"search_tags": text.unescape(self.tag).replace("+", " ")}
+
+
+class ReactorSearchExtractor(ReactorTagExtractor):
+ """Extractor for search results on *reactor.cc sites"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "search", "{search_tags}")
+ archive_fmt = "s_{search_tags}_{post_id}_{num}"
+ pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
+ test = ("http://anime.reactor.cc/search?q=Art",)
+
+
+class ReactorUserExtractor(ReactorExtractor):
+ """Extractor for all posts of a user on *reactor.cc sites"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "user", "{user}")
+ pattern = BASE_PATTERN + r"/user/([^/?&#]+)"
+ test = ("http://anime.reactor.cc/user/Shuster",)
+
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.user = match.group(2)
+
+ def metadata(self):
+ return {"user": text.unescape(self.user).replace("+", " ")}
+
+
+class ReactorPostExtractor(ReactorExtractor):
+ """Extractor for single posts on *reactor.cc sites"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/post/(\d+)"
+ test = ("http://anime.reactor.cc/post/3576250",)
+
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def items(self):
+ yield Message.Version, 1
+ post = self.request(self.url).text
+ pos = post.find('class="uhead">')
+ for image in self._parse_post(post[pos:]):
+ if image["num"] == 1:
+ yield Message.Directory, image
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+
+# --------------------------------------------------------------------
+# JoyReactor
+
+JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))"
+
+
+class JoyreactorTagExtractor(ReactorTagExtractor):
+ """Extractor for tag searches on joyreactor.cc"""
+ category = "joyreactor"
+ pattern = JR_BASE_PATTERN + r"/tag/([^/?&#]+)"
+ test = (
+ ("http://joyreactor.cc/tag/Advent+Cirno", {
+ "count": ">= 17",
+ }),
+ ("http://joyreactor.com/tag/Cirno", {
+ "url": "de1e60c15bfb07a0e9603b00dc3d05f60edc7914",
+ }),
+ )
+
+
+class JoyreactorSearchExtractor(ReactorSearchExtractor):
+ """Extractor for search results on joyreactor.cc"""
+ category = "joyreactor"
+ pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
+ test = (
+ ("http://joyreactor.cc/search/Cirno+Gifs", {
+ "range": "1-25",
+ "count": ">= 20",
+ }),
+ ("http://joyreactor.com/search?q=Cirno+Gifs", {
+ "count": 0, # no search results on joyreactor.com
+ }),
+ )
+
+
+class JoyreactorUserExtractor(ReactorUserExtractor):
+ """Extractor for all posts of a user on joyreactor.cc"""
+ category = "joyreactor"
+ pattern = JR_BASE_PATTERN + r"/user/([^/?&#]+)"
+ test = (
+ ("http://joyreactor.cc/user/hemantic"),
+ ("http://joyreactor.com/user/Tacoman123", {
+ "url": "452cd0fa23e2ad0e122c296ba75aa7f0b29329f6",
+ }),
+ )
+
+
+class JoyreactorPostExtractor(ReactorPostExtractor):
+ """Extractor for single posts on joyreactor.cc"""
+ category = "joyreactor"
+ pattern = JR_BASE_PATTERN + r"/post/(\d+)"
+ test = (
+ ("http://joyreactor.com/post/3721876", { # single image
+ "url": "6ce09f239d8b7fdf6dd1664c2afc39618cc87663",
+ "keyword": "966d2acd462732a9ed823a9db5ed19f95734fd10",
+ }),
+ ("http://joyreactor.com/post/3713804", { # 4 images
+ "url": "f08ac8493ca0619a3e3c6bedb8d8374af3eec304",
+ "keyword": "84e34d402342607045a65fab6d4d593d146c238a",
+ }),
+ ("http://joyreactor.com/post/3726210", { # gif / video
+ "url": "33a48e1eca6cb2d298fbbb6536b3283799d6515b",
+ "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47",
+ }),
+ ("http://joyreactor.com/post/3668724", { # youtube embed
+ "url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a",
+ "keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651",
+ }),
+ ("http://joyreactor.cc/post/1299", { # "malformed" JSON
+ "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde",
+ }),
+ )
+
+
+# --------------------------------------------------------------------
+# PornReactor
+
+PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)"
+
+
+class PornreactorTagExtractor(ReactorTagExtractor):
+ """Extractor for tag searches on pornreactor.cc"""
+ category = "pornreactor"
+ pattern = PR_BASE_PATTERN + r"/tag/([^/?&#]+)"
+ test = (
+ ("http://pornreactor.cc/tag/RiceGnat", {
+ "range": "1-25",
+ "count": ">= 25",
+ }),
+ ("http://fapreactor.com/tag/RiceGnat"),
+ )
+
+
+class PornreactorSearchExtractor(ReactorSearchExtractor):
+ """Extractor for search results on pornreactor.cc"""
+ category = "pornreactor"
+ pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)"
+ test = (
+ ("http://pornreactor.cc/search?q=ecchi+hentai", {
+ "range": "1-25",
+ "count": ">= 25",
+ }),
+ ("http://fapreactor.com/search/ecchi+hentai"),
+ )
+
+
+class PornreactorUserExtractor(ReactorUserExtractor):
+ """Extractor for all posts of a user on pornreactor.cc"""
+ category = "pornreactor"
+ pattern = PR_BASE_PATTERN + r"/user/([^/?&#]+)"
+ test = (
+ ("http://pornreactor.cc/user/Disillusion", {
+ "range": "1-25",
+ "count": ">= 25",
+ }),
+ ("http://fapreactor.com/user/Disillusion"),
+ )
+
+
+class PornreactorPostExtractor(ReactorPostExtractor):
+ """Extractor for single posts on pornreactor.cc"""
+ category = "pornreactor"
+ subcategory = "post"
+ pattern = PR_BASE_PATTERN + r"/post/(\d+)"
+ test = (
+ ("http://pornreactor.cc/post/863166", {
+ "url": "680db1e33ca92ff70b2c0e1708c471cbe2201324",
+ "content": "ec6b0568bfb1803648744077da082d14de844340",
+ }),
+ ("http://fapreactor.com/post/863166", {
+ "url": "864ecd5785e4898301aa8d054dd653b1165be158",
+ }),
+ )
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
new file mode 100644
index 0000000..dda4809
--- /dev/null
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -0,0 +1,97 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract comic-issues and entire comics from https://readcomiconline.to/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .kissmanga import RedirectMixin
+from .. import text
+import re
+
+
+class ReadcomiconlineBase(RedirectMixin):
+ """Base class for readcomiconline extractors"""
+ category = "readcomiconline"
+ directory_fmt = ("{category}", "{comic}", "{issue:>03}")
+ filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
+ archive_fmt = "{issue_id}_{page}"
+ root = "https://readcomiconline.to"
+
+
+class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
+ """Extractor for comic-issues from readcomiconline.to"""
+ subcategory = "issue"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
+ r"(/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+))")
+ test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
+ "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",
+ "keyword": "30fe110273e871305001f33c18634516a0a51421",
+ })
+
+ def __init__(self, match):
+ ChapterExtractor.__init__(self, match)
+ self.issue_id = match.group(2)
+
+ def metadata(self, page):
+ comic, pos = text.extract(page, " - Read\r\n ", "\r\n")
+ iinfo, pos = text.extract(page, " ", "\r\n", pos)
+ match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
+ return {
+ "comic": comic,
+ "issue": match.group(1) or match.group(2),
+ "issue_id": text.parse_int(self.issue_id),
+ "lang": "en",
+ "language": "English",
+ }
+
+ def images(self, page):
+ return [
+ (url, None)
+ for url in text.extract_iter(
+ page, 'lstImages.push("', '"'
+ )
+ ]
+
+
+class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
+ """Extractor for comics from readcomiconline.to"""
+ chapterclass = ReadcomiconlineIssueExtractor
+ subcategory = "comic"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
+ r"(/Comic/[^/?&#]+/?)$")
+ test = (
+ ("https://readcomiconline.to/Comic/W-i-t-c-h", {
+ "url": "e231bc2a293edb465133c37a8e36a7e7d94cab14",
+ "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",
+ }),
+ ("https://readcomiconline.to/Comic/Bazooka-Jules", {
+ "url": "711674cb78ed10bd2557315f7a67552d01b33985",
+ "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",
+ }),
+ )
+
+ def chapters(self, page):
+ results = []
+ comic, pos = text.extract(page, ' class="barTitle">', '<')
+ page , pos = text.extract(page, ' class="listing">', '</table>', pos)
+
+ comic = comic.rpartition("information")[0].strip()
+ needle = ' title="Read {} '.format(comic)
+ comic = text.unescape(comic)
+
+ for item in text.extract_iter(page, ' href="', ' comic online '):
+ url, _, issue = item.partition(needle)
+ url = url.rpartition('"')[0]
+ if issue.startswith('Issue #'):
+ issue = issue[7:]
+ results.append((self.root + url, {
+ "comic": comic, "issue": issue,
+ "issue_id": text.parse_int(url.rpartition("=")[2]),
+ "lang": "en", "language": "English",
+ }))
+ return results
diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py
new file mode 100644
index 0000000..1a793a0
--- /dev/null
+++ b/gallery_dl/extractor/recursive.py
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Recursive extractor"""
+
+from .common import Extractor, Message
+from .. import extractor, util
+import requests
+import re
+
+
+class RecursiveExtractor(Extractor):
+ """Extractor that fetches URLs from a remote or local source"""
+ category = "recursive"
+ pattern = r"r(?:ecursive)?:"
+ test = ("recursive:https://pastebin.com/raw/FLwrCYsT", {
+ "url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
+ })
+
+ def items(self):
+ blist = self.config(
+ "blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
+
+ self.session.mount("file://", FileAdapter())
+ page = self.request(self.url.partition(":")[2]).text
+
+ yield Message.Version, 1
+ with extractor.blacklist(blist):
+ for match in re.finditer(r"https?://[^\s\"']+", page):
+ yield Message.Queue, match.group(0), {}
+
+
+class FileAdapter(requests.adapters.BaseAdapter):
+ """Requests adapter for local files"""
+
+ def send(self, request, **kwargs):
+ response = requests.Response()
+ try:
+ response.raw = open(request.url[7:], "rb")
+ except OSError:
+ import io
+ response.raw = io.BytesIO()
+ response.status_code = requests.codes.bad_request
+ else:
+ response.raw.release_conn = response.raw.close
+ response.status_code = requests.codes.ok
+ return response
+
+ def close(self):
+ pass
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
new file mode 100644
index 0000000..0c5a924
--- /dev/null
+++ b/gallery_dl/extractor/reddit.py
@@ -0,0 +1,313 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from subreddits at https://www.reddit.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, extractor, exception
+from ..cache import cache
+import datetime
+import time
+
+
+class RedditExtractor(Extractor):
+ """Base class for reddit extractors"""
+ category = "reddit"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = RedditAPI(self)
+ self.max_depth = int(self.config("recursion", 0))
+ self._visited = set()
+
+ def items(self):
+ subre = RedditSubmissionExtractor.pattern
+ submissions = self.submissions()
+ depth = 0
+
+ yield Message.Version, 1
+ with extractor.blacklist(
+ util.SPECIAL_EXTRACTORS, [RedditSubredditExtractor]):
+ while True:
+ extra = []
+ for url, data in self._urls(submissions):
+ if url[0] == "#":
+ continue
+ if url[0] == "/":
+ url = "https://www.reddit.com" + url
+
+ match = subre.match(url)
+ if match:
+ extra.append(match.group(1))
+ else:
+ yield Message.Queue, text.unescape(url), data
+
+ if not extra or depth == self.max_depth:
+ return
+ depth += 1
+ submissions = (
+ self.api.submission(sid) for sid in extra
+ if sid not in self._visited
+ )
+
+ def submissions(self):
+ """Return an iterable containing all (submission, comments) tuples"""
+
+ def _urls(self, submissions):
+ for submission, comments in submissions:
+ self._visited.add(submission["id"])
+
+ if not submission["is_self"]:
+ yield submission["url"], submission
+
+ for url in text.extract_iter(
+ submission["selftext_html"] or "", ' href="', '"'):
+ yield url, submission
+
+ for comment in comments:
+ for url in text.extract_iter(
+ comment["body_html"] or "", ' href="', '"'):
+ yield url, comment
+
+
+class RedditSubredditExtractor(RedditExtractor):
+ """Extractor for images from subreddits on reddit.com"""
+ subcategory = "subreddit"
+ pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/([^/?&#]+)"
+ r"(/[a-z]+)?/?"
+ r"(?:\?.*?(?:\bt=([a-z]+))?)?$")
+ test = (
+ ("https://www.reddit.com/r/lavaporn/"),
+ ("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"),
+ ("https://old.reddit.com/r/lavaporn/"),
+ ("https://np.reddit.com/r/lavaporn/"),
+ ("https://m.reddit.com/r/lavaporn/"),
+ )
+
+ def __init__(self, match):
+ RedditExtractor.__init__(self, match)
+ self.subreddit, self.order, self.timeframe = match.groups()
+
+ def submissions(self):
+ subreddit = self.subreddit + (self.order or "")
+ params = {"t": self.timeframe} if self.timeframe else {}
+ return self.api.submissions_subreddit(subreddit, params)
+
+
+class RedditSubmissionExtractor(RedditExtractor):
+ """Extractor for images from a submission on reddit.com"""
+ subcategory = "submission"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|"
+ r"redd\.it"
+ r")/([a-z0-9]+)")
+ test = (
+ ("https://www.reddit.com/r/lavaporn/comments/2a00np/", {
+ "pattern": r"https?://i\.imgur\.com/AaAUCgy\.jpg",
+ }),
+ ("https://old.reddit.com/r/lavaporn/comments/2a00np/"),
+ ("https://np.reddit.com/r/lavaporn/comments/2a00np/"),
+ ("https://m.reddit.com/r/lavaporn/comments/2a00np/"),
+ ("https://redd.it/2a00np/"),
+ )
+
+ def __init__(self, match):
+ RedditExtractor.__init__(self, match)
+ self.submission_id = match.group(1)
+
+ def submissions(self):
+ return (self.api.submission(self.submission_id),)
+
+
+class RedditImageExtractor(Extractor):
+ """Extractor for reddit-hosted images"""
+ category = "reddit"
+ subcategory = "image"
+ archive_fmt = "{filename}"
+ pattern = (r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)"
+ r"/[^/?&#]+(?:\?[^#]*)?")
+ test = (
+ ("https://i.redd.it/upjtjcx2npzz.jpg", {
+ "url": "0de614900feef103e580b632190458c0b62b641a",
+ "content": "cc9a68cf286708d5ce23c68e79cd9cf7826db6a3",
+ }),
+ (("https://i.reddituploads.com/0f44f1b1fca2461f957c713d9592617d"
+ "?fit=max&h=1536&w=1536&s=e96ce7846b3c8e1f921d2ce2671fb5e2"), {
+ "url": "f24f25efcedaddeec802e46c60d77ef975dc52a5",
+ "content": "541dbcc3ad77aa01ee21ca49843c5e382371fae7",
+ }),
+ )
+
+ def items(self):
+ data = text.nameext_from_url(self.url)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, self.url, data
+
+
+class RedditAPI():
+ """Minimal interface for the reddit API"""
+ CLIENT_ID = "6N9uN0krSDE-ig"
+ USER_AGENT = "Python:gallery-dl:0.8.4 (by /u/mikf1)"
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.comments = extractor.config("comments", 500)
+ self.morecomments = extractor.config("morecomments", False)
+ self.refresh_token = extractor.config("refresh-token")
+ self.log = extractor.log
+
+ client_id = extractor.config("client-id", self.CLIENT_ID)
+ user_agent = extractor.config("user-agent", self.USER_AGENT)
+
+ if (client_id == self.CLIENT_ID) ^ (user_agent == self.USER_AGENT):
+ self.client_id = None
+ self.log.warning(
+ "Conflicting values for 'client-id' and 'user-agent': "
+ "override either both or none of them.")
+ else:
+ self.client_id = client_id
+ extractor.session.headers["User-Agent"] = user_agent
+
+ def submission(self, submission_id):
+ """Fetch the (submission, comments)=-tuple for a submission id"""
+ endpoint = "/comments/" + submission_id + "/.json"
+ link_id = "t3_" + submission_id if self.morecomments else None
+ submission, comments = self._call(endpoint, {"limit": self.comments})
+ return (submission["data"]["children"][0]["data"],
+ self._flatten(comments, link_id))
+
+ def submissions_subreddit(self, subreddit, params):
+ """Collect all (submission, comments)-tuples of a subreddit"""
+ endpoint = "/r/" + subreddit + "/.json"
+ params["limit"] = 100
+ return self._pagination(endpoint, params)
+
+ def morechildren(self, link_id, children):
+ """Load additional comments from a submission"""
+ endpoint = "/api/morechildren"
+ params = {"link_id": link_id, "api_type": "json"}
+ index, done = 0, False
+ while not done:
+ if len(children) - index < 100:
+ done = True
+ params["children"] = ",".join(children[index:index + 100])
+ index += 100
+
+ data = self._call(endpoint, params)["json"]
+ for thing in data["data"]["things"]:
+ if thing["kind"] == "more":
+ children.extend(thing["data"]["children"])
+ else:
+ yield thing["data"]
+
+ def authenticate(self):
+ """Authenticate the application by requesting an access token"""
+ access_token = self._authenticate_impl(self.refresh_token)
+ self.extractor.session.headers["Authorization"] = access_token
+
+ @cache(maxage=3600, keyarg=1)
+ def _authenticate_impl(self, refresh_token=None):
+ """Actual authenticate implementation"""
+ url = "https://www.reddit.com/api/v1/access_token"
+ if refresh_token:
+ self.log.info("Refreshing private access token")
+ data = {"grant_type": "refresh_token",
+ "refresh_token": refresh_token}
+ else:
+ self.log.info("Requesting public access token")
+ data = {"grant_type": ("https://oauth.reddit.com/"
+ "grants/installed_client"),
+ "device_id": "DO_NOT_TRACK_THIS_DEVICE"}
+ response = self.extractor.request(
+ url, method="POST", data=data, auth=(self.client_id, ""))
+ if response.status_code != 200:
+ raise exception.AuthenticationError('"{} ({})"'.format(
+ response.json().get("message"), response.status_code))
+ return "Bearer " + response.json()["access_token"]
+
+ def _call(self, endpoint, params):
+ url = "https://oauth.reddit.com" + endpoint
+ params["raw_json"] = 1
+ self.authenticate()
+ response = self.extractor.request(
+ url, params=params, expect=range(400, 500))
+ remaining = response.headers.get("x-ratelimit-remaining")
+ if remaining and float(remaining) < 2:
+ wait = int(response.headers["x-ratelimit-reset"])
+ self.log.info("Waiting %d seconds for ratelimit reset", wait)
+ time.sleep(wait)
+ data = response.json()
+ if "error" in data:
+ if data["error"] == 403:
+ raise exception.AuthorizationError()
+ if data["error"] == 404:
+ raise exception.NotFoundError()
+ raise Exception(data["message"])
+ return data
+
+ def _pagination(self, endpoint, params, _empty=()):
+ date_fmt = self.extractor.config("date-format", "%Y-%m-%dT%H:%M:%S")
+ date_min = self._parse_datetime("date-min", 0, date_fmt)
+ date_max = self._parse_datetime("date-max", 253402210800, date_fmt)
+
+ id_min = self._parse_id("id-min", 0)
+ id_max = self._parse_id("id-max", 2147483647)
+
+ while True:
+ data = self._call(endpoint, params)["data"]
+
+ for submission in data["children"]:
+ submission = submission["data"]
+ if (date_min <= submission["created_utc"] <= date_max and
+ id_min <= self._decode(submission["id"]) <= id_max):
+ if submission["num_comments"] and self.comments:
+ try:
+ yield self.submission(submission["id"])
+ except exception.AuthorizationError:
+ pass
+ else:
+ yield submission, _empty
+
+ if not data["after"]:
+ return
+ params["after"] = data["after"]
+
+ def _flatten(self, comments, link_id=None):
+ extra = []
+ queue = comments["data"]["children"]
+ while queue:
+ comment = queue.pop(0)
+ if comment["kind"] == "more":
+ if link_id:
+ extra.extend(comment["data"]["children"])
+ continue
+ comment = comment["data"]
+ yield comment
+ if comment["replies"]:
+ queue += comment["replies"]["data"]["children"]
+ if link_id and extra:
+ yield from self.morechildren(link_id, extra)
+
+ def _parse_datetime(self, key, default, fmt):
+ ts = self.extractor.config(key, default)
+ if isinstance(ts, str):
+ try:
+ ts = int(datetime.datetime.strptime(ts, fmt).timestamp())
+ except ValueError as exc:
+ self.log.warning("Unable to parse '%s': %s", key, exc)
+ ts = default
+ return ts
+
+ def _parse_id(self, key, default):
+ sid = self.extractor.config(key)
+ return self._decode(sid.rpartition("_")[2].lower()) if sid else default
+
+ @staticmethod
+ def _decode(sid):
+ return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz")
diff --git a/gallery_dl/extractor/rule34.py b/gallery_dl/extractor/rule34.py
new file mode 100644
index 0000000..de7ef45
--- /dev/null
+++ b/gallery_dl/extractor/rule34.py
@@ -0,0 +1,63 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://rule34.xxx/"""
+
+from . import booru
+
+
+class Rule34Extractor(booru.XmlParserMixin,
+ booru.GelbooruPageMixin,
+ booru.BooruExtractor):
+ """Base class for rule34 extractors"""
+ category = "rule34"
+ api_url = "https://rule34.xxx/index.php"
+ post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
+ pool_url = "https://rule34.xxx/index.php?page=pool&s=show&id={}"
+ page_limit = 4000
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.params.update({"page": "dapi", "s": "post", "q": "index"})
+
+
+class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
+ """Extractor for images from rule34.xxx based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
+ r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
+ test = ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "pattern": r"https?://([^.]+\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
+ "count": 1,
+ })
+
+
+class Rule34PoolExtractor(booru.GelbooruPoolMixin, Rule34Extractor):
+ """Extractor for image-pools from rule34.xxx"""
+ pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
+ r"\?page=pool&s=show&id=(?P<pool>\d+)")
+ test = ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
+ "count": 3,
+ })
+
+
+class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
+ """Extractor for single images from rule34.xxx"""
+ pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
+ r"\?page=post&s=view&id=(?P<post>\d+)")
+ test = ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "danraku",
+ "tags_character": "kashima_(kantai_collection)",
+ "tags_copyright": "kantai_collection",
+ "tags_general": str,
+ "tags_metadata": str,
+ },
+ })
diff --git a/gallery_dl/extractor/safebooru.py b/gallery_dl/extractor/safebooru.py
new file mode 100644
index 0000000..f5f058c
--- /dev/null
+++ b/gallery_dl/extractor/safebooru.py
@@ -0,0 +1,61 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://safebooru.org/"""
+
+from . import booru
+
+
+class SafebooruExtractor(booru.XmlParserMixin,
+ booru.GelbooruPageMixin,
+ booru.BooruExtractor):
+ """Base class for safebooru extractors"""
+ category = "safebooru"
+ api_url = "https://safebooru.org/index.php"
+ post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
+ pool_url = "https://safebooru.org/index.php?page=pool&s=show&id={}"
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.params.update({"page": "dapi", "s": "post", "q": "index"})
+
+
+class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
+ """Extractor for images from safebooru.org based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
+ r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
+ test = ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
+ "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
+ "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
+ })
+
+
+class SafebooruPoolExtractor(booru.GelbooruPoolMixin, SafebooruExtractor):
+ """Extractor for image-pools from safebooru.org"""
+ pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
+ r"\?page=pool&s=show&id=(?P<pool>\d+)")
+ test = ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
+ "count": 5,
+ })
+
+
+class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
+ """Extractor for single images from safebooru.org"""
+ pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
+ r"\?page=post&s=view&id=(?P<post>\d+)")
+ test = ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
+ "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
+ "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "kawanakajima",
+ "tags_character": "heath_ledger ronald_mcdonald the_joker",
+ "tags_copyright": "dc_comics mcdonald's the_dark_knight",
+ "tags_general": str,
+ },
+ })
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
new file mode 100644
index 0000000..012cb8b
--- /dev/null
+++ b/gallery_dl/extractor/sankaku.py
@@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://chan.sankakucomplex.com/"""
+
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text, util, exception
+from ..cache import cache
+import collections
+import random
+import time
+import re
+
+
+class SankakuExtractor(SharedConfigMixin, Extractor):
+ """Base class for sankaku extractors"""
+ basecategory = "booru"
+ category = "sankaku"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
+ cookienames = ("login", "pass_hash")
+ cookiedomain = "chan.sankakucomplex.com"
+ subdomain = "chan"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.root = "https://" + self.cookiedomain
+ self.logged_in = True
+ self.start_page = 1
+ self.start_post = 0
+ self.extags = self.config("tags", False)
+ self.wait_min = self.config("wait-min", 3.0)
+ self.wait_max = self.config("wait-max", 6.0)
+ if self.wait_max < self.wait_min:
+ self.wait_max = self.wait_min
+
+ def items(self):
+ self.login()
+ data = self.get_metadata()
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for post_id in util.advance(self.get_posts(), self.start_post):
+ self.wait()
+ post = self.get_post_data(post_id)
+ url = post["file_url"]
+ post.update(data)
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
+ def skip(self, num):
+ self.start_post += num
+ return num
+
+ def get_metadata(self):
+ """Return general metadata"""
+ return {}
+
+ def get_posts(self):
+ """Return an iterable containing all relevant post ids"""
+
+ def get_post_data(self, post_id, extr=text.extract):
+ """Extract metadata of a single post"""
+ url = self.root + "/post/show/" + post_id
+ page = self.request(url, retries=10).text
+
+ tags , pos = extr(page, "<title>", " | ")
+ vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos)
+ vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos)
+ _ , pos = extr(page, "Posted: <", "", pos)
+ created, pos = extr(page, ' title="', '"', pos)
+ rating = extr(page, "<li>Rating: ", "<", pos)[0]
+
+ file_url, pos = extr(page, '<li>Original: <a href="', '"', pos)
+ if file_url:
+ width , pos = extr(page, '>', 'x', pos)
+ height, pos = extr(page, '', ' ', pos)
+ else:
+ width , pos = extr(page, '<object width=', ' ', pos)
+ height, pos = extr(page, 'height=', '>', pos)
+ file_url = extr(page, '<embed src="', '"', pos)[0]
+
+ data = {
+ "id": text.parse_int(post_id),
+ "md5": file_url.rpartition("/")[2].partition(".")[0],
+ "tags": text.unescape(tags),
+ "vote_average": text.parse_float(vavg),
+ "vote_count": text.parse_int(vcnt),
+ "created_at": created,
+ "rating": (rating or "?")[0].lower(),
+ "file_url": "https:" + text.unescape(file_url),
+ "width": text.parse_int(width),
+ "height": text.parse_int(height),
+ }
+
+ if self.extags:
+ tags = collections.defaultdict(list)
+ tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0]
+ pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)')
+ for tag_type, tag_name in pattern.findall(tags_html or ""):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ data["tags_" + key] = " ".join(value)
+
+ return data
+
+ def wait(self):
+ """Wait for a randomly chosen amount of seconds"""
+ time.sleep(random.uniform(self.wait_min, self.wait_max))
+
+ def login(self):
+ """Login and set necessary cookies"""
+ if self._check_cookies(self.cookienames):
+ return
+ username, password = self._get_auth_info()
+ if username:
+ cookies = self._login_impl((username, self.subdomain), password)
+ self._update_cookies(cookies)
+ else:
+ self.logged_in = False
+
+ @cache(maxage=90*24*3600, keyarg=1)
+ def _login_impl(self, usertuple, password):
+ username = usertuple[0]
+ self.log.info("Logging in as %s", username)
+ url = self.root + "/user/authenticate"
+ data = {
+ "url": "",
+ "user[name]": username,
+ "user[password]": password,
+ "commit": "Login",
+ }
+ response = self.request(url, method="POST", data=data)
+
+ if not response.history or response.url != self.root + "/user/home":
+ raise exception.AuthenticationError()
+ cookies = response.history[0].cookies
+ return {c: cookies[c] for c in self.cookienames}
+
+
+class SankakuTagExtractor(SankakuExtractor):
+ """Extractor for images from chan.sankakucomplex.com by search-tags"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = r"(?:https?://)?chan\.sankakucomplex\.com/\?([^#]*)"
+ test = (
+ ("https://chan.sankakucomplex.com/?tags=bonocho", {
+ "count": 5,
+ "pattern": r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
+ r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+",
+ }),
+ # respect 'page' query parameter
+ ("https://chan.sankakucomplex.com/?tags=bonocho&page=2", {
+ "count": 0,
+ }),
+ # respect 'next' query parameter
+ ("https://chan.sankakucomplex.com/?tags=bonocho&next=182284", {
+ "count": 1,
+ }),
+ # error on five or more tags
+ ("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", {
+ "options": (("username", None),),
+ "exception": exception.StopExtraction,
+ }),
+ # match arbitrary query parameters
+ ("https://chan.sankakucomplex.com"
+ "/?tags=marie_rose&page=98&next=3874906&commit=Search"),
+ )
+ per_page = 20
+
+ def __init__(self, match):
+ SankakuExtractor.__init__(self, match)
+ query = text.parse_query(match.group(1))
+ self.tags = text.unquote(query.get("tags", "").replace("+", " "))
+ self.start_page = text.parse_int(query.get("page"), 1)
+ self.next = text.parse_int(query.get("next"), 0)
+
+ def skip(self, num):
+ if self.next:
+ self.start_post += num
+ else:
+ pages, posts = divmod(num, self.per_page)
+ self.start_page += pages
+ self.start_post += posts
+ return num
+
+ def get_metadata(self):
+ if not self.next:
+ max_page = 50 if self.logged_in else 25
+ if self.start_page > max_page:
+ self.log.info("Traversing from page %d to page %d",
+ max_page, self.start_page)
+ self.start_post += self.per_page * (self.start_page - max_page)
+ self.start_page = max_page
+
+ tags = self.tags.split()
+ if not self.logged_in and len(tags) > 4:
+ self.log.error("Unauthenticated users cannot use "
+ "more than 4 tags at once.")
+ raise exception.StopExtraction()
+ return {"search_tags": " ".join(tags)}
+
+ def get_posts(self):
+ params = {"tags": self.tags}
+
+ if self.next:
+ params["next"] = self.next
+ else:
+ params["page"] = self.start_page
+
+ while True:
+ self.wait()
+ page = self.request(self.root, params=params, retries=10).text
+ pos = page.find("<div id=more-popular-posts-link>") + 1
+
+ ids = list(text.extract_iter(page, '" id=p', '>', pos))
+ if not ids:
+ return
+ yield from ids
+
+ next_qs = text.extract(page, 'next-page-url="/?', '"', pos)[0]
+ next_id = text.parse_query(next_qs).get("next")
+
+ # stop if the same "next" parameter occurs twice in a row (#265)
+ if "next" in params and params["next"] == next_id:
+ return
+
+ params["next"] = next_id or (text.parse_int(ids[-1]) - 1)
+ params["page"] = "2"
+
+
+class SankakuPoolExtractor(SankakuExtractor):
+ """Extractor for image-pools from chan.sankakucomplex.com"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool}")
+ archive_fmt = "p_{pool}_{id}"
+ pattern = r"(?:https?://)?chan\.sankakucomplex\.com/pool/show/(\d+)"
+ test = ("https://chan.sankakucomplex.com/pool/show/90", {
+ "count": 5,
+ })
+ per_page = 24
+
+ def __init__(self, match):
+ SankakuExtractor.__init__(self, match)
+ self.pool_id = match.group(1)
+
+ def skip(self, num):
+ pages, posts = divmod(num, self.per_page)
+ self.start_page += pages
+ self.start_post += posts
+ return num
+
+ def get_metadata(self):
+ return {"pool": self.pool_id}
+
+ def get_posts(self):
+ url = self.root + "/pool/show/" + self.pool_id
+ params = {"page": self.start_page}
+
+ while True:
+ page = self.request(url, params=params, retries=10).text
+ ids = list(text.extract_iter(page, '" id=p', '>'))
+
+ yield from ids
+ if len(ids) < self.per_page:
+ return
+
+ params["page"] += 1
+
+
+class SankakuPostExtractor(SankakuExtractor):
+ """Extractor for single images from chan.sankakucomplex.com"""
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = r"(?:https?://)?chan\.sankakucomplex\.com/post/show/(\d+)"
+ test = ("https://chan.sankakucomplex.com/post/show/360451", {
+ "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "bonocho",
+ "tags_copyright": "batman_(series) the_dark_knight",
+ "tags_medium": "sketch copyright_name",
+ "tags_studio": "dc_comics",
+ "tags_character": str,
+ "tags_general": str,
+ },
+ })
+
+ def __init__(self, match):
+ SankakuExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def get_posts(self):
+ return (self.post_id,)
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
new file mode 100644
index 0000000..22b2b63
--- /dev/null
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.sankakucomplex.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import re
+
+
+class SankakucomplexExtractor(Extractor):
+ """Base class for sankakucomplex extractors"""
+ category = "sankakucomplex"
+ root = "https://www.sankakucomplex.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+
+class SankakucomplexArticleExtractor(SankakucomplexExtractor):
+ """Extractor for articles on www.sankakucomplex.com"""
+ subcategory = "article"
+ directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{date:%Y%m%d}_{filename}"
+ pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
+ r"/(\d{4}/\d\d/\d\d/[^/?&#]+)")
+ test = (
+ ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", {
+ "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d",
+ "keyword": "4b3b5766b277a5d0acbec90fa8f2343262b07efd",
+ }),
+ ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", {
+ "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
+ "keyword": "f47a416d680717855bbc3e4f0cd44479f61d9aa4",
+ }),
+ )
+
+ def items(self):
+ url = "{}/{}/?pg=X".format(self.root, self.path)
+ extr = text.extract_from(self.request(url).text)
+ data = {
+ "title" : text.unescape(
+ extr('property="og:title" content="', '"')),
+ "description": text.unescape(
+ extr('property="og:description" content="', '"')),
+ "date" : text.parse_datetime(
+ extr('property="article:published_time" content="', '"')),
+ }
+ imgs = self.images(extr)
+ data["count"] = len(imgs)
+ data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for img in imgs:
+ img.update(data)
+ yield Message.Url, img["url"], img
+
+ def images(self, extr):
+ num = 0
+ imgs = []
+ urls = set()
+ orig = re.compile(r"-\d+x\d+\.")
+
+ extr('<div class="entry-content">', '')
+ while True:
+ url = extr('data-lazy-src="', '"')
+ if not url:
+ return imgs
+ if url in urls:
+ continue
+ if url[0] == "/":
+ url = text.urljoin(self.root, url)
+ url = orig.sub(".", url)
+ num += 1
+ imgs.append(text.nameext_from_url(url, {
+ "url" : url,
+ "num" : num,
+ }))
+ urls.add(url)
+
+
+class SankakucomplexTagExtractor(SankakucomplexExtractor):
+ """Extractor for sankakucomplex blog articles by tag or author"""
+ subcategory = "tag"
+ pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
+ r"/((?:tag|category|author)/[^/&?#]+)")
+ test = (
+ ("https://www.sankakucomplex.com/tag/cosplay/", {
+ "range": "1-50",
+ "count": 50,
+ "pattern": SankakucomplexArticleExtractor.pattern,
+ }),
+ ("https://www.sankakucomplex.com/category/anime/"),
+ ("https://www.sankakucomplex.com/author/rift/page/5/"),
+ )
+
+ def items(self):
+ pnum = 1
+ last = None
+ data = {"_extractor": SankakucomplexArticleExtractor}
+
+ yield Message.Version, 1
+ while True:
+ url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
+ response = self.request(url, expect=(404,))
+ if response.status_code == 404:
+ return
+ for url in text.extract_iter(response.text, 'data-direct="', '"'):
+ if url != last:
+ last = url
+ yield Message.Queue, url, data
+ pnum += 1
diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py
new file mode 100644
index 0000000..f63c999
--- /dev/null
+++ b/gallery_dl/extractor/seiga.py
@@ -0,0 +1,198 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://seiga.nicovideo.jp/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+from ..cache import cache
+
+
+class SeigaExtractor(Extractor):
+ """Base class for seiga extractors"""
+ category = "seiga"
+ archive_fmt = "{image_id}"
+ cookiedomain = ".nicovideo.jp"
+ root = "https://seiga.nicovideo.jp"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.start_image = 0
+
+ def items(self):
+ self.login()
+ images = iter(self.get_images())
+ data = next(images)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for image in util.advance(images, self.start_image):
+ data.update(image)
+ data["extension"] = None
+ yield Message.Url, self.get_image_url(data["image_id"]), data
+
+ def get_images(self):
+ """Return iterable containing metadata and images"""
+
+ def get_image_url(self, image_id):
+ """Get url for an image with id 'image_id'"""
+ url = "{}/image/source/{}".format(self.root, image_id)
+ response = self.request(
+ url, method="HEAD", allow_redirects=False, expect=(404,))
+ if response.status_code == 404:
+ raise exception.NotFoundError("image")
+ return response.headers["Location"].replace("/o/", "/priv/", 1)
+
+ def login(self):
+ """Login and set necessary cookies"""
+ if not self._check_cookies(("user_session",)):
+ username, password = self._get_auth_info()
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=7*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "https://account.nicovideo.jp/api/v1/login"
+ data = {"mail_tel": username, "password": password}
+
+ self.request(url, method="POST", data=data)
+ if "user_session" not in self.session.cookies:
+ raise exception.AuthenticationError()
+ del self.session.cookies["nicosid"]
+ return self.session.cookies
+
+
+class SeigaUserExtractor(SeigaExtractor):
+ """Extractor for images of a user from seiga.nicovideo.jp"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "{user[id]}")
+ filename_fmt = "{category}_{user[id]}_{image_id}.{extension}"
+ pattern = (r"(?:https?://)?(?:www\.|seiga\.)?nicovideo\.jp/"
+ r"user/illust/(\d+)(?:\?(?:[^&]+&)*sort=([^&#]+))?")
+ test = (
+ ("https://seiga.nicovideo.jp/user/illust/39537793", {
+ "pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+",
+ "count": ">= 4",
+ "keyword": {
+ "user": {
+ "id": 39537793,
+ "message": str,
+ "name": str,
+ },
+ "clips": int,
+ "comments": int,
+ "count": int,
+ "extension": None,
+ "image_id": int,
+ "title": str,
+ "views": int,
+ },
+ }),
+ ("https://seiga.nicovideo.jp/user/illust/79433", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://seiga.nicovideo.jp/user/illust/39537793"
+ "?sort=image_view&target=illust_all"),
+ )
+
+ def __init__(self, match):
+ SeigaExtractor.__init__(self, match)
+ self.user_id, self.order = match.groups()
+ self.start_page = 1
+
+ def skip(self, num):
+ pages, images = divmod(num, 40)
+ self.start_page += pages
+ self.start_image += images
+ return num
+
+ def get_metadata(self, page):
+ """Collect metadata from 'page'"""
+ data = text.extract_all(page, (
+ ("name" , '<img alt="', '"'),
+ ("msg" , '<li class="user_message">', '</li>'),
+ (None , '<span class="target_name">すべて</span>', ''),
+ ("count", '<span class="count ">', '</span>'),
+ ))[0]
+
+ if not data["name"] and "ユーザー情報が取得出来ませんでした" in page:
+ raise exception.NotFoundError("user")
+
+ return {
+ "user": {
+ "id": text.parse_int(self.user_id),
+ "name": data["name"],
+ "message": (data["msg"] or "").strip(),
+ },
+ "count": text.parse_int(data["count"]),
+ }
+
+ def get_images(self):
+ url = "{}/user/illust/{}".format(self.root, self.user_id)
+ params = {"sort": self.order, "page": self.start_page,
+ "target": "illust_all"}
+
+ while True:
+ cnt = 0
+ page = self.request(url, params=params).text
+
+ if params["page"] == self.start_page:
+ yield self.get_metadata(page)
+
+ for info in text.extract_iter(
+ page, '<li class="list_item', '</a></li> '):
+ data = text.extract_all(info, (
+ ("image_id", '/seiga/im', '"'),
+ ("title" , '<li class="title">', '</li>'),
+ ("views" , '</span>', '</li>'),
+ ("comments", '</span>', '</li>'),
+ ("clips" , '</span>', '</li>'),
+ ))[0]
+ for key in ("image_id", "views", "comments", "clips"):
+ data[key] = text.parse_int(data[key])
+ yield data
+ cnt += 1
+
+ if cnt < 40:
+ return
+ params["page"] += 1
+
+
+class SeigaImageExtractor(SeigaExtractor):
+ """Extractor for single images from seiga.nicovideo.jp"""
+ subcategory = "image"
+ filename_fmt = "{category}_{image_id}.{extension}"
+ pattern = (r"(?:https?://)?(?:"
+ r"(?:seiga\.|www\.)?nicovideo\.jp/(?:seiga/im|image/source/)"
+ r"|lohas\.nicoseiga\.jp/(?:thumb|(?:priv|o)/[^/]+/\d+)/)(\d+)")
+ test = (
+ ("https://seiga.nicovideo.jp/seiga/im5977527", {
+ "keyword": "f66ba5de33d4ce2cb57f23bb37e1e847e0771c10",
+ "content": "d9202292012178374d57fb0126f6124387265297",
+ }),
+ ("https://seiga.nicovideo.jp/seiga/im123", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://seiga.nicovideo.jp/image/source/5977527"),
+ ("https://lohas.nicoseiga.jp/thumb/5977527i"),
+ ("https://lohas.nicoseiga.jp/priv"
+ "/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),
+ ("https://lohas.nicoseiga.jp/o"
+ "/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"),
+ )
+
+ def __init__(self, match):
+ SeigaExtractor.__init__(self, match)
+ self.image_id = match.group(1)
+
+ def skip(self, num):
+ self.start_image += num
+ return num
+
+ def get_images(self):
+ return ({}, {"image_id": text.parse_int(self.image_id)})
diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py
new file mode 100644
index 0000000..736173f
--- /dev/null
+++ b/gallery_dl/extractor/senmanga.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters from from https://raw.senmanga.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class SenmangaChapterExtractor(Extractor):
+ """Extractor for manga-chapters from raw.senmanga.com"""
+ category = "senmanga"
+ subcategory = "chapter"
+ directory_fmt = ("{category}", "{manga}", "{chapter_string}")
+ filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
+ archive_fmt = "{manga}_{chapter_string}_{page}"
+ pattern = r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"
+ test = (
+ ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {
+ "url": "5f95140ff511d8497e2ec08fa7267c6bb231faec",
+ "keyword": "705d941a150765edb33cd2707074bd703a93788c",
+ "content": "0e37b1995708ffc175f2e175d91a518e6948c379",
+ }),
+ ("http://raw.senmanga.com/Love-Lab/2016-03/1", {
+ "url": "8347b9f00c14b864dd3c19a1f5ae52adb2ef00de",
+ "keyword": "8a8ab2529ba2edfc83a6b3a8bede1d6c580db7b4",
+ }),
+ )
+ root = "https://raw.senmanga.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ part = match.group(1)
+ self.chapter_url = "{}/{}/".format(self.root, part)
+ self.img_url = "{}/viewer/{}/".format(self.root, part)
+ self.session.headers["Referer"] = self.chapter_url
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["page"] in range(1, data["count"]+1):
+ data["extension"] = None
+ yield Message.Url, self.img_url + str(data["page"]), data
+
+ def metadata(self):
+ """Collect metadata for extractor-job"""
+ page = self.request(self.chapter_url).text
+ self.session.cookies.clear()
+ title, pos = text.extract(page, '<title>', '</title>')
+ count, pos = text.extract(page, '</select> of ', '\n', pos)
+ manga, _, chapter = title.partition(" - Chapter ")
+
+ return {
+ "manga": text.unescape(manga).replace("-", " "),
+ "chapter_string": chapter.partition(" - Page ")[0],
+ "count": text.parse_int(count),
+ "lang": "jp",
+ "language": "Japanese",
+ }
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
new file mode 100644
index 0000000..aa2b16b
--- /dev/null
+++ b/gallery_dl/extractor/sexcom.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.sex.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class SexcomExtractor(Extractor):
+ """Base class for sexcom extractors"""
+ category = "sexcom"
+ directory_fmt = ("{category}")
+ filename_fmt = "{pin_id}{title:? //}.{extension}"
+ archive_fmt = "{pin_id}"
+ root = "https://www.sex.com"
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, self.metadata()
+ for url in self.pins():
+ pin = self._parse_pin(url)
+ yield Message.Url, pin["url"], pin
+
+ def metadata(self):
+ return {}
+
+ def pins(self):
+ return ()
+
+ def _pagination(self, url):
+ while True:
+ extr = text.extract_from(self.request(url).text)
+
+ while True:
+ href = extr('<a class="image_wrapper" href="', '"')
+ if not href:
+ break
+ yield self.root + href
+
+ pager = extr('id="pagenum"', '</div>')
+ url = text.extract(pager, ' href="', '"')[0]
+ if not url:
+ return
+ url = text.urljoin(self.root, url)
+
+ def _parse_pin(self, pin_url):
+ extr = text.extract_from(self.request(pin_url).text)
+ data = {}
+
+ data["thumbnail"] = extr('itemprop="thumbnail" content="', '"')
+ data["type"] = extr('<h1>' , '<').rstrip(" -").strip().lower()
+ data["title"] = text.unescape(extr('itemprop="name">' , '<'))
+ data["repins"] = text.parse_int(text.extract(
+ extr('"btn-group"', '</div>'), '"btn btn-primary">' , '<')[0])
+ data["likes"] = text.parse_int(text.extract(
+ extr('"btn-group"', '</div>'), '"btn btn-default">' , '<')[0])
+ data["pin_id"] = text.parse_int(extr('data-id="', '"'))
+
+ if data["type"] == "video":
+ info = extr("player.updateSrc(", ");")
+
+ if info:
+ path = text.extract(info, "src: '", "'")[0]
+ data["filename"] = path.rpartition("/")[2]
+ data["extension"] = "mp4"
+ if "'HD'" in info:
+ path += "/hd"
+ data["url"] = self.root + path
+ else:
+ data["url"] = "ytdl:" + text.extract(
+ extr('<iframe', '>'), ' src="', '"')[0]
+ else:
+ data["url"] = extr(' src="', '"')
+ text.nameext_from_url(data["url"], data)
+
+ data["uploader"] = extr('itemprop="author">', '<')
+ data["date"] = extr('datetime="', '"')
+ data["tags"] = text.split_html(extr('class="tags"> Tags', '</div>'))
+ data["comments"] = text.parse_int(extr('Comments (', ')'))
+
+ return data
+
+
+class SexcomPinExtractor(SexcomExtractor):
+ """Extractor a pinned image or video on www.sex.com"""
+ subcategory = "pin"
+ directory_fmt = ("{category}",)
+ pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)"
+ test = (
+ # picture
+ ("https://www.sex.com/pin/56714360/", {
+ "url": "599190d6e3d79f9f49dda194a0a58cb0ffa3ab86",
+ "keyword": {
+ "comments": int,
+ "date": "2018-10-02T21:18:17-04:00",
+ "extension": "jpg",
+ "filename": "20037816",
+ "likes": int,
+ "pin_id": 56714360,
+ "repins": int,
+ "tags": list,
+ "thumbnail": str,
+ "title": "Pin #56714360",
+ "type": "picture",
+ "uploader": "alguem",
+ "url": str,
+ },
+ }),
+ # gif
+ ("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", {
+ "url": "98a82c5ae7a65c8228e1405ac740f80d4d556de1",
+ }),
+ # video
+ ("https://www.sex.com/pin/55748381/", {
+ "pattern": "https://www.sex.com/video/stream/776238/hd",
+ }),
+ # pornhub embed
+ ("https://www.sex.com/pin/55847384-very-nicely-animated/", {
+ "pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2",
+ }),
+ )
+
+ def __init__(self, match):
+ SexcomExtractor.__init__(self, match)
+ self.pin_id = match.group(1)
+
+ def pins(self):
+ return ("{}/pin/{}/".format(self.root, self.pin_id),)
+
+
+class SexcomBoardExtractor(SexcomExtractor):
+ """Extractor for pins from a board on www.sex.com"""
+ subcategory = "board"
+ directory_fmt = ("{category}", "{user}", "{board}")
+ pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user"
+ r"/([^/?&#]+)/(?!(?:following|pins|repins|likes)/)([^/?&#]+)")
+ test = ("https://www.sex.com/user/ronin17/exciting-hentai/", {
+ "count": ">= 15",
+ })
+
+ def __init__(self, match):
+ SexcomExtractor.__init__(self, match)
+ self.user, self.board = match.groups()
+
+ def metadata(self):
+ return {
+ "user" : text.unquote(self.user),
+ "board": text.unquote(self.board),
+ }
+
+ def pins(self):
+ url = "{}/user/{}/{}/".format(self.root, self.user, self.board)
+ return self._pagination(url)
+
+
+class SexcomSearchExtractor(SexcomExtractor):
+ """Extractor for search results on www.sex.com"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "search", "{search[query]}")
+ pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:"
+ r"(pic|gif|video)s/([^/?&#]+)|search/(pic|gif|video)s"
+ r")/?(?:\?([^#]+))?)")
+ test = (
+ ("https://www.sex.com/search/pics?query=ecchi", {
+ "range": "1-10",
+ "count": 10,
+ }),
+ ("https://www.sex.com/videos/hentai/", {
+ "range": "1-10",
+ "count": 10,
+ }),
+ )
+
+ def __init__(self, match):
+ SexcomExtractor.__init__(self, match)
+ self.path = match.group(1)
+
+ self.search = text.parse_query(match.group(5))
+ self.search["type"] = match.group(2) or match.group(4)
+ if "query" not in self.search:
+ self.search["query"] = match.group(3) or ""
+
+ def metadata(self):
+ return {"search": self.search}
+
+ def pins(self):
+ url = "{}/{}".format(self.root, self.path)
+ return self._pagination(url)
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
new file mode 100644
index 0000000..35895bb
--- /dev/null
+++ b/gallery_dl/extractor/shopify.py
@@ -0,0 +1,136 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Shopify instances"""
+
+from .common import Extractor, Message, SharedConfigMixin, generate_extractors
+from .. import text
+import time
+import re
+
+
+class ShopifyExtractor(SharedConfigMixin, Extractor):
+ """Base class for Shopify extractors"""
+ basecategory = "shopify"
+ filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}"
+ archive_fmt = "{id}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item_url = self.root + match.group(1)
+
+ def request(self, url, method="GET", expect=range(400, 500), **kwargs):
+ tries = 0
+ kwargs["expect"] = expect
+ while True:
+ response = Extractor.request(self, url, method, **kwargs)
+ if response.status_code not in (429, 430):
+ return response
+ tries += 1
+ waittime = 2 ** (tries + 2)
+ self.log.warning(
+ "HTTP status %s: %s - Waiting for %d seconds",
+ response.status_code, response.reason, waittime)
+ time.sleep(waittime)
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ headers = {"X-Requested-With": "XMLHttpRequest"}
+ for url in self.products():
+ response = self.request(url + ".json", headers=headers)
+ if response.status_code >= 400:
+ self.log.warning('Skipping %s ("%d: %s")',
+ url, response.status_code, response.reason)
+ continue
+ product = response.json()["product"]
+ del product["image"]
+
+ for num, image in enumerate(product.pop("images"), 1):
+ text.nameext_from_url(image["src"], image)
+ image.update(data)
+ image["product"] = product
+ image["num"] = num
+ yield Message.Url, image["src"], image
+
+ def metadata(self):
+ """Return general metadata"""
+ return {}
+
+ def products(self):
+ """Return an iterable with all relevant product URLs"""
+
+
+class ShopifyCollectionExtractor(ShopifyExtractor):
+ """Base class for collection extractors for Shopify based sites"""
+ subcategory = "collection"
+ directory_fmt = ("{category}", "{collection[title]}")
+ pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)"
+
+ def __init__(self, match):
+ ShopifyExtractor.__init__(self, match)
+ self.params = match.group(2)
+
+ def metadata(self):
+ return self.request(self.item_url + ".json").json()
+
+ def products(self):
+ params = text.parse_query(self.params)
+ params["page"] = text.parse_int(params.get("page"), 1)
+ search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+")
+
+ while True:
+ page = self.request(self.item_url, params=params).text
+ urls = search_re.findall(page)
+
+ if not urls:
+ return
+ for path in urls:
+ yield self.root + path
+ params["page"] += 1
+
+
+class ShopifyProductExtractor(ShopifyExtractor):
+ """Base class for product extractors for Shopify based sites"""
+ subcategory = "product"
+ directory_fmt = ("{category}", "Products")
+ pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)"
+
+ def products(self):
+ return (self.item_url,)
+
+
+EXTRACTORS = {
+ "fashionnova": {
+ "root": "https://www.fashionnova.com",
+ "pattern": r"(?:www\.)?fashionnova\.com",
+ "test-product": (
+ ("https://www.fashionnova.com/products/essential-slide-red", {
+ "pattern": r"https?://cdn\.shopify.com/",
+ "count": 3,
+ }),
+ ("https://www.fashionnova.com/collections/flats/products/name"),
+ ),
+ "test-collection": (
+ ("https://www.fashionnova.com/collections/mini-dresses", {
+ "range": "1-20",
+ "count": 20,
+ }),
+ ("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
+ ("https://www.fashionnova.com/collections/mini-dresses#1"),
+ ),
+
+ },
+}
+
+generate_extractors(EXTRACTORS, globals(), (
+ ShopifyProductExtractor,
+ ShopifyCollectionExtractor,
+))
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
new file mode 100644
index 0000000..44dc6fe
--- /dev/null
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract hentai-manga from https://www.simply-hentai.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util, exception
+
+
+class SimplyhentaiGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from simply-hentai.com"""
+ category = "simplyhentai"
+ archive_fmt = "{image_id}"
+ pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com"
+ r"(?!/(?:album|gifs?|images?|series)(?:/|$))"
+ r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?&#]+)+)")
+ test = (
+ (("https://original-work.simply-hentai.com"
+ "/amazon-no-hiyaku-amazon-elixir"), {
+ "url": "258289249990502c3138719cb89e995a60861e49",
+ "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b",
+ }),
+ ("https://www.simply-hentai.com/notfound", {
+ "exception": exception.GalleryDLException,
+ }),
+ # custom subdomain
+ ("https://pokemon.simply-hentai.com/mao-friends-9bc39"),
+ # www subdomain, two path segments
+ ("https://www.simply-hentai.com/vocaloid/black-magnet"),
+ )
+
+ def __init__(self, match):
+ url = "https://" + match.group(1)
+ GalleryExtractor.__init__(self, match, url)
+ self.session.headers["Referer"] = url
+
+ def metadata(self, page):
+ extr = text.extract
+ title , pos = extr(page, '<meta property="og:title" content="', '"')
+ if not title:
+ raise exception.NotFoundError("gallery")
+ gid , pos = extr(page, '/Album/', '/', pos)
+ series, pos = extr(page, 'box-title">Series</div>', '</div>', pos)
+ lang , pos = extr(page, 'box-title">Language</div>', '</div>', pos)
+ chars , pos = extr(page, 'box-title">Characters</div>', '</div>', pos)
+ tags , pos = extr(page, 'box-title">Tags</div>', '</div>', pos)
+ artist, pos = extr(page, 'box-title">Artists</div>', '</div>', pos)
+ date , pos = extr(page, 'Uploaded', '</div>', pos)
+ lang = text.remove_html(lang) if lang else None
+
+ return {
+ "gallery_id": text.parse_int(gid),
+ "title" : text.unescape(title),
+ "artist" : text.split_html(artist),
+ "parody" : text.split_html(series),
+ "characters": text.split_html(chars),
+ "tags" : text.split_html(tags),
+ "lang" : util.language_to_code(lang),
+ "language" : lang,
+ "date" : text.remove_html(date),
+ }
+
+ def images(self, _):
+ url = self.chapter_url + "/all-pages"
+ headers = {"Accept": "application/json"}
+ images = self.request(url, headers=headers).json()
+ return [
+ (urls["full"], {"image_id": text.parse_int(image_id)})
+ for image_id, urls in sorted(images.items())
+ ]
+
+
+class SimplyhentaiImageExtractor(Extractor):
+ """Extractor for individual images from simply-hentai.com"""
+ category = "simplyhentai"
+ subcategory = "image"
+ directory_fmt = ("{category}", "{type}s")
+ filename_fmt = "{category}_{token}{title:?_//}.{extension}"
+ archive_fmt = "{token}"
+ pattern = (r"(?:https?://)?(?:www\.)?(simply-hentai\.com"
+ r"/(image|gif)/[^/?&#]+)")
+ test = (
+ (("https://www.simply-hentai.com/image"
+ "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), {
+ "url": "0338eb137830ab6f81e5f410d3936ef785d063d9",
+ "keyword": "e10e5588481cab68329ef6ec1e5325206b2079a2",
+ }),
+ ("https://www.simply-hentai.com/gif/8915dfcf-0b6a-47c", {
+ "url": "11c060d7ec4dfd0bd105300b6e1fd454674a5af1",
+ "keyword": "dd97a4bb449c397d6fec9f43a1303c0fb168ae65",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page_url = "https://www." + match.group(1)
+ self.type = match.group(2)
+
+ def items(self):
+ page = self.request(self.page_url).text
+ url_search = 'data-src="' if self.type == "image" else '<source src="'
+
+ title, pos = text.extract(page, '"og:title" content="', '"')
+ descr, pos = text.extract(page, '"og:description" content="', '"', pos)
+ url , pos = text.extract(page, url_search, '"', pos)
+
+ tags = text.extract(descr, " tagged with ", " online for free ")[0]
+ if tags:
+ tags = tags.split(", ")
+ tags[-1] = tags[-1].partition(" ")[2]
+ else:
+ tags = []
+
+ data = text.nameext_from_url(url, {
+ "title": text.unescape(title) if title else "",
+ "tags": tags,
+ "type": self.type,
+ })
+ data["token"] = data["filename"].rpartition("_")[2]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, url, data
+
+
+class SimplyhentaiVideoExtractor(Extractor):
+ """Extractor for hentai videos from simply-hentai.com"""
+ category = "simplyhentai"
+ subcategory = "video"
+ directory_fmt = ("{category}", "{type}s")
+ filename_fmt = "{title}{episode:?_//>02}.{extension}"
+ archive_fmt = "{title}_{episode}"
+ pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?&#]+)"
+ test = (
+ ("https://videos.simply-hentai.com/creamy-pie-episode-02", {
+ "pattern": r"https://www\.googleapis\.com/drive/v3/files"
+ r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+",
+ "keyword": "29d63987fed33f0a9f4b3786d1d71b03d793250a",
+ "count": 1,
+ }),
+ (("https://videos.simply-hentai.com"
+ "/1715-tifa-in-hentai-gang-bang-3d-movie"), {
+ "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0",
+ "keyword": "c561341aa3c6999f615abf1971d28fb2a83da2a7",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page_url = "https://" + match.group(1)
+
+ def items(self):
+ page = self.request(self.page_url).text
+
+ title, pos = text.extract(page, "<title>", "</title>")
+ tags , pos = text.extract(page, ">Tags</div>", "</div>", pos)
+ date , pos = text.extract(page, ">Upload Date</div>", "</div>", pos)
+ title = title.rpartition(" - ")[0]
+
+ if "<video" in page:
+ video_url = text.extract(page, '<source src="', '"', pos)[0]
+ episode = 0
+ else:
+ # video url from myhentai.tv embed
+ pos = page.index('<div class="video-frame-container">', pos)
+ embed_url = text.extract(page, 'src="', '"', pos)[0].replace(
+ "embedplayer.php?link=", "embed.php?name=")
+ embed_page = self.request(embed_url).text
+ video_url = text.extract(embed_page, '"file":"', '"')[0]
+ title, _, episode = title.rpartition(" Episode ")
+
+ data = text.nameext_from_url(video_url, {
+ "title": text.unescape(title),
+ "episode": text.parse_int(episode),
+ "tags": text.split_html(tags)[::2],
+ "date": text.remove_html(date),
+ "type": "video",
+ })
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, video_url, data
diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py
new file mode 100644
index 0000000..127cce8
--- /dev/null
+++ b/gallery_dl/extractor/slickpic.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.slickpic.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import time
+
+
+BASE_PATTERN = r"(?:https?://)?([^.]+)\.slickpic\.com"
+
+
+class SlickpicExtractor(Extractor):
+ """Base class for slickpic extractors"""
+ category = "slickpic"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+ self.root = "https://{}.slickpic.com".format(self.user)
+
+
+class SlickpicAlbumExtractor(SlickpicExtractor):
+ """Extractor for albums on slickpic.com"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{user[name]}",
+ "{album[id]} {album[title]}")
+ filename_fmt = "{num:>03}_{id}{title:?_//}.{extension}"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/albums/([^/?&#]+)"
+ test = (
+ ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", {
+ "url": "58bd94ebc80fd906e9879826970b408d54c6da07",
+ "keyword": "54a9d6f9e42ae43c644aa9316186fb9d9955fe53",
+ }),
+ ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", {
+ "range": "34",
+ "content": "cec6630e659dc72db1ee1a9a6f3b525189261988",
+ }),
+ )
+
+ def __init__(self, match):
+ SlickpicExtractor.__init__(self, match)
+ self.album = match.group(2)
+
+ def items(self):
+ data = self.metadata()
+ imgs = self.images(data)
+
+ data = {
+ "album": {
+ "id" : text.parse_int(data["aid"]),
+ "title": text.unescape(data["title"]),
+ },
+ "user": {
+ "id" : text.parse_int(data["uid"]),
+ "name": text.unescape(data["user"]),
+ "nick": self.user
+ },
+ "count": len(imgs),
+ }
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for num, img in enumerate(imgs, 1):
+ url = img["url_rsz"] + "/o/" + img["fname"]
+ img = text.nameext_from_url(img["fname"], {
+ "url" : url,
+ "num" : num,
+ "id" : text.parse_int(img["id"]),
+ "width" : text.parse_int(img["width"]),
+ "height" : text.parse_int(img["height"]),
+ "title" : img["title"],
+ "description": img["descr"],
+ })
+ img.update(data)
+ yield Message.Url, url, img
+
+ def metadata(self):
+ url = "{}/albums/{}/?wallpaper".format(self.root, self.album)
+ extr = text.extract_from(self.request(url).text)
+
+ title = text.unescape(extr("<title>", "</title>"))
+ title, _, user = title.rpartition(" by ")
+
+ return {
+ "title": title,
+ "user" : user,
+ "tk" : extr('tk = "', '"'),
+ "shd" : extr('shd = "', '"'),
+ "aid" : extr('data-aid="', '"', ),
+ "uid" : extr('data-uid="', '"', ),
+ }
+
+ def images(self, data):
+ url = self.root + "/xhr/photo/get/list"
+ data = {
+ "tm" : time.time(),
+ "tk" : data["tk"],
+ "shd" : data["shd"],
+ "aid" : data["aid"],
+ "uid" : data["uid"],
+ "col" : "0",
+ "sys" : self.album,
+ "vw" : "1280",
+ "vh" : "1024",
+ "skey" : "",
+ "viewer": "false",
+ "pub" : "1",
+ "sng" : "0",
+ "whq" : "1",
+ }
+ return self.request(url, method="POST", data=data).json()["list"]
+
+
+class SlickpicUserExtractor(SlickpicExtractor):
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"(?:/gallery)?/?(?:$|[?#])"
+ test = (
+ ("https://mattcrandall.slickpic.com/gallery/", {
+ "count": ">= 358",
+ "pattern": SlickpicAlbumExtractor.pattern,
+ }),
+ ("https://mattcrandall.slickpic.com/"),
+ )
+
+ def items(self):
+ page = self.request(self.root + "/gallery?viewer").text
+ data = {"_extractor": SlickpicAlbumExtractor}
+ base = self.root + "/albums/"
+
+ yield Message.Version, 1
+ for album in text.extract_iter(page, 'href="' + base, '"'):
+ yield Message.Queue, base + album, data
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
new file mode 100644
index 0000000..30420a8
--- /dev/null
+++ b/gallery_dl/extractor/slideshare.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann, Leonardo Taccari
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.slideshare.net/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class SlidesharePresentationExtractor(Extractor):
+ """Extractor for images from a presentation on slideshare.net"""
+ category = "slideshare"
+ subcategory = "presentation"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{presentation}-{num:>02}.{extension}"
+ archive_fmt = "{presentation}_{num}"
+ pattern = (r"(?:https?://)?(?:www\.)?slideshare\.net"
+ r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)")
+ test = (
+ (("https://www.slideshare.net"
+ "/Slideshare/get-started-with-slide-share"), {
+ "url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
+ "content": "ee54e54898778e92696a7afec3ffabdbd98eb0cc",
+ }),
+ # long title
+ (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
+ "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
+ "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
+ }),
+ # mobile URL
+ (("https://www.slideshare.net"
+ "/mobile/uqudent/introduction-to-fixed-prosthodontics"), {
+ "url": "59993ad7b0cb93c73011547eedcd02c622649e9d",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user, self.presentation = match.groups()
+
+ def items(self):
+ page = self.request("https://www.slideshare.net/" + self.user +
+ "/" + self.presentation).text
+ data = self.get_job_metadata(page)
+ imgs = self.get_image_urls(page)
+ data["count"] = len(imgs)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def get_job_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ descr, pos = text.extract(
+ page, '<meta name="description" content="', '"')
+ title, pos = text.extract(
+ page, '<span class="j-title-breadcrumb">', '</span>', pos)
+ views, pos = text.extract(
+ page, '<span class="notranslate pippin-data">', 'views<', pos)
+ published, pos = text.extract(
+ page, '<time datetime="', '"', pos)
+ alt_descr, pos = text.extract(
+ page, 'id="slideshow-description-paragraph" class="notranslate">',
+ '</p>', pos)
+
+ if descr.endswith("…") and alt_descr:
+ descr = text.remove_html(alt_descr).strip()
+
+ return {
+ "user": self.user,
+ "presentation": self.presentation,
+ "title": text.unescape(title.strip()),
+ "description": text.unescape(descr),
+ "views": text.parse_int(views.replace(",", "")),
+ "published": published,
+ }
+
+ @staticmethod
+ def get_image_urls(page):
+ """Extract and return a list of all image-urls"""
+ return list(text.extract_iter(page, 'data-full="', '"'))
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
new file mode 100644
index 0000000..80348ae
--- /dev/null
+++ b/gallery_dl/extractor/smugmug.py
@@ -0,0 +1,316 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.smugmug.com/"""
+
+from .common import Extractor, Message
+from .. import text, oauth, exception
+
+BASE_PATTERN = (
+ r"(?:smugmug:(?!album:)(?:https?://)?([^/]+)|"
+ r"(?:https?://)?([^.]+)\.smugmug\.com)")
+
+
+class SmugmugExtractor(Extractor):
+ """Base class for smugmug extractors"""
+ category = "smugmug"
+ filename_fmt = ("{category}_{User[NickName]:?/_/}"
+ "{Image[UploadKey]}_{Image[ImageKey]}.{extension}")
+ empty_user = {
+ "Uri": "",
+ "ResponseLevel": "Public",
+ "Name": "",
+ "NickName": "",
+ "QuickShare": False,
+ "RefTag": "",
+ "ViewPassHint": "",
+ "WebUri": "",
+ "Uris": None,
+ }
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = SmugmugAPI(self)
+ self.videos = self.config("videos", True)
+ self.session = self.api.session
+
+ def _select_format(self, image):
+ details = image["Uris"]["ImageSizeDetails"]
+ media = None
+
+ if self.videos and image["IsVideo"]:
+ fltr = "VideoSize"
+ elif "ImageSizeOriginal" in details:
+ media = details["ImageSizeOriginal"]
+ else:
+ fltr = "ImageSize"
+
+ if not media:
+ sizes = filter(lambda s: s[0].startswith(fltr), details.items())
+ media = max(sizes, key=lambda s: s[1]["Width"])[1]
+ del image["Uris"]
+
+ for key in ("Url", "Width", "Height", "MD5", "Size", "Watermarked",
+ "Bitrate", "Duration"):
+ if key in media:
+ image[key] = media[key]
+ return image["Url"]
+
+
+class SmugmugAlbumExtractor(SmugmugExtractor):
+ """Extractor for smugmug albums"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{User[NickName]}", "{Album[Name]}")
+ archive_fmt = "a_{Album[AlbumKey]}_{Image[ImageKey]}"
+ pattern = r"smugmug:album:([^:]+)$"
+ test = (
+ ("smugmug:album:ddvxpg", {
+ "url": "0429e9bf50ee600674e448934e3882ca1761ae7b",
+ }),
+ # empty
+ ("smugmug:album:SXvjbW", {
+ "count": 0,
+ }),
+ # no "User"
+ ("smugmug:album:6VRT8G", {
+ "url": "c4a0f4c4bfd514b93cbdeb02b3345bf7ef6604df",
+ }),
+ )
+
+ def __init__(self, match):
+ SmugmugExtractor.__init__(self, match)
+ self.album_id = match.group(1)
+
+ def items(self):
+ album = self.api.album(self.album_id, "User")
+ user = album["Uris"].get("User") or self.empty_user.copy()
+
+ del user["Uris"]
+ del album["Uris"]
+ data = {"Album": album, "User": user}
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for image in self.api.album_images(self.album_id, "ImageSizeDetails"):
+ url = self._select_format(image)
+ data["Image"] = image
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class SmugmugImageExtractor(SmugmugExtractor):
+ """Extractor for individual smugmug images"""
+ subcategory = "image"
+ archive_fmt = "{Image[ImageKey]}"
+ pattern = BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#-]+)"
+ test = (
+ ("https://acapella.smugmug.com/Micro-Macro/Drops/i-g2Dmf9z", {
+ "url": "78f0bf3516b6d670b7319216bdeccb35942ca4cf",
+ "keyword": "b298ef7ed2b1918263b6a7dc6f56e54401584381",
+ "content": "64a8f69a1d824921eebbdf2420087937adfa45cd",
+ }),
+ # video
+ ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", {
+ "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee",
+ "keyword": "c708c4b9527a2fb29396c19f7628f9cf4b0b3a39",
+ }),
+ )
+
+ def __init__(self, match):
+ SmugmugExtractor.__init__(self, match)
+ self.image_id = match.group(3)
+
+ def items(self):
+ image = self.api.image(self.image_id, "ImageSizeDetails")
+ url = self._select_format(image)
+
+ data = {"Image": image}
+ text.nameext_from_url(url, data)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, url, data
+
+
+class SmugmugPathExtractor(SmugmugExtractor):
+ """Extractor for smugmug albums from URL paths and users"""
+ subcategory = "path"
+ pattern = BASE_PATTERN + r"((?:/[^/?&#a-fh-mo-z][^/?&#]*)*)/?$"
+ test = (
+ ("https://acapella.smugmug.com/Micro-Macro/Drops/", {
+ "pattern": "smugmug:album:ddvxpg$",
+ }),
+ ("https://acapella.smugmug.com/", {
+ "pattern": SmugmugAlbumExtractor.pattern,
+ "url": "797eb1cbbf5ad8ecac8ee4eedc6466ed77a65d68",
+ }),
+ # gallery node without owner
+ ("https://www.smugmug.com/gallery/n-GLCjnD/", {
+ "pattern": "smugmug:album:6VRT8G$",
+ }),
+ # custom domain
+ ("smugmug:www.creativedogportraits.com/PortfolioGallery/", {
+ "pattern": "smugmug:album:txWXzs$",
+ }),
+ ("smugmug:www.creativedogportraits.com/", {
+ "pattern": "smugmug:album:txWXzs$",
+ }),
+ ("smugmug:https://www.creativedogportraits.com/"),
+ )
+
+ def __init__(self, match):
+ SmugmugExtractor.__init__(self, match)
+ self.domain, self.user, self.path = match.groups()
+
+ def items(self):
+ yield Message.Version, 1
+
+ if not self.user:
+ self.user = self.api.site_user(self.domain)["NickName"]
+
+ if self.path:
+ if self.path.startswith("/gallery/n-"):
+ node = self.api.node(self.path[11:])
+ else:
+ data = self.api.user_urlpathlookup(self.user, self.path)
+ node = data["Uris"]["Node"]
+
+ if node["Type"] == "Album":
+ nodes = (node,)
+ elif node["Type"] == "Folder":
+ nodes = self.album_nodes(node)
+ else:
+ nodes = ()
+
+ for node in nodes:
+ album_id = node["Uris"]["Album"].rpartition("/")[2]
+ node["_extractor"] = SmugmugAlbumExtractor
+ yield Message.Queue, "smugmug:album:" + album_id, node
+
+ else:
+ for album in self.api.user_albums(self.user):
+ uri = "smugmug:album:" + album["AlbumKey"]
+ album["_extractor"] = SmugmugAlbumExtractor
+ yield Message.Queue, uri, album
+
+ def album_nodes(self, root):
+ """Yield all descendant album nodes of 'root'"""
+ for node in self.api.node_children(root["NodeID"]):
+ if node["Type"] == "Album":
+ yield node
+ elif node["Type"] == "Folder":
+ yield from self.album_nodes(node)
+
+
+class SmugmugAPI(oauth.OAuth1API):
+ """Minimal interface for the smugmug API v2"""
+ API_DOMAIN = "api.smugmug.com"
+ API_KEY = "DFqxg4jf7GrtsQ5PnbNB8899zKfnDrdK"
+ API_SECRET = ("fknV35p9r9BwZC4XbTzvCXpcSJRdD83S"
+ "9nMFQm25ndGBzNPnwRDbRnnVBvqt4xTq")
+ HEADERS = {"Accept": "application/json"}
+
+ def album(self, album_id, expands=None):
+ return self._expansion("album/" + album_id, expands)
+
+ def image(self, image_id, expands=None):
+ return self._expansion("image/" + image_id, expands)
+
+ def node(self, node_id, expands=None):
+ return self._expansion("node/" + node_id, expands)
+
+ def user(self, username, expands=None):
+ return self._expansion("user/" + username, expands)
+
+ def album_images(self, album_id, expands=None):
+ return self._pagination("album/" + album_id + "!images", expands)
+
+ def node_children(self, node_id, expands=None):
+ return self._pagination("node/" + node_id + "!children", expands)
+
+ def user_albums(self, username, expands=None):
+ return self._pagination("user/" + username + "!albums", expands)
+
+ def site_user(self, domain):
+ return self._call("!siteuser", domain=domain)["Response"]["User"]
+
+ def user_urlpathlookup(self, username, path):
+ endpoint = "user/" + username + "!urlpathlookup"
+ params = {"urlpath": path}
+ return self._expansion(endpoint, "Node", params)
+
+ def _call(self, endpoint, params=None, domain=API_DOMAIN):
+ url = "https://{}/api/v2/{}".format(domain, endpoint)
+ params = params or {}
+ if self.api_key:
+ params["APIKey"] = self.api_key
+ params["_verbosity"] = "1"
+
+ response = self.request(url, params=params, headers=self.HEADERS)
+ data = response.json()
+
+ if 200 <= data["Code"] < 400:
+ return data
+ if data["Code"] == 404:
+ raise exception.NotFoundError()
+ if data["Code"] == 429:
+ self.log.error("Rate limit reached")
+ else:
+ self.log.error("API request failed")
+ self.log.debug(data)
+ raise exception.StopExtraction()
+
+ def _expansion(self, endpoint, expands, params=None):
+ endpoint = self._extend(endpoint, expands)
+ result = self._apply_expansions(self._call(endpoint, params), expands)
+ if not result:
+ raise exception.NotFoundError()
+ return result[0]
+
+ def _pagination(self, endpoint, expands=None):
+ endpoint = self._extend(endpoint, expands)
+ params = {"start": 1, "count": 100}
+
+ while True:
+ data = self._call(endpoint, params)
+ yield from self._apply_expansions(data, expands)
+
+ if "NextPage" not in data["Response"]["Pages"]:
+ return
+ params["start"] += params["count"]
+
+ @staticmethod
+ def _extend(endpoint, expands):
+ if expands:
+ endpoint += "?_expand=" + expands
+ return endpoint
+
+ @staticmethod
+ def _apply_expansions(data, expands):
+
+ def unwrap(response):
+ locator = response["Locator"]
+ return response[locator] if locator in response else []
+
+ objs = unwrap(data["Response"])
+ if not isinstance(objs, list):
+ objs = (objs,)
+
+ if "Expansions" in data:
+ expansions = data["Expansions"]
+ expands = expands.split(",")
+
+ for obj in objs:
+ uris = obj["Uris"]
+
+ for name in expands:
+ if name in uris:
+ uri = uris[name]
+ uris[name] = unwrap(expansions[uri])
+
+ return objs
diff --git a/gallery_dl/extractor/test.py b/gallery_dl/extractor/test.py
new file mode 100644
index 0000000..2f4992c
--- /dev/null
+++ b/gallery_dl/extractor/test.py
@@ -0,0 +1,86 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2017 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Utility extractor to execute tests of other extractors"""
+
+from .common import Extractor, Message
+from .. import extractor, exception
+
+
+class TestExtractor(Extractor):
+ """Extractor to select and run the test URLs of other extractors
+
+ The general form is 'test:<categories>:<subcategories>:<indices>', where
+ <categories> and <subcategories> are comma-separated (sub)category names
+ and <indices> is a comma-seperated list of array indices.
+ To select all possible values for a field use the star '*' character or
+ leave the field empty.
+
+ Examples:
+ - test:pixiv
+ run all pixiv tests
+
+ - test:pixiv:user,favorite:0
+ run the first test of the PixivUser- and PixivFavoriteExtractor
+
+ - test:
+ run all tests
+ """
+ category = "test"
+ pattern = r"t(?:est)?:([^:]*)(?::([^:]*)(?::(\*|[\d,]*))?)?$"
+ test = (
+ ("test:pixiv"),
+ ("test:pixiv:user,favorite:0"),
+ ("test:"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ categories, subcategories, indices = match.groups()
+ self.categories = self._split(categories)
+ self.subcategories = self._split(subcategories)
+ self.indices = self._split(indices) or self
+
+ def items(self):
+ extractors = extractor.extractors()
+
+ if self.categories:
+ extractors = [
+ extr for extr in extractors
+ if extr.category in self.categories
+ ]
+
+ if self.subcategories:
+ extractors = [
+ extr for extr in extractors
+ if extr.subcategory in self.subcategories
+ ]
+
+ tests = [
+ test
+ for extr in extractors
+ for index, test in enumerate(extr._get_tests())
+ if str(index) in self.indices
+ ]
+
+ if not tests:
+ raise exception.NotFoundError("test")
+
+ yield Message.Version, 1
+ for test in tests:
+ yield Message.Queue, test[0], {}
+
+ @staticmethod
+ def __contains__(_):
+ return True
+
+ @staticmethod
+ def _split(value):
+ if value and value != "*":
+ return value.split(",")
+ return None
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
new file mode 100644
index 0000000..62a9173
--- /dev/null
+++ b/gallery_dl/extractor/tsumino.py
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.tsumino.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class TsuminoBase():
+ """Base class for tsumino extractors"""
+ category = "tsumino"
+ cookiedomain = "www.tsumino.com"
+ root = "https://www.tsumino.com"
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+ else:
+ self.session.cookies.setdefault(
+ "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5")
+
+ @cache(maxage=14*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "{}/Account/Login".format(self.root)
+ headers = {"Referer": url}
+ data = {"Username": username, "Password": password}
+
+ response = self.request(url, method="POST", headers=headers, data=data)
+ if not response.history:
+ raise exception.AuthenticationError()
+ return {".aotsumino": response.history[0].cookies[".aotsumino"]}
+
+
+class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
+ """Extractor for image galleries on tsumino.com"""
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+ r"/(?:Book/Info|Read/View)/(\d+)")
+ test = (
+ ("https://www.tsumino.com/Book/Info/40996", {
+ "url": "84bf30a86623039fc87855680fada884dc8a1ddd",
+ "keyword": {
+ "title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+ "title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+ "title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
+ "gallery_id": 40996,
+ "date" : "2018 June 29",
+ "count" : 42,
+ "collection": "",
+ "artist" : ["Itou Life"],
+ "group" : ["Itou Life"],
+ "parody" : ["Fate/Grand Order"],
+ "characters": list,
+ "tags" : list,
+ "type" : "Doujinshi",
+ "rating" : float,
+ "uploader" : "sehki",
+ "lang" : "en",
+ "language" : "English",
+ "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
+ },
+ }),
+ ("https://www.tsumino.com/Read/View/45834"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/Book/Info/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ title = extr('"og:title" content="', '"')
+ title_en, _, title_jp = text.unescape(title).partition("/")
+ title_en = title_en.strip()
+ title_jp = title_jp.strip()
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : title_en or title_jp,
+ "title_en" : title_en,
+ "title_jp" : title_jp,
+ "thumbnail" : extr('"og:image" content="', '"'),
+ "uploader" : text.remove_html(extr('id="Uploader">', '</div>')),
+ "date" : extr('id="Uploaded">', '</div>').strip(),
+ "rating" : text.parse_float(extr(
+ 'id="Rating">', '</div>').partition(" ")[0]),
+ "type" : text.remove_html(extr('id="Category">' , '</div>')),
+ "collection": text.remove_html(extr('id="Collection">', '</div>')),
+ "group" : text.split_html(extr('id="Group">' , '</div>')),
+ "artist" : text.split_html(extr('id="Artist">' , '</div>')),
+ "parody" : text.split_html(extr('id="Parody">' , '</div>')),
+ "characters": text.split_html(extr('id="Character">' , '</div>')),
+ "tags" : text.split_html(extr('id="Tag">' , '</div>')),
+ "language" : "English",
+ "lang" : "en",
+ }
+
+ def images(self, page):
+ url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
+ headers = {"Referer": self.chapter_url}
+ response = self.request(url, headers=headers, expect=(404,))
+
+ if response.status_code == 404:
+ url = "{}/Read/View/{}".format(self.root, self.gallery_id)
+ self.log.error(
+ "Failed to get gallery JSON data. Visit '%s' in a browser "
+ "and solve the CAPTCHA to continue.", url)
+ raise exception.StopExtraction()
+
+ base = self.root + "/Image/Object?name="
+ return [
+ (base + text.quote(name), None)
+ for name in response.json()["reader_page_urls"]
+ ]
+
+
+class TsuminoSearchExtractor(TsuminoBase, Extractor):
+ """Extractor for search results on tsumino.com"""
+ subcategory = "search"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+ r"/(?:Books/?)?#(.+)")
+ test = (
+ ("https://www.tsumino.com/Books#?Character=Reimu+Hakurei", {
+ "pattern": TsuminoGalleryExtractor.pattern,
+ "range": "1-40",
+ "count": 40,
+ }),
+ (("http://www.tsumino.com/Books#~(Tags~(~"
+ "(Type~7~Text~'Reimu*20Hakurei~Exclude~false)~"
+ "(Type~'1~Text~'Pantyhose~Exclude~false)))#"), {
+ "pattern": TsuminoGalleryExtractor.pattern,
+ "count": ">= 3",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.query = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for gallery in self.galleries():
+ url = "{}/Book/Info/{}".format(self.root, gallery["Id"])
+ gallery["_extractor"] = TsuminoGalleryExtractor
+ yield Message.Queue, url, gallery
+
+ def galleries(self):
+ """Return all gallery results matching 'self.query'"""
+ url = "{}/Books/Operate".format(self.root)
+ headers = {
+ "Referer": "{}/".format(self.root),
+ "X-Requested-With": "XMLHttpRequest",
+ }
+ data = {
+ "PageNumber": 1,
+ "Text": "",
+ "Sort": "Newest",
+ "List": "0",
+ "Length": "0",
+ "MinimumRating": "0",
+ "ExcludeList": "0",
+ "CompletelyExcludeHated": "false",
+ }
+ data.update(self._parse(self.query))
+
+ while True:
+ info = self.request(
+ url, method="POST", headers=headers, data=data).json()
+
+ for gallery in info["Data"]:
+ yield gallery["Entry"]
+
+ if info["PageNumber"] >= info["PageCount"]:
+ return
+ data["PageNumber"] += 1
+
+ def _parse(self, query):
+ try:
+ if query.startswith("?"):
+ return self._parse_simple(query)
+ return self._parse_jsurl(query)
+ except Exception as exc:
+ self.log.error("Invalid search query: '%s' (%s)", query, exc)
+ raise exception.StopExtraction()
+
+ @staticmethod
+ def _parse_simple(query):
+ """Parse search query with format '?<key>=value>'"""
+ key, _, value = query.partition("=")
+ tag_types = {
+ "Tag": "1",
+ "Category": "2",
+ "Collection": "3",
+ "Group": "4",
+ "Artist": "5",
+ "Parody": "6",
+ "Character": "7",
+ "Uploader": "100",
+ }
+
+ return {
+ "Tags[0][Type]": tag_types[key[1:].capitalize()],
+ "Tags[0][Text]": text.unquote(value).replace("+", " "),
+ "Tags[0][Exclude]": "false",
+ }
+
+ @staticmethod
+ def _parse_jsurl(data):
+ """Parse search query in JSURL format
+
+ Nested lists and dicts are handled in a special way to deal
+ with the way Tsumino expects its parameters -> expand(...)
+
+ Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill))
+ Ref: https://github.com/Sage/jsurl
+ """
+ if not data:
+ return {}
+ i = 0
+ imax = len(data)
+
+ def eat(expected):
+ nonlocal i
+
+ if data[i] != expected:
+ error = "bad JSURL syntax: expected '{}', got {}".format(
+ expected, data[i])
+ raise ValueError(error)
+ i += 1
+
+ def decode():
+ nonlocal i
+
+ beg = i
+ result = ""
+
+ while i < imax:
+ ch = data[i]
+
+ if ch not in "~)*!":
+ i += 1
+
+ elif ch == "*":
+ if beg < i:
+ result += data[beg:i]
+ if data[i + 1] == "*":
+ result += chr(int(data[i+2:i+6], 16))
+ i += 6
+ else:
+ result += chr(int(data[i+1:i+3], 16))
+ i += 3
+ beg = i
+
+ elif ch == "!":
+ if beg < i:
+ result += data[beg:i]
+ result += "$"
+ i += 1
+ beg = i
+
+ else:
+ break
+
+ return result + data[beg:i]
+
+ def parse_one():
+ nonlocal i
+
+ eat('~')
+ result = ""
+ ch = data[i]
+
+ if ch == "(":
+ i += 1
+
+ if data[i] == "~":
+ result = []
+ if data[i+1] == ")":
+ i += 1
+ else:
+ result.append(parse_one())
+ while data[i] == "~":
+ result.append(parse_one())
+
+ else:
+ result = {}
+
+ if data[i] != ")":
+ while True:
+ key = decode()
+ value = parse_one()
+ for ekey, evalue in expand(key, value):
+ result[ekey] = evalue
+ if data[i] != "~":
+ break
+ i += 1
+ eat(")")
+
+ elif ch == "'":
+ i += 1
+ result = decode()
+
+ else:
+ beg = i
+ i += 1
+
+ while i < imax and data[i] not in "~)":
+ i += 1
+
+ sub = data[beg:i]
+ if ch in "0123456789-":
+ fval = float(sub)
+ ival = int(fval)
+ result = ival if ival == fval else fval
+ else:
+ if sub not in ("true", "false", "null"):
+ raise ValueError("bad value keyword: " + sub)
+ result = sub
+
+ return result
+
+ def expand(key, value):
+ if isinstance(value, list):
+ for index, cvalue in enumerate(value):
+ ckey = "{}[{}]".format(key, index)
+ yield from expand(ckey, cvalue)
+ elif isinstance(value, dict):
+ for ckey, cvalue in value.items():
+ ckey = "{}[{}]".format(key, ckey)
+ yield from expand(ckey, cvalue)
+ else:
+ yield key, value
+
+ return parse_one()
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
new file mode 100644
index 0000000..5679cdc
--- /dev/null
+++ b/gallery_dl/extractor/tumblr.py
@@ -0,0 +1,425 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.tumblr.com/"""
+
+from .common import Extractor, Message
+from .. import text, oauth, extractor, exception
+from datetime import datetime, timedelta
+import re
+import time
+
+
+def _original_inline_image(url):
+ return re.sub(
+ (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
+ r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
+ r"https://\1_1280.\2", url
+ )
+
+
+def _original_video(url):
+ return re.sub(
+ (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
+ r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"),
+ r"https://\1.\2", url
+ )
+
+
+POST_TYPES = frozenset((
+ "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+
+BASE_PATTERN = (
+ r"(?:tumblr:(?:https?://)?([^/]+)|"
+ r"(?:https?://)?([^.]+\.tumblr\.com))")
+
+
+class TumblrExtractor(Extractor):
+ """Base class for tumblr extractors"""
+ category = "tumblr"
+ directory_fmt = ("{category}", "{name}")
+ filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.blog = match.group(1) or match.group(2)
+ self.api = TumblrAPI(self)
+
+ self.types = self._setup_posttypes()
+ self.avatar = self.config("avatar", False)
+ self.inline = self.config("inline", True)
+ self.reblogs = self.config("reblogs", True)
+ self.external = self.config("external", False)
+
+ if len(self.types) == 1:
+ self.api.posts_type = next(iter(self.types))
+ elif not self.types:
+ self.log.warning("no valid post types selected")
+
+ if self.reblogs == "same-blog":
+ self._skip_reblog = self._skip_reblog_same_blog
+
+ def items(self):
+ blog = None
+ yield Message.Version, 1
+
+ for post in self.posts():
+ if post["type"] not in self.types:
+ continue
+ if not blog:
+ blog = self.api.info(self.blog)
+ blog["uuid"] = self.blog
+ yield Message.Directory, blog.copy()
+
+ if self.avatar:
+ url = self.api.avatar(self.blog)
+ yield self._prepare_avatar(url, post.copy(), blog)
+
+ reblog = "reblogged_from_id" in post
+ if reblog and self._skip_reblog(post):
+ continue
+ post["reblogged"] = reblog
+
+ post["blog"] = blog
+ post["date"] = text.parse_timestamp(post["timestamp"])
+ post["num"] = 0
+
+ if "trail" in post:
+ del post["trail"]
+
+ if "photos" in post: # type "photo" or "link"
+ photos = post["photos"]
+ del post["photos"]
+
+ for photo in photos:
+ post["photo"] = photo
+ photo.update(photo["original_size"])
+ del photo["original_size"]
+ del photo["alt_sizes"]
+ yield self._prepare_image(photo["url"], post)
+
+ url = post.get("audio_url") # type: "audio"
+ if url:
+ yield self._prepare(url, post)
+
+ url = post.get("video_url") # type: "video"
+ if url:
+ yield self._prepare(_original_video(url), post)
+
+ if self.inline and "reblog" in post: # inline media
+ # only "chat" posts are missing a "reblog" key in their
+ # API response, but they can't contain images/videos anyway
+ body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
+ for url in re.findall('<img src="([^"]+)"', body):
+ url = _original_inline_image(url)
+ yield self._prepare_image(url, post)
+ for url in re.findall('<source src="([^"]+)"', body):
+ url = _original_video(url)
+ yield self._prepare(url, post)
+
+ if self.external: # external links
+ post["extension"] = None
+ with extractor.blacklist(("tumblr",)):
+ for key in ("permalink_url", "url"):
+ url = post.get(key)
+ if url:
+ yield Message.Queue, url, post
+ break
+
+ def posts(self):
+ """Return an iterable containing all relevant posts"""
+
+ def _setup_posttypes(self):
+ types = self.config("posts", "all")
+
+ if types == "all":
+ return POST_TYPES
+
+ elif not types:
+ return frozenset()
+
+ else:
+ if isinstance(types, str):
+ types = types.split(",")
+ types = frozenset(types)
+
+ invalid = types - POST_TYPES
+ if invalid:
+ types = types & POST_TYPES
+ self.log.warning('invalid post types: "%s"',
+ '", "'.join(sorted(invalid)))
+ return types
+
+ @staticmethod
+ def _prepare(url, post):
+ text.nameext_from_url(url, post)
+ post["num"] += 1
+ post["hash"] = post["filename"].partition("_")[2]
+ return Message.Url, url, post
+
+ @staticmethod
+ def _prepare_image(url, post):
+ text.nameext_from_url(url, post)
+ post["num"] += 1
+
+ parts = post["filename"].split("_")
+ try:
+ post["hash"] = parts[1] if parts[1] != "inline" else parts[2]
+ except IndexError:
+ # filename doesn't follow the usual pattern (#129)
+ post["hash"] = post["filename"]
+
+ return Message.Url, url, post
+
+ @staticmethod
+ def _prepare_avatar(url, post, blog):
+ text.nameext_from_url(url, post)
+ post["num"] = 1
+ post["blog"] = blog
+ post["reblogged"] = False
+ post["type"] = post["id"] = post["hash"] = "avatar"
+ return Message.Url, url, post
+
+ def _skip_reblog(self, _):
+ return not self.reblogs
+
+ def _skip_reblog_same_blog(self, post):
+ return self.blog != post["reblogged_root_uuid"]
+
+
+class TumblrUserExtractor(TumblrExtractor):
+ """Extractor for all images from a tumblr-user"""
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$"
+ test = (
+ ("http://demo.tumblr.com/", {
+ "pattern": r"https://\d+\.media\.tumblr\.com"
+ r"/tumblr_[^/_]+_\d+\.jpg",
+ "count": 1,
+ "options": (("posts", "photo"),),
+ }),
+ ("http://demo.tumblr.com/", {
+ "pattern": (r"https?://(?:$|"
+ r"\d+\.media\.tumblr\.com/.+_1280\.jpg|"
+ r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"),
+ "count": 3,
+ "options": (("posts", "all"), ("external", True))
+ }),
+ ("https://mikf123-hidden.tumblr.com/", { # dashbord-only
+ "count": 2,
+ "keyword": {"tags": ["test", "hidden"]},
+ }),
+ ("https://mikf123-private.tumblr.com/", { # password protected
+ "count": 2,
+ "keyword": {"tags": ["test", "private"]},
+ }),
+ ("https://mikf123-private-hidden.tumblr.com/", { # both
+ "count": 2,
+ "keyword": {"tags": ["test", "private", "hidden"]},
+ }),
+ ("https://demo.tumblr.com/page/2"),
+ ("https://demo.tumblr.com/archive"),
+ ("tumblr:http://www.b-authentique.com/"),
+ ("tumblr:www.b-authentique.com"),
+ )
+
+ def posts(self):
+ return self.api.posts(self.blog, {})
+
+
+class TumblrPostExtractor(TumblrExtractor):
+ """Extractor for images from a single post on tumblr"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(?:post|image)/(\d+)"
+ test = (
+ ("http://demo.tumblr.com/post/459265350", {
+ "pattern": (r"https://\d+\.media\.tumblr\.com"
+ r"/tumblr_[^/_]+_1280.jpg"),
+ "count": 1,
+ }),
+ ("https://mikf123.tumblr.com/post/167770226574/text-post", {
+ "count": 2,
+ }),
+ ("https://mikf123.tumblr.com/post/181022561719/quote-post", {
+ "count": 1,
+ }),
+ ("https://mikf123.tumblr.com/post/167623351559/link-post", {
+ "count": 2,
+ }),
+ ("https://muyanna.tumblr.com/post/180692431632/answer-post", {
+ "count": 1,
+ }),
+ ("https://mikf123.tumblr.com/post/167633596145/video-post", {
+ "count": 2,
+ }),
+ ("https://mikf123.tumblr.com/post/167770026604/audio-post", {
+ "count": 2,
+ }),
+ ("https://mikf123.tumblr.com/post/172687798174/photo-post", {
+ "count": 4,
+ }),
+ ("https://mikf123.tumblr.com/post/181022380064/chat-post", {
+ "count": 0,
+ }),
+ ("http://pinetre-3.tumblr.com/post/181904381470/via", {
+ "count": 0, # audio post with "null" as URL (#165)
+ }),
+ ("http://ziemniax.tumblr.com/post/109697912859/", {
+ "exception": exception.NotFoundError, # HTML response (#297)
+ }),
+ ("http://demo.tumblr.com/image/459265350"),
+ )
+
+ def __init__(self, match):
+ TumblrExtractor.__init__(self, match)
+ self.post_id = match.group(3)
+ self.reblogs = True
+
+ def posts(self):
+ return self.api.posts(self.blog, {"id": self.post_id})
+
+ @staticmethod
+ def _setup_posttypes():
+ return POST_TYPES
+
+
+class TumblrTagExtractor(TumblrExtractor):
+ """Extractor for images from a tumblr-user by tag"""
+ subcategory = "tag"
+ pattern = BASE_PATTERN + r"/tagged/([^/?&#]+)"
+ test = ("http://demo.tumblr.com/tagged/Times%20Square", {
+ "pattern": (r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg"),
+ "count": 1,
+ })
+
+ def __init__(self, match):
+ TumblrExtractor.__init__(self, match)
+ self.tag = text.unquote(match.group(3))
+
+ def posts(self):
+ return self.api.posts(self.blog, {"tag": self.tag})
+
+
+class TumblrLikesExtractor(TumblrExtractor):
+ """Extractor for images from a tumblr-user's liked posts"""
+ subcategory = "likes"
+ directory_fmt = ("{category}", "{name}", "likes")
+ archive_fmt = "f_{blog[name]}_{id}_{num}"
+ pattern = BASE_PATTERN + r"/likes"
+ test = ("http://mikf123.tumblr.com/likes", {
+ "count": 1,
+ })
+
+ def posts(self):
+ return self.api.likes(self.blog)
+
+
+class TumblrAPI(oauth.OAuth1API):
+ """Minimal interface for the Tumblr API v2"""
+ API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B"
+ API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03"
+ BLOG_CACHE = {}
+
+ def __init__(self, extractor):
+ oauth.OAuth1API.__init__(self, extractor)
+ self.posts_type = None
+
+ def info(self, blog):
+ """Return general information about a blog"""
+ if blog not in self.BLOG_CACHE:
+ self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"]
+ return self.BLOG_CACHE[blog]
+
+ def avatar(self, blog, size="512"):
+ """Retrieve a blog avatar"""
+ if self.api_key:
+ url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}"
+ return url_fmt.format(blog, size, self.api_key)
+ params = {"size": size}
+ data = self._call(blog, "avatar", params, allow_redirects=False)
+ return data["avatar_url"]
+
+ def posts(self, blog, params):
+ """Retrieve published posts"""
+ params.update({"offset": 0, "limit": 50, "reblog_info": "true"})
+ if self.posts_type:
+ params["type"] = self.posts_type
+ while True:
+ data = self._call(blog, "posts", params)
+ self.BLOG_CACHE[blog] = data["blog"]
+ yield from data["posts"]
+ params["offset"] += params["limit"]
+ if params["offset"] >= data["total_posts"]:
+ return
+
+ def likes(self, blog):
+ """Retrieve liked posts"""
+ params = {"limit": 50}
+ while True:
+ posts = self._call(blog, "likes", params)["liked_posts"]
+ if not posts:
+ return
+ yield from posts
+ params["before"] = posts[-1]["liked_timestamp"]
+
+ def _call(self, blog, endpoint, params, **kwargs):
+ if self.api_key:
+ params["api_key"] = self.api_key
+ url = "https://api.tumblr.com/v2/blog/{}/{}".format(
+ blog, endpoint)
+
+ response = self.request(url, params=params, **kwargs)
+
+ try:
+ data = response.json()
+ except ValueError:
+ data = response.text
+ status = response.status_code
+ else:
+ status = data["meta"]["status"]
+ if 200 <= status < 400:
+ return data["response"]
+
+ if status == 403:
+ raise exception.AuthorizationError()
+ elif status == 404:
+ raise exception.NotFoundError("user or post")
+ elif status == 429:
+
+ # daily rate limit
+ if response.headers.get("x-ratelimit-perday-remaining") == "0":
+ reset = response.headers.get("x-ratelimit-perday-reset")
+ self.log.error(
+ "Daily API rate limit exceeded: aborting; "
+ "rate limit will reset at %s",
+ self._to_time(reset),
+ )
+ raise exception.StopExtraction()
+
+ # hourly rate limit
+ reset = response.headers.get("x-ratelimit-perhour-reset")
+ if reset:
+ self.log.info(
+ "Hourly API rate limit exceeded; "
+ "waiting until %s for rate limit reset",
+ self._to_time(reset),
+ )
+ time.sleep(int(reset) + 1)
+ return self._call(blog, endpoint, params)
+
+ self.log.error(data)
+ raise exception.StopExtraction()
+
+ @staticmethod
+ def _to_time(reset):
+ try:
+ reset_time = datetime.now() + timedelta(seconds=int(reset))
+ except (ValueError, TypeError):
+ return "?"
+ return reset_time.strftime("%H:%M:%S")
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
new file mode 100644
index 0000000..ad4dc46
--- /dev/null
+++ b/gallery_dl/extractor/twitter.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://twitter.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class TwitterExtractor(Extractor):
+ """Base class for twitter extractors"""
+ category = "twitter"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{tweet_id}_{num}.{extension}"
+ archive_fmt = "{tweet_id}_{retweet_id}_{num}"
+ root = "https://twitter.com"
+ sizes = (":orig", ":large", ":medium", ":small")
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+ self.retweets = self.config("retweets", True)
+ self.videos = self.config("videos", False)
+
+ def items(self):
+ self.login()
+ yield Message.Version, 1
+ yield Message.Directory, self.metadata()
+
+ for tweet in self.tweets():
+ data = self._data_from_tweet(tweet)
+ if not self.retweets and data["retweet_id"]:
+ continue
+
+ images = text.extract_iter(
+ tweet, 'data-image-url="', '"')
+ for data["num"], url in enumerate(images, 1):
+ text.nameext_from_url(url, data)
+ urls = [url + size for size in self.sizes]
+ yield Message.Urllist, urls, data
+
+ if self.videos and "-videoContainer" in tweet:
+ data["num"] = 1
+ url = "ytdl:{}/{}/status/{}".format(
+ self.root, data["user"], data["tweet_id"])
+ yield Message.Url, url, data
+
+ def metadata(self):
+ """Return general metadata"""
+ return {"user": self.user}
+
+ def tweets(self):
+ """Yield HTML content of all relevant tweets"""
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=360*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ page = self.request(self.root + "/login").text
+ pos = page.index('name="authenticity_token"')
+ token = text.extract(page, 'value="', '"', pos-80)[0]
+
+ url = self.root + "/sessions"
+ data = {
+ "session[username_or_email]": username,
+ "session[password]" : password,
+ "authenticity_token" : token,
+ "ui_metrics" : '{"rf":{},"s":""}',
+ "scribe_log" : "",
+ "redirect_after_login" : "",
+ "remember_me" : "1",
+ }
+ response = self.request(url, method="POST", data=data)
+
+ if "/error" in response.url:
+ raise exception.AuthenticationError()
+ return self.session.cookies
+
+ @staticmethod
+ def _data_from_tweet(tweet):
+ extr = text.extract_from(tweet)
+ return {
+ "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')),
+ "retweet_id": text.parse_int(extr('data-retweet-id="', '"')),
+ "retweeter" : extr('data-retweeter="' , '"'),
+ "user" : extr('data-screen-name="', '"'),
+ "username" : extr('data-name="' , '"'),
+ "user_id" : text.parse_int(extr('data-user-id="' , '"')),
+ "date" : text.parse_timestamp(extr('data-time="', '"')),
+ }
+
+ def _tweets_from_api(self, url):
+ params = {
+ "include_available_features": "1",
+ "include_entities": "1",
+ "reset_error_state": "false",
+ "lang": "en",
+ }
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "X-Twitter-Active-User": "yes",
+ "Referer": "{}/{}".format(self.root, self.user)
+ }
+
+ while True:
+ data = self.request(url, params=params, headers=headers).json()
+ if "inner" in data:
+ data = data["inner"]
+
+ for tweet in text.extract_iter(
+ data["items_html"], '<div class="tweet ', '\n</li>'):
+ yield tweet
+
+ if not data["has_more_items"]:
+ return
+
+ position = text.parse_int(text.extract(
+ tweet, 'data-tweet-id="', '"')[0])
+ if "max_position" in params and position >= params["max_position"]:
+ return
+ params["max_position"] = position
+
+
+class TwitterTimelineExtractor(TwitterExtractor):
+ """Extractor for all images from a user's timeline"""
+ subcategory = "timeline"
+ pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+ r"/([^/?&#]+)/?$")
+ test = ("https://twitter.com/supernaturepics", {
+ "range": "1-40",
+ "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
+ "keyword": "7210d679606240405e0cf62cbc67596e81a7a250",
+ })
+
+ def tweets(self):
+ url = "{}/i/profiles/show/{}/timeline/tweets".format(
+ self.root, self.user)
+ return self._tweets_from_api(url)
+
+
+class TwitterMediaExtractor(TwitterExtractor):
+ """Extractor for all images from a user's Media Tweets"""
+ subcategory = "media"
+ pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+ r"/([^/?&#]+)/media(?!\w)")
+ test = ("https://twitter.com/supernaturepics/media", {
+ "range": "1-40",
+ "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4",
+ })
+
+ def tweets(self):
+ url = "{}/i/profiles/show/{}/media_timeline".format(
+ self.root, self.user)
+ return self._tweets_from_api(url)
+
+
+class TwitterTweetExtractor(TwitterExtractor):
+ """Extractor for images from individual tweets"""
+ subcategory = "tweet"
+ pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com"
+ r"/([^/?&#]+)/status/(\d+)")
+ test = (
+ ("https://twitter.com/supernaturepics/status/604341487988576256", {
+ "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580",
+ "keyword": "1b8afb93cc04a9f44d89173f8facc61c3a6caf91",
+ "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
+ }),
+ # 4 images
+ ("https://twitter.com/perrypumas/status/894001459754180609", {
+ "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6",
+ "keyword": "43d98ab448193f0d4f30aa571a4b6bda9b6a5692",
+ }),
+ # video
+ ("https://twitter.com/perrypumas/status/1065692031626829824", {
+ "options": (("videos", True),),
+ "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+",
+ }),
+ )
+
+ def __init__(self, match):
+ TwitterExtractor.__init__(self, match)
+ self.tweet_id = match.group(2)
+
+ def metadata(self):
+ return {"user": self.user, "tweet_id": self.tweet_id}
+
+ def tweets(self):
+ url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id)
+ page = self.request(url).text
+ return (text.extract(
+ page, '<div class="tweet ', '<ul class="stats')[0],)
diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py
new file mode 100644
index 0000000..687ce3c
--- /dev/null
+++ b/gallery_dl/extractor/vanillarock.py
@@ -0,0 +1,95 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://vanilla-rock.com/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class VanillarockExtractor(Extractor):
+ """Base class for vanillarock extractors"""
+ category = "vanillarock"
+ root = "https://vanilla-rock.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+
+class VanillarockPostExtractor(VanillarockExtractor):
+ """Extractor for blogposts on vanilla-rock.com"""
+ subcategory = "post"
+ directory_fmt = ("{category}", "{path}")
+ filename_fmt = "{num:>02}.{extension}"
+ archive_fmt = "{filename}"
+ pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com"
+ r"(/(?!category/|tag/)[^/?&#]+)/?$")
+ test = ("https://vanilla-rock.com/mizuhashi_parsee-5", {
+ "url": "7fb9a4d18d9fa22d7295fee8d94ab5a7a52265dd",
+ "keyword": "b91df99b714e1958d9636748b1c81a07c3ef52c9",
+ })
+
+ def items(self):
+ extr = text.extract_from(self.request(self.root + self.path).text)
+ name = extr("<title>", "</title>")
+
+ imgs = []
+ while True:
+ img = extr('<div class="main-img">', '</div>')
+ if not img:
+ break
+ imgs.append(text.extract(img, 'href="', '"')[0])
+
+ data = {
+ "count": len(imgs),
+ "title": text.unescape(name.rpartition(" | ")[0]),
+ "path" : self.path.strip("/"),
+ "date" : text.parse_datetime(extr(
+ '<div class="date">', '</div>'), "%Y-%m-%d %H:%M"),
+ "tags" : text.split_html(extr(
+ '<div class="cat-tag">', '</div>'))[::2],
+ }
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class VanillarockTagExtractor(VanillarockExtractor):
+ """Extractor for vanillarock blog posts by tag or category"""
+ subcategory = "tag"
+ pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com"
+ r"(/(?:tag|category)/[^?&#]+)")
+ test = (
+ ("https://vanilla-rock.com/tag/%e5%b0%84%e5%91%bd%e4%b8%b8%e6%96%87", {
+ "pattern": VanillarockPostExtractor.pattern,
+ "count": ">= 12",
+ }),
+ (("https://vanilla-rock.com/category/%e4%ba%8c%e6%ac%a1%e3%82%a8%e3%83"
+ "%ad%e7%94%bb%e5%83%8f/%e8%90%8c%e3%81%88%e3%83%bb%e3%82%bd%e3%83%95"
+ "%e3%83%88%e3%82%a8%e3%83%ad"), {
+ "pattern": VanillarockPostExtractor.pattern,
+ "count": 3,
+ }),
+ )
+
+ def items(self):
+ url = self.root + self.path
+ data = {"_extractor": VanillarockPostExtractor}
+
+ yield Message.Version, 1
+ while url:
+ extr = text.extract_from(self.request(url).text)
+ while True:
+ post = extr('<h2 class="entry-title">', '</h2>')
+ if not post:
+ break
+ yield Message.Queue, text.extract(post, 'href="', '"')[0], data
+ url = text.unescape(extr('class="next page-numbers" href="', '"'))
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
new file mode 100644
index 0000000..4326582
--- /dev/null
+++ b/gallery_dl/extractor/wallhaven.py
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://wallhaven.cc/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class WallhavenExtractor(Extractor):
+ """Base class for wallhaven extractors"""
+ category = "wallhaven"
+ filename_fmt = "{category}_{id}_{resolution}.{extension}"
+ root = "https://wallhaven.cc"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.api = WallhavenAPI(self)
+
+
+class WallhavenSearchExtractor(WallhavenExtractor):
+ """Extractor for search results on wallhaven.cc"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "{search[q]}")
+ archive_fmt = "s_{search[q]}_{id}"
+ pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^/?#]+))?"
+ test = (
+ ("https://wallhaven.cc/search?q=touhou"),
+ (("https://wallhaven.cc/search?q=id%3A87"
+ "&categories=111&purity=100&sorting=date_added&order=asc&page=3"), {
+ "count": 4,
+ "url": "d024bc11895d758b76ffdb0fa85a627e53f072cf",
+ }),
+ )
+
+ def __init__(self, match):
+ WallhavenExtractor.__init__(self, match)
+ self.params = text.parse_query(match.group(1))
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, {"search": self.params}
+ for wp in self.api.search(self.params.copy()):
+ wp["search"] = self.params
+ yield Message.Url, wp["url"], wp
+
+
+class WallhavenImageExtractor(WallhavenExtractor):
+ """Extractor for individual wallpaper on wallhaven.cc"""
+ subcategory = "image"
+ archive_fmt = "{id}"
+ pattern = (r"(?:https?://)?(?:wallhaven\.cc/w/|whvn\.cc/"
+ r"|w\.wallhaven\.cc/[a-z]+/\w\w/wallhaven-)(\w+)")
+ test = (
+ ("https://wallhaven.cc/w/01w334", {
+ "pattern": "https://[^.]+.wallhaven.cc/full/01/[^-]+-01w334.jpg",
+ "content": "497212679383a465da1e35bd75873240435085a2",
+ "keyword": {
+ "id" : "01w334",
+ "width" : 1920,
+ "height" : 1200,
+ "resolution" : "1920x1200",
+ "ratio" : 1.6,
+ "colors" : list,
+ "tags" : list,
+ "file_size" : 278799,
+ "file_type" : "image/jpeg",
+ "purity" : "sfw",
+ "short_url" : "https://whvn.cc/01w334",
+ "source" : str,
+ "uploader" : {
+ "group" : "Owner/Developer",
+ "username" : "AksumkA",
+ },
+ "date" : "type:datetime",
+ "wh_category": "anime",
+ "views" : int,
+ "favorites" : int,
+ },
+ }),
+ # NSFW
+ ("https://wallhaven.cc/w/dge6v3", {
+ "url": "e4b802e70483f659d790ad5d0bd316245badf2ec",
+ }),
+ ("https://whvn.cc/01w334"),
+ ("https://w.wallhaven.cc/full/01/wallhaven-01w334.jpg"),
+ )
+
+ def __init__(self, match):
+ WallhavenExtractor.__init__(self, match)
+ self.wallpaper_id = match.group(1)
+
+ def items(self):
+ data = self.api.info(self.wallpaper_id)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ yield Message.Url, data["url"], data
+
+
+class WallhavenAPI():
+ """Minimal interface to wallhaven's API"""
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+
+ key = extractor.config("api-key")
+ if key is None:
+ key = "25HYZenXTICjzBZXzFSg98uJtcQVrDs2"
+ extractor.log.debug("Using default API Key")
+ else:
+ extractor.log.debug("Using custom API Key")
+ self.headers = {"X-API-Key": key}
+
+ def info(self, wallpaper_id):
+ url = "https://wallhaven.cc/api/v1/w/" + wallpaper_id
+ return self._update(self._call(url)["data"])
+
+ def search(self, params):
+ url = "https://wallhaven.cc/api/v1/search"
+ while True:
+ data = self._call(url, params)
+ yield from map(self._update, data["data"])
+ if data["meta"]["current_page"] >= data["meta"]["last_page"]:
+ return
+ params["page"] = data["meta"]["current_page"] + 1
+
+ def _call(self, url, params=None):
+ return self.extractor.request(
+ url, headers=self.headers, params=params).json()
+
+ @staticmethod
+ def _update(wp):
+ width, _, height = wp["resolution"].partition("x")
+ wp["url"] = wp.pop("path")
+ if "tags" in wp:
+ wp["tags"] = [t["name"] for t in wp["tags"]]
+ wp["date"] = text.parse_datetime(
+ wp.pop("created_at"), "%Y-%m-%d %H:%M:%S")
+ wp["ratio"] = text.parse_float(wp["ratio"])
+ wp["width"] = wp.pop("dimension_x")
+ wp["height"] = wp.pop("dimension_y")
+ wp["wh_category"] = wp["category"]
+ return text.nameext_from_url(wp["url"], wp)
diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py
new file mode 100644
index 0000000..d353144
--- /dev/null
+++ b/gallery_dl/extractor/warosu.py
@@ -0,0 +1,108 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://warosu.org/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class WarosuThreadExtractor(Extractor):
+ """Extractor for images from threads on warosu.org"""
+ category = "warosu"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} - {title}")
+ filename_fmt = "{tim}-{filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ pattern = r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"
+ test = (
+ ("https://warosu.org/jp/thread/16656025", {
+ "url": "889d57246ed67e491e5b8f7f124e50ea7991e770",
+ "keyword": "c00ea4c5460c5986994f17bb8416826d42ca57c0",
+ }),
+ ("https://warosu.org/jp/thread/16658073", {
+ "url": "4500cf3184b067424fd9883249bd543c905fbecd",
+ "keyword": "7534edf4ec51891dbf44d775b73fbbefd52eec71",
+ "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c",
+ }),
+ )
+ root = "https://warosu.org"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "{}/{}/thread/{}".format(self.root, self.board, self.thread)
+ page = self.request(url).text
+ data = self.get_metadata(page)
+ posts = self.posts(page)
+
+ if not data["title"]:
+ title = text.remove_html(posts[0]["com"])
+ data["title"] = text.unescape(title)[:50]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in posts:
+ if "image" in post:
+ for key in ("w", "h", "no", "time", "tim"):
+ post[key] = text.parse_int(post[key])
+ post.update(data)
+ yield Message.Url, post["image"], post
+
+ def get_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ boardname = text.extract(page, "<title>", "</title>")[0]
+ title = text.extract(page, 'filetitle" itemprop="name">', '<')[0]
+ return {
+ "board": self.board,
+ "board_name": boardname.rpartition(" - ")[2],
+ "thread": self.thread,
+ "title": title,
+ }
+
+ def posts(self, page):
+ """Build a list of all post-objects"""
+ page = text.extract(page, '<div class="content">', '<table>')[0]
+ needle = '<table itemscope itemtype="http://schema.org/Comment">'
+ return [self.parse(post) for post in page.split(needle)]
+
+ def parse(self, post):
+ """Build post-object by extracting data from an HTML post"""
+ data = self._extract_post(post)
+ if "<span>File:" in post:
+ self._extract_image(post, data)
+ part = data["image"].rpartition("/")[2]
+ data["tim"], _, data["extension"] = part.partition(".")
+ data["ext"] = "." + data["extension"]
+ return data
+
+ @staticmethod
+ def _extract_post(post):
+ data = text.extract_all(post, (
+ ("no" , 'id="p', '"'),
+ ("name", '<span itemprop="name">', '</span>'),
+ ("time", '<span class="posttime" title="', '000">'),
+ ("now" , '', '<'),
+ ("com" , '<blockquote><p itemprop="text">', '</p></blockquote>'),
+ ))[0]
+ data["com"] = text.unescape(text.remove_html(data["com"].strip()))
+ return data
+
+ @staticmethod
+ def _extract_image(post, data):
+ text.extract_all(post, (
+ ("fsize" , '<span>File: ', ', '),
+ ("w" , '', 'x'),
+ ("h" , '', ', '),
+ ("filename", '', '<'),
+ ("image" , '<br />\n<a href="', '"'),
+ ), 0, data)
+ data["filename"] = text.unquote(data["filename"].rpartition(".")[0])
+ data["image"] = "https:" + data["image"]
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
new file mode 100644
index 0000000..7a4ee8f
--- /dev/null
+++ b/gallery_dl/extractor/weibo.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.weibo.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+class WeiboExtractor(Extractor):
+ category = "weibo"
+ directory_fmt = ("{category}", "{user[screen_name]}")
+ filename_fmt = "{status[id]}_{num:>02}.{extension}"
+ archive_fmt = "{status[id]}_{num}"
+ root = "https://m.weibo.cn"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.retweets = self.config("retweets", True)
+
+ def items(self):
+ first = True
+
+ for status in self.statuses():
+
+ obj = status
+ num = 1
+
+ if first:
+ yield Message.Version, 1
+ yield Message.Directory, status
+ first = False
+
+ while True:
+
+ if "pics" in obj:
+ for image in obj["pics"]:
+ pid = image["pid"]
+ if "large" in image:
+ image = image["large"]
+ data = text.nameext_from_url(image["url"], {
+ "num": num,
+ "pid": pid,
+ "width": text.parse_int(image["geo"]["width"]),
+ "height": text.parse_int(image["geo"]["height"]),
+ "status": status,
+ })
+ yield Message.Url, image["url"], data
+ num += 1
+
+ if "page_info" in obj and "media_info" in obj["page_info"]:
+ info = obj["page_info"]["media_info"]
+ url = info.get("stream_url_hd") or info["stream_url"]
+ data = text.nameext_from_url(url, {
+ "num": num,
+ "url": url,
+ "width": 0,
+ "height": 0,
+ "status": status,
+ })
+ yield Message.Url, url, data
+
+ if self.retweets and "retweeted_status" in obj:
+ obj = obj["retweeted_status"]
+ else:
+ break
+
+ def statuses(self):
+ """Returns an iterable containing all relevant 'status' objects"""
+
+
+class WeiboUserExtractor(WeiboExtractor):
+ """Extractor for all images of a user on weibo.cn"""
+ subcategory = "user"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
+ r"/(?:u|p(?:rofile)?)/(\d+)")
+ test = (
+ ("https://m.weibo.cn/u/2314621010", {
+ "range": "1-30",
+ }),
+ ("https://m.weibo.cn/profile/2314621010"),
+ ("https://m.weibo.cn/p/2304132314621010_-_WEIBO_SECOND_PROFILE_WEIBO"),
+ ("https://www.weibo.com/p/1003062314621010/home"),
+ )
+
+ def __init__(self, match):
+ WeiboExtractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def statuses(self):
+ url = self.root + "/api/container/getIndex"
+ params = {"page": 1, "containerid": "107603" + self.user_id[-10:]}
+
+ while True:
+ data = self.request(url, params=params).json()
+
+ for card in data["data"]["cards"]:
+ if "mblog" in card:
+ yield card["mblog"]
+
+ if len(data["data"]["cards"]) < 5:
+ return
+ params["page"] += 1
+
+
+class WeiboStatusExtractor(WeiboExtractor):
+ """Extractor for images from a status on weibo.cn"""
+ subcategory = "status"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
+ r"/(?:detail|status|\d+)/(\d+)")
+ test = (
+ ("https://m.weibo.cn/detail/4323047042991618", {
+ "pattern": r"https?://wx\d+.sinaimg.cn/large/\w+.jpg",
+ }),
+ ("https://m.weibo.cn/detail/4339748116375525", {
+ "pattern": r"https?://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_hd",
+ }),
+ ("https://m.weibo.cn/status/4339748116375525"),
+ ("https://m.weibo.cn/5746766133/4339748116375525"),
+ )
+
+ def __init__(self, match):
+ WeiboExtractor.__init__(self, match)
+ self.status_id = match.group(1)
+
+ def statuses(self):
+ url = "{}/detail/{}".format(self.root, self.status_id)
+ page = self.request(url).text
+ data = json.loads(text.extract(
+ page, " var $render_data = [", "][0] || {};")[0])
+ return (data["status"],)
diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py
new file mode 100644
index 0000000..b9c223c
--- /dev/null
+++ b/gallery_dl/extractor/wikiart.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.wikiart.org/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?wikiart\.org/([a-z]+)"
+
+
+class WikiartExtractor(Extractor):
+ """Base class for wikiart extractors"""
+ category = "wikiart"
+ filename_fmt = "{id}_{title}.{extension}"
+ archive_fmt = "{id}"
+ root = "https://www.wikiart.org"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.lang = match.group(1)
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for painting in self.paintings():
+ url = painting["image"]
+ painting.update(data)
+ yield Message.Url, url, text.nameext_from_url(url, painting)
+
+ def metadata(self):
+ """Return a dict with general metadata"""
+
+ def paintings(self):
+ """Return an iterable containing all relevant 'painting' objects"""
+
+ def _pagination(self, url, extra_params=None, key="Paintings"):
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": url,
+ }
+ params = {
+ "json": "2",
+ "layout": "new",
+ "page": 1,
+ "resultType": "masonry",
+ }
+ if extra_params:
+ params.update(extra_params)
+
+ while True:
+ data = self.request(url, headers=headers, params=params).json()
+ items = data.get(key)
+ if not items:
+ return
+ yield from items
+ params["page"] += 1
+
+
+class WikiartArtistExtractor(WikiartExtractor):
+ """Extractor for an artist's paintings on wikiart.org"""
+ subcategory = "artist"
+ directory_fmt = ("{category}", "{artist[artistName]}")
+ pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)"
+ test = ("https://www.wikiart.org/en/thomas-cole", {
+ "url": "f1eee8158f5b8b7380382ab730a8f53884715c8b",
+ "keyword": "b62678394ce645815963883d5c9642255307225f",
+ })
+
+ def __init__(self, match):
+ WikiartExtractor.__init__(self, match)
+ self.artist = match.group(2)
+
+ def metadata(self):
+ url = "{}/{}/{}?json=2".format(self.root, self.lang, self.artist)
+ return {"artist": self.request(url).json()}
+
+ def paintings(self):
+ url = "{}/{}/{}/mode/all-paintings".format(
+ self.root, self.lang, self.artist)
+ return self._pagination(url)
+
+
+class WikiartArtworksExtractor(WikiartExtractor):
+ """Extractor for artwork collections on wikiart.org"""
+ subcategory = "artworks"
+ directory_fmt = ("{category}", "Artworks by {group!c}", "{type}")
+ pattern = BASE_PATTERN + r"/paintings-by-([\w-]+)/([\w-]+)"
+ test = ("https://www.wikiart.org/en/paintings-by-media/grisaille", {
+ "url": "f92d55669fa949491c26a5437527adb14b35b8cc",
+ })
+
+ def __init__(self, match):
+ WikiartExtractor.__init__(self, match)
+ self.group = match.group(2)
+ self.type = match.group(3)
+
+ def metadata(self):
+ return {"group": self.group, "type": self.type}
+
+ def paintings(self):
+ url = "{}/{}/paintings-by-{}/{}".format(
+ self.root, self.lang, self.group, self.type)
+ return self._pagination(url)
+
+
+class WikiartArtistsExtractor(WikiartExtractor):
+ """Extractor for artist collections on wikiart.org"""
+ subcategory = "artists"
+ pattern = (BASE_PATTERN + r"/artists-by-([\w-]+)/([\w-]+)")
+ test = ("https://www.wikiart.org/en/artists-by-century/12", {
+ "pattern": WikiartArtistExtractor.pattern,
+ "count": 7,
+ })
+
+ def __init__(self, match):
+ WikiartExtractor.__init__(self, match)
+ self.group = match.group(2)
+ self.type = match.group(3)
+
+ def items(self):
+ url = "{}/{}/App/Search/Artists-by-{}".format(
+ self.root, self.lang, self.group)
+ params = {"json": "3", "searchterm": self.type}
+
+ for artist in self._pagination(url, params, "Artists"):
+ artist["_extractor"] = WikiartArtistExtractor
+ yield Message.Queue, self.root + artist["artistUrl"], artist
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
new file mode 100644
index 0000000..9699806
--- /dev/null
+++ b/gallery_dl/extractor/xhamster.py
@@ -0,0 +1,171 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://xhamster.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)"
+
+
+class XhamsterExtractor(Extractor):
+ """Base class for xhamster extractors"""
+ category = "xhamster"
+ root = "https://xhamster.com"
+
+
+class XhamsterGalleryExtractor(XhamsterExtractor):
+ """Extractor for image galleries on xhamster.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user[name]}",
+ "{gallery[id]} {gallery[title]}")
+ filename_fmt = "{num:>03}_{id}.{extension}"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"(/photos/gallery/[^/?&#]+)"
+ test = (
+ ("https://xhamster.com/photos/gallery/11748968", {
+ "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",
+ "count": ">= 144",
+ "keyword": {
+ "comments": int,
+ "count": int,
+ "favorite": bool,
+ "id": int,
+ "num": int,
+ "height": int,
+ "width": int,
+ "imageURL": str,
+ "pageURL": str,
+ "thumbURL": str,
+ "gallery": {
+ "date": "type:datetime",
+ "description": "",
+ "dislikes": int,
+ "id": 11748968,
+ "likes": int,
+ "tags": ["NON-Porn"],
+ "thumbnail": str,
+ "title": "Make the world better.",
+ "views": int,
+ },
+ "user": {
+ "id": 16874672,
+ "name": "Anonymousrants",
+ "retired": bool,
+ "subscribers": int,
+ "url": "https://xhamster.com/users/anonymousrants",
+ "verified": bool,
+ },
+ },
+ }),
+ ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"),
+ ("https://xhamster.com/photos/gallery/11748968"),
+ ("https://xhamster.one/photos/gallery/11748968"),
+ ("https://xhamster.desi/photos/gallery/11748968"),
+ ("https://en.xhamster.com/photos/gallery/11748968"),
+ )
+
+ def __init__(self, match):
+ XhamsterExtractor.__init__(self, match)
+ self.path = match.group(1)
+ self.data = None
+
+ def items(self):
+ data = self.metadata()
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for num, image in enumerate(self.images(), 1):
+ url = image["imageURL"]
+ image.update(data)
+ image["num"] = num
+ yield Message.Url, url, text.nameext_from_url(url, image)
+
+ def metadata(self):
+ self.data = self._data(self.root + self.path)
+ user = self.data["authorModel"]
+ imgs = self.data["photosGalleryModel"]
+
+ return {
+ "user":
+ {
+ "id" : text.parse_int(user["id"]),
+ "url" : user["pageURL"],
+ "name" : user["name"],
+ "retired" : user["retired"],
+ "verified" : user["verified"],
+ "subscribers": user["subscribers"],
+ },
+ "gallery":
+ {
+ "id" : text.parse_int(imgs["id"]),
+ "tags" : [c["name"] for c in imgs["categories"]],
+ "date" : text.parse_timestamp(imgs["created"]),
+ "views" : text.parse_int(imgs["views"]),
+ "likes" : text.parse_int(imgs["rating"]["likes"]),
+ "dislikes" : text.parse_int(imgs["rating"]["dislikes"]),
+ "title" : imgs["title"],
+ "description": imgs["description"],
+ "thumbnail" : imgs["thumbURL"],
+ },
+ "count": text.parse_int(imgs["quantity"]),
+ }
+
+ def images(self):
+ data = self.data
+ self.data = None
+
+ while True:
+ for image in data["photosGalleryModel"]["photos"]:
+ del image["modelName"]
+ yield image
+
+ pgntn = data["pagination"]
+ if pgntn["active"] == pgntn["maxPage"]:
+ return
+ url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"])
+ data = self._data(url)
+
+ def _data(self, url):
+ page = self.request(url).text
+ return json.loads(text.extract(
+ page, "window.initials =", "</script>")[0].rstrip("\n\r;"))
+
+
+class XhamsterUserExtractor(XhamsterExtractor):
+ """Extractor for all galleries of an xhamster user"""
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"/users/([^/?&#]+)(?:/photos)?/?(?:$|[?#])"
+ test = (
+ ("https://xhamster.com/users/nickname68/photos", {
+ "pattern": XhamsterGalleryExtractor.pattern,
+ "count": 50,
+ "range": "1-50",
+ }),
+ ("https://xhamster.com/users/nickname68"),
+ )
+
+ def __init__(self, match):
+ XhamsterExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ url = "{}/users/{}/photos".format(self.root, self.user)
+ data = {"_extractor": XhamsterGalleryExtractor}
+
+ while url:
+ extr = text.extract_from(self.request(url).text)
+ while True:
+ url = extr('thumb-image-container" href="', '"')
+ if not url:
+ break
+ yield Message.Queue, url, data
+ url = extr('data-page="next" href="', '"')
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
new file mode 100644
index 0000000..7eec18b
--- /dev/null
+++ b/gallery_dl/extractor/xvideos.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.xvideos.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+import json
+
+
+class XvideosExtractor(Extractor):
+ """Base class for xvideos extractors"""
+ category = "xvideos"
+ root = "https://www.xvideos.com"
+
+ def get_page(self, url, codes=(403, 404)):
+ response = self.request(url, expect=codes)
+ if response.status_code in codes:
+ raise exception.NotFoundError(self.subcategory)
+ return response.text
+
+
+class XvideosGalleryExtractor(XvideosExtractor):
+ """Extractor for user profile galleries from xvideos.com"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{user[name]}", "{title}")
+ filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
+ archive_fmt = "{gallery_id}_{num}"
+ pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com"
+ r"/profiles/([^/?&#]+)/photos/(\d+)")
+ test = (
+ (("https://www.xvideos.com/profiles"
+ "/pervertedcouple/photos/751031/random_stuff"), {
+ "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7",
+ "keyword": "8d637b372c6231cc4ada92dd5918db5fdbd06520",
+ }),
+ ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ XvideosExtractor.__init__(self, match)
+ self.user, self.gid = match.groups()
+
+ def items(self):
+ url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid)
+ page = self.get_page(url)
+ data = self.get_metadata(page)
+ imgs = self.get_images(page)
+ data["count"] = len(imgs)
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for url in imgs:
+ data["num"] = text.parse_int(url.rsplit("_", 2)[1])
+ data["extension"] = url.rpartition(".")[2]
+ yield Message.Url, url, data
+
+ def get_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ data = text.extract_all(page, (
+ ("userid" , '"id_user":', ','),
+ ("display", '"display":"', '"'),
+ ("title" , '"title":"', '"'),
+ ("descr" , '<small class="mobile-hide">', '</small>'),
+ ("tags" , '<em>Tagged:</em>', '<'),
+ ))[0]
+
+ return {
+ "user": {
+ "id": text.parse_int(data["userid"]),
+ "name": self.user,
+ "display": data["display"],
+ "description": data["descr"].strip(),
+ },
+ "tags": text.unescape(data["tags"] or "").strip().split(", "),
+ "title": text.unescape(data["title"]),
+ "gallery_id": text.parse_int(self.gid),
+ }
+
+ @staticmethod
+ def get_images(page):
+ """Return a list of all image urls for this gallery"""
+ return list(text.extract_iter(
+ page, '<a class="embed-responsive-item" href="', '"'))
+
+
+class XvideosUserExtractor(XvideosExtractor):
+ """Extractor for user profiles from xvideos.com"""
+ subcategory = "user"
+ categorytransfer = True
+ pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com"
+ r"/profiles/([^/?&#]+)/?(?:#.*)?$")
+ test = (
+ ("https://www.xvideos.com/profiles/pervertedcouple", {
+ "url": "a413f3e60d6d3a2de79bd44fa3b7a9c03db4336e",
+ "keyword": "a796760d34732adc7ec52a8feb057515209a2ca6",
+ }),
+ ("https://www.xvideos.com/profiles/niwehrwhernvh", {
+ "exception": exception.NotFoundError,
+ }),
+ ("https://www.xvideos.com/profiles/pervertedcouple#_tabPhotos"),
+ )
+
+ def __init__(self, match):
+ XvideosExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def items(self):
+ url = "{}/profiles/{}".format(self.root, self.user)
+ page = self.get_page(url)
+ data = json.loads(text.extract(
+ page, "xv.conf=", ";</script>")[0])["data"]
+
+ if not isinstance(data["galleries"], dict):
+ return
+ if "0" in data["galleries"]:
+ del data["galleries"]["0"]
+
+ galleries = [
+ {
+ "gallery_id": text.parse_int(gid),
+ "title": text.unescape(gdata["title"]),
+ "count": gdata["nb_pics"],
+ "_extractor": XvideosGalleryExtractor,
+ }
+ for gid, gdata in data["galleries"].items()
+ ]
+ galleries.sort(key=lambda x: x["gallery_id"])
+
+ yield Message.Version, 1
+ for gallery in galleries:
+ url = "https://www.xvideos.com/profiles/{}/photos/{}".format(
+ self.user, gallery["gallery_id"])
+ yield Message.Queue, url, gallery
diff --git a/gallery_dl/extractor/yandere.py b/gallery_dl/extractor/yandere.py
new file mode 100644
index 0000000..623e7a8
--- /dev/null
+++ b/gallery_dl/extractor/yandere.py
@@ -0,0 +1,68 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://yande.re/"""
+
+from . import booru
+
+
+class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+ """Base class for yandere extractors"""
+ category = "yandere"
+ api_url = "https://yande.re/post.json"
+ post_url = "https://yande.re/post/show/{}"
+
+
+class YandereTagExtractor(booru.TagMixin, YandereExtractor):
+ """Extractor for images from yande.re based on search-tags"""
+ pattern = (r"(?:https?://)?(?:www\.)?yande\.re"
+ r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
+ test = ("https://yande.re/post?tags=ouzoku+armor", {
+ "content": "59201811c728096b2d95ce6896fd0009235fe683",
+ })
+
+
+class YanderePoolExtractor(booru.PoolMixin, YandereExtractor):
+ """Extractor for image-pools from yande.re"""
+ pattern = r"(?:https?://)?(?:www\.)?yande\.re/pool/show/(?P<pool>\d+)"
+ test = ("https://yande.re/pool/show/318", {
+ "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68",
+ })
+
+
+class YanderePostExtractor(booru.PostMixin, YandereExtractor):
+ """Extractor for single images from yande.re"""
+ pattern = r"(?:https?://)?(?:www\.)?yande\.re/post/show/(?P<post>\d+)"
+ test = ("https://yande.re/post/show/51824", {
+ "content": "59201811c728096b2d95ce6896fd0009235fe683",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "sasaki_tamaru",
+ "tags_circle": "softhouse_chara",
+ "tags_copyright": "ouzoku",
+ "tags_general": str,
+ },
+ })
+
+
+class YanderePopularExtractor(booru.MoebooruPopularMixin, YandereExtractor):
+ """Extractor for popular images from yande.re"""
+ pattern = (r"(?:https?://)?(?:www\.)?yande\.re"
+ r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
+ r"(?:\?(?P<query>[^#]*))?")
+ test = (
+ ("https://yande.re/post/popular_by_month?month=6&year=2014", {
+ "count": 40,
+ }),
+ ("https://yande.re/post/popular_recent"),
+ )
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.api_url = "https://yande.re/post/popular_{scale}.json".format(
+ scale=self.scale)
diff --git a/gallery_dl/extractor/yaplog.py b/gallery_dl/extractor/yaplog.py
new file mode 100644
index 0000000..b3c5501
--- /dev/null
+++ b/gallery_dl/extractor/yaplog.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://yaplog.jp/"""
+
+from .common import Extractor, Message, AsynchronousMixin
+from .. import text, util
+
+
+class YaplogExtractor(AsynchronousMixin, Extractor):
+ """Base class for yaplog extractors"""
+ category = "yaplog"
+ root = "https://yaplog.jp"
+ filename_fmt = "{post[id]}_{post[title]}_{id}.{extension}"
+ directory_fmt = ("{category}", "{post[user]}")
+ archive_fmt = "{post[user]}_{id}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for post, urls in self.posts():
+ yield Message.Directory, {"post": post}
+ for num, url in enumerate(urls, 1):
+ page = self.request(url).text if num > 1 else url
+ iurl = text.extract(page, '<img src="', '"')[0]
+ iid, _, ext = iurl.rpartition("/")[2].rpartition(".")
+ image = {
+ "url" : iurl,
+ "num" : num,
+ "id" : text.parse_int(iid.partition("_")[0]),
+ "extension": ext,
+ "post" : post,
+ }
+ yield Message.Url, iurl, image
+
+ def posts(self):
+ """Return an iterable with (data, image page URLs) tuples"""
+
+ def _parse_post(self, url):
+ page = self.request(url).text
+ title, pos = text.extract(page, 'class="title">', '<')
+ date , pos = text.extract(page, 'class="date">' , '<', pos)
+ pid , pos = text.extract(page, '/archive/' , '"', pos)
+ prev , pos = text.extract(page, 'class="last"><a href="', '"', pos)
+
+ urls = list(text.extract_iter(page, '<li><a href="', '"', pos))
+ urls[0] = page # cache HTML of first page
+
+ if len(urls) == 24 and text.extract(page, '(1/', ')')[0] != '24':
+ # there are a maximum of 24 image entries in an /image/ page
+ # -> search /archive/ page for the rest
+ url = "{}/{}/archive/{}".format(self.root, self.user, pid)
+ page = self.request(url).text
+
+ base = "{}/{}/image/{}/".format(self.root, self.user, pid)
+ for part in util.advance(text.extract_iter(
+ page, base, '"', pos), 24):
+ urls.append(base + part)
+
+ return prev, urls, {
+ "id" : text.parse_int(pid),
+ "title": text.unescape(title[:-3]),
+ "user" : self.user,
+ "date" : date,
+ }
+
+
+class YaplogBlogExtractor(YaplogExtractor):
+ """Extractor for a user's blog on yaplog.jp"""
+ subcategory = "blog"
+ pattern = r"(?:https?://)?(?:www\.)?yaplog\.jp/(\w+)/?(?:$|[?&#])"
+ test = ("https://yaplog.jp/omitakashi3", {
+ "pattern": r"https://img.yaplog.jp/img/18/pc/o/m/i/omitakashi3/0/",
+ "count": ">= 2",
+ })
+
+ def posts(self):
+ url = "{}/{}/image/".format(self.root, self.user)
+ while url:
+ url, images, data = self._parse_post(url)
+ yield data, images
+
+
+class YaplogPostExtractor(YaplogExtractor):
+ """Extractor for images from a blog post on yaplog.jp"""
+ subcategory = "post"
+ pattern = (r"(?:https?://)?(?:www\.)?yaplog\.jp"
+ r"/(\w+)/(?:archive|image)/(\d+)")
+ test = ("https://yaplog.jp/imamiami0726/image/1299", {
+ "url": "896cae20fa718735a57e723c48544e830ff31345",
+ "keyword": "f8d8781e61c4c38238a7622d6df6c905f864e5d3",
+ })
+
+ def __init__(self, match):
+ YaplogExtractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def posts(self):
+ url = "{}/{}/image/{}".format(self.root, self.user, self.post_id)
+ _, images, data = self._parse_post(url)
+ return ((data, images),)
diff --git a/gallery_dl/extractor/yuki.py b/gallery_dl/extractor/yuki.py
new file mode 100644
index 0000000..0844c40
--- /dev/null
+++ b/gallery_dl/extractor/yuki.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://yuki.la/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class YukiThreadExtractor(Extractor):
+ """Extractor for images from threads on yuki.la"""
+ category = "yuki"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread}{title:? - //}")
+ filename_fmt = "{time}-{filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ pattern = r"(?:https?://)?yuki\.la/([^/?&#]+)/(\d+)"
+ test = (
+ ("https://yuki.la/gd/309639", {
+ "url": "289e86c5caf673a2515ec5f5f521ac0ae7e189e9",
+ "keyword": "01cbe29ae207a5cb7556bcbd5ed481ecdaf32727",
+ "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
+ }),
+ ("https://yuki.la/a/159767162", {
+ "url": "cd94d0eb646d279c3b7efb9b7898888e5d44fa93",
+ "keyword": "7a4ff90e423c74bd3126fb65d13015decec2fa45",
+ }),
+ # old thread - missing board name in title and multi-line HTML
+ ("https://yuki.la/gif/6877752", {
+ "url": "3dbb2f8453490d002416c5fc2fe95b56c129faf9",
+ "keyword": "563ef4ae80134d845dddaed7ebe56f5fc41056be",
+ }),
+ # even older thread - no thread title
+ ("https://yuki.la/a/9357051", {
+ "url": "010560bf254bd485e48366c3531728bda4b22583",
+ "keyword": "7b736c41e307dcfcb84ef495f29299a6ddd06d67",
+ }),
+ )
+ root = "https://yuki.la"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "{}/{}/{}".format(self.root, self.board, self.thread)
+ page = self.request(url).text
+ data = self.get_metadata(page)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in self.posts(page):
+ if "image" in post:
+ for key in ("w", "h", "no", "time"):
+ post[key] = text.parse_int(post[key])
+ post.update(data)
+ yield Message.Url, post["image"], post
+
+ def get_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ title = text.extract(page, "<title>", "</title>")[0]
+ try:
+ title, boardname, _ = title.rsplit(" - ", 2)
+ except ValueError:
+ title = boardname = ""
+ else:
+ title = title.partition(" - ")[2]
+ if not title:
+ title, boardname = boardname, ""
+ return {
+ "board": self.board,
+ "board_name": boardname,
+ "thread": text.parse_int(self.thread),
+ "title": text.unescape(title),
+ }
+
+ def posts(self, page):
+ """Build a list of all post-objects"""
+ return [
+ self.parse(post) for post in text.extract_iter(
+ page, '<div class="postContainer', '</blockquote>')
+ ]
+
+ def parse(self, post):
+ """Build post-object by extracting data from an HTML post"""
+ data = self._extract_post(post)
+ if 'class="file"' in post:
+ self._extract_image(post, data)
+ part = data["image"].rpartition("/")[2]
+ data["tim"], _, data["extension"] = part.partition(".")
+ data["ext"] = "." + data["extension"]
+ return data
+
+ @staticmethod
+ def _extract_post(post):
+ data, pos = text.extract_all(post, (
+ ("no" , 'id="pc', '"'),
+ ("name", '<span class="name">', '</span>'),
+ ("time", 'data-utc="', '"'),
+ ("now" , '>', ' <'),
+ ))
+ data["com"] = text.unescape(text.remove_html(
+ post[post.index("<blockquote ", pos):].partition(">")[2]))
+ return data
+
+ @staticmethod
+ def _extract_image(post, data):
+ text.extract_all(post, (
+ (None , '>File:', ''),
+ ("fullname", '<a title="', '"'),
+ ("image" , 'href="', '"'),
+ ("filename", '>', '<'),
+ ("fsize" , '(', ', '),
+ ("w" , '', 'x'),
+ ("h" , '', ')'),
+ ), 0, data)
+ filename = data["fullname"] or data["filename"]
+ data["filename"] = text.unescape(filename.rpartition(".")[0])
+ data["image"] = "https:" + data["image"]
+ del data["fullname"]
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
new file mode 100644
index 0000000..667b9b3
--- /dev/null
+++ b/gallery_dl/job.py
@@ -0,0 +1,492 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import sys
+import time
+import logging
+from . import extractor, downloader, postprocessor
+from . import config, text, util, output, exception
+from .extractor.message import Message
+
+
+class Job():
+ """Base class for Job-types"""
+ ulog = None
+
+ def __init__(self, extr, parent=None):
+ if isinstance(extr, str):
+ extr = extractor.find(extr)
+ if not extr:
+ raise exception.NoExtractorError()
+
+ self.extractor = extr
+ extr.log.extractor = extr
+ extr.log.job = self
+ extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url)
+
+ # url predicates
+ self.pred_url = self._prepare_predicates(
+ "image", [util.UniquePredicate()], True)
+
+ # queue predicates
+ self.pred_queue = self._prepare_predicates(
+ "chapter", [], False)
+
+ # category transfer
+ if parent and parent.extractor.config(
+ "category-transfer", parent.extractor.categorytransfer):
+ self.extractor.category = parent.extractor.category
+ self.extractor.subcategory = parent.extractor.subcategory
+
+ # user-supplied metadata
+ self.userkwds = self.extractor.config("keywords")
+
+ def run(self):
+ """Execute or run the job"""
+ try:
+ log = self.extractor.log
+ for msg in self.extractor:
+ self.dispatch(msg)
+ except exception.AuthenticationError as exc:
+ msg = str(exc) or "Please provide a valid username/password pair."
+ log.error("Authentication failed: %s", msg)
+ except exception.AuthorizationError:
+ log.error("You do not have permission to access the resource "
+ "at '%s'", self.extractor.url)
+ except exception.NotFoundError as exc:
+ res = str(exc) or "resource (gallery/image/user)"
+ log.error("The %s at '%s' does not exist", res, self.extractor.url)
+ except exception.HttpError as exc:
+ err = exc.args[0]
+ if isinstance(err, Exception):
+ err = "{}: {}".format(err.__class__.__name__, err)
+ log.error("HTTP request failed: %s", err)
+ except exception.FormatError as exc:
+ err, obj = exc.args
+ log.error("Applying %s format string failed: %s: %s",
+ obj, err.__class__.__name__, err)
+ except exception.FilterError as exc:
+ err = exc.args[0]
+ log.error("Evaluating filter expression failed: %s: %s",
+ err.__class__.__name__, err)
+ except exception.StopExtraction:
+ pass
+ except OSError as exc:
+ log.error("Unable to download data: %s: %s",
+ exc.__class__.__name__, exc)
+ log.debug("", exc_info=True)
+ except Exception as exc:
+ log.error(("An unexpected error occurred: %s - %s. "
+ "Please run gallery-dl again with the --verbose flag, "
+ "copy its output and report this issue on "
+ "https://github.com/mikf/gallery-dl/issues ."),
+ exc.__class__.__name__, exc)
+ log.debug("", exc_info=True)
+ self.handle_finalize()
+
+ def dispatch(self, msg):
+ """Call the appropriate message handler"""
+ if msg[0] == Message.Url:
+ _, url, kwds = msg
+ if self.pred_url(url, kwds):
+ self.update_kwdict(kwds)
+ self.handle_url(url, kwds)
+
+ elif msg[0] == Message.Directory:
+ self.update_kwdict(msg[1])
+ self.handle_directory(msg[1])
+
+ elif msg[0] == Message.Queue:
+ _, url, kwds = msg
+ if self.pred_queue(url, kwds):
+ self.handle_queue(url, kwds)
+
+ elif msg[0] == Message.Urllist:
+ _, urls, kwds = msg
+ if self.pred_url(urls[0], kwds):
+ self.update_kwdict(kwds)
+ self.handle_urllist(urls, kwds)
+
+ elif msg[0] == Message.Version:
+ if msg[1] != 1:
+ raise "unsupported message-version ({}, {})".format(
+ self.extractor.category, msg[1]
+ )
+ # TODO: support for multiple message versions
+
+ def handle_url(self, url, keywords):
+ """Handle Message.Url"""
+
+ def handle_urllist(self, urls, keywords):
+ """Handle Message.Urllist"""
+ self.handle_url(urls[0], keywords)
+
+ def handle_directory(self, keywords):
+ """Handle Message.Directory"""
+
+ def handle_queue(self, url, keywords):
+ """Handle Message.Queue"""
+
+ def handle_finalize(self):
+ """Handle job finalization"""
+
+ def update_kwdict(self, kwdict):
+ """Update 'kwdict' with additional metadata"""
+ kwdict["category"] = self.extractor.category
+ kwdict["subcategory"] = self.extractor.subcategory
+ if self.userkwds:
+ kwdict.update(self.userkwds)
+
+ def _prepare_predicates(self, target, predicates, skip=True):
+ pfilter = self.extractor.config(target + "-filter")
+ if pfilter:
+ try:
+ pred = util.FilterPredicate(pfilter, target)
+ except (SyntaxError, ValueError, TypeError) as exc:
+ self.extractor.log.warning(exc)
+ else:
+ predicates.append(pred)
+
+ prange = self.extractor.config(target + "-range")
+ if prange:
+ try:
+ pred = util.RangePredicate(prange)
+ except ValueError as exc:
+ self.extractor.log.warning(
+ "invalid %s range: %s", target, exc)
+ else:
+ if skip and pred.lower > 1 and not pfilter:
+ pred.index += self.extractor.skip(pred.lower - 1)
+ predicates.append(pred)
+
+ return util.build_predicate(predicates)
+
+ def _write_unsupported(self, url):
+ if self.ulog:
+ self.ulog.info(url)
+
+ @staticmethod
+ def _filter(kwdict):
+ """Return a copy of 'kwdict' without "private" entries"""
+ return {k: v for k, v in kwdict.items() if k[0] != "_"}
+
+
+class DownloadJob(Job):
+ """Download images into appropriate directory/filename locations"""
+
+ def __init__(self, url, parent=None):
+ Job.__init__(self, url, parent)
+ self.log = logging.getLogger("download")
+ self.pathfmt = None
+ self.archive = None
+ self.sleep = None
+ self.downloaders = {}
+ self.postprocessors = None
+ self.out = output.select()
+
+ def handle_url(self, url, keywords, fallback=None):
+ """Download the resource specified in 'url'"""
+ # prepare download
+ self.pathfmt.set_keywords(keywords)
+
+ if self.postprocessors:
+ for pp in self.postprocessors:
+ pp.prepare(self.pathfmt)
+
+ if self.pathfmt.exists(self.archive):
+ self.handle_skip()
+ return
+
+ if self.sleep:
+ time.sleep(self.sleep)
+
+ # download from URL
+ if not self.download(url):
+
+ # use fallback URLs if available
+ for num, url in enumerate(fallback or (), 1):
+ self.log.info("Trying fallback URL #%d", num)
+ if self.download(url):
+ break
+ else:
+ # download failed
+ self.log.error(
+ "Failed to download %s", self.pathfmt.filename or url)
+ return
+
+ if not self.pathfmt.temppath:
+ self.handle_skip()
+ return
+
+ # run post processors
+ if self.postprocessors:
+ for pp in self.postprocessors:
+ pp.run(self.pathfmt)
+
+ # download succeeded
+ self.pathfmt.finalize()
+ self.out.success(self.pathfmt.path, 0)
+ if self.archive:
+ self.archive.add(keywords)
+ self._skipcnt = 0
+
+ def handle_urllist(self, urls, keywords):
+ """Download the resource specified in 'url'"""
+ fallback = iter(urls)
+ url = next(fallback)
+ self.handle_url(url, keywords, fallback)
+
+ def handle_directory(self, keywords):
+ """Set and create the target directory for downloads"""
+ if not self.pathfmt:
+ self.initialize(keywords)
+ else:
+ self.pathfmt.set_directory(keywords)
+
+ def handle_queue(self, url, keywords):
+ if "_extractor" in keywords:
+ extr = keywords["_extractor"].from_url(url)
+ else:
+ extr = extractor.find(url)
+ if extr:
+ self.__class__(extr, self).run()
+ else:
+ self._write_unsupported(url)
+
+ def handle_finalize(self):
+ if self.postprocessors:
+ for pp in self.postprocessors:
+ pp.finalize()
+
+ def handle_skip(self):
+ self.out.skip(self.pathfmt.path)
+ if self._skipexc:
+ self._skipcnt += 1
+ if self._skipcnt >= self._skipmax:
+ raise self._skipexc()
+
+ def download(self, url):
+ """Download 'url'"""
+ scheme = url.partition(":")[0]
+ downloader = self.get_downloader(scheme)
+ if downloader:
+ return downloader.download(url, self.pathfmt)
+ self._write_unsupported(url)
+ return False
+
+ def get_downloader(self, scheme):
+ """Return a downloader suitable for 'scheme'"""
+ if scheme == "https":
+ scheme = "http"
+ try:
+ return self.downloaders[scheme]
+ except KeyError:
+ pass
+
+ klass = downloader.find(scheme)
+ if klass and config.get(("downloader", scheme, "enabled"), True):
+ instance = klass(self.extractor, self.out)
+ else:
+ instance = None
+ self.log.error("'%s:' URLs are not supported/enabled", scheme)
+ self.downloaders[scheme] = instance
+ return instance
+
+ def initialize(self, keywords=None):
+ """Delayed initialization of PathFormat, etc."""
+ self.pathfmt = util.PathFormat(self.extractor)
+ if keywords:
+ self.pathfmt.set_directory(keywords)
+ self.sleep = self.extractor.config("sleep")
+
+ skip = self.extractor.config("skip", True)
+ if skip:
+ self._skipexc = None
+ if isinstance(skip, str):
+ skip, _, smax = skip.partition(":")
+ if skip == "abort":
+ self._skipexc = exception.StopExtraction
+ elif skip == "exit":
+ self._skipexc = sys.exit
+ self._skipcnt = 0
+ self._skipmax = text.parse_int(smax)
+ else:
+ self.pathfmt.exists = lambda x=None: False
+
+ archive = self.extractor.config("archive")
+ if archive:
+ path = util.expand_path(archive)
+ self.archive = util.DownloadArchive(path, self.extractor)
+
+ postprocessors = self.extractor.config("postprocessors")
+ if postprocessors:
+ self.postprocessors = []
+ for pp_dict in postprocessors:
+ whitelist = pp_dict.get("whitelist")
+ blacklist = pp_dict.get("blacklist")
+ if (whitelist and self.extractor.category not in whitelist or
+ blacklist and self.extractor.category in blacklist):
+ continue
+ name = pp_dict.get("name")
+ pp_cls = postprocessor.find(name)
+ if not pp_cls:
+ postprocessor.log.warning("module '%s' not found", name)
+ continue
+ try:
+ pp_obj = pp_cls(self.pathfmt, pp_dict)
+ except Exception as exc:
+ postprocessor.log.error(
+ "'%s' initialization failed: %s: %s",
+ name, exc.__class__.__name__, exc)
+ else:
+ self.postprocessors.append(pp_obj)
+ self.extractor.log.debug(
+ "Active postprocessor modules: %s", self.postprocessors)
+
+
+class SimulationJob(DownloadJob):
+ """Simulate the extraction process without downloading anything"""
+
+ def handle_url(self, url, keywords, fallback=None):
+ self.pathfmt.set_keywords(keywords)
+ self.out.skip(self.pathfmt.path)
+ if self.sleep:
+ time.sleep(self.sleep)
+ if self.archive:
+ self.archive.add(keywords)
+
+ def handle_directory(self, keywords):
+ if not self.pathfmt:
+ self.initialize()
+
+
+class KeywordJob(Job):
+ """Print available keywords"""
+
+ def handle_url(self, url, keywords):
+ print("\nKeywords for filenames and --filter:")
+ print("------------------------------------")
+ self.print_keywords(keywords)
+ raise exception.StopExtraction()
+
+ def handle_directory(self, keywords):
+ print("Keywords for directory names:")
+ print("-----------------------------")
+ self.print_keywords(keywords)
+
+ def handle_queue(self, url, keywords):
+ if not keywords:
+ self.extractor.log.info(
+ "This extractor delegates work to other extractors "
+ "and does not provide any keywords on its own. Try "
+ "'gallery-dl -K \"%s\"' instead.", url)
+ else:
+ print("Keywords for --chapter-filter:")
+ print("------------------------------")
+ self.print_keywords(keywords)
+ if self.extractor.categorytransfer:
+ print()
+ KeywordJob(url, self).run()
+ raise exception.StopExtraction()
+
+ @staticmethod
+ def print_keywords(keywords, prefix=""):
+ """Print key-value pairs with formatting"""
+ suffix = "]" if prefix else ""
+ for key, value in sorted(keywords.items()):
+ if key[0] == "_":
+ continue
+ key = prefix + key + suffix
+
+ if isinstance(value, dict):
+ KeywordJob.print_keywords(value, key + "[")
+
+ elif isinstance(value, list):
+ if value and isinstance(value[0], dict):
+ KeywordJob.print_keywords(value[0], key + "[][")
+ else:
+ print(key, "[]", sep="")
+ for val in value:
+ print(" -", val)
+
+ else:
+ # string or number
+ print(key, "\n ", value, sep="")
+
+
+class UrlJob(Job):
+ """Print download urls"""
+ maxdepth = 1
+
+ def __init__(self, url, parent=None, depth=1):
+ Job.__init__(self, url, parent)
+ self.depth = depth
+ if depth >= self.maxdepth:
+ self.handle_queue = self.handle_url
+
+ @staticmethod
+ def handle_url(url, _):
+ print(url)
+
+ @staticmethod
+ def handle_urllist(urls, _):
+ prefix = ""
+ for url in urls:
+ print(prefix, url, sep="")
+ prefix = "| "
+
+ def handle_queue(self, url, _):
+ try:
+ UrlJob(url, self, self.depth + 1).run()
+ except exception.NoExtractorError:
+ self._write_unsupported(url)
+
+
+class DataJob(Job):
+ """Collect extractor results and dump them"""
+
+ def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True):
+ Job.__init__(self, url, parent)
+ self.file = file
+ self.data = []
+ self.ascii = config.get(("output", "ascii"), ensure_ascii)
+
+ def run(self):
+ # collect data
+ try:
+ for msg in self.extractor:
+ self.dispatch(msg)
+ except exception.StopExtraction:
+ pass
+ except Exception as exc:
+ self.data.append((exc.__class__.__name__, str(exc)))
+ except BaseException:
+ pass
+
+ # convert numbers to string
+ if config.get(("output", "num-to-str"), False):
+ for msg in self.data:
+ util.transform_dict(msg[-1], util.number_to_string)
+
+ # dump to 'file'
+ util.dump_json(self.data, self.file, self.ascii, 2)
+
+ def handle_url(self, url, kwdict):
+ self.data.append((Message.Url, url, self._filter(kwdict)))
+
+ def handle_urllist(self, urls, kwdict):
+ self.data.append((Message.Urllist, list(urls), self._filter(kwdict)))
+
+ def handle_directory(self, kwdict):
+ self.data.append((Message.Directory, self._filter(kwdict)))
+
+ def handle_queue(self, url, kwdict):
+ self.data.append((Message.Queue, url, self._filter(kwdict)))
+
+ def handle_finalize(self):
+ self.file.close()
diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py
new file mode 100644
index 0000000..58126ac
--- /dev/null
+++ b/gallery_dl/oauth.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""OAuth helper functions and classes"""
+
+import hmac
+import time
+import base64
+import random
+import string
+import hashlib
+import urllib.parse
+
+import requests
+import requests.auth
+
+from . import text
+
+
+def nonce(size, alphabet=string.ascii_letters):
+ """Generate a nonce value with 'size' characters"""
+ return "".join(random.choice(alphabet) for _ in range(size))
+
+
+def quote(value, quote=urllib.parse.quote):
+ """Quote 'value' according to the OAuth1.0 standard"""
+ return quote(value, "~")
+
+
+def concat(*args):
+ """Concatenate 'args' as expected by OAuth1.0"""
+ return "&".join(quote(item) for item in args)
+
+
+class OAuth1Session(requests.Session):
+ """Extension to requests.Session to support OAuth 1.0"""
+
+ def __init__(self, consumer_key, consumer_secret,
+ token=None, token_secret=None):
+
+ requests.Session.__init__(self)
+ self.auth = OAuth1Client(
+ consumer_key, consumer_secret,
+ token, token_secret,
+ )
+
+ def rebuild_auth(self, prepared_request, response):
+ if "Authorization" in prepared_request.headers:
+ del prepared_request.headers["Authorization"]
+ prepared_request.prepare_auth(self.auth)
+
+
+class OAuth1Client(requests.auth.AuthBase):
+ """OAuth1.0a authentication"""
+
+ def __init__(self, consumer_key, consumer_secret,
+ token=None, token_secret=None):
+
+ self.consumer_key = consumer_key
+ self.consumer_secret = consumer_secret
+ self.token = token
+ self.token_secret = token_secret
+
+ def __call__(self, request):
+ oauth_params = [
+ ("oauth_consumer_key", self.consumer_key),
+ ("oauth_nonce", nonce(16)),
+ ("oauth_signature_method", "HMAC-SHA1"),
+ ("oauth_timestamp", str(int(time.time()))),
+ ("oauth_version", "1.0"),
+ ]
+ if self.token:
+ oauth_params.append(("oauth_token", self.token))
+
+ signature = self.generate_signature(request, oauth_params)
+ oauth_params.append(("oauth_signature", signature))
+
+ request.headers["Authorization"] = "OAuth " + ",".join(
+ key + '="' + value + '"' for key, value in oauth_params)
+
+ return request
+
+ def generate_signature(self, request, params):
+ """Generate 'oauth_signature' value"""
+ url, _, query = request.url.partition("?")
+
+ params = params.copy()
+ for key, value in text.parse_query(query).items():
+ params.append((quote(key), quote(value)))
+ params.sort()
+ query = "&".join("=".join(item) for item in params)
+
+ message = concat(request.method, url, query).encode()
+ key = concat(self.consumer_secret, self.token_secret or "").encode()
+ signature = hmac.new(key, message, hashlib.sha1).digest()
+
+ return quote(base64.b64encode(signature).decode())
+
+
+class OAuth1API():
+ """Base class for OAuth1.0 based API interfaces"""
+ API_KEY = None
+ API_SECRET = None
+
+ def __init__(self, extractor):
+ self.log = extractor.log
+ self.extractor = extractor
+
+ api_key = extractor.config("api-key", self.API_KEY)
+ api_secret = extractor.config("api-secret", self.API_SECRET)
+ token = extractor.config("access-token")
+ token_secret = extractor.config("access-token-secret")
+
+ if api_key and api_secret and token and token_secret:
+ self.log.debug("Using OAuth1.0 authentication")
+ self.session = OAuth1Session(
+ api_key, api_secret, token, token_secret)
+ self.api_key = None
+ else:
+ self.log.debug("Using api_key authentication")
+ self.session = extractor.session
+ self.api_key = api_key
+
+ def request(self, url, method="GET", *, expect=range(400, 500), **kwargs):
+ kwargs["expect"] = expect
+ kwargs["session"] = self.session
+ return self.extractor.request(url, method, **kwargs)
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
new file mode 100644
index 0000000..f23b79d
--- /dev/null
+++ b/gallery_dl/option.py
@@ -0,0 +1,304 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Command line option parsing"""
+
+import argparse
+import logging
+import json
+from . import job, version
+
+
+class ConfigAction(argparse.Action):
+ """Set argparse results as config values"""
+ def __call__(self, parser, namespace, values, option_string=None):
+ namespace.options.append(((self.dest,), values))
+
+
+class ConfigConstAction(argparse.Action):
+ """Set argparse const values as config values"""
+ def __call__(self, parser, namespace, values, option_string=None):
+ namespace.options.append(((self.dest,), self.const))
+
+
+class ParseAction(argparse.Action):
+ """Parse <key>=<value> options and set them as config values"""
+ def __call__(self, parser, namespace, values, option_string=None):
+ key, _, value = values.partition("=")
+ try:
+ value = json.loads(value)
+ except ValueError:
+ pass
+ key = key.split(".")
+ namespace.options.append((key, value))
+
+
+class Formatter(argparse.HelpFormatter):
+ """Custom HelpFormatter class to customize help output"""
+ def __init__(self, *args, **kwargs):
+ super().__init__(max_help_position=50, *args, **kwargs)
+
+ def _format_action_invocation(self, action):
+ opts = action.option_strings[:]
+ if opts:
+ if action.nargs != 0:
+ args_string = self._format_args(action, "ARG")
+ opts[-1] += " " + args_string
+ return ', '.join(opts)
+ else:
+ return self._metavar_formatter(action, action.dest)(1)[0]
+
+
+def build_parser():
+ """Build and configure an ArgumentParser object"""
+ parser = argparse.ArgumentParser(
+ usage="%(prog)s [OPTION]... URL...",
+ formatter_class=Formatter,
+ add_help=False,
+ )
+
+ general = parser.add_argument_group("General Options")
+ general.add_argument(
+ "-h", "--help",
+ action="help",
+ help="Print this help message and exit",
+ )
+ general.add_argument(
+ "--version",
+ action="version", version=version.__version__,
+ help="Print program version and exit",
+ )
+ general.add_argument(
+ "-d", "--dest",
+ dest="base-directory", metavar="DEST", action=ConfigAction,
+ help="Destination directory",
+ )
+ general.add_argument(
+ "-i", "--input-file",
+ dest="inputfile", metavar="FILE",
+ help="Download URLs found in FILE ('-' for stdin)",
+ )
+ general.add_argument(
+ "--cookies",
+ dest="cookies", metavar="FILE", action=ConfigAction,
+ help="File to load additional cookies from",
+ )
+ general.add_argument(
+ "--proxy",
+ dest="proxy", metavar="URL", action=ConfigAction,
+ help="Use the specified proxy",
+ )
+ general.add_argument(
+ "--clear-cache",
+ dest="clear_cache", action="store_true",
+ help="Delete all cached login sessions, cookies, etc.",
+ )
+
+ output = parser.add_argument_group("Output Options")
+ output.add_argument(
+ "-q", "--quiet",
+ dest="loglevel", default=logging.INFO,
+ action="store_const", const=logging.ERROR,
+ help="Activate quiet mode",
+ )
+ output.add_argument(
+ "-v", "--verbose",
+ dest="loglevel",
+ action="store_const", const=logging.DEBUG,
+ help="Print various debugging information",
+ )
+ output.add_argument(
+ "-g", "--get-urls",
+ dest="list_urls", action="count",
+ help="Print URLs instead of downloading",
+ )
+ output.add_argument(
+ "-j", "--dump-json",
+ dest="jobtype", action="store_const", const=job.DataJob,
+ help="Print JSON information",
+ )
+ output.add_argument(
+ "-s", "--simulate",
+ dest="jobtype", action="store_const", const=job.SimulationJob,
+ help="Simulate data extraction; do not download anything",
+ )
+ output.add_argument(
+ "-K", "--list-keywords",
+ dest="jobtype", action="store_const", const=job.KeywordJob,
+ help=("Print a list of available keywords and example values "
+ "for the given URLs"),
+ )
+ output.add_argument(
+ "--list-modules",
+ dest="list_modules", action="store_true",
+ help="Print a list of available extractor modules",
+ )
+ output.add_argument(
+ "--list-extractors",
+ dest="list_extractors", action="store_true",
+ help=("Print a list of extractor classes "
+ "with description, (sub)category and example URL"),
+ )
+ output.add_argument(
+ "--write-log",
+ dest="logfile", metavar="FILE", action=ConfigAction,
+ help="Write logging output to FILE",
+ )
+ output.add_argument(
+ "--write-unsupported",
+ dest="unsupportedfile", metavar="FILE", action=ConfigAction,
+ help=("Write URLs, which get emitted by other extractors but cannot "
+ "be handled, to FILE"),
+ )
+
+ downloader = parser.add_argument_group("Downloader Options")
+ downloader.add_argument(
+ "-r", "--limit-rate",
+ dest="rate", metavar="RATE", action=ConfigAction,
+ help="Maximum download rate (e.g. 500k or 2.5M)",
+ )
+ downloader.add_argument(
+ "-R", "--retries",
+ dest="retries", metavar="RETRIES", type=int, action=ConfigAction,
+ help="Number of retries (default: 5)",
+ )
+ downloader.add_argument(
+ "--http-timeout",
+ dest="timeout", metavar="SECONDS", type=float, action=ConfigAction,
+ help="Timeout for HTTP connections (defaut: 30.0)",
+ )
+ downloader.add_argument(
+ "--sleep",
+ dest="sleep", metavar="SECONDS", type=float, action=ConfigAction,
+ help="Number of seconds to sleep before each download",
+ )
+ downloader.add_argument(
+ "--no-part",
+ dest="part", nargs=0, action=ConfigConstAction, const=False,
+ help="Do not use .part files",
+ )
+ downloader.add_argument(
+ "--no-check-certificate",
+ dest="verify", nargs=0, action=ConfigConstAction, const=False,
+ help="Disable HTTPS certificate validation",
+ )
+ downloader.add_argument(
+ "--abort-on-skip",
+ dest="skip", nargs=0, action=ConfigConstAction, const="abort",
+ help=("Abort extractor run if a file download would normally be "
+ "skipped, i.e. if a file with the same filename already exists"),
+ )
+
+ configuration = parser.add_argument_group("Configuration Options")
+ configuration.add_argument(
+ "-c", "--config",
+ dest="cfgfiles", metavar="FILE", action="append",
+ help="Additional configuration files",
+ )
+ configuration.add_argument(
+ "--config-yaml",
+ dest="yamlfiles", metavar="FILE", action="append",
+ help=argparse.SUPPRESS,
+ )
+ configuration.add_argument(
+ "-o", "--option",
+ dest="options", metavar="OPT", action=ParseAction, default=[],
+ help="Additional '<key>=<value>' option values",
+ )
+ configuration.add_argument(
+ "--ignore-config",
+ dest="load_config", action="store_false",
+ help="Do not read the default configuration files",
+ )
+
+ authentication = parser.add_argument_group("Authentication Options")
+ authentication.add_argument(
+ "-u", "--username",
+ dest="username", metavar="USER", action=ConfigAction,
+ help="Username to login with",
+ )
+ authentication.add_argument(
+ "-p", "--password",
+ dest="password", metavar="PASS", action=ConfigAction,
+ help="Password belonging to the given username",
+ )
+ authentication.add_argument(
+ "--netrc",
+ dest="netrc", nargs=0, action=ConfigConstAction, const=True,
+ help="Enable .netrc authentication data",
+ )
+
+ selection = parser.add_argument_group("Selection Options")
+ selection.add_argument(
+ "--download-archive",
+ dest="archive", metavar="FILE", action=ConfigAction,
+ help=("Record all downloaded files in the archive file and "
+ "skip downloading any file already in it."),
+ )
+ selection.add_argument(
+ "--range",
+ dest="image-range", metavar="RANGE", action=ConfigAction,
+ help=("Index-range(s) specifying which images to download. "
+ "For example '5-10' or '1,3-5,10-'"),
+ )
+ selection.add_argument(
+ "--chapter-range",
+ dest="chapter-range", metavar="RANGE", action=ConfigAction,
+ help=("Like '--range', but applies to manga-chapters "
+ "and other delegated URLs"),
+ )
+ selection.add_argument(
+ "--filter",
+ dest="image-filter", metavar="EXPR", action=ConfigAction,
+ help=("Python expression controlling which images to download. "
+ "Files for which the expression evaluates to False are ignored. "
+ "Available keys are the filename-specific ones listed by '-K'. "
+ "Example: --filter \"image_width >= 1000 and "
+ "rating in ('s', 'q')\""),
+ )
+ selection.add_argument(
+ "--chapter-filter",
+ dest="chapter-filter", metavar="EXPR", action=ConfigAction,
+ help=("Like '--filter', but applies to manga-chapters "
+ "and other delegated URLs"),
+ )
+
+ postprocessor = parser.add_argument_group("Post-processing Options")
+ postprocessor.add_argument(
+ "--zip",
+ dest="postprocessors",
+ action="append_const", const={"name": "zip"},
+ help="Store downloaded files in a ZIP archive",
+ )
+ postprocessor.add_argument(
+ "--ugoira-conv",
+ dest="postprocessors",
+ action="append_const", const={"name": "ugoira", "ffmpeg-args": (
+ "-c:v", "libvpx", "-crf", "4", "-b:v", "5000k", "-an")},
+ help="Convert Pixiv Ugoira to WebM (requires FFmpeg)",
+ )
+ postprocessor.add_argument(
+ "--write-metadata",
+ dest="postprocessors",
+ action="append_const", const={"name": "metadata"},
+ help="Write metadata to separate JSON files",
+ )
+ postprocessor.add_argument(
+ "--write-tags",
+ dest="postprocessors",
+ action="append_const", const={"name": "metadata", "mode": "tags"},
+ help="Write image tags to separate text files",
+ )
+
+ parser.add_argument(
+ "urls",
+ metavar="URL", nargs="*",
+ help=argparse.SUPPRESS,
+ )
+
+ return parser
diff --git a/gallery_dl/output.py b/gallery_dl/output.py
new file mode 100644
index 0000000..327b69a
--- /dev/null
+++ b/gallery_dl/output.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import os
+import sys
+import shutil
+import logging
+from . import config, util
+
+
+# --------------------------------------------------------------------
+# Logging
+
+LOG_FORMAT = "[{name}][{levelname}] {message}"
+LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S"
+LOG_LEVEL = logging.INFO
+
+
+class Logger(logging.Logger):
+ """Custom logger that includes extractor and job info in log records"""
+ extractor = util.NONE
+ job = util.NONE
+
+ def makeRecord(self, name, level, fn, lno, msg, args, exc_info,
+ func=None, extra=None, sinfo=None,
+ factory=logging._logRecordFactory):
+ rv = factory(name, level, fn, lno, msg, args, exc_info, func, sinfo)
+ rv.extractor = self.extractor
+ rv.job = self.job
+ return rv
+
+
+def initialize_logging(loglevel):
+ """Setup basic logging functionality before configfiles have been loaded"""
+ # convert levelnames to lowercase
+ for level in (10, 20, 30, 40, 50):
+ name = logging.getLevelName(level)
+ logging.addLevelName(level, name.lower())
+
+ # register custom Logging class
+ logging.Logger.manager.setLoggerClass(Logger)
+
+ # setup basic logging to stderr
+ formatter = logging.Formatter(LOG_FORMAT, LOG_FORMAT_DATE, "{")
+ handler = logging.StreamHandler()
+ handler.setFormatter(formatter)
+ handler.setLevel(loglevel)
+ root = logging.getLogger()
+ root.setLevel(logging.NOTSET)
+ root.addHandler(handler)
+
+ return logging.getLogger("gallery-dl")
+
+
+def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL):
+ """Setup a new logging handler"""
+ opts = config.interpolate(("output", key))
+ if not opts:
+ return None
+ if not isinstance(opts, dict):
+ opts = {"path": opts}
+
+ path = opts.get("path")
+ mode = opts.get("mode", "w")
+ encoding = opts.get("encoding", "utf-8")
+ try:
+ path = util.expand_path(path)
+ handler = logging.FileHandler(path, mode, encoding)
+ except (OSError, ValueError) as exc:
+ logging.getLogger("gallery-dl").warning(
+ "%s: %s", key, exc)
+ return None
+ except TypeError as exc:
+ logging.getLogger("gallery-dl").warning(
+ "%s: missing or invalid path (%s)", key, exc)
+ return None
+
+ level = opts.get("level", lvl)
+ logfmt = opts.get("format", fmt)
+ datefmt = opts.get("format-date", LOG_FORMAT_DATE)
+ formatter = logging.Formatter(logfmt, datefmt, "{")
+ handler.setFormatter(formatter)
+ handler.setLevel(level)
+
+ return handler
+
+
+def configure_logging_handler(key, handler):
+ """Configure a logging handler"""
+ opts = config.interpolate(("output", key))
+ if not opts:
+ return
+ if isinstance(opts, str):
+ opts = {"format": opts}
+ if handler.level == LOG_LEVEL and "level" in opts:
+ handler.setLevel(opts["level"])
+ if "format" in opts or "format-date" in opts:
+ logfmt = opts.get("format", LOG_FORMAT)
+ datefmt = opts.get("format-date", LOG_FORMAT_DATE)
+ formatter = logging.Formatter(logfmt, datefmt, "{")
+ handler.setFormatter(formatter)
+
+
+# --------------------------------------------------------------------
+# Utility functions
+
+def replace_std_streams(errors="replace"):
+ """Replace standard streams and set their error handlers to 'errors'"""
+ for name in ("stdout", "stdin", "stderr"):
+ stream = getattr(sys, name)
+ setattr(sys, name, stream.__class__(
+ stream.buffer,
+ errors=errors,
+ newline=stream.newlines,
+ line_buffering=stream.line_buffering,
+ ))
+
+
+# --------------------------------------------------------------------
+# Downloader output
+
+def select():
+ """Automatically select a suitable output class"""
+ pdict = {
+ "default": PipeOutput,
+ "pipe": PipeOutput,
+ "term": TerminalOutput,
+ "terminal": TerminalOutput,
+ "color": ColorOutput,
+ "null": NullOutput,
+ }
+ omode = config.get(("output", "mode"), "auto").lower()
+ if omode in pdict:
+ return pdict[omode]()
+ elif omode == "auto":
+ if hasattr(sys.stdout, "isatty") and sys.stdout.isatty():
+ return ColorOutput() if ANSI else TerminalOutput()
+ else:
+ return PipeOutput()
+ else:
+ raise Exception("invalid output mode: " + omode)
+
+
+class NullOutput():
+
+ def start(self, path):
+ """Print a message indicating the start of a download"""
+
+ def skip(self, path):
+ """Print a message indicating that a download has been skipped"""
+
+ def success(self, path, tries):
+ """Print a message indicating the completion of a download"""
+
+
+class PipeOutput(NullOutput):
+
+ def skip(self, path):
+ print(CHAR_SKIP, path, sep="", flush=True)
+
+ def success(self, path, tries):
+ print(path, flush=True)
+
+
+class TerminalOutput(NullOutput):
+
+ def __init__(self):
+ self.short = config.get(("output", "shorten"), True)
+ if self.short:
+ self.width = shutil.get_terminal_size().columns - OFFSET
+
+ def start(self, path):
+ print(self.shorten(" " + path), end="", flush=True)
+
+ def skip(self, path):
+ print(self.shorten(CHAR_SKIP + path))
+
+ def success(self, path, tries):
+ print("\r", self.shorten(CHAR_SUCCESS + path), sep="")
+
+ def shorten(self, txt):
+ """Reduce the length of 'txt' to the width of the terminal"""
+ if self.short and len(txt) > self.width:
+ hwidth = self.width // 2 - OFFSET
+ return "".join((
+ txt[:hwidth-1],
+ CHAR_ELLIPSIES,
+ txt[-hwidth-(self.width % 2):]
+ ))
+ return txt
+
+
+class ColorOutput(TerminalOutput):
+
+ def start(self, path):
+ print(self.shorten(path), end="", flush=True)
+
+ def skip(self, path):
+ print("\033[2m", self.shorten(path), "\033[0m", sep="")
+
+ def success(self, path, tries):
+ print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="")
+
+
+if os.name == "nt":
+ ANSI = os.environ.get("TERM") == "ANSI"
+ OFFSET = 1
+ CHAR_SKIP = "# "
+ CHAR_SUCCESS = "* "
+ CHAR_ELLIPSIES = "..."
+else:
+ ANSI = True
+ OFFSET = 0
+ CHAR_SKIP = "# "
+ CHAR_SUCCESS = "✔ "
+ CHAR_ELLIPSIES = "…"
diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py
new file mode 100644
index 0000000..093f8e0
--- /dev/null
+++ b/gallery_dl/postprocessor/__init__.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Post-processing modules"""
+
+import importlib
+import logging
+
+modules = [
+ "classify",
+ "exec",
+ "metadata",
+ "ugoira",
+ "zip",
+]
+
+log = logging.getLogger("postprocessor")
+
+
+def find(name):
+ """Return a postprocessor class with the given name"""
+ try:
+ return _cache[name]
+ except KeyError:
+ klass = None
+ try:
+ if name in modules: # prevent unwanted imports
+ module = importlib.import_module("." + name, __package__)
+ klass = module.__postprocessor__
+ except (ImportError, AttributeError, TypeError):
+ pass
+ _cache[name] = klass
+ return klass
+
+
+# --------------------------------------------------------------------
+# internals
+
+_cache = {}
diff --git a/gallery_dl/postprocessor/classify.py b/gallery_dl/postprocessor/classify.py
new file mode 100644
index 0000000..62460d3
--- /dev/null
+++ b/gallery_dl/postprocessor/classify.py
@@ -0,0 +1,49 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Categorize files by file extension"""
+
+from .common import PostProcessor
+import os
+
+
+class ClassifyPP(PostProcessor):
+
+ DEFAULT_MAPPING = {
+ "Music" : ("mp3", "aac", "flac", "ogg", "wma", "m4a", "wav"),
+ "Video" : ("flv", "ogv", "avi", "mp4", "mpg", "mpeg", "3gp", "mkv",
+ "webm", "vob", "wmv"),
+ "Pictures" : ("jpg", "jpeg", "png", "gif", "bmp", "svg", "webp"),
+ "Archives" : ("zip", "rar", "7z", "tar", "gz", "bz2"),
+ }
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+ mapping = options.get("mapping", self.DEFAULT_MAPPING)
+
+ self.mapping = {
+ ext: directory
+ for directory, exts in mapping.items()
+ for ext in exts
+ }
+
+ def prepare(self, pathfmt):
+ ext = pathfmt.keywords.get("extension")
+
+ if ext in self.mapping:
+ self._dir = pathfmt.realdirectory + os.sep + self.mapping[ext]
+ pathfmt.realpath = self._dir + os.sep + pathfmt.filename
+ else:
+ self._dir = None
+
+ def run(self, pathfmt):
+ if self._dir:
+ os.makedirs(self._dir, exist_ok=True)
+
+
+__postprocessor__ = ClassifyPP
diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py
new file mode 100644
index 0000000..c642f0f
--- /dev/null
+++ b/gallery_dl/postprocessor/common.py
@@ -0,0 +1,25 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Common classes and constants used by postprocessor modules."""
+
+from . import log
+
+
+class PostProcessor():
+ """Base class for postprocessors"""
+ log = log
+
+ def prepare(self, pathfmt):
+ """ """
+
+ def run(self, pathfmt):
+ """Execute the postprocessor for a file"""
+
+ def finalize(self):
+ """Cleanup"""
diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py
new file mode 100644
index 0000000..c86b480
--- /dev/null
+++ b/gallery_dl/postprocessor/exec.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Execute processes"""
+
+from .common import PostProcessor
+import subprocess
+
+
+class ExecPP(PostProcessor):
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+
+ try:
+ self.args = options["command"]
+ self.args[0] # test if 'args' is subscriptable
+ except (KeyError, IndexError, TypeError):
+ raise TypeError("option 'command' must be a non-empty list")
+
+ if options.get("async", False):
+ self._exec = subprocess.Popen
+
+ def run(self, pathfmt):
+ self._exec([
+ arg.format_map(pathfmt.keywords)
+ for arg in self.args
+ ])
+
+ def _exec(self, args):
+ retcode = subprocess.Popen(args).wait()
+ if retcode:
+ self.log.warning(
+ "executing '%s' returned non-zero exit status %d",
+ " ".join(args), retcode)
+
+
+__postprocessor__ = ExecPP
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
new file mode 100644
index 0000000..77be9c7
--- /dev/null
+++ b/gallery_dl/postprocessor/metadata.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Write metadata to JSON files"""
+
+from .common import PostProcessor
+from .. import util
+
+
+class MetadataPP(PostProcessor):
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+
+ mode = options.get("mode", "json")
+ ext = "txt"
+
+ if mode == "custom":
+ self.write = self._write_custom
+ self.formatter = util.Formatter(options.get("format"))
+ elif mode == "tags":
+ self.write = self._write_tags
+ else:
+ self.write = self._write_json
+ self.indent = options.get("indent", 4)
+ self.ascii = options.get("ascii", False)
+ ext = "json"
+
+ self.extension = options.get("extension", ext)
+
+ def run(self, pathfmt):
+ path = "{}.{}".format(pathfmt.realpath, self.extension)
+ with open(path, "w", encoding="utf-8") as file:
+ self.write(file, pathfmt)
+
+ def _write_custom(self, file, pathfmt):
+ output = self.formatter.format_map(pathfmt.keywords)
+ file.write(output)
+
+ def _write_tags(self, file, pathfmt):
+ kwds = pathfmt.keywords
+ tags = kwds.get("tags") or kwds.get("tag_string")
+
+ if not tags:
+ return
+
+ if not isinstance(tags, list):
+ taglist = tags.split(", ")
+ if len(taglist) < len(tags) / 16:
+ taglist = tags.split(" ")
+ tags = taglist
+
+ file.write("\n".join(tags))
+ file.write("\n")
+
+ def _write_json(self, file, pathfmt):
+ util.dump_json(pathfmt.keywords, file, self.ascii, self.indent)
+
+
+__postprocessor__ = MetadataPP
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
new file mode 100644
index 0000000..bd8c5ad
--- /dev/null
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Convert pixiv ugoira to webm"""
+
+from .common import PostProcessor
+from .. import util
+import collections
+import subprocess
+import tempfile
+import zipfile
+import os
+
+
+class UgoiraPP(PostProcessor):
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+ self.extension = options.get("extension") or "webm"
+ self.args = options.get("ffmpeg-args") or ()
+ self.twopass = options.get("ffmpeg-twopass", False)
+ self.output = options.get("ffmpeg-output", True)
+ self.delete = not options.get("keep-files", False)
+
+ ffmpeg = options.get("ffmpeg-location")
+ self.ffmpeg = util.expand_path(ffmpeg) if ffmpeg else "ffmpeg"
+
+ rate = options.get("framerate", "auto")
+ if rate != "auto":
+ self.calculate_framerate = lambda _: (None, rate)
+
+ if options.get("libx264-prevent-odd", True):
+ # get last video-codec argument
+ vcodec = None
+ for index, arg in enumerate(self.args):
+ arg, _, stream = arg.partition(":")
+ if arg == "-vcodec" or arg in ("-c", "-codec") and (
+ not stream or stream.partition(":")[0] in ("v", "V")):
+ vcodec = self.args[index + 1]
+ # use filter if libx264/5 is explicitly or implicitly used
+ self.prevent_odd = (
+ vcodec in ("libx264", "libx265") or
+ not vcodec and self.extension.lower() in ("mp4", "mkv"))
+ else:
+ self.prevent_odd = False
+
+ def prepare(self, pathfmt):
+ self._frames = None
+
+ if pathfmt.keywords["extension"] != "zip":
+ return
+
+ if "frames" in pathfmt.keywords:
+ self._frames = pathfmt.keywords["frames"]
+ elif "pixiv_ugoira_frame_data" in pathfmt.keywords:
+ self._frames = pathfmt.keywords["pixiv_ugoira_frame_data"]["data"]
+ else:
+ return
+
+ if self.delete:
+ pathfmt.set_extension(self.extension)
+
+ def run(self, pathfmt):
+ if not self._frames:
+ return
+
+ rate_in, rate_out = self.calculate_framerate(self._frames)
+
+ with tempfile.TemporaryDirectory() as tempdir:
+ # extract frames
+ with zipfile.ZipFile(pathfmt.temppath) as zfile:
+ zfile.extractall(tempdir)
+
+ # write ffconcat file
+ ffconcat = tempdir + "/ffconcat.txt"
+ with open(ffconcat, "w") as file:
+ file.write("ffconcat version 1.0\n")
+ for frame in self._frames:
+ file.write("file '{}'\n".format(frame["file"]))
+ file.write("duration {}\n".format(frame["delay"] / 1000))
+ if self.extension != "gif":
+ # repeat the last frame to prevent it from only being
+ # displayed for a very short amount of time
+ file.write("file '{}'\n".format(self._frames[-1]["file"]))
+
+ # collect command-line arguments
+ args = [self.ffmpeg]
+ if rate_in:
+ args += ["-r", str(rate_in)]
+ args += ["-i", ffconcat]
+ if rate_out:
+ args += ["-r", str(rate_out)]
+ if self.prevent_odd:
+ args += ["-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)"]
+ if self.args:
+ args += self.args
+ self.log.debug("ffmpeg args: %s", args)
+
+ # invoke ffmpeg
+ pathfmt.set_extension(self.extension)
+ if self.twopass:
+ if "-f" not in args:
+ args += ["-f", self.extension]
+ args += ["-passlogfile", tempdir + "/ffmpeg2pass", "-pass"]
+ self._exec(args + ["1", "-y", os.devnull])
+ self._exec(args + ["2", pathfmt.realpath])
+ else:
+ args.append(pathfmt.realpath)
+ self._exec(args)
+
+ if self.delete:
+ pathfmt.delete = True
+ else:
+ pathfmt.set_extension("zip")
+
+ def _exec(self, args):
+ out = None if self.output else subprocess.DEVNULL
+ return subprocess.Popen(args, stdout=out, stderr=out).wait()
+
+ @staticmethod
+ def calculate_framerate(framelist):
+ counter = collections.Counter(frame["delay"] for frame in framelist)
+ fps = "1000/{}".format(min(counter))
+ return (fps, None) if len(counter) == 1 else (None, fps)
+
+
+__postprocessor__ = UgoiraPP
diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py
new file mode 100644
index 0000000..3a0c323
--- /dev/null
+++ b/gallery_dl/postprocessor/zip.py
@@ -0,0 +1,65 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Store files in ZIP archives"""
+
+from .common import PostProcessor
+import zipfile
+import os
+
+
+class ZipPP(PostProcessor):
+
+ COMPRESSION_ALGORITHMS = {
+ "store": zipfile.ZIP_STORED,
+ "zip": zipfile.ZIP_DEFLATED,
+ "bzip2": zipfile.ZIP_BZIP2,
+ "lzma": zipfile.ZIP_LZMA,
+ }
+
+ def __init__(self, pathfmt, options):
+ PostProcessor.__init__(self)
+ self.delete = not options.get("keep-files", False)
+ self.ext = "." + options.get("extension", "zip")
+ algorithm = options.get("compression", "store")
+ if algorithm not in self.COMPRESSION_ALGORITHMS:
+ self.log.warning(
+ "unknown compression algorithm '%s'; falling back to 'store'",
+ algorithm)
+ algorithm = "store"
+
+ self.path = pathfmt.realdirectory
+ self.zfile = zipfile.ZipFile(
+ self.path + self.ext, "a",
+ self.COMPRESSION_ALGORITHMS[algorithm], True)
+
+ def run(self, pathfmt):
+ # 'NameToInfo' is not officially documented, but it's available
+ # for all supported Python versions and using it directly is a lot
+ # better than calling getinfo()
+ if pathfmt.filename not in self.zfile.NameToInfo:
+ self.zfile.write(pathfmt.temppath, pathfmt.filename)
+ pathfmt.delete = self.delete
+
+ def finalize(self):
+ self.zfile.close()
+
+ if self.delete:
+ try:
+ os.rmdir(self.path)
+ except OSError:
+ pass
+
+ if not self.zfile.NameToInfo:
+ try:
+ os.unlink(self.zfile.filename)
+ except OSError:
+ pass
+
+
+__postprocessor__ = ZipPP
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
new file mode 100644
index 0000000..151fa30
--- /dev/null
+++ b/gallery_dl/text.py
@@ -0,0 +1,278 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Collection of functions that work on strings/text"""
+
+import re
+import html
+import os.path
+import datetime
+import urllib.parse
+
+
+INVALID_XML_CHARS = (
+ "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07",
+ "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12",
+ "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a",
+ "\x1b", "\x1c", "\x1d", "\x1e", "\x1f",
+)
+
+
+def clean_xml(xmldata, repl=""):
+ """Replace/Remove invalid control characters in 'xmldata'"""
+ if not isinstance(xmldata, str):
+ try:
+ xmldata = "".join(xmldata)
+ except TypeError:
+ return ""
+ for char in INVALID_XML_CHARS:
+ if char in xmldata:
+ xmldata = xmldata.replace(char, repl)
+ return xmldata
+
+
+def remove_html(txt):
+ """Remove html-tags from a string"""
+ try:
+ return " ".join(re.sub("<[^>]+>", " ", txt).split())
+ except TypeError:
+ return ""
+
+
+def split_html(txt, sep=None):
+ """Split input string by html-tags"""
+ try:
+ return [
+ x.strip() for x in re.split("<[^>]+>", txt)
+ if x and not x.isspace()
+ ]
+ except TypeError:
+ return []
+
+
+def filename_from_url(url):
+ """Extract the last part of an URL to use as a filename"""
+ try:
+ return urllib.parse.urlsplit(url).path.rpartition("/")[2]
+ except (TypeError, AttributeError):
+ return ""
+
+
+def ext_from_url(url):
+ """Extract the filename extension of an URL"""
+ filename = filename_from_url(url)
+ ext = os.path.splitext(filename)[1]
+ return ext[1:].lower()
+
+
+def nameext_from_url(url, data=None):
+ """Extract the last part of an URL and fill 'data' accordingly"""
+ if data is None:
+ data = {}
+ name = unquote(filename_from_url(url))
+ data["filename"], ext = os.path.splitext(name)
+ data["extension"] = ext[1:].lower()
+ return data
+
+
+def clean_path_windows(path):
+ """Remove illegal characters from a path-segment (Windows)"""
+ try:
+ return re.sub(r'[<>:"\\/|?*]', "_", path)
+ except TypeError:
+ return ""
+
+
+def clean_path_posix(path):
+ """Remove illegal characters from a path-segment (Posix)"""
+ try:
+ return path.replace("/", "_")
+ except AttributeError:
+ return ""
+
+
+def extract(txt, begin, end, pos=0):
+ """Extract the text between 'begin' and 'end' from 'txt'
+
+ Args:
+ txt: String to search in
+ begin: First string to be searched for
+ end: Second string to be searched for after 'begin'
+ pos: Starting position for searches in 'txt'
+
+ Returns:
+ The string between the two search-strings 'begin' and 'end' beginning
+ with position 'pos' in 'txt' as well as the position after 'end'.
+
+ If at least one of 'begin' or 'end' is not found, None and the original
+ value of 'pos' is returned
+
+ Examples:
+ extract("abcde", "b", "d") -> "c" , 4
+ extract("abcde", "b", "d", 3) -> None, 3
+ """
+ try:
+ first = txt.index(begin, pos) + len(begin)
+ last = txt.index(end, first)
+ return txt[first:last], last+len(end)
+ except (ValueError, TypeError, AttributeError):
+ return None, pos
+
+
+def rextract(txt, begin, end, pos=-1):
+ try:
+ lbeg = len(begin)
+ first = txt.rindex(begin, 0, pos)
+ last = txt.index(end, first + lbeg)
+ return txt[first + lbeg:last], first
+ except (ValueError, TypeError, AttributeError):
+ return None, pos
+
+
+def extract_all(txt, rules, pos=0, values=None):
+ """Calls extract for each rule and returns the result in a dict"""
+ if values is None:
+ values = {}
+ for key, begin, end in rules:
+ result, pos = extract(txt, begin, end, pos)
+ if key:
+ values[key] = result
+ return values, pos
+
+
+def extract_iter(txt, begin, end, pos=0):
+ """Yield values that would be returned by repeated calls of extract()"""
+ index = txt.index
+ lbeg = len(begin)
+ lend = len(end)
+ try:
+ while True:
+ first = index(begin, pos) + lbeg
+ last = index(end, first)
+ pos = last + lend
+ yield txt[first:last]
+ except (ValueError, TypeError, AttributeError):
+ return
+
+
+def extract_from(txt, pos=0, default=""):
+ """Returns a function object that extracts from 'txt'"""
+ def extr(begin, end, index=txt.index, txt=txt):
+ nonlocal pos
+ try:
+ first = index(begin, pos) + len(begin)
+ last = index(end, first)
+ pos = last + len(end)
+ return txt[first:last]
+ except (ValueError, TypeError, AttributeError):
+ return default
+ return extr
+
+
+def parse_unicode_escapes(txt):
+ """Convert JSON Unicode escapes in 'txt' into actual characters"""
+ if "\\u" in txt:
+ return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt)
+ return txt
+
+
+def _hex_to_char(match):
+ return chr(int(match.group(1), 16))
+
+
+def parse_bytes(value, default=0, suffixes="bkmgtp"):
+ """Convert a bytes-amount ("500k", "2.5M", ...) to int"""
+ try:
+ last = value[-1].lower()
+ except (TypeError, KeyError, IndexError):
+ return default
+
+ if last in suffixes:
+ mul = 1024 ** suffixes.index(last)
+ value = value[:-1]
+ else:
+ mul = 1
+
+ try:
+ return round(float(value) * mul)
+ except ValueError:
+ return default
+
+
+def parse_int(value, default=0):
+ """Convert 'value' to int"""
+ if not value:
+ return default
+ try:
+ return int(value)
+ except (ValueError, TypeError):
+ return default
+
+
+def parse_float(value, default=0.0):
+ """Convert 'value' to float"""
+ if not value:
+ return default
+ try:
+ return float(value)
+ except (ValueError, TypeError):
+ return default
+
+
+def parse_query(qs):
+ """Parse a query string into key-value pairs"""
+ result = {}
+ try:
+ for key, value in urllib.parse.parse_qsl(qs):
+ if key not in result:
+ result[key] = value
+ except AttributeError:
+ pass
+ return result
+
+
+def parse_timestamp(ts, default=None):
+ """Create a datetime object from a unix timestamp"""
+ try:
+ return datetime.datetime.utcfromtimestamp(int(ts))
+ except (TypeError, ValueError, OverflowError):
+ return default
+
+
+def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z"):
+ """Create a datetime object by parsing 'date_string'"""
+ try:
+ if format.endswith("%z") and date_string[-3] == ":":
+ # workaround for Python < 3.7: +00:00 -> +0000
+ ds = date_string[:-3] + date_string[-2:]
+ else:
+ ds = date_string
+ d = datetime.datetime.strptime(ds, format)
+ o = d.utcoffset()
+ if o is not None:
+ d = d.replace(tzinfo=None) - o # convert to naive UTC
+ return d
+ except (TypeError, IndexError, KeyError):
+ return None
+ except (ValueError, OverflowError):
+ return date_string
+
+
+if os.name == "nt":
+ clean_path = clean_path_windows
+else:
+ clean_path = clean_path_posix
+
+
+urljoin = urllib.parse.urljoin
+
+quote = urllib.parse.quote
+unquote = urllib.parse.unquote
+
+escape = html.escape
+unescape = html.unescape
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
new file mode 100644
index 0000000..5c0ae41
--- /dev/null
+++ b/gallery_dl/util.py
@@ -0,0 +1,673 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2017-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Utility functions and classes"""
+
+import re
+import os
+import sys
+import json
+import shutil
+import string
+import _string
+import sqlite3
+import datetime
+import operator
+import itertools
+import urllib.parse
+from . import text, exception
+
+
+def bencode(num, alphabet="0123456789"):
+ """Encode an integer into a base-N encoded string"""
+ data = ""
+ base = len(alphabet)
+ while num:
+ num, remainder = divmod(num, base)
+ data = alphabet[remainder] + data
+ return data
+
+
+def bdecode(data, alphabet="0123456789"):
+ """Decode a base-N encoded string ( N = len(alphabet) )"""
+ num = 0
+ base = len(alphabet)
+ for c in data:
+ num *= base
+ num += alphabet.index(c)
+ return num
+
+
+def advance(iterable, num):
+ """"Advance the iterable by 'num' steps"""
+ iterator = iter(iterable)
+ next(itertools.islice(iterator, num, num), None)
+ return iterator
+
+
+def raises(obj):
+ """Returns a function that raises 'obj' as exception"""
+ def wrap():
+ raise obj
+ return wrap
+
+
+def combine_dict(a, b):
+ """Recursively combine the contents of 'b' into 'a'"""
+ for key, value in b.items():
+ if key in a and isinstance(value, dict) and isinstance(a[key], dict):
+ combine_dict(a[key], value)
+ else:
+ a[key] = value
+ return a
+
+
+def transform_dict(a, func):
+ """Recursively apply 'func' to all values in 'a'"""
+ for key, value in a.items():
+ if isinstance(value, dict):
+ transform_dict(value, func)
+ else:
+ a[key] = func(value)
+
+
+def number_to_string(value, numbers=(int, float)):
+ """Convert numbers (int, float) to string; Return everything else as is."""
+ return str(value) if value.__class__ in numbers else value
+
+
+def to_string(value):
+ """str() with "better" defaults"""
+ if not value:
+ return ""
+ if value.__class__ is list:
+ try:
+ return ", ".join(value)
+ except Exception:
+ return ", ".join(map(str, value))
+ return str(value)
+
+
+def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4):
+ """Serialize 'obj' as JSON and write it to 'fp'"""
+ json.dump(
+ obj, fp,
+ ensure_ascii=ensure_ascii,
+ indent=indent,
+ default=str,
+ sort_keys=True,
+ )
+ fp.write("\n")
+
+
+def expand_path(path):
+ """Expand environment variables and tildes (~)"""
+ if not path:
+ return path
+ if not isinstance(path, str):
+ path = os.path.join(*path)
+ return os.path.expandvars(os.path.expanduser(path))
+
+
+def code_to_language(code, default=None):
+ """Map an ISO 639-1 language code to its actual name"""
+ return CODES.get((code or "").lower(), default)
+
+
+def language_to_code(lang, default=None):
+ """Map a language name to its ISO 639-1 code"""
+ if lang is None:
+ return default
+ lang = lang.capitalize()
+ for code, language in CODES.items():
+ if language == lang:
+ return code
+ return default
+
+
+CODES = {
+ "ar": "Arabic",
+ "bg": "Bulgarian",
+ "ca": "Catalan",
+ "cs": "Czech",
+ "da": "Danish",
+ "de": "German",
+ "el": "Greek",
+ "en": "English",
+ "es": "Spanish",
+ "fi": "Finnish",
+ "fr": "French",
+ "he": "Hebrew",
+ "hu": "Hungarian",
+ "id": "Indonesian",
+ "it": "Italian",
+ "jp": "Japanese",
+ "ko": "Korean",
+ "ms": "Malay",
+ "nl": "Dutch",
+ "no": "Norwegian",
+ "pl": "Polish",
+ "pt": "Portuguese",
+ "ro": "Romanian",
+ "ru": "Russian",
+ "sv": "Swedish",
+ "th": "Thai",
+ "tr": "Turkish",
+ "vi": "Vietnamese",
+ "zh": "Chinese",
+}
+
+SPECIAL_EXTRACTORS = {"oauth", "recursive", "test"}
+
+
+class UniversalNone():
+ """None-style object that supports more operations than None itself"""
+ __slots__ = ()
+
+ def __getattribute__(self, _):
+ return self
+
+ def __getitem__(self, _):
+ return self
+
+ @staticmethod
+ def __bool__():
+ return False
+
+ @staticmethod
+ def __str__():
+ return "None"
+
+ __repr__ = __str__
+
+
+NONE = UniversalNone()
+
+
+def build_predicate(predicates):
+ if not predicates:
+ return lambda url, kwds: True
+ elif len(predicates) == 1:
+ return predicates[0]
+ else:
+ return ChainPredicate(predicates)
+
+
+class RangePredicate():
+ """Predicate; True if the current index is in the given range"""
+ def __init__(self, rangespec):
+ self.ranges = self.optimize_range(self.parse_range(rangespec))
+ self.index = 0
+
+ if self.ranges:
+ self.lower, self.upper = self.ranges[0][0], self.ranges[-1][1]
+ else:
+ self.lower, self.upper = 0, 0
+
+ def __call__(self, url, kwds):
+ self.index += 1
+
+ if self.index > self.upper:
+ raise exception.StopExtraction()
+
+ for lower, upper in self.ranges:
+ if lower <= self.index <= upper:
+ return True
+ return False
+
+ @staticmethod
+ def parse_range(rangespec):
+ """Parse an integer range string and return the resulting ranges
+
+ Examples:
+ parse_range("-2,4,6-8,10-") -> [(1,2), (4,4), (6,8), (10,INTMAX)]
+ parse_range(" - 3 , 4- 4, 2-6") -> [(1,3), (4,4), (2,6)]
+ """
+ ranges = []
+
+ for group in rangespec.split(","):
+ if not group:
+ continue
+ first, sep, last = group.partition("-")
+ if not sep:
+ beg = end = int(first)
+ else:
+ beg = int(first) if first.strip() else 1
+ end = int(last) if last.strip() else sys.maxsize
+ ranges.append((beg, end) if beg <= end else (end, beg))
+
+ return ranges
+
+ @staticmethod
+ def optimize_range(ranges):
+ """Simplify/Combine a parsed list of ranges
+
+ Examples:
+ optimize_range([(2,4), (4,6), (5,8)]) -> [(2,8)]
+ optimize_range([(1,1), (2,2), (3,6), (8,9))]) -> [(1,6), (8,9)]
+ """
+ if len(ranges) <= 1:
+ return ranges
+
+ ranges.sort()
+ riter = iter(ranges)
+ result = []
+
+ beg, end = next(riter)
+ for lower, upper in riter:
+ if lower > end+1:
+ result.append((beg, end))
+ beg, end = lower, upper
+ elif upper > end:
+ end = upper
+ result.append((beg, end))
+ return result
+
+
+class UniquePredicate():
+ """Predicate; True if given URL has not been encountered before"""
+ def __init__(self):
+ self.urls = set()
+
+ def __call__(self, url, kwds):
+ if url.startswith("text:"):
+ return True
+ if url not in self.urls:
+ self.urls.add(url)
+ return True
+ return False
+
+
+class FilterPredicate():
+ """Predicate; True if evaluating the given expression returns True"""
+ globalsdict = {
+ "parse_int": text.parse_int,
+ "urlsplit": urllib.parse.urlsplit,
+ "datetime": datetime.datetime,
+ "abort": raises(exception.StopExtraction()),
+ "re": re,
+ }
+
+ def __init__(self, filterexpr, target="image"):
+ name = "<{} filter>".format(target)
+ self.codeobj = compile(filterexpr, name, "eval")
+
+ def __call__(self, url, kwds):
+ try:
+ return eval(self.codeobj, self.globalsdict, kwds)
+ except exception.GalleryDLException:
+ raise
+ except Exception as exc:
+ raise exception.FilterError(exc)
+
+
+class ChainPredicate():
+ """Predicate; True if all of its predicates return True"""
+ def __init__(self, predicates):
+ self.predicates = predicates
+
+ def __call__(self, url, kwds):
+ for pred in self.predicates:
+ if not pred(url, kwds):
+ return False
+ return True
+
+
+class ExtendedUrl():
+ """URL with attached config key-value pairs"""
+ def __init__(self, url, gconf, lconf):
+ self.value, self.gconfig, self.lconfig = url, gconf, lconf
+
+ def __str__(self):
+ return self.value
+
+
+class Formatter():
+ """Custom, extended version of string.Formatter
+
+ This string formatter implementation is a mostly performance-optimized
+ variant of the original string.Formatter class. Unnecessary features have
+ been removed (positional arguments, unused argument check) and new
+ formatting options have been added.
+
+ Extra Conversions:
+ - "l": calls str.lower on the target value
+ - "u": calls str.upper
+ - "c": calls str.capitalize
+ - "C": calls string.capwords
+ - "U": calls urllib.parse.unquote
+ - "S": calls util.to_string()
+ - Example: {f!l} -> "example"; {f!u} -> "EXAMPLE"
+
+ Extra Format Specifiers:
+ - "?<before>/<after>/":
+ Adds <before> and <after> to the actual value if it evaluates to True.
+ Otherwise the whole replacement field becomes an empty string.
+ Example: {f:?-+/+-/} -> "-+Example+-" (if "f" contains "Example")
+ -> "" (if "f" is None, 0, "")
+
+ - "L<maxlen>/<replacement>/":
+ Replaces the output with <replacement> if its length (in characters)
+ exceeds <maxlen>. Otherwise everything is left as is.
+ Example: {f:L5/too long/} -> "foo" (if "f" is "foo")
+ -> "too long" (if "f" is "foobar")
+
+ - "J<separator>/":
+ Joins elements of a list (or string) using <separator>
+ Example: {f:J - /} -> "a - b - c" (if "f" is ["a", "b", "c"])
+
+ - "R<old>/<new>/":
+ Replaces all occurrences of <old> with <new>
+ Example: {f:R /_/} -> "f_o_o_b_a_r" (if "f" is "f o o b a r")
+ """
+ CONVERSIONS = {
+ "l": str.lower,
+ "u": str.upper,
+ "c": str.capitalize,
+ "C": string.capwords,
+ "U": urllib.parse.unquote,
+ "S": to_string,
+ "s": str,
+ "r": repr,
+ "a": ascii,
+ }
+
+ def __init__(self, format_string, default=None):
+ self.default = default
+ self.result = []
+ self.fields = []
+
+ for literal_text, field_name, format_spec, conversion in \
+ _string.formatter_parser(format_string):
+ if literal_text:
+ self.result.append(literal_text)
+ if field_name:
+ self.fields.append((
+ len(self.result),
+ self._field_access(field_name, format_spec, conversion)
+ ))
+ self.result.append("")
+
+ def format_map(self, kwargs):
+ """Apply 'kwargs' to the initial format_string and return its result"""
+ for index, func in self.fields:
+ self.result[index] = func(kwargs)
+ return "".join(self.result)
+
+ def _field_access(self, field_name, format_spec, conversion):
+ first, rest = _string.formatter_field_name_split(field_name)
+
+ funcs = []
+ for is_attr, key in rest:
+ if is_attr:
+ func = operator.attrgetter
+ elif ":" in key:
+ func = self._slicegetter
+ else:
+ func = operator.itemgetter
+ funcs.append(func(key))
+
+ if conversion:
+ funcs.append(self.CONVERSIONS[conversion])
+
+ if format_spec:
+ if format_spec[0] == "?":
+ func = self._format_optional
+ elif format_spec[0] == "L":
+ func = self._format_maxlen
+ elif format_spec[0] == "J":
+ func = self._format_join
+ elif format_spec[0] == "R":
+ func = self._format_replace
+ else:
+ func = self._format_default
+ fmt = func(format_spec)
+ else:
+ fmt = str
+
+ if funcs:
+ return self._apply(first, funcs, fmt)
+ return self._apply_simple(first, fmt)
+
+ def _apply_simple(self, key, fmt):
+ def wrap(obj):
+ if key in obj:
+ obj = obj[key]
+ else:
+ obj = self.default
+ return fmt(obj)
+ return wrap
+
+ def _apply(self, key, funcs, fmt):
+ def wrap(obj):
+ try:
+ obj = obj[key]
+ for func in funcs:
+ obj = func(obj)
+ except Exception:
+ obj = self.default
+ return fmt(obj)
+ return wrap
+
+ @staticmethod
+ def _slicegetter(key):
+ start, _, stop = key.partition(":")
+ stop, _, step = stop.partition(":")
+ start = int(start) if start else None
+ stop = int(stop) if stop else None
+ step = int(step) if step else None
+ return operator.itemgetter(slice(start, stop, step))
+
+ @staticmethod
+ def _format_optional(format_spec):
+ def wrap(obj):
+ if not obj:
+ return ""
+ return before + format(obj, format_spec) + after
+ before, after, format_spec = format_spec.split("/", 2)
+ before = before[1:]
+ return wrap
+
+ @staticmethod
+ def _format_maxlen(format_spec):
+ def wrap(obj):
+ obj = format(obj, format_spec)
+ return obj if len(obj) <= maxlen else replacement
+ maxlen, replacement, format_spec = format_spec.split("/", 2)
+ maxlen = text.parse_int(maxlen[1:])
+ return wrap
+
+ @staticmethod
+ def _format_join(format_spec):
+ def wrap(obj):
+ obj = separator.join(obj)
+ return format(obj, format_spec)
+ separator, _, format_spec = format_spec.partition("/")
+ separator = separator[1:]
+ return wrap
+
+ @staticmethod
+ def _format_replace(format_spec):
+ def wrap(obj):
+ obj = obj.replace(old, new)
+ return format(obj, format_spec)
+ old, new, format_spec = format_spec.split("/", 2)
+ old = old[1:]
+ return wrap
+
+ @staticmethod
+ def _format_default(format_spec):
+ def wrap(obj):
+ return format(obj, format_spec)
+ return wrap
+
+
+class PathFormat():
+
+ def __init__(self, extractor):
+ self.filename_fmt = extractor.config(
+ "filename", extractor.filename_fmt)
+ self.directory_fmt = extractor.config(
+ "directory", extractor.directory_fmt)
+ self.kwdefault = extractor.config("keywords-default")
+
+ try:
+ self.formatter = Formatter(self.filename_fmt, self.kwdefault)
+ except Exception as exc:
+ raise exception.FormatError(exc, "filename")
+
+ self.delete = False
+ self.has_extension = False
+ self.keywords = {}
+ self.filename = ""
+ self.directory = self.realdirectory = ""
+ self.path = self.realpath = self.temppath = ""
+
+ self.basedirectory = expand_path(
+ extractor.config("base-directory", (".", "gallery-dl")))
+ if os.altsep:
+ self.basedirectory = self.basedirectory.replace(os.altsep, os.sep)
+
+ def open(self, mode="wb"):
+ """Open file and return a corresponding file object"""
+ return open(self.temppath, mode)
+
+ def exists(self, archive=None):
+ """Return True if the file exists on disk or in 'archive'"""
+ if (archive and archive.check(self.keywords) or
+ self.has_extension and os.path.exists(self.realpath)):
+ if not self.has_extension:
+ # adjust display name
+ self.set_extension("")
+ if self.path[-1] == ".":
+ self.path = self.path[:-1]
+ return True
+ return False
+
+ def set_directory(self, keywords):
+ """Build directory path and create it if necessary"""
+ try:
+ segments = [
+ text.clean_path(
+ Formatter(segment, self.kwdefault)
+ .format_map(keywords).strip())
+ for segment in self.directory_fmt
+ ]
+ except Exception as exc:
+ raise exception.FormatError(exc, "directory")
+
+ self.directory = os.path.join(
+ self.basedirectory,
+ *segments
+ )
+
+ # remove trailing path separator;
+ # occurs if the last argument to os.path.join() is an empty string
+ if self.directory[-1] == os.sep:
+ self.directory = self.directory[:-1]
+
+ self.realdirectory = self.adjust_path(self.directory)
+ os.makedirs(self.realdirectory, exist_ok=True)
+
+ def set_keywords(self, keywords):
+ """Set filename keywords"""
+ self.keywords = keywords
+ self.temppath = ""
+ self.has_extension = bool(keywords.get("extension"))
+ if self.has_extension:
+ self.build_path()
+
+ def set_extension(self, extension, real=True):
+ """Set the 'extension' keyword"""
+ self.has_extension = real
+ self.keywords["extension"] = extension
+ self.build_path()
+
+ def build_path(self):
+ """Use filename-keywords and directory to build a full path"""
+ try:
+ self.filename = text.clean_path(
+ self.formatter.format_map(self.keywords))
+ except Exception as exc:
+ raise exception.FormatError(exc, "filename")
+
+ filename = os.sep + self.filename
+ self.path = self.directory + filename
+ self.realpath = self.realdirectory + filename
+ if not self.temppath:
+ self.temppath = self.realpath
+
+ def part_enable(self, part_directory=None):
+ """Enable .part file usage"""
+ if self.has_extension:
+ self.temppath += ".part"
+ else:
+ self.set_extension("part", False)
+ if part_directory:
+ self.temppath = os.path.join(
+ part_directory,
+ os.path.basename(self.temppath),
+ )
+
+ def part_size(self):
+ """Return size of .part file"""
+ try:
+ return os.stat(self.temppath).st_size
+ except OSError:
+ pass
+ return 0
+
+ def finalize(self):
+ """Move tempfile to its target location"""
+ if self.delete:
+ self.delete = False
+ os.unlink(self.temppath)
+ return
+
+ if self.temppath == self.realpath:
+ return
+
+ try:
+ os.replace(self.temppath, self.realpath)
+ return
+ except OSError:
+ pass
+
+ shutil.copyfile(self.temppath, self.realpath)
+ os.unlink(self.temppath)
+
+ @staticmethod
+ def adjust_path(path):
+ """Enable longer-than-260-character paths on windows"""
+ return "\\\\?\\" + os.path.abspath(path) if os.name == "nt" else path
+
+
+class DownloadArchive():
+
+ def __init__(self, path, extractor):
+ con = sqlite3.connect(path)
+ con.isolation_level = None
+ self.cursor = con.cursor()
+ self.cursor.execute("CREATE TABLE IF NOT EXISTS archive "
+ "(entry PRIMARY KEY) WITHOUT ROWID")
+ self.keygen = (extractor.category + extractor.config(
+ "archive-format", extractor.archive_fmt)
+ ).format_map
+
+ def check(self, kwdict):
+ """Return True if item described by 'kwdict' exists in archive"""
+ key = self.keygen(kwdict)
+ self.cursor.execute(
+ "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,))
+ return self.cursor.fetchone()
+
+ def add(self, kwdict):
+ """Add item described by 'kwdict' to archive"""
+ key = self.keygen(kwdict)
+ self.cursor.execute(
+ "INSERT OR IGNORE INTO archive VALUES (?)", (key,))
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
new file mode 100644
index 0000000..4167bc4
--- /dev/null
+++ b/gallery_dl/version.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2016-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+__version__ = "1.8.7"
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f9f5cd8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+requests>=2.11.0
diff --git a/scripts/bash_completion.py b/scripts/bash_completion.py
new file mode 100755
index 0000000..69e6a79
--- /dev/null
+++ b/scripts/bash_completion.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Generate bash completion script from gallery-dl's argument parser"""
+
+import util
+from gallery_dl import option
+
+
+TEMPLATE = """_gallery_dl()
+{
+ local cur prev
+ COMPREPLY=()
+ cur="${COMP_WORDS[COMP_CWORD]}"
+ prev="${COMP_WORDS[COMP_CWORD-1]}"
+
+ if [[ "${prev}" =~ ^(%(fileopts)s)$ ]]; then
+ COMPREPLY=( $(compgen -f -- "${cur}") )
+ elif [[ "${prev}" =~ ^(%(diropts)s)$ ]]; then
+ COMPREPLY=( $(compgen -d -- "${cur}") )
+ else
+ COMPREPLY=( $(compgen -W "%(opts)s" -- "${cur}") )
+ fi
+}
+
+complete -F _gallery_dl gallery-dl
+"""
+
+opts = []
+diropts = []
+fileopts = []
+for action in option.build_parser()._actions:
+
+ if action.metavar in ("DEST",):
+ diropts.extend(action.option_strings)
+
+ elif action.metavar in ("FILE", "CFG"):
+ fileopts.extend(action.option_strings)
+
+ for opt in action.option_strings:
+ if opt.startswith("--"):
+ opts.append(opt)
+
+PATH = util.path("gallery-dl.bash_completion")
+with open(PATH, "w", encoding="utf-8") as file:
+ file.write(TEMPLATE % {
+ "opts" : " ".join(opts),
+ "diropts" : "|".join(diropts),
+ "fileopts": "|".join(fileopts),
+ })
diff --git a/scripts/build_testresult_db.py b/scripts/build_testresult_db.py
new file mode 100755
index 0000000..fda9f64
--- /dev/null
+++ b/scripts/build_testresult_db.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""Collect results of extractor unit tests"""
+
+import sys
+import os.path
+import datetime
+
+import util
+from gallery_dl import extractor, job, config
+from test.test_results import setup_test_config
+
+
+# filter test cases
+
+tests = [
+ (idx, extr, url, result)
+
+ for extr in extractor.extractors()
+ if hasattr(extr, "test") and extr.test
+ if len(sys.argv) <= 1 or extr.category in sys.argv
+
+ for idx, (url, result) in enumerate(extr._get_tests())
+ if result
+]
+
+
+# setup target directory
+
+path = util.path("archive", "testdb", str(datetime.date.today()))
+os.makedirs(path, exist_ok=True)
+
+
+for idx, extr, url, result in tests:
+
+ # filename
+ name = "{}-{}-{}.json".format(extr.category, extr.subcategory, idx)
+ print(name)
+
+ # config values
+ setup_test_config()
+
+ if "options" in result:
+ for key, value in result["options"]:
+ config.set(key.split("."), value)
+ if "range" in result:
+ config.set(("image-range",), result["range"])
+ config.set(("chapter-range",), result["range"])
+
+ # write test data
+ try:
+ with open(os.path.join(path, name), "w") as outfile:
+ job.DataJob(url, file=outfile, ensure_ascii=False).run()
+ except KeyboardInterrupt:
+ sys.exit()
diff --git a/scripts/create_test_data.py b/scripts/create_test_data.py
new file mode 100755
index 0000000..14ab0c0
--- /dev/null
+++ b/scripts/create_test_data.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Create testdata for extractor tests"""
+
+import argparse
+
+import util # noqa
+from gallery_dl import extractor
+from test.test_results import ResultJob, setup_test_config
+
+
+TESTDATA_FMT = """
+ test = ("{}", {{
+ "url": "{}",
+ "keyword": "{}",
+ "content": "{}",
+ }})
+"""
+
+TESTDATA_EXCEPTION_FMT = """
+ test = ("{}", {{
+ "exception": exception.{},
+ }})
+"""
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--content", action="store_true")
+ parser.add_argument("--recreate", action="store_true")
+ parser.add_argument("urls", nargs="*")
+ args = parser.parse_args()
+
+ if args.recreate:
+ urls = [
+ test[0]
+ for extr in extractor.extractors() if extr.category in args.urls
+ for test in extr.test
+ ]
+ else:
+ urls = args.urls
+
+ setup_test_config()
+
+ for url in urls:
+ tjob = ResultJob(url, content=args.content)
+ try:
+ tjob.run()
+ except Exception as exc:
+ fmt = TESTDATA_EXCEPTION_FMT
+ data = (exc.__class__.__name__,)
+ else:
+ fmt = TESTDATA_FMT
+ data = (tjob.hash_url.hexdigest(),
+ tjob.hash_keyword.hexdigest(),
+ tjob.hash_content.hexdigest())
+ print(tjob.extractor.__class__.__name__)
+ print(fmt.format(url, *data))
+
+
+if __name__ == '__main__':
+ main()
diff --git a/scripts/hook-gallery_dl.py b/scripts/hook-gallery_dl.py
new file mode 100644
index 0000000..d549019
--- /dev/null
+++ b/scripts/hook-gallery_dl.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+
+from gallery_dl import extractor, downloader, postprocessor
+
+hiddenimports = [
+ package.__name__ + "." + module
+ for package in (extractor, downloader, postprocessor)
+ for module in package.modules
+]
diff --git a/scripts/man.py b/scripts/man.py
new file mode 100755
index 0000000..91608a3
--- /dev/null
+++ b/scripts/man.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Generate man pages"""
+
+import re
+import datetime
+
+import util
+import gallery_dl.option
+import gallery_dl.version
+
+
+def build_gallery_dl_1(path=None):
+
+ OPTS_FMT = """.TP\n.B "{}" {}\n{}"""
+
+ TEMPLATE = r"""
+.TH "GALLERY-DL" "1" "%(date)s" "%(version)s" "gallery-dl Manual"
+.\" disable hyphenation
+.nh
+
+.SH NAME
+gallery-dl \- download image-galleries and -collections
+
+.SH SYNOPSIS
+.B gallery-dl
+[OPTION]... URL...
+
+.SH DESCRIPTION
+.B gallery-dl
+is a command-line program to download image-galleries and -collections
+from several image hosting sites. It is a cross-platform tool
+with many configuration options and powerful filenaming capabilities.
+
+.SH OPTIONS
+%(options)s
+
+.SH EXAMPLES
+.TP
+gallery-dl \f[I]URL\f[]
+Download images from \f[I]URL\f[].
+.TP
+gallery-dl -g -u <username> -p <password> \f[I]URL\f[]
+Print direct URLs from a site that requires authentication.
+.TP
+gallery-dl --filter 'type == "ugoira"' --range '2-4' \f[I]URL\f[]
+Apply filter and range expressions. This will only download
+the second, third, and fourth file where its type value is equal to "ugoira".
+.TP
+gallery-dl r:\f[I]URL\f[]
+Scan \f[I]URL\f[] for other URLs and invoke \f[B]gallery-dl\f[] on them.
+.TP
+gallery-dl oauth:\f[I]SITE\-NAME\f[]
+Gain OAuth authentication tokens for
+.IR deviantart ,
+.IR flickr ,
+.IR reddit ,
+.IR smugmug ", and"
+.IR tumblr .
+
+.SH FILES
+.TP
+.I /etc/gallery-dl.conf
+The system wide configuration file.
+.TP
+.I ~/.config/gallery-dl/config.json
+Per user configuration file.
+.TP
+.I ~/.gallery-dl.conf
+Alternate per user configuration file.
+
+.SH BUGS
+https://github.com/mikf/gallery-dl/issues
+
+.SH AUTHORS
+Mike Fährmann <mike_faehrmann@web.de>
+.br
+and https://github.com/mikf/gallery-dl/graphs/contributors
+
+.SH "SEE ALSO"
+.BR gallery-dl.conf (5)
+"""
+
+ options = []
+ for action in gallery_dl.option.build_parser()._actions:
+ if action.help.startswith("=="):
+ continue
+ options.append(OPTS_FMT.format(
+ ", ".join(action.option_strings).replace("-", r"\-"),
+ r"\f[I]{}\f[]".format(action.metavar) if action.metavar else "",
+ action.help,
+ ))
+
+ if not path:
+ path = util.path("gallery-dl.1")
+ with open(path, "w", encoding="utf-8") as file:
+ file.write(TEMPLATE.lstrip() % {
+ "options": "\n".join(options),
+ "version": gallery_dl.version.__version__,
+ "date" : datetime.datetime.now().strftime("%Y-%m-%d"),
+ })
+
+
+def build_gallery_dl_conf_5(path=None):
+
+ TEMPLATE = r"""
+.TH "GALLERY-DL.CONF" "5" "%(date)s" "%(version)s" "gallery-dl Manual"
+.\" disable hyphenation
+.nh
+.\" disable justification (adjust text to left margin only)
+.ad l
+
+.SH NAME
+gallery-dl.conf \- gallery-dl configuration file
+
+.SH DESCRIPTION
+gallery-dl will search for configuration files in the following places
+every time it is started, unless
+.B --ignore-config
+is specified:
+.PP
+.RS 4
+.nf
+.I /etc/gallery-dl.conf
+.I $HOME/.config/gallery-dl/config.json
+.I $HOME/.gallery-dl.conf
+.fi
+.RE
+.PP
+It is also possible to specify additional configuration files with the
+.B -c/--config
+command-line option or to add further option values with
+.B -o/--option
+as <key>=<value> pairs,
+
+Configuration files are JSON-based and therefore don't allow any ordinary
+comments, but, since unused keys are simply ignored, it is possible to utilize
+those as makeshift comments by settings their values to arbitrary strings.
+
+.SH EXAMPLE
+{
+.RS 4
+"base-directory": "/tmp/",
+.br
+"extractor": {
+.RS 4
+"pixiv": {
+.RS 4
+"directory": ["Pixiv", "Works", "{user[id]}"],
+.br
+"filename": "{id}{num}.{extension}",
+.br
+"username": "foo",
+.br
+"password": "bar"
+.RE
+},
+.br
+"flickr": {
+.RS 4
+"_comment": "OAuth keys for account 'foobar'",
+.br
+"access-token": "0123456789-0123456789abcdef",
+.br
+"access-token-secret": "fedcba9876543210"
+.RE
+}
+.RE
+},
+.br
+"downloader": {
+.RS 4
+"retries": 3,
+.br
+"timeout": 2.5
+.RE
+}
+.RE
+}
+
+%(options)s
+
+.SH BUGS
+https://github.com/mikf/gallery-dl/issues
+
+.SH AUTHORS
+Mike Fährmann <mike_faehrmann@web.de>
+.br
+and https://github.com/mikf/gallery-dl/graphs/contributors
+
+.SH "SEE ALSO"
+.BR gallery-dl (1)
+"""
+
+ sections = parse_docs_configuration()
+ content = []
+
+ for sec_name, section in sections.items():
+ content.append(".SH " + sec_name.upper())
+
+ for opt_name, option in section.items():
+ content.append(".SS " + opt_name)
+
+ for field, text in option.items():
+ if field in ("Type", "Default"):
+ content.append('.IP "{}:" {}'.format(field, len(field)+2))
+ content.append(strip_rst(text))
+ else:
+ content.append('.IP "{}:" 4'.format(field))
+ content.append(strip_rst(text, field != "Example"))
+
+ if not path:
+ path = util.path("gallery-dl.conf.5")
+ with open(path, "w", encoding="utf-8") as file:
+ file.write(TEMPLATE.lstrip() % {
+ "options": "\n".join(content),
+ "version": gallery_dl.version.__version__,
+ "date" : datetime.datetime.now().strftime("%Y-%m-%d"),
+ })
+
+
+def parse_docs_configuration():
+
+ doc_path = util.path("docs", "configuration.rst")
+ with open(doc_path, encoding="utf-8") as file:
+ doc_lines = file.readlines()
+
+ sections = {}
+ sec_name = None
+ options = None
+ opt_name = None
+ opt_desc = None
+ name = None
+ last = last2 = None
+ for line in doc_lines:
+
+ # start of new section
+ if re.match(r"^=+$", line):
+ if sec_name and options:
+ sections[sec_name] = options
+ sec_name = last.strip()
+ options = {}
+
+ elif re.match(r"^=+ =+$", line):
+ # start of option table
+ if re.match(r"^-+$", last):
+ opt_name = last2.strip()
+ opt_desc = {}
+ # end of option table
+ elif opt_desc:
+ options[opt_name] = opt_desc
+ opt_name = None
+ name = None
+
+ # inside option table
+ elif opt_name:
+ if line[0].isalpha():
+ name, _, line = line.partition(" ")
+ opt_desc[name] = ""
+ line = line.strip()
+ if line.startswith(("* ", "- ")):
+ line = "\n" + line
+ elif line.startswith("| "):
+ line = line[2:] + "\n.br"
+ opt_desc[name] += line + "\n"
+
+ last2 = last
+ last = line
+ sections[sec_name] = options
+
+ return sections
+
+
+def strip_rst(text, extended=True, *, ITALIC=r"\\f[I]\1\\f[]", REGULAR=r"\1"):
+
+ text = text.replace("\\", "\\\\")
+
+ # ``foo``
+ repl = ITALIC if extended else REGULAR
+ text = re.sub(r"``([^`]+)``", repl, text)
+ # |foo|_
+ text = re.sub(r"\|([^|]+)\|_*", ITALIC, text)
+ # `foo`_
+ text = re.sub(r"`([^`]+)`_+", ITALIC, text)
+ # `foo`
+ text = re.sub(r"`([^`]+)`", REGULAR, text)
+ # foo_
+ text = re.sub(r"([A-Za-z0-9-]+)_+(?=\s)", ITALIC, text)
+ # -------
+ text = re.sub(r"---+", "", text)
+
+ return text
+
+
+if __name__ == "__main__":
+ build_gallery_dl_1()
+ build_gallery_dl_conf_5()
diff --git a/scripts/pyinstaller.py b/scripts/pyinstaller.py
new file mode 100755
index 0000000..879ae50
--- /dev/null
+++ b/scripts/pyinstaller.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""Build a standalone executable using PyInstaller"""
+
+import PyInstaller.__main__
+import util
+
+PyInstaller.__main__.run([
+ "--onefile",
+ "--console",
+ "--name", "gallery-dl." + ("exe" if PyInstaller.is_win else "bin"),
+ "--additional-hooks-dir", util.path("scripts"),
+ "--distpath", util.path("dist"),
+ "--workpath", util.path("build"),
+ "--specpath", util.path("build"),
+ util.path("gallery_dl", "__main__.py"),
+])
diff --git a/scripts/release.sh b/scripts/release.sh
new file mode 100755
index 0000000..ef444e0
--- /dev/null
+++ b/scripts/release.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+set -e
+
+prompt() {
+ echo "root: ${ROOTDIR} old: ${OLDVERSION} - new: ${NEWVERSION}"
+ read -n 1 -r -p "Proceed? [Y/n] " P
+ echo
+ if [ "$P" == y -o "$P" == Y -o -z "$P" ]; then
+ return 0
+ else
+ exit 1
+ fi
+}
+
+cleanup() {
+ cd "${ROOTDIR}"
+ echo Removing old build directory
+
+ if [ -d ./build ]; then
+ rm -rf ./build
+ fi
+}
+
+update() {
+ cd "${ROOTDIR}"
+ echo Updating version to ${NEWVERSION}
+
+ sed -i "s#\"${PYVERSION}\"#\"${NEWVERSION}\"#" "gallery_dl/version.py"
+ sed -i "s#v${OLDVERSION}#v${NEWVERSION}#" "${README}"
+}
+
+update-dev() {
+ cd "${ROOTDIR}"
+
+ IFS="." read MAJOR MINOR BUILD <<< "${NEWVERSION}"
+ BUILD=$((BUILD+1))
+ # update version to -dev
+ sed -i "s#\"${NEWVERSION}\"#\"${MAJOR}.${MINOR}.${BUILD}-dev\"#" "gallery_dl/version.py"
+ # add 'unreleased' line to changelog
+ sed -i "2i\\\n## Unreleased" "${CHANGELOG}"
+
+ git add "gallery_dl/version.py" "${CHANGELOG}"
+}
+
+build-python() {
+ cd "${ROOTDIR}"
+ echo Building bdist_wheel and sdist
+
+ python setup.py bdist_wheel sdist
+}
+
+build-linux() {
+ cd "${ROOTDIR}"
+ echo Building Linux executable
+
+ make executable
+}
+
+build-windows() {
+ cd "${ROOTDIR}/dist"
+ echo Building Windows executable
+
+ # remove old executable
+ rm -f "gallery-dl.exe"
+
+ # build windows exe in vm
+ ln -fs "${ROOTDIR}" /tmp/
+ vmstart "Windows 7" &
+ disown
+ while [ ! -e "gallery-dl.exe" ] ; do
+ sleep 5
+ done
+ sleep 2
+
+ # check exe version
+ OUTPUT="$(wine gallery-dl.exe --version)"
+ if [[ ! "${OUTPUT%?}" == "${NEWVERSION}" ]]; then
+ echo "exe version mismatch: ${OUTPUT} != ${NEWVERSION}"
+ exit 3
+ fi
+}
+
+sign() {
+ cd "${ROOTDIR}/dist"
+ echo Signing files
+
+ gpg --detach-sign --armor gallery_dl-${NEWVERSION}-py3-none-any.whl
+ gpg --detach-sign --armor gallery_dl-${NEWVERSION}.tar.gz
+ gpg --detach-sign --yes gallery-dl.exe
+ gpg --detach-sign --yes gallery-dl.bin
+}
+
+changelog() {
+ cd "${ROOTDIR}"
+ echo Updating "${CHANGELOG}"
+
+ # - replace "#NN" with link to actual issue
+ # - insert new version and date
+ sed -i \
+ -e "s*\([( ]\)#\([0-9]\+\)*\1[#\2](https://github.com/mikf/gallery-dl/issues/\2)*g" \
+ -e "s*^## [Uu]nreleased*## ${NEWVERSION} - $(date +%Y-%m-%d)*" \
+ "${CHANGELOG}"
+}
+
+supportedsites() {
+ cd "${ROOTDIR}"
+ echo Checking if "${SUPPORTEDSITES}" is up to date
+
+ ./scripts/supportedsites.py
+ if ! git diff --quiet "${SUPPORTEDSITES}"; then
+ echo "updated ${SUPPORTEDSITES} contains changes"
+ exit 4
+ fi
+}
+
+git-upload() {
+ cd "${ROOTDIR}"
+ echo Pushing changes to github
+
+ git add "gallery_dl/version.py" "${README}" "${CHANGELOG}"
+ git commit -S -m "release version ${NEWVERSION}"
+ git tag -s -m "version ${NEWVERSION}" "v${NEWVERSION}"
+ git push
+ git push origin "v${NEWVERSION}"
+}
+
+pypi-upload() {
+ cd "${ROOTDIR}/dist"
+ echo Uploading to PyPI
+
+ twine upload gallery_dl-${NEWVERSION}*
+}
+
+
+ROOTDIR="$(realpath "$(dirname "$0")/..")/"
+README="README.rst"
+CHANGELOG="CHANGELOG.md"
+SUPPORTEDSITES="./docs/supportedsites.rst"
+
+LASTTAG="$(git describe --abbrev=0 --tags)"
+OLDVERSION="${LASTTAG#v}"
+PYVERSION="$(python -c "import gallery_dl as g; print(g.__version__)")"
+
+if [[ "$1" ]]; then
+ NEWVERSION="$1"
+else
+ NEWVERSION="${PYVERSION%-dev}"
+fi
+
+if [[ ! $NEWVERSION =~ [0-9]+\.[0-9]+\.[0-9]+(-[a-z]+(\.[0-9]+)?)?$ ]]; then
+ echo "invalid version: $NEWVERSION"
+ exit 2
+fi
+
+
+prompt
+supportedsites
+cleanup
+update
+build-python
+build-linux
+build-windows
+sign
+changelog
+git-upload
+pypi-upload
+update-dev
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
new file mode 100755
index 0000000..334671e
--- /dev/null
+++ b/scripts/run_tests.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+
+TESTS_CORE=(config cookies downloader extractor oauth text util)
+TESTS_RESULTS=(results)
+
+
+# select tests
+case "${1:-${GALLERYDL_TESTS:-core}}" in
+ core) TESTS=( ${TESTS_CORE[@]} );;
+ results) TESTS=( ${TESTS_RESULTS[@]} );;
+ *) TESTS=( );;
+esac
+
+
+# transform each array element to test_###.py
+TESTS=( ${TESTS[@]/#/test_} )
+TESTS=( ${TESTS[@]/%/.py} )
+
+
+# run 'nosetests' with selected tests
+# (or all tests if ${TESTS} is empty)
+nosetests --verbose -w "${DIR}/../test" ${TESTS[@]}
diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py
new file mode 100755
index 0000000..f326617
--- /dev/null
+++ b/scripts/supportedsites.py
@@ -0,0 +1,264 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""Generate a reStructuredText document with all supported sites"""
+
+import sys
+import collections
+
+import util
+from gallery_dl import extractor
+
+
+CATEGORY_MAP = {
+ "2chan" : "Futaba Channel",
+ "35photo" : "35PHOTO",
+ "archivedmoe" : "Archived.Moe",
+ "archiveofsins" : "Archive of Sins",
+ "artstation" : "ArtStation",
+ "b4k" : "arch.b4k.co",
+ "bobx" : "BobX",
+ "deviantart" : "DeviantArt",
+ "dokireader" : "Doki Reader",
+ "dynastyscans" : "Dynasty Reader",
+ "e621" : "e621",
+ "exhentai" : "ExHentai",
+ "fallenangels" : "Fallen Angels Scans",
+ "fashionnova" : "Fashion Nova",
+ "hbrowse" : "HBrowse",
+ "hentai2read" : "Hentai2Read",
+ "hentaicafe" : "Hentai Cafe",
+ "hentaifoundry" : "Hentai Foundry",
+ "hentaifox" : "HentaiFox",
+ "hentaihere" : "HentaiHere",
+ "hitomi" : "Hitomi.la",
+ "idolcomplex" : "Idol Complex",
+ "imagebam" : "ImageBam",
+ "imagefap" : "ImageFap",
+ "imgbox" : "imgbox",
+ "imgth" : "imgth",
+ "imgur" : "imgur",
+ "jaiminisbox" : "Jaimini's Box",
+ "kireicake" : "Kirei Cake",
+ "kissmanga" : "KissManga",
+ "livedoor" : "livedoor Blog",
+ "mangadex" : "MangaDex",
+ "mangafox" : "Manga Fox",
+ "mangahere" : "Manga Here",
+ "mangapark" : "MangaPark",
+ "mangastream" : "Manga Stream",
+ "myportfolio" : "Adobe Portfolio",
+ "nhentai" : "nhentai",
+ "nijie" : "nijie",
+ "nsfwalbum" : "NSFWalbum.com",
+ "nyafuu" : "Nyafuu Archive",
+ "paheal" : "rule #34",
+ "powermanga" : "PowerManga",
+ "readcomiconline": "Read Comic Online",
+ "rbt" : "RebeccaBlackTech",
+ "rule34" : "Rule 34",
+ "sankaku" : "Sankaku Channel",
+ "sankakucomplex" : "Sankaku Complex",
+ "seaotterscans" : "Sea Otter Scans",
+ "seiga" : "Niconico Seiga",
+ "senmanga" : "Sen Manga",
+ "sensescans" : "Sense-Scans",
+ "sexcom" : "Sex.com",
+ "simplyhentai" : "Simply Hentai",
+ "slickpic" : "SlickPic",
+ "slideshare" : "SlideShare",
+ "smugmug" : "SmugMug",
+ "thebarchive" : "The /b/ Archive",
+ "vanillarock" : "もえぴりあ",
+ "wikiart" : "WikiArt.org",
+ "worldthree" : "World Three",
+ "xhamster" : "xHamster",
+ "xvideos" : "XVideos",
+ "yaplog" : "yaplog!",
+ "yuki" : "yuki.la 4chan archive",
+}
+
+SUBCATEGORY_MAP = {
+ "artwork": "Artwork Listings",
+ "artists": "",
+ "doujin" : "Doujin",
+ "gallery": "Galleries",
+ "image" : "individual Images",
+ "issue" : "Comic-Issues",
+ "manga" : "Manga",
+ "me" : "pixiv.me Links",
+ "media" : "Media Timelines",
+ "path" : "Images from Users and Folders",
+ "pinit" : "pin.it Links",
+ "popular": "Popular Images",
+ "recent" : "Recent Images",
+ "search" : "Search Results",
+ "stash" : "Sta.sh",
+ "status" : "Images from Statuses",
+ "tag" : "Tag-Searches",
+ "user" : "Images from Users",
+ "work" : "Individual Images",
+ "related-pin" : "related Pins",
+ "related-board": "",
+}
+
+AUTH_MAP = {
+ "danbooru" : "Optional",
+ "deviantart" : "Optional (OAuth)",
+ "exhentai" : "Optional",
+ "flickr" : "Optional (OAuth)",
+ "idolcomplex": "Optional",
+ "luscious" : "Optional",
+ "mangoxo" : "Optional",
+ "nijie" : "Required",
+ "pixiv" : "Required",
+ "reddit" : "Optional (OAuth)",
+ "sankaku" : "Optional",
+ "seiga" : "Required",
+ "smugmug" : "Optional (OAuth)",
+ "tsumino" : "Optional",
+ "tumblr" : "Optional (OAuth)",
+ "twitter" : "Optional",
+}
+
+IGNORE_LIST = (
+ "directlink",
+ "oauth",
+ "recursive",
+ "test",
+)
+
+
+def domain(cls):
+ """Return the web-domain related to an extractor class"""
+ url = sys.modules[cls.__module__].__doc__.split()[-1]
+ if url.startswith("http"):
+ return url
+
+ if hasattr(cls, "root") and cls.root:
+ return cls.root + "/"
+
+ if hasattr(cls, "https"):
+ scheme = "https" if cls.https else "http"
+ netloc = cls.__doc__.split()[-1]
+ return "{}://{}/".format(scheme, netloc)
+
+ test = next(cls._get_tests(), None)
+ if test:
+ url = test[0]
+ return url[:url.find("/", 8)+1]
+
+ return ""
+
+
+def category_text(cls):
+ """Return a human-readable representation of a category"""
+ c = cls.category
+ return CATEGORY_MAP.get(c) or c.capitalize()
+
+
+def subcategory_text(cls):
+ """Return a human-readable representation of a subcategory"""
+ sc = cls.subcategory
+ if sc in SUBCATEGORY_MAP:
+ return SUBCATEGORY_MAP[sc]
+ sc = sc.capitalize()
+ return sc if sc.endswith("s") else sc + "s"
+
+
+def category_key(cls):
+ """Generate sorting keys by category"""
+ key = category_text(cls).lower()
+ if cls.__module__.endswith(".imagehosts"):
+ key = "zz" + key
+ return key
+
+
+def subcategory_key(cls):
+ """Generate sorting keys by subcategory"""
+ if cls.subcategory in ("user", "issue"):
+ return "A"
+ return cls.subcategory
+
+
+def build_extractor_list():
+ """Generate a sorted list of lists of extractor classes"""
+ extractors = collections.defaultdict(list)
+
+ # get lists of extractor classes grouped by category
+ for extr in extractor.extractors():
+ if not extr.category or extr.category in IGNORE_LIST:
+ continue
+ extractors[extr.category].append(extr)
+
+ # sort extractor lists with the same category
+ for extrlist in extractors.values():
+ extrlist.sort(key=subcategory_key)
+
+ # sort lists by category
+ return sorted(
+ extractors.values(),
+ key=lambda lst: category_key(lst[0]),
+ )
+
+
+# define table columns
+COLUMNS = (
+ ("Site", 20,
+ lambda x: category_text(x[0])),
+ ("URL" , 35,
+ lambda x: domain(x[0])),
+ ("Capabilities", 50,
+ lambda x: ", ".join(subcategory_text(extr) for extr in x
+ if subcategory_text(extr))),
+ ("Authentication", 16,
+ lambda x: AUTH_MAP.get(x[0].category, "")),
+)
+
+
+def write_output(fobj, columns, extractors):
+
+ def pad(output, col, category=None):
+ size = col[1]
+ output = output if isinstance(output, str) else col[2](output)
+
+ if len(output) > size:
+ sub = "|{}-{}|".format(category, col[0][0])
+ subs.append((sub, output))
+ output = sub
+
+ return output + " " * (size - len(output))
+
+ w = fobj.write
+ subs = []
+
+ # caption
+ w("Supported Sites\n")
+ w("===============\n")
+
+ # table head
+ sep = " ".join("=" * c[1] for c in columns) + "\n"
+ w(sep)
+ w(" ".join(pad(c[0], c) for c in columns).strip() + "\n")
+ w(sep)
+
+ # table body
+ for lst in extractors:
+ w(" ".join(
+ pad(col[2](lst), col, lst[0].category)
+ for col in columns
+ ).strip())
+ w("\n")
+
+ # table bottom
+ w(sep)
+ w("\n")
+
+ # substitutions
+ for sub, value in subs:
+ w(".. {} replace:: {}\n".format(sub, value))
+
+
+outfile = sys.argv[1] if len(sys.argv) > 1 else "supportedsites.rst"
+with open(util.path("docs", outfile), "w") as file:
+ write_output(file, COLUMNS, build_extractor_list())
diff --git a/scripts/util.py b/scripts/util.py
new file mode 100644
index 0000000..bfbd6cb
--- /dev/null
+++ b/scripts/util.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import os.path
+
+ROOTDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, os.path.realpath(ROOTDIR))
+
+
+def path(*segments, join=os.path.join):
+ return join(ROOTDIR, *segments)
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..c8d5cea
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,3 @@
+[flake8]
+exclude = gallery_dl/__init__.py,gallery_dl/__main__.py,setup.py,build,scripts,archive
+ignore = E203,E226,W504
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..8299811
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals, print_function
+
+import sys
+import os.path
+import warnings
+
+if sys.hexversion < 0x3040000:
+ sys.exit("Python 3.4+ required")
+
+try:
+ from setuptools import setup
+ has_setuptools = True
+except ImportError:
+ from distutils.core import setup
+ has_setuptools = False
+
+
+def read(fname):
+ path = os.path.join(os.path.dirname(__file__), fname)
+ with open(path, encoding="utf-8") as file:
+ return file.read()
+
+def check_file(fname):
+ if os.path.exists(fname):
+ return True
+ warnings.warn(
+ "Not including file '{}' since it is not present. "
+ "Run 'make' to build all automatically generated files.".format(fname)
+ )
+ return False
+
+
+# get version without importing the package
+exec(read("gallery_dl/version.py"))
+
+DESCRIPTION = ("Command-line program to download image-galleries and "
+ "-collections from several image hosting sites")
+LONG_DESCRIPTION = read("README.rst")
+
+if "py2exe" in sys.argv:
+ try:
+ import py2exe
+ except ImportError:
+ sys.exit("Error importing 'py2exe'")
+ params = {
+ "console": [{
+ "script": "./gallery_dl/__main__.py",
+ "dest_base": "gallery-dl",
+ "version": __version__,
+ "description": DESCRIPTION,
+ "comments": LONG_DESCRIPTION,
+ "product_name": "gallery-dl",
+ "product_version": __version__,
+ }],
+ "options": {"py2exe": {
+ "bundle_files": 0,
+ "compressed": 1,
+ "optimize": 1,
+ "dist_dir": ".",
+ "packages": ["gallery_dl"],
+ "dll_excludes": ["w9xpopen.exe"],
+ }},
+ "zipfile": None,
+ }
+elif has_setuptools:
+ params = {
+ "entry_points": {
+ "console_scripts": [
+ "gallery-dl = gallery_dl:main"
+ ]
+ }
+ }
+else:
+ params = {
+ "scripts": ["bin/gallery-dl"]
+ }
+
+data_files = [
+ (path, [f for f in files if check_file(f)])
+ for (path, files) in [
+ ('etc/bash_completion.d', ['gallery-dl.bash_completion']),
+ ('share/man/man1' , ['gallery-dl.1']),
+ ('share/man/man5' , ['gallery-dl.conf.5']),
+ ]
+]
+
+
+setup(
+ name="gallery_dl",
+ version=__version__,
+ description=DESCRIPTION,
+ long_description=LONG_DESCRIPTION,
+ url="https://github.com/mikf/gallery-dl",
+ download_url="https://github.com/mikf/gallery-dl/releases/latest",
+ author="Mike Fährmann",
+ author_email="mike_faehrmann@web.de",
+ maintainer="Mike Fährmann",
+ maintainer_email="mike_faehrmann@web.de",
+ license="GPLv2",
+ python_requires=">=3.4",
+ install_requires=[
+ "requests>=2.11.0",
+ ],
+ packages=[
+ "gallery_dl",
+ "gallery_dl.extractor",
+ "gallery_dl.downloader",
+ "gallery_dl.postprocessor",
+ ],
+ data_files=data_files,
+ keywords="image gallery downloader crawler scraper",
+ classifiers=[
+ "Development Status :: 5 - Production/Stable",
+ "Environment :: Console",
+ "Intended Audience :: End Users/Desktop",
+ "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
+ "Operating System :: Microsoft :: Windows",
+ "Operating System :: POSIX",
+ "Operating System :: MacOS",
+ "Programming Language :: Python :: 3.4",
+ "Programming Language :: Python :: 3.5",
+ "Programming Language :: Python :: 3.6",
+ "Programming Language :: Python :: 3.7",
+ "Programming Language :: Python :: 3 :: Only",
+ "Topic :: Internet :: WWW/HTTP",
+ "Topic :: Multimedia :: Graphics",
+ "Topic :: Utilities",
+ ],
+ test_suite="test",
+ **params
+)
diff --git a/snap/local/launchers/gallery-dl-launch b/snap/local/launchers/gallery-dl-launch
new file mode 100755
index 0000000..908f303
--- /dev/null
+++ b/snap/local/launchers/gallery-dl-launch
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# This is the maintainence launcher for the snap, make necessary runtime environment changes to make the snap work here. You may also insert security confinement/deprecation/obsoletion notice of the snap here.
+
+set \
+ -o errexit \
+ -o errtrace \
+ -o nounset \
+ -o pipefail
+
+# Use user's real home directory for canonical configuration path access
+declare REALHOME="$(
+ getent passwd "${USER}" \
+ | cut --delimiter=: --fields=6
+)"
+HOME="${REALHOME}"
+
+if ! test -f "${SNAP_USER_COMMON}"/marker_disable_interface_warning; then
+ # Warn if the `removable-media` interface isn't connected
+ if ! ls /media &>/dev/null; then
+ printf -- \
+ "It seems that this snap isn't connected to the \`removable-media\` security confinement interface. If you want to save the files under \`/media\`, \`/run/media\`, or \`/mnt\` directories you need to connect this snap to the \`removable-media\` interface by running the following command in a terminal:\\n\\n sudo snap connect %s:removable-media\\n\\n" \
+ "${SNAP_NAME}" \
+ >&2
+ printf -- \
+ "To disable this warning create an empty file at the following path:\\n\\n %s/marker_disable_interface_warning\\n\\n" \
+ "${SNAP_USER_COMMON}" \
+ >&2
+ fi
+fi
+
+# Finally run the next part of the command chain
+exec "${@}"
diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml
new file mode 100644
index 0000000..f8e26fa
--- /dev/null
+++ b/snap/snapcraft.yaml
@@ -0,0 +1,110 @@
+%YAML 1.1
+---
+# Snapcraft Recipe for gallery-dl
+# ------------------------------
+# This file is in the YAML data serialization format:
+# http://yaml.org
+# For the spec. of writing this file refer the following documentation:
+# * The snapcraft format
+# https://docs.snapcraft.io/the-snapcraft-format/8337
+# * Snap Documentation
+# https://docs.snapcraft.io
+# * Topics under the doc category in the Snapcraft Forum
+# https://forum.snapcraft.io/c/doc
+# For support refer to the snapcraft section in the Snapcraft Forum:
+# https://forum.snapcraft.io/c/snapcraft
+name: gallery-dl
+license: GPL-2.0
+base: core
+summary: Download image-galleries and -collections from several image hosting sites
+description: |
+ `gallery-dl` is a command-line program to download image-galleries and -collections from several image hosting sites (see [Supported Sites][1]). It is a cross-platform tool with many configuration options and powerful filenaming capabilities.
+
+ [1]: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
+
+adopt-info: gallery-dl
+confinement: strict
+grade: stable
+
+plugs:
+ # For `xdg-open` command access for opening OAuth authentication webpages
+ desktop:
+
+ # Storage access
+ home:
+ removable-media: # Non-A/C
+
+ # Network access
+ network:
+
+ # For network service for recieving OAuth callback tokens
+ network-bind:
+
+ # Configuration access
+ config-gallery-dl:
+ interface: personal-files
+ read:
+ - $HOME/.config/gallery-dl
+ - $HOME/.gallery-dl.conf
+ etc-gallery-dl:
+ interface: system-files
+ read:
+ - /etc/gallery-dl.conf
+
+parts:
+ # Launcher programs to fix problems at runtime
+ launchers:
+ source: snap/local/launchers
+ plugin: dump
+ organize:
+ '*': bin/
+
+ # Check out the tagged release revision if it isn’t promoted to the stable channel
+ # https://forum.snapcraft.io/t/selective-checkout-check-out-the-tagged-release-revision-if-it-isnt-promoted-to-the-stable-channel/10617
+ selective-checkout:
+ plugin: nil
+ build-packages:
+ - git
+ stage-snaps:
+ - selective-checkout
+ prime:
+ - -*
+
+ gallery-dl:
+ after:
+ - selective-checkout
+
+ source: .
+ override-pull: |
+ snapcraftctl pull
+ $SNAPCRAFT_STAGE/scriptlets/selective-checkout
+
+ plugin: python
+ build-packages:
+ - make
+ python-packages:
+ - youtube_dl
+ override-build: |
+ # build manpages and bash completion
+ make man completion
+
+ snapcraftctl build
+
+ ffmpeg:
+ plugin: nil
+ stage-packages:
+ - ffmpeg
+
+apps:
+ gallery-dl:
+ adapter: full
+ command-chain:
+ - bin/gallery-dl-launch
+ command: bin/gallery-dl
+ completer: etc/bash_completion.d/gallery-dl.bash_completion
+ environment:
+ LANG: C.UTF-8
+ LC_ALL: C.UTF-8
+
+ # Satisfy FFmpeg's libpulsecommon dependency
+ LD_LIBRARY_PATH: $LD_LIBRARY_PATH:$SNAP/usr/lib/$SNAPCRAFT_ARCH_TRIPLET/pulseaudio
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/test/__init__.py
diff --git a/test/test_config.py b/test/test_config.py
new file mode 100644
index 0000000..8cdb3da
--- /dev/null
+++ b/test/test_config.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2017 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+import gallery_dl.config as config
+import os
+import tempfile
+
+
+class TestConfig(unittest.TestCase):
+
+ def setUp(self):
+ fd, self._configfile = tempfile.mkstemp()
+ with os.fdopen(fd, "w") as file:
+ file.write('{"a": "1", "b": {"a": 2, "c": "text"}}')
+ config.load((self._configfile,))
+
+ def tearDown(self):
+ config.clear()
+ os.remove(self._configfile)
+
+ def test_get(self):
+ self.assertEqual(config.get(["a"]), "1")
+ self.assertEqual(config.get(["b", "c"]), "text")
+ self.assertEqual(config.get(["d"]), None)
+ self.assertEqual(config.get(["e", "f", "g"], 123), 123)
+
+ def test_interpolate(self):
+ self.assertEqual(config.interpolate(["a"]), "1")
+ self.assertEqual(config.interpolate(["b", "a"]), "1")
+ self.assertEqual(config.interpolate(["b", "c"], "2"), "text")
+ self.assertEqual(config.interpolate(["b", "d"], "2"), "2")
+ config.set(["d"], 123)
+ self.assertEqual(config.interpolate(["b", "d"], "2"), 123)
+ self.assertEqual(config.interpolate(["d", "d"], "2"), 123)
+
+ def test_set(self):
+ config.set(["b", "c"], [1, 2, 3])
+ config.set(["e", "f", "g"], value=234)
+ self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
+ self.assertEqual(config.get(["e", "f", "g"]), 234)
+
+ def test_setdefault(self):
+ config.setdefault(["b", "c"], [1, 2, 3])
+ config.setdefault(["e", "f", "g"], value=234)
+ self.assertEqual(config.get(["b", "c"]), "text")
+ self.assertEqual(config.get(["e", "f", "g"]), 234)
+
+ def test_unset(self):
+ config.unset(["a"])
+ config.unset(["b", "c"])
+ config.unset(["c", "d"])
+ self.assertEqual(config.get(["a"]), None)
+ self.assertEqual(config.get(["b", "a"]), 2)
+ self.assertEqual(config.get(["b", "c"]), None)
+
+ def test_apply(self):
+ options = (
+ (["b", "c"], [1, 2, 3]),
+ (["e", "f", "g"], 234),
+ )
+
+ self.assertEqual(config.get(["b", "c"]), "text")
+ self.assertEqual(config.get(["e", "f", "g"]), None)
+
+ with config.apply(options):
+ self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
+ self.assertEqual(config.get(["e", "f", "g"]), 234)
+
+ self.assertEqual(config.get(["b", "c"]), "text")
+ self.assertEqual(config.get(["e", "f", "g"]), None)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_cookies.py b/test/test_cookies.py
new file mode 100644
index 0000000..a786df6
--- /dev/null
+++ b/test/test_cookies.py
@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2017 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+from unittest import mock
+
+import logging
+import tempfile
+import http.cookiejar
+from os.path import join
+
+import gallery_dl.config as config
+import gallery_dl.extractor as extractor
+
+CKEY = ("cookies",)
+
+
+class TestCookiejar(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ cls.path = tempfile.TemporaryDirectory()
+
+ cls.cookiefile = join(cls.path.name, "cookies.txt")
+ with open(cls.cookiefile, "w") as file:
+ file.write("""# HTTP Cookie File
+.example.org\tTRUE\t/\tFALSE\t253402210800\tNAME\tVALUE
+""")
+
+ cls.invalid_cookiefile = join(cls.path.name, "invalid.txt")
+ with open(cls.invalid_cookiefile, "w") as file:
+ file.write("""# asd
+.example.org\tTRUE\t/\tFALSE\t253402210800\tNAME\tVALUE
+""")
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.path.cleanup()
+ config.clear()
+
+ def test_cookiefile(self):
+ config.set(CKEY, self.cookiefile)
+
+ cookies = extractor.find("test:").session.cookies
+ self.assertEqual(len(cookies), 1)
+
+ cookie = next(iter(cookies))
+ self.assertEqual(cookie.domain, ".example.org")
+ self.assertEqual(cookie.path , "/")
+ self.assertEqual(cookie.name , "NAME")
+ self.assertEqual(cookie.value , "VALUE")
+
+ def test_invalid_cookiefile(self):
+ self._test_warning(self.invalid_cookiefile, http.cookiejar.LoadError)
+
+ def test_invalid_filename(self):
+ self._test_warning(join(self.path.name, "nothing"), FileNotFoundError)
+
+ def _test_warning(self, filename, exc):
+ config.set(CKEY, filename)
+ log = logging.getLogger("test")
+ with mock.patch.object(log, "warning") as mock_warning:
+ cookies = extractor.find("test:").session.cookies
+ self.assertEqual(len(cookies), 0)
+ self.assertEqual(mock_warning.call_count, 1)
+ self.assertEqual(mock_warning.call_args[0][0], "cookies: %s")
+ self.assertIsInstance(mock_warning.call_args[0][1], exc)
+
+
+class TestCookiedict(unittest.TestCase):
+
+ def setUp(self):
+ self.cdict = {"NAME1": "VALUE1", "NAME2": "VALUE2"}
+ config.set(CKEY, self.cdict)
+
+ def tearDown(self):
+ config.clear()
+
+ def test_dict(self):
+ cookies = extractor.find("test:").session.cookies
+ self.assertEqual(len(cookies), len(self.cdict))
+ self.assertEqual(sorted(cookies.keys()), sorted(self.cdict.keys()))
+ self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values()))
+
+ def test_domain(self):
+ for category in ["exhentai", "nijie", "sankaku", "seiga"]:
+ extr = _get_extractor(category)
+ cookies = extr.session.cookies
+ for key in self.cdict:
+ self.assertTrue(key in cookies)
+ for c in cookies:
+ self.assertEqual(c.domain, extr.cookiedomain)
+
+
+class TestCookieLogin(unittest.TestCase):
+
+ def tearDown(self):
+ config.clear()
+
+ def test_cookie_login(self):
+ extr_cookies = {
+ "exhentai": ("ipb_member_id", "ipb_pass_hash"),
+ "nijie" : ("nemail", "nlogin"),
+ "sankaku" : ("login", "pass_hash"),
+ "seiga" : ("user_session",),
+ }
+ for category, cookienames in extr_cookies.items():
+ cookies = {name: "value" for name in cookienames}
+ config.set(CKEY, cookies)
+ extr = _get_extractor(category)
+ with mock.patch.object(extr, "_login_impl") as mock_login:
+ extr.login()
+ mock_login.assert_not_called()
+
+
+def _get_extractor(category):
+ for extr in extractor.extractors():
+ if extr.category == category and hasattr(extr, "_login_impl"):
+ url = next(extr._get_tests())[0]
+ return extr.from_url(url)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/test/test_downloader.py b/test/test_downloader.py
new file mode 100644
index 0000000..3f301b0
--- /dev/null
+++ b/test/test_downloader.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import re
+import base64
+import os.path
+import tempfile
+import unittest
+import threading
+import http.server
+
+import gallery_dl.downloader as downloader
+import gallery_dl.extractor as extractor
+import gallery_dl.config as config
+from gallery_dl.downloader.common import DownloaderBase
+from gallery_dl.output import NullOutput
+from gallery_dl.util import PathFormat
+
+
+class TestDownloaderBase(unittest.TestCase):
+
+ @classmethod
+ def setUpClass(cls):
+ cls.extractor = extractor.find("test:")
+ cls.dir = tempfile.TemporaryDirectory()
+ cls.fnum = 0
+ config.set(("base-directory",), cls.dir.name)
+
+ @classmethod
+ def tearDownClass(cls):
+ cls.dir.cleanup()
+ config.clear()
+
+ @classmethod
+ def _prepare_destination(cls, content=None, part=True, extension=None):
+ name = "file-{}".format(cls.fnum)
+ cls.fnum += 1
+
+ kwdict = {
+ "category": "test",
+ "subcategory": "test",
+ "filename": name,
+ "extension": extension,
+ }
+ pathfmt = PathFormat(cls.extractor)
+ pathfmt.set_directory(kwdict)
+ pathfmt.set_keywords(kwdict)
+
+ if content:
+ mode = "w" + ("b" if isinstance(content, bytes) else "")
+ with pathfmt.open(mode) as file:
+ file.write(content)
+
+ return pathfmt
+
+ def _run_test(self, url, input, output,
+ extension, expected_extension=None):
+ pathfmt = self._prepare_destination(input, extension=extension)
+ success = self.downloader.download(url, pathfmt)
+
+ # test successful download
+ self.assertTrue(success, "downloading '{}' failed".format(url))
+
+ # test content
+ mode = "r" + ("b" if isinstance(output, bytes) else "")
+ with pathfmt.open(mode) as file:
+ content = file.read()
+ self.assertEqual(content, output)
+
+ # test filename extension
+ self.assertEqual(
+ pathfmt.keywords["extension"],
+ expected_extension,
+ )
+ self.assertEqual(
+ os.path.splitext(pathfmt.realpath)[1][1:],
+ expected_extension,
+ )
+
+
+class TestHTTPDownloader(TestDownloaderBase):
+
+ @classmethod
+ def setUpClass(cls):
+ TestDownloaderBase.setUpClass()
+ cls.downloader = downloader.find("http")(cls.extractor, NullOutput())
+
+ port = 8088
+ cls.address = "http://127.0.0.1:{}".format(port)
+ cls._jpg = cls.address + "/image.jpg"
+ cls._png = cls.address + "/image.png"
+ cls._gif = cls.address + "/image.gif"
+
+ server = http.server.HTTPServer(("", port), HttpRequestHandler)
+ threading.Thread(target=server.serve_forever, daemon=True).start()
+
+ def test_http_download(self):
+ self._run_test(self._jpg, None, DATA_JPG, "jpg", "jpg")
+ self._run_test(self._png, None, DATA_PNG, "png", "png")
+ self._run_test(self._gif, None, DATA_GIF, "gif", "gif")
+
+ def test_http_offset(self):
+ self._run_test(self._jpg, DATA_JPG[:123], DATA_JPG, "jpg", "jpg")
+ self._run_test(self._png, DATA_PNG[:12] , DATA_PNG, "png", "png")
+ self._run_test(self._gif, DATA_GIF[:1] , DATA_GIF, "gif", "gif")
+
+ def test_http_extension(self):
+ self._run_test(self._jpg, None, DATA_JPG, None, "jpg")
+ self._run_test(self._png, None, DATA_PNG, None, "png")
+ self._run_test(self._gif, None, DATA_GIF, None, "gif")
+
+ def test_http_adjust_extension(self):
+ self._run_test(self._jpg, None, DATA_JPG, "png", "jpg")
+ self._run_test(self._png, None, DATA_PNG, "gif", "png")
+ self._run_test(self._gif, None, DATA_GIF, "jpg", "gif")
+
+
+class TestTextDownloader(TestDownloaderBase):
+
+ @classmethod
+ def setUpClass(cls):
+ TestDownloaderBase.setUpClass()
+ cls.downloader = downloader.find("text")(cls.extractor, NullOutput())
+
+ def test_text_download(self):
+ self._run_test("text:foobar", None, "foobar", "txt", "txt")
+
+ def test_text_offset(self):
+ self._run_test("text:foobar", "foo", "foobar", "txt", "txt")
+
+ def test_text_extension(self):
+ self._run_test("text:foobar", None, "foobar", None, "txt")
+
+ def test_text_empty(self):
+ self._run_test("text:", None, "", "txt", "txt")
+
+
+class FakeDownloader(DownloaderBase):
+ scheme = "fake"
+
+ def __init__(self, extractor, output):
+ DownloaderBase.__init__(self, extractor, output)
+
+ def connect(self, url, offset):
+ pass
+
+ def receive(self, file):
+ pass
+
+ def reset(self):
+ pass
+
+ def get_extension(self):
+ pass
+
+ @staticmethod
+ def _check_extension(file, pathfmt):
+ pass
+
+
+class HttpRequestHandler(http.server.BaseHTTPRequestHandler):
+
+ def do_GET(self):
+ if self.path == "/image.jpg":
+ content_type = "image/jpeg"
+ output = DATA_JPG
+ elif self.path == "/image.png":
+ content_type = "image/png"
+ output = DATA_PNG
+ elif self.path == "/image.gif":
+ content_type = "image/gif"
+ output = DATA_GIF
+ else:
+ self.send_response(404)
+ self.wfile.write(self.path.encode())
+ return
+
+ headers = {
+ "Content-Type": content_type,
+ "Content-Length": len(output),
+ }
+
+ if "Range" in self.headers:
+ status = 206
+
+ match = re.match(r"bytes=(\d+)-", self.headers["Range"])
+ start = int(match.group(1))
+
+ headers["Content-Range"] = "bytes {}-{}/{}".format(
+ start, len(output)-1, len(output))
+ output = output[start:]
+ else:
+ status = 200
+
+ self.send_response(status)
+ for key, value in headers.items():
+ self.send_header(key, value)
+ self.end_headers()
+ self.wfile.write(output)
+
+
+DATA_JPG = base64.standard_b64decode("""
+/9j/4AAQSkZJRgABAQEASABIAAD/2wBD
+AAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
+AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
+AQEBAQEBAQEBAQEBAQEBAQH/2wBDAQEB
+AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
+AQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEB
+AQEBAQEBAQEBAQEBAQH/wAARCAABAAED
+AREAAhEBAxEB/8QAFAABAAAAAAAAAAAA
+AAAAAAAACv/EABQQAQAAAAAAAAAAAAAA
+AAAAAAD/xAAUAQEAAAAAAAAAAAAAAAAA
+AAAA/8QAFBEBAAAAAAAAAAAAAAAAAAAA
+AP/aAAwDAQACEQMRAD8AfwD/2Q==""")
+
+
+DATA_PNG = base64.standard_b64decode("""
+iVBORw0KGgoAAAANSUhEUgAAAAEAAAAB
+CAAAAAA6fptVAAAACklEQVQIHWP4DwAB
+AQEANl9ngAAAAABJRU5ErkJggg==""")
+
+
+DATA_GIF = base64.standard_b64decode("""
+R0lGODdhAQABAIAAAP///////ywAAAAA
+AQABAAACAkQBADs=""")
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/test/test_extractor.py b/test/test_extractor.py
new file mode 100644
index 0000000..fa0709b
--- /dev/null
+++ b/test/test_extractor.py
@@ -0,0 +1,186 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2018-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import sys
+import unittest
+import string
+
+from gallery_dl import extractor
+from gallery_dl.extractor.common import Extractor, Message
+from gallery_dl.extractor.directlink import DirectlinkExtractor as DLExtractor
+
+
+class FakeExtractor(Extractor):
+ category = "fake"
+ subcategory = "test"
+ pattern = "fake:"
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Url, "text:foobar", {}
+
+
+class TestExtractor(unittest.TestCase):
+ VALID_URIS = (
+ "https://example.org/file.jpg",
+ "tumblr:foobar",
+ "oauth:flickr",
+ "test:pixiv:",
+ "recursive:https://example.org/document.html",
+ )
+
+ def setUp(self):
+ extractor._cache.clear()
+ extractor._module_iter = iter(extractor.modules)
+
+ def test_find(self):
+ for uri in self.VALID_URIS:
+ result = extractor.find(uri)
+ self.assertIsInstance(result, Extractor, uri)
+
+ for not_found in ("", "/tmp/file.ext"):
+ self.assertIsNone(extractor.find(not_found))
+
+ for invalid in (None, [], {}, 123, b"test:"):
+ with self.assertRaises(TypeError):
+ extractor.find(invalid)
+
+ def test_add(self):
+ uri = "fake:foobar"
+ self.assertIsNone(extractor.find(uri))
+
+ extractor.add(FakeExtractor)
+ self.assertIsInstance(extractor.find(uri), FakeExtractor)
+
+ def test_add_module(self):
+ uri = "fake:foobar"
+ self.assertIsNone(extractor.find(uri))
+
+ classes = extractor.add_module(sys.modules[__name__])
+ self.assertEqual(len(classes), 1)
+ self.assertEqual(classes[0].pattern, FakeExtractor.pattern)
+ self.assertEqual(classes[0], FakeExtractor)
+ self.assertIsInstance(extractor.find(uri), FakeExtractor)
+
+ def test_blacklist(self):
+ link_uri = "https://example.org/file.jpg"
+ test_uri = "test:"
+ fake_uri = "fake:"
+
+ self.assertIsInstance(extractor.find(link_uri), DLExtractor)
+ self.assertIsInstance(extractor.find(test_uri), Extractor)
+ self.assertIsNone(extractor.find(fake_uri))
+
+ with extractor.blacklist(["directlink"]):
+ self.assertIsNone(extractor.find(link_uri))
+ self.assertIsInstance(extractor.find(test_uri), Extractor)
+ self.assertIsNone(extractor.find(fake_uri))
+
+ with extractor.blacklist([], [DLExtractor, FakeExtractor]):
+ self.assertIsNone(extractor.find(link_uri))
+ self.assertIsInstance(extractor.find(test_uri), Extractor)
+ self.assertIsNone(extractor.find(fake_uri))
+
+ with extractor.blacklist(["test"], [DLExtractor]):
+ self.assertIsNone(extractor.find(link_uri))
+ self.assertIsNone(extractor.find(test_uri))
+ self.assertIsNone(extractor.find(fake_uri))
+
+ def test_from_url(self):
+ for uri in self.VALID_URIS:
+ cls = extractor.find(uri).__class__
+ extr = cls.from_url(uri)
+ self.assertIs(type(extr), cls)
+ self.assertIsInstance(extr, Extractor)
+
+ for not_found in ("", "/tmp/file.ext"):
+ self.assertIsNone(FakeExtractor.from_url(not_found))
+
+ for invalid in (None, [], {}, 123, b"test:"):
+ with self.assertRaises(TypeError):
+ FakeExtractor.from_url(invalid)
+
+ def test_unique_pattern_matches(self):
+ test_urls = []
+
+ # collect testcase URLs
+ for extr in extractor.extractors():
+ for testcase in extr._get_tests():
+ test_urls.append((testcase[0], extr))
+
+ # iterate over all testcase URLs
+ for url, extr1 in test_urls:
+ matches = []
+
+ # ... and apply all regex patterns to each one
+ for extr2 in extractor._cache:
+
+ # skip DirectlinkExtractor pattern if it isn't tested
+ if extr1 != DLExtractor and extr2 == DLExtractor:
+ continue
+
+ match = extr2.pattern.match(url)
+ if match:
+ matches.append(match)
+
+ # fail if more or less than 1 match happened
+ if len(matches) > 1:
+ msg = "'{}' gets matched by more than one pattern:".format(url)
+ for match in matches:
+ msg += "\n- "
+ msg += match.re.pattern
+ self.fail(msg)
+
+ if len(matches) < 1:
+ msg = "'{}' isn't matched by any pattern".format(url)
+ self.fail(msg)
+
+ def test_docstrings(self):
+ """ensure docstring uniqueness"""
+ for extr1 in extractor.extractors():
+ for extr2 in extractor.extractors():
+ if extr1 != extr2 and extr1.__doc__ and extr2.__doc__:
+ self.assertNotEqual(
+ extr1.__doc__,
+ extr2.__doc__,
+ "{} <-> {}".format(extr1, extr2),
+ )
+
+ def test_names(self):
+ """Ensure extractor classes are named CategorySubcategoryExtractor"""
+ def capitalize(c):
+ if "-" in c:
+ return string.capwords(c.replace("-", " ")).replace(" ", "")
+ if "." in c:
+ c = c.replace(".", "")
+ return c.capitalize()
+
+ mapping = {
+ "2chan" : "futaba",
+ "3dbooru": "threedeebooru",
+ "4chan" : "fourchan",
+ "4plebs" : "fourplebs",
+ "8chan" : "infinitychan",
+ "oauth" : None,
+ }
+
+ for extr in extractor.extractors():
+ category = mapping.get(extr.category, extr.category)
+ if category:
+ expected = "{}{}Extractor".format(
+ capitalize(category),
+ capitalize(extr.subcategory),
+ )
+ if expected[0].isdigit():
+ expected = "_" + expected
+ self.assertEqual(expected, extr.__name__)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/test/test_oauth.py b/test/test_oauth.py
new file mode 100644
index 0000000..2ce5b43
--- /dev/null
+++ b/test/test_oauth.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+
+from gallery_dl import oauth, text
+
+TESTSERVER = "http://term.ie/oauth/example"
+CONSUMER_KEY = "key"
+CONSUMER_SECRET = "secret"
+REQUEST_TOKEN = "requestkey"
+REQUEST_TOKEN_SECRET = "requestsecret"
+ACCESS_TOKEN = "accesskey"
+ACCESS_TOKEN_SECRET = "accesssecret"
+
+
+class TestOAuthSession(unittest.TestCase):
+
+ def test_concat(self):
+ concat = oauth.concat
+
+ self.assertEqual(concat(), "")
+ self.assertEqual(concat("str"), "str")
+ self.assertEqual(concat("str1", "str2"), "str1&str2")
+
+ self.assertEqual(concat("&", "?/"), "%26&%3F%2F")
+ self.assertEqual(
+ concat("GET", "http://example.org/", "foo=bar&baz=a"),
+ "GET&http%3A%2F%2Fexample.org%2F&foo%3Dbar%26baz%3Da"
+ )
+
+ def test_nonce(self, size=16):
+ nonce_values = set(oauth.nonce(size) for _ in range(size))
+
+ # uniqueness
+ self.assertEqual(len(nonce_values), size)
+
+ # length
+ for nonce in nonce_values:
+ self.assertEqual(len(nonce), size)
+
+ def test_quote(self):
+ quote = oauth.quote
+
+ reserved = ",;:!\"§$%&/(){}[]=?`´+*'äöü"
+ unreserved = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789-._~")
+
+ for char in unreserved:
+ self.assertEqual(quote(char), char)
+
+ for char in reserved:
+ quoted = quote(char)
+ quoted_hex = quoted.replace("%", "")
+ self.assertTrue(quoted.startswith("%"))
+ self.assertTrue(len(quoted) >= 3)
+ self.assertEqual(quoted_hex.upper(), quoted_hex)
+
+ def test_request_token(self):
+ response = self._oauth_request(
+ "/request_token.php", {})
+ expected = "oauth_token=requestkey&oauth_token_secret=requestsecret"
+ self.assertEqual(response, expected, msg=response)
+
+ data = text.parse_query(response)
+ self.assertTrue(data["oauth_token"], REQUEST_TOKEN)
+ self.assertTrue(data["oauth_token_secret"], REQUEST_TOKEN_SECRET)
+
+ def test_access_token(self):
+ response = self._oauth_request(
+ "/access_token.php", {}, REQUEST_TOKEN, REQUEST_TOKEN_SECRET)
+ expected = "oauth_token=accesskey&oauth_token_secret=accesssecret"
+ self.assertEqual(response, expected, msg=response)
+
+ data = text.parse_query(response)
+ self.assertTrue(data["oauth_token"], ACCESS_TOKEN)
+ self.assertTrue(data["oauth_token_secret"], ACCESS_TOKEN_SECRET)
+
+ def test_authenticated_call(self):
+ params = {"method": "foo", "a": "äöüß/?&#", "äöüß/?&#": "a"}
+ response = self._oauth_request(
+ "/echo_api.php", params, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
+
+ self.assertEqual(text.parse_query(response), params)
+
+ def _oauth_request(self, endpoint, params=None,
+ oauth_token=None, oauth_token_secret=None):
+ session = oauth.OAuth1Session(
+ CONSUMER_KEY, CONSUMER_SECRET,
+ oauth_token, oauth_token_secret,
+ )
+ url = TESTSERVER + endpoint
+ return session.get(url, params=params).text
+
+
+if __name__ == "__main__":
+ unittest.main(warnings="ignore")
diff --git a/test/test_results.py b/test/test_results.py
new file mode 100644
index 0000000..8f03f03
--- /dev/null
+++ b/test/test_results.py
@@ -0,0 +1,344 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import os
+import sys
+import re
+import json
+import hashlib
+import unittest
+from gallery_dl import extractor, util, job, config, exception
+
+
+# these don't work on Travis CI
+TRAVIS_SKIP = {
+ "exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie", "bobx",
+ "archivedmoe", "archiveofsins", "thebarchive", "fireden", "4plebs",
+ "sankaku", "idolcomplex", "mangahere", "readcomiconline", "mangadex",
+ "sankakucomplex",
+}
+
+# temporary issues, etc.
+BROKEN = {
+ "komikcast",
+ "mangapark",
+}
+
+
+class TestExtractorResults(unittest.TestCase):
+
+ def setUp(self):
+ setup_test_config()
+
+ def tearDown(self):
+ config.clear()
+
+ @classmethod
+ def setUpClass(cls):
+ cls._skipped = []
+
+ @classmethod
+ def tearDownClass(cls):
+ if cls._skipped:
+ print("\n\nSkipped tests:")
+ for url, exc in cls._skipped:
+ print('- {} ("{}")'.format(url, exc))
+
+ def _run_test(self, extr, url, result):
+ if result:
+ if "options" in result:
+ for key, value in result["options"]:
+ config.set(key.split("."), value)
+ if "range" in result:
+ config.set(("image-range",), result["range"])
+ config.set(("chapter-range",), result["range"])
+ content = "content" in result
+ else:
+ content = False
+
+ tjob = ResultJob(url, content=content)
+ self.assertEqual(extr, tjob.extractor.__class__)
+
+ if not result:
+ return
+ if "exception" in result:
+ with self.assertRaises(result["exception"]):
+ tjob.run()
+ return
+ try:
+ tjob.run()
+ except exception.StopExtraction:
+ pass
+ except exception.HttpError as exc:
+ exc = str(exc)
+ if re.match(r"5\d\d: ", exc) or \
+ re.search(r"\bRead timed out\b", exc):
+ self._skipped.append((url, exc))
+ self.skipTest(exc)
+ raise
+
+ # test archive-id uniqueness
+ self.assertEqual(len(set(tjob.list_archive)), len(tjob.list_archive))
+
+ # test '_extractor' entries
+ if tjob.queue:
+ for url, kwdict in zip(tjob.list_url, tjob.list_keyword):
+ if "_extractor" in kwdict:
+ extr = kwdict["_extractor"].from_url(url)
+ self.assertIsInstance(extr, kwdict["_extractor"])
+ self.assertEqual(extr.url, url)
+
+ # test extraction results
+ if "url" in result:
+ self.assertEqual(result["url"], tjob.hash_url.hexdigest())
+
+ if "content" in result:
+ self.assertEqual(result["content"], tjob.hash_content.hexdigest())
+
+ if "keyword" in result:
+ keyword = result["keyword"]
+ if isinstance(keyword, dict):
+ for kwdict in tjob.list_keyword:
+ self._test_kwdict(kwdict, keyword)
+ else: # assume SHA1 hash
+ self.assertEqual(keyword, tjob.hash_keyword.hexdigest())
+
+ if "count" in result:
+ count = result["count"]
+ if isinstance(count, str):
+ self.assertRegex(count, r"^ *(==|!=|<|<=|>|>=) *\d+ *$")
+ expr = "{} {}".format(len(tjob.list_url), count)
+ self.assertTrue(eval(expr), msg=expr)
+ else: # assume integer
+ self.assertEqual(len(tjob.list_url), count)
+
+ if "pattern" in result:
+ self.assertGreater(len(tjob.list_url), 0)
+ for url in tjob.list_url:
+ self.assertRegex(url, result["pattern"])
+
+ def _test_kwdict(self, kwdict, tests):
+ for key, test in tests.items():
+ if key.startswith("?"):
+ key = key[1:]
+ if key not in kwdict:
+ continue
+ self.assertIn(key, kwdict)
+ value = kwdict[key]
+
+ if isinstance(test, dict):
+ self._test_kwdict(value, test)
+ elif isinstance(test, type):
+ self.assertIsInstance(value, test, msg=key)
+ elif isinstance(test, str):
+ if test.startswith("re:"):
+ self.assertRegex(value, test[3:], msg=key)
+ elif test.startswith("type:"):
+ self.assertEqual(type(value).__name__, test[5:], msg=key)
+ else:
+ self.assertEqual(value, test, msg=key)
+ else:
+ self.assertEqual(value, test, msg=key)
+
+
+class ResultJob(job.DownloadJob):
+ """Generate test-results for extractor runs"""
+
+ def __init__(self, url, parent=None, content=False):
+ job.DownloadJob.__init__(self, url, parent)
+ self.queue = False
+ self.content = content
+ self.list_url = []
+ self.list_keyword = []
+ self.list_archive = []
+ self.hash_url = hashlib.sha1()
+ self.hash_keyword = hashlib.sha1()
+ self.hash_archive = hashlib.sha1()
+ self.hash_content = hashlib.sha1()
+ if content:
+ self.fileobj = TestPathfmt(self.hash_content)
+ self.get_downloader("http")._check_extension = lambda a, b: None
+
+ self.format_directory = TestFormatter(
+ "".join(self.extractor.directory_fmt))
+ self.format_filename = TestFormatter(self.extractor.filename_fmt)
+
+ def run(self):
+ for msg in self.extractor:
+ self.dispatch(msg)
+
+ def handle_url(self, url, keywords, fallback=None):
+ self.update_url(url)
+ self.update_keyword(keywords)
+ self.update_archive(keywords)
+ self.update_content(url)
+ self.format_filename.format_map(keywords)
+
+ def handle_directory(self, keywords):
+ self.update_keyword(keywords, False)
+ self.format_directory.format_map(keywords)
+
+ def handle_queue(self, url, keywords):
+ self.queue = True
+ self.update_url(url)
+ self.update_keyword(keywords)
+
+ def update_url(self, url):
+ self.list_url.append(url)
+ self.hash_url.update(url.encode())
+
+ def update_keyword(self, kwdict, to_list=True):
+ if to_list:
+ self.list_keyword.append(kwdict)
+ kwdict = self._filter(kwdict)
+ self.hash_keyword.update(
+ json.dumps(kwdict, sort_keys=True, default=str).encode())
+
+ def update_archive(self, kwdict):
+ archive_id = self.extractor.archive_fmt.format_map(kwdict)
+ self.list_archive.append(archive_id)
+ self.hash_archive.update(archive_id.encode())
+
+ def update_content(self, url):
+ if self.content:
+ scheme = url.partition(":")[0]
+ self.get_downloader(scheme).download(url, self.fileobj)
+
+
+class TestPathfmt():
+
+ def __init__(self, hashobj):
+ self.hashobj = hashobj
+ self.path = ""
+ self.size = 0
+ self.has_extension = True
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, *args):
+ pass
+
+ def open(self, mode):
+ self.size = 0
+ return self
+
+ def write(self, content):
+ """Update SHA1 hash"""
+ self.size += len(content)
+ self.hashobj.update(content)
+
+ def tell(self):
+ return self.size
+
+ def part_size(self):
+ return 0
+
+
+class TestFormatter(util.Formatter):
+
+ @staticmethod
+ def _noop(_):
+ return ""
+
+ def _apply_simple(self, key, fmt):
+ if key == "extension" or "._format_optional." in repr(fmt):
+ return self._noop
+
+ def wrap(obj):
+ return fmt(obj[key])
+ return wrap
+
+ def _apply(self, key, funcs, fmt):
+ if key == "extension" or "._format_optional." in repr(fmt):
+ return self._noop
+
+ def wrap(obj):
+ obj = obj[key]
+ for func in funcs:
+ obj = func(obj)
+ return fmt(obj)
+ return wrap
+
+
+def setup_test_config():
+ name = "gallerydl"
+ email = "gallerydl@openaliasbox.org"
+
+ config.clear()
+ config.set(("cache", "file"), ":memory:")
+ config.set(("downloader", "part"), False)
+ config.set(("extractor", "timeout"), 60)
+ config.set(("extractor", "username"), name)
+ config.set(("extractor", "password"), name)
+ config.set(("extractor", "nijie", "username"), email)
+ config.set(("extractor", "seiga", "username"), email)
+ config.set(("extractor", "danbooru", "username"), None)
+ config.set(("extractor", "twitter" , "username"), None)
+ config.set(("extractor", "mangoxo" , "password"), "VZ8DL3983u")
+
+ config.set(("extractor", "deviantart", "client-id"), "7777")
+ config.set(("extractor", "deviantart", "client-secret"),
+ "ff14994c744d9208e5caeec7aab4a026")
+
+ config.set(("extractor", "tumblr", "api-key"),
+ "0cXoHfIqVzMQcc3HESZSNsVlulGxEXGDTTZCDrRrjaa0jmuTc6")
+ config.set(("extractor", "tumblr", "api-secret"),
+ "6wxAK2HwrXdedn7VIoZWxGqVhZ8JdYKDLjiQjL46MLqGuEtyVj")
+ config.set(("extractor", "tumblr", "access-token"),
+ "N613fPV6tOZQnyn0ERTuoEZn0mEqG8m2K8M3ClSJdEHZJuqFdG")
+ config.set(("extractor", "tumblr", "access-token-secret"),
+ "sgOA7ZTT4FBXdOGGVV331sSp0jHYp4yMDRslbhaQf7CaS71i4O")
+
+
+def generate_tests():
+ """Dynamically generate extractor unittests"""
+ def _generate_test(extr, tcase):
+ def test(self):
+ url, result = tcase
+ print("\n", url, sep="")
+ self._run_test(extr, url, result)
+ return test
+
+ # enable selective testing for direct calls
+ if __name__ == '__main__' and len(sys.argv) > 1:
+ if sys.argv[1].lower() == "all":
+ fltr = lambda c, bc: True # noqa: E731
+ elif sys.argv[1].lower() == "broken":
+ fltr = lambda c, bc: c in BROKEN # noqa: E731
+ else:
+ argv = sys.argv[1:]
+ fltr = lambda c, bc: c in argv or bc in argv # noqa: E731
+ del sys.argv[1:]
+ else:
+ skip = set(BROKEN)
+ if "CI" in os.environ and "TRAVIS" in os.environ:
+ skip |= set(TRAVIS_SKIP)
+ if skip:
+ print("skipping:", ", ".join(skip))
+ fltr = lambda c, bc: c not in skip # noqa: E731
+
+ # filter available extractor classes
+ extractors = [
+ extr for extr in extractor.extractors()
+ if fltr(extr.category, getattr(extr, "basecategory", None))
+ ]
+
+ # add 'test_...' methods
+ for extr in extractors:
+ name = "test_" + extr.__name__ + "_"
+ for num, tcase in enumerate(extr._get_tests(), 1):
+ test = _generate_test(extr, tcase)
+ test.__name__ = name + str(num)
+ setattr(TestExtractorResults, test.__name__, test)
+
+
+generate_tests()
+if __name__ == '__main__':
+ unittest.main(warnings='ignore')
diff --git a/test/test_text.py b/test/test_text.py
new file mode 100644
index 0000000..405acd3
--- /dev/null
+++ b/test/test_text.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2018 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+import datetime
+
+from gallery_dl import text
+
+
+INVALID = ((), [], {}, None, 1, 2.3)
+INVALID_ALT = ((), [], {}, None, "")
+
+
+class TestText(unittest.TestCase):
+
+ def test_clean_xml(self, f=text.clean_xml):
+ # standard usage
+ self.assertEqual(f(""), "")
+ self.assertEqual(f("foo"), "foo")
+ self.assertEqual(f("\tfoo\nbar\r"), "\tfoo\nbar\r")
+ self.assertEqual(f("<foo>\ab\ba\fr\v</foo>"), "<foo>bar</foo>")
+
+ # 'repl' argument
+ repl = "#"
+ self.assertEqual(f("", repl), "")
+ self.assertEqual(f("foo", repl), "foo")
+ self.assertEqual(f("\tfoo\nbar\r", repl), "\tfoo\nbar\r")
+ self.assertEqual(
+ f("<foo>\ab\ba\fr\v</foo>", repl), "<foo>#b#a#r#</foo>")
+
+ # removal of all illegal control characters
+ value = "".join(chr(x) for x in range(32))
+ self.assertEqual(f(value), "\t\n\r")
+
+ # 'invalid' arguments
+ for value in INVALID:
+ self.assertEqual(f(value), "")
+
+ def test_remove_html(self, f=text.remove_html):
+ result = "Hello World."
+
+ # standard usage
+ self.assertEqual(f(""), "")
+ self.assertEqual(f("Hello World."), result)
+ self.assertEqual(f(" Hello World. "), result)
+ self.assertEqual(f("Hello<br/>World."), result)
+ self.assertEqual(
+ f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
+
+ # empty HTML
+ self.assertEqual(f("<div></div>"), "")
+ self.assertEqual(f(" <div> </div> "), "")
+
+ # malformed HTML
+ self.assertEqual(f("<div</div>"), "")
+ self.assertEqual(f("<div<Hello World.</div>"), "")
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), "")
+
+ def test_split_html(self, f=text.split_html):
+ result = ["Hello", "World."]
+ empty = []
+
+ # standard usage
+ self.assertEqual(f(""), empty)
+ self.assertEqual(f("Hello World."), ["Hello World."])
+ self.assertEqual(f(" Hello World. "), ["Hello World."])
+ self.assertEqual(f("Hello<br/>World."), result)
+ self.assertEqual(f(" Hello <br/> World. "), result)
+ self.assertEqual(
+ f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
+
+ # empty HTML
+ self.assertEqual(f("<div></div>"), empty)
+ self.assertEqual(f(" <div> </div> "), empty)
+
+ # malformed HTML
+ self.assertEqual(f("<div</div>"), empty)
+ self.assertEqual(f("<div<Hello World.</div>"), empty)
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), empty)
+
+ def test_filename_from_url(self, f=text.filename_from_url):
+ result = "filename.ext"
+
+ # standard usage
+ self.assertEqual(f(""), "")
+ self.assertEqual(f("filename.ext"), result)
+ self.assertEqual(f("/filename.ext"), result)
+ self.assertEqual(f("example.org/filename.ext"), result)
+ self.assertEqual(f("http://example.org/v2/filename.ext"), result)
+ self.assertEqual(
+ f("http://example.org/v2/filename.ext?param=value#frag"), result)
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), "")
+
+ def test_ext_from_url(self, f=text.ext_from_url):
+ result = "ext"
+
+ # standard usage
+ self.assertEqual(f(""), "")
+ self.assertEqual(f("filename.ext"), result)
+ self.assertEqual(f("/filename.ext"), result)
+ self.assertEqual(f("example.org/filename.ext"), result)
+ self.assertEqual(f("http://example.org/v2/filename.ext"), result)
+ self.assertEqual(
+ f("http://example.org/v2/filename.ext?param=value#frag"), result)
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), "")
+
+ def test_nameext_from_url(self, f=text.nameext_from_url):
+ empty = {"filename": "", "extension": ""}
+ result = {"filename": "filename", "extension": "ext"}
+
+ # standard usage
+ self.assertEqual(f(""), empty)
+ self.assertEqual(f("filename.ext"), result)
+ self.assertEqual(f("/filename.ext"), result)
+ self.assertEqual(f("example.org/filename.ext"), result)
+ self.assertEqual(f("http://example.org/v2/filename.ext"), result)
+ self.assertEqual(
+ f("http://example.org/v2/filename.ext?param=value#frag"), result)
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), empty)
+
+ def test_clean_path_windows(self, f=text.clean_path_windows):
+ self.assertEqual(f(""), "")
+ self.assertEqual(f("foo"), "foo")
+ self.assertEqual(f("foo/bar"), "foo_bar")
+ self.assertEqual(f("foo<>:\"\\/|?*bar"), "foo_________bar")
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), "")
+
+ def test_clean_path_posix(self, f=text.clean_path_posix):
+ self.assertEqual(f(""), "")
+ self.assertEqual(f("foo"), "foo")
+ self.assertEqual(f("foo/bar"), "foo_bar")
+ self.assertEqual(f("foo<>:\"\\/|?*bar"), "foo<>:\"\\_|?*bar")
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), "")
+
+ def test_extract(self, f=text.extract):
+ txt = "<a><b>"
+ self.assertEqual(f(txt, "<", ">"), ("a" , 3))
+ self.assertEqual(f(txt, "X", ">"), (None, 0))
+ self.assertEqual(f(txt, "<", "X"), (None, 0))
+
+ # 'pos' argument
+ for i in range(1, 4):
+ self.assertEqual(f(txt, "<", ">", i), ("b", 6))
+ for i in range(4, 10):
+ self.assertEqual(f(txt, "<", ">", i), (None, i))
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value, "<" , ">") , (None, 0))
+ self.assertEqual(f(txt , value, ">") , (None, 0))
+ self.assertEqual(f(txt , "<" , value), (None, 0))
+
+ def test_rextract(self, f=text.rextract):
+ txt = "<a><b>"
+ self.assertEqual(f(txt, "<", ">"), ("b" , 3))
+ self.assertEqual(f(txt, "X", ">"), (None, -1))
+ self.assertEqual(f(txt, "<", "X"), (None, -1))
+
+ # 'pos' argument
+ for i in range(10, 3, -1):
+ self.assertEqual(f(txt, "<", ">", i), ("b", 3))
+ for i in range(3, 0, -1):
+ self.assertEqual(f(txt, "<", ">", i), ("a", 0))
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value, "<" , ">") , (None, -1))
+ self.assertEqual(f(txt , value, ">") , (None, -1))
+ self.assertEqual(f(txt , "<" , value), (None, -1))
+
+ def test_extract_all(self, f=text.extract_all):
+ txt = "[c][b][a]: xyz! [d][e"
+
+ self.assertEqual(
+ f(txt, ()), ({}, 0))
+ self.assertEqual(
+ f(txt, (("C", "[", "]"), ("B", "[", "]"), ("A", "[", "]"))),
+ ({"A": "a", "B": "b", "C": "c"}, 9),
+ )
+
+ # 'None' as field name
+ self.assertEqual(
+ f(txt, ((None, "[", "]"), (None, "[", "]"), ("A", "[", "]"))),
+ ({"A": "a"}, 9),
+ )
+ self.assertEqual(
+ f(txt, ((None, "[", "]"), (None, "[", "]"), (None, "[", "]"))),
+ ({}, 9),
+ )
+
+ # failed matches
+ self.assertEqual(
+ f(txt, (("C", "[", "]"), ("X", "X", "X"), ("B", "[", "]"))),
+ ({"B": "b", "C": "c", "X": None}, 6),
+ )
+
+ # 'pos' argument
+ self.assertEqual(
+ f(txt, (("B", "[", "]"), ("A", "[", "]")), pos=1),
+ ({"A": "a", "B": "b"}, 9),
+ )
+
+ # 'values' argument
+ self.assertEqual(
+ f(txt, (("C", "[", "]"),), values={"A": "a", "B": "b"}),
+ ({"A": "a", "B": "b", "C": "c"}, 3),
+ )
+
+ vdict = {}
+ rdict, pos = f(txt, (), values=vdict)
+ self.assertIs(vdict, rdict)
+
+ def test_extract_iter(self, f=text.extract_iter):
+ txt = "[c][b][a]: xyz! [d][e"
+
+ def g(*args):
+ return list(f(*args))
+
+ self.assertEqual(
+ g("", "[", "]"), [])
+ self.assertEqual(
+ g("[a]", "[", "]"), ["a"])
+ self.assertEqual(
+ g(txt, "[", "]"), ["c", "b", "a", "d"])
+ self.assertEqual(
+ g(txt, "X", "X"), [])
+ self.assertEqual(
+ g(txt, "[", "]", 6), ["a", "d"])
+
+ def test_extract_from(self, f=text.extract_from):
+ txt = "[c][b][a]: xyz! [d][e"
+
+ e = f(txt)
+ self.assertEqual(e("[", "]"), "c")
+ self.assertEqual(e("[", "]"), "b")
+ self.assertEqual(e("[", "]"), "a")
+ self.assertEqual(e("[", "]"), "d")
+ self.assertEqual(e("[", "]"), "")
+ self.assertEqual(e("[", "]"), "")
+
+ e = f(txt, pos=6, default="END")
+ self.assertEqual(e("[", "]"), "a")
+ self.assertEqual(e("[", "]"), "d")
+ self.assertEqual(e("[", "]"), "END")
+ self.assertEqual(e("[", "]"), "END")
+
+ def test_parse_unicode_escapes(self, f=text.parse_unicode_escapes):
+ self.assertEqual(f(""), "")
+ self.assertEqual(f("foobar"), "foobar")
+ self.assertEqual(f("foo’bar"), "foo’bar")
+ self.assertEqual(f("foo\\u2019bar"), "foo’bar")
+ self.assertEqual(f("foo\\u201bar"), "foo‛ar")
+ self.assertEqual(f("foo\\u201zar"), "foo\\u201zar")
+ self.assertEqual(
+ f("\\u2018foo\\u2019\\u2020bar\\u00ff"),
+ "‘foo’†barÿ",
+ )
+
+ def test_parse_bytes(self, f=text.parse_bytes):
+ self.assertEqual(f("0"), 0)
+ self.assertEqual(f("50"), 50)
+ self.assertEqual(f("50k"), 50 * 1024**1)
+ self.assertEqual(f("50m"), 50 * 1024**2)
+ self.assertEqual(f("50g"), 50 * 1024**3)
+ self.assertEqual(f("50t"), 50 * 1024**4)
+ self.assertEqual(f("50p"), 50 * 1024**5)
+
+ # fractions
+ self.assertEqual(f("123.456"), 123)
+ self.assertEqual(f("123.567"), 124)
+ self.assertEqual(f("0.5M"), round(0.5 * 1024**2))
+
+ # invalid arguments
+ for value in INVALID_ALT:
+ self.assertEqual(f(value), 0)
+ self.assertEqual(f("NaN"), 0)
+ self.assertEqual(f("invalid"), 0)
+ self.assertEqual(f(" 123 kb "), 0)
+
+ def test_parse_int(self, f=text.parse_int):
+ self.assertEqual(f(0), 0)
+ self.assertEqual(f("0"), 0)
+ self.assertEqual(f(123), 123)
+ self.assertEqual(f("123"), 123)
+
+ # invalid arguments
+ for value in INVALID_ALT:
+ self.assertEqual(f(value), 0)
+ self.assertEqual(f("123.456"), 0)
+ self.assertEqual(f("zzz"), 0)
+ self.assertEqual(f([1, 2, 3]), 0)
+ self.assertEqual(f({1: 2, 3: 4}), 0)
+
+ # 'default' argument
+ default = "default"
+ for value in INVALID_ALT:
+ self.assertEqual(f(value, default), default)
+ self.assertEqual(f("zzz", default), default)
+
+ def test_parse_float(self, f=text.parse_float):
+ self.assertEqual(f(0), 0.0)
+ self.assertEqual(f("0"), 0.0)
+ self.assertEqual(f(123), 123.0)
+ self.assertEqual(f("123"), 123.0)
+ self.assertEqual(f(123.456), 123.456)
+ self.assertEqual(f("123.456"), 123.456)
+
+ # invalid arguments
+ for value in INVALID_ALT:
+ self.assertEqual(f(value), 0.0)
+ self.assertEqual(f("zzz"), 0.0)
+ self.assertEqual(f([1, 2, 3]), 0.0)
+ self.assertEqual(f({1: 2, 3: 4}), 0.0)
+
+ # 'default' argument
+ default = "default"
+ for value in INVALID_ALT:
+ self.assertEqual(f(value, default), default)
+ self.assertEqual(f("zzz", default), default)
+
+ def test_parse_query(self, f=text.parse_query):
+ # standard usage
+ self.assertEqual(f(""), {})
+ self.assertEqual(f("foo=1"), {"foo": "1"})
+ self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"})
+
+ # missing value
+ self.assertEqual(f("bar"), {})
+ self.assertEqual(f("foo=1&bar"), {"foo": "1"})
+ self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"})
+
+ # keys with identical names
+ self.assertEqual(f("foo=1&foo=2"), {"foo": "1"})
+ self.assertEqual(
+ f("foo=1&bar=2&foo=3&bar=4"),
+ {"foo": "1", "bar": "2"},
+ )
+
+ # invalid arguments
+ for value in INVALID:
+ self.assertEqual(f(value), {})
+
+ def test_parse_timestamp(self, f=text.parse_timestamp):
+ null = datetime.datetime.utcfromtimestamp(0)
+ value = datetime.datetime.utcfromtimestamp(1555816235)
+
+ self.assertEqual(f(0) , null)
+ self.assertEqual(f("0") , null)
+ self.assertEqual(f(1555816235) , value)
+ self.assertEqual(f("1555816235"), value)
+
+ for value in INVALID_ALT:
+ self.assertEqual(f(value), None)
+ self.assertEqual(f(value, "foo"), "foo")
+
+ def test_parse_datetime(self, f=text.parse_datetime):
+ null = datetime.datetime.utcfromtimestamp(0)
+
+ self.assertEqual(f("1970-01-01T00:00:00+00:00"), null)
+ self.assertEqual(f("1970-01-01T00:00:00+0000") , null)
+ self.assertEqual(f("1970.01.01", "%Y.%m.%d") , null)
+
+ self.assertEqual(
+ f("2019-05-07T21:25:02+09:00"),
+ datetime.datetime(2019, 5, 7, 12, 25, 2),
+ )
+ self.assertEqual(
+ f("2019-05-07T21:25:02+0900"),
+ datetime.datetime(2019, 5, 7, 12, 25, 2),
+ )
+ self.assertEqual(
+ f("2019-05-07 21:25:02"),
+ "2019-05-07 21:25:02",
+ )
+
+ for value in INVALID:
+ self.assertEqual(f(value), None)
+ self.assertEqual(f("1970.01.01"), "1970.01.01")
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_util.py b/test/test_util.py
new file mode 100644
index 0000000..815b2d8
--- /dev/null
+++ b/test/test_util.py
@@ -0,0 +1,395 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+import sys
+import random
+import string
+
+from gallery_dl import util, text, exception
+
+
+class TestRange(unittest.TestCase):
+
+ def test_parse_range(self, f=util.RangePredicate.parse_range):
+ self.assertEqual(
+ f(""),
+ [])
+ self.assertEqual(
+ f("1-2"),
+ [(1, 2)])
+ self.assertEqual(
+ f("-"),
+ [(1, sys.maxsize)])
+ self.assertEqual(
+ f("-2,4,6-8,10-"),
+ [(1, 2), (4, 4), (6, 8), (10, sys.maxsize)])
+ self.assertEqual(
+ f(" - 3 , 4- 4, 2-6"),
+ [(1, 3), (4, 4), (2, 6)])
+
+ def test_optimize_range(self, f=util.RangePredicate.optimize_range):
+ self.assertEqual(
+ f([]),
+ [])
+ self.assertEqual(
+ f([(2, 4)]),
+ [(2, 4)])
+ self.assertEqual(
+ f([(2, 4), (6, 8), (10, 12)]),
+ [(2, 4), (6, 8), (10, 12)])
+ self.assertEqual(
+ f([(2, 4), (4, 6), (5, 8)]),
+ [(2, 8)])
+ self.assertEqual(
+ f([(1, 1), (2, 2), (3, 6), (8, 9)]),
+ [(1, 6), (8, 9)])
+
+
+class TestPredicate(unittest.TestCase):
+
+ def test_range_predicate(self):
+ dummy = None
+
+ pred = util.RangePredicate(" - 3 , 4- 4, 2-6")
+ for i in range(6):
+ self.assertTrue(pred(dummy, dummy))
+ with self.assertRaises(exception.StopExtraction):
+ bool(pred(dummy, dummy))
+
+ pred = util.RangePredicate("1, 3, 5")
+ self.assertTrue(pred(dummy, dummy))
+ self.assertFalse(pred(dummy, dummy))
+ self.assertTrue(pred(dummy, dummy))
+ self.assertFalse(pred(dummy, dummy))
+ self.assertTrue(pred(dummy, dummy))
+ with self.assertRaises(exception.StopExtraction):
+ bool(pred(dummy, dummy))
+
+ pred = util.RangePredicate("")
+ with self.assertRaises(exception.StopExtraction):
+ bool(pred(dummy, dummy))
+
+ def test_unique_predicate(self):
+ dummy = None
+ pred = util.UniquePredicate()
+
+ # no duplicates
+ self.assertTrue(pred("1", dummy))
+ self.assertTrue(pred("2", dummy))
+ self.assertFalse(pred("1", dummy))
+ self.assertFalse(pred("2", dummy))
+ self.assertTrue(pred("3", dummy))
+ self.assertFalse(pred("3", dummy))
+
+ # duplicates for "text:"
+ self.assertTrue(pred("text:123", dummy))
+ self.assertTrue(pred("text:123", dummy))
+ self.assertTrue(pred("text:123", dummy))
+
+ def test_filter_predicate(self):
+ url = ""
+
+ pred = util.FilterPredicate("a < 3")
+ self.assertTrue(pred(url, {"a": 2}))
+ self.assertFalse(pred(url, {"a": 3}))
+
+ with self.assertRaises(SyntaxError):
+ util.FilterPredicate("(")
+
+ with self.assertRaises(exception.FilterError):
+ util.FilterPredicate("a > 1")(url, {"a": None})
+
+ with self.assertRaises(exception.FilterError):
+ util.FilterPredicate("b > 1")(url, {"a": 2})
+
+ def test_build_predicate(self):
+ pred = util.build_predicate([])
+ self.assertIsInstance(pred, type(lambda: True))
+
+ pred = util.build_predicate([util.UniquePredicate()])
+ self.assertIsInstance(pred, util.UniquePredicate)
+
+ pred = util.build_predicate([util.UniquePredicate(),
+ util.UniquePredicate()])
+ self.assertIsInstance(pred, util.ChainPredicate)
+
+
+class TestISO639_1(unittest.TestCase):
+
+ def test_code_to_language(self):
+ d = "default"
+ self._run_test(util.code_to_language, {
+ ("en",): "English",
+ ("FR",): "French",
+ ("xx",): None,
+ ("" ,): None,
+ (None,): None,
+ ("en", d): "English",
+ ("FR", d): "French",
+ ("xx", d): d,
+ ("" , d): d,
+ (None, d): d,
+ })
+
+ def test_language_to_code(self):
+ d = "default"
+ self._run_test(util.language_to_code, {
+ ("English",): "en",
+ ("fRENch",): "fr",
+ ("xx",): None,
+ ("" ,): None,
+ (None,): None,
+ ("English", d): "en",
+ ("fRENch", d): "fr",
+ ("xx", d): d,
+ ("" , d): d,
+ (None, d): d,
+ })
+
+ def _run_test(self, func, tests):
+ for args, result in tests.items():
+ self.assertEqual(func(*args), result)
+
+
+class TestFormatter(unittest.TestCase):
+
+ kwdict = {
+ "a": "hElLo wOrLd",
+ "b": "äöü",
+ "l": ["a", "b", "c"],
+ "n": None,
+ "u": "%27%3C%20/%20%3E%27",
+ "name": "Name",
+ "title1": "Title",
+ "title2": "",
+ "title3": None,
+ "title4": 0,
+ }
+
+ def test_conversions(self):
+ self._run_test("{a!l}", "hello world")
+ self._run_test("{a!u}", "HELLO WORLD")
+ self._run_test("{a!c}", "Hello world")
+ self._run_test("{a!C}", "Hello World")
+ self._run_test("{a!U}", self.kwdict["a"])
+ self._run_test("{u!U}", "'< / >'")
+ self._run_test("{a!s}", self.kwdict["a"])
+ self._run_test("{a!r}", "'" + self.kwdict["a"] + "'")
+ self._run_test("{a!a}", "'" + self.kwdict["a"] + "'")
+ self._run_test("{b!a}", "'\\xe4\\xf6\\xfc'")
+ self._run_test("{a!S}", self.kwdict["a"])
+ self._run_test("{l!S}", "a, b, c")
+ self._run_test("{n!S}", "")
+ with self.assertRaises(KeyError):
+ self._run_test("{a!q}", "hello world")
+
+ def test_optional(self):
+ self._run_test("{name}{title1}", "NameTitle")
+ self._run_test("{name}{title1:?//}", "NameTitle")
+ self._run_test("{name}{title1:? **/''/}", "Name **Title''")
+
+ self._run_test("{name}{title2}", "Name")
+ self._run_test("{name}{title2:?//}", "Name")
+ self._run_test("{name}{title2:? **/''/}", "Name")
+
+ self._run_test("{name}{title3}", "NameNone")
+ self._run_test("{name}{title3:?//}", "Name")
+ self._run_test("{name}{title3:? **/''/}", "Name")
+
+ self._run_test("{name}{title4}", "Name0")
+ self._run_test("{name}{title4:?//}", "Name")
+ self._run_test("{name}{title4:? **/''/}", "Name")
+
+ def test_missing(self):
+ replacement = "None"
+
+ self._run_test("{missing}", replacement)
+ self._run_test("{missing.attr}", replacement)
+ self._run_test("{missing[key]}", replacement)
+ self._run_test("{missing:?a//}", "")
+
+ self._run_test("{name[missing]}", replacement)
+ self._run_test("{name[missing].attr}", replacement)
+ self._run_test("{name[missing][key]}", replacement)
+ self._run_test("{name[missing]:?a//}", "")
+
+ def test_missing_custom_default(self):
+ replacement = default = "foobar"
+ self._run_test("{missing}" , replacement, default)
+ self._run_test("{missing.attr}", replacement, default)
+ self._run_test("{missing[key]}", replacement, default)
+ self._run_test("{missing:?a//}", "a" + default, default)
+
+ def test_slicing(self):
+ v = self.kwdict["a"]
+ self._run_test("{a[1:10]}" , v[1:10])
+ self._run_test("{a[-10:-1]}", v[-10:-1])
+ self._run_test("{a[5:]}" , v[5:])
+ self._run_test("{a[50:]}", v[50:])
+ self._run_test("{a[:5]}" , v[:5])
+ self._run_test("{a[:50]}", v[:50])
+ self._run_test("{a[:]}" , v)
+ self._run_test("{a[1:10:2]}" , v[1:10:2])
+ self._run_test("{a[-10:-1:2]}", v[-10:-1:2])
+ self._run_test("{a[5::2]}" , v[5::2])
+ self._run_test("{a[50::2]}", v[50::2])
+ self._run_test("{a[:5:2]}" , v[:5:2])
+ self._run_test("{a[:50:2]}", v[:50:2])
+ self._run_test("{a[::]}" , v)
+
+ def test_maxlen(self):
+ v = self.kwdict["a"]
+ self._run_test("{a:L5/foo/}" , "foo")
+ self._run_test("{a:L50/foo/}", v)
+ self._run_test("{a:L50/foo/>50}", " " * 39 + v)
+ self._run_test("{a:L50/foo/>51}", "foo")
+ self._run_test("{a:Lab/foo/}", "foo")
+
+ def test_join(self):
+ self._run_test("{l:J}" , "abc")
+ self._run_test("{l:J,}" , "a,b,c")
+ self._run_test("{l:J,/}" , "a,b,c")
+ self._run_test("{l:J,/>20}" , " a,b,c")
+ self._run_test("{l:J - }" , "a - b - c")
+ self._run_test("{l:J - /}" , "a - b - c")
+ self._run_test("{l:J - />20}", " a - b - c")
+
+ self._run_test("{a:J/}" , self.kwdict["a"])
+ self._run_test("{a:J, /}" , ", ".join(self.kwdict["a"]))
+
+ def test_replace(self):
+ self._run_test("{a:Rh/C/}" , "CElLo wOrLd")
+ self._run_test("{a!l:Rh/C/}", "Cello world")
+ self._run_test("{a!u:Rh/C/}", "HELLO WORLD")
+
+ self._run_test("{a!l:Rl/_/}", "he__o wor_d")
+ self._run_test("{a!l:Rl//}" , "heo word")
+ self._run_test("{name:Rame/othing/}", "Nothing")
+
+ def _run_test(self, format_string, result, default=None):
+ formatter = util.Formatter(format_string, default)
+ output = formatter.format_map(self.kwdict)
+ self.assertEqual(output, result, format_string)
+
+
+class TestOther(unittest.TestCase):
+
+ def test_bencode(self):
+ self.assertEqual(util.bencode(0), "")
+ self.assertEqual(util.bencode(123), "123")
+ self.assertEqual(util.bencode(123, "01"), "1111011")
+ self.assertEqual(util.bencode(123, "BA"), "AAAABAA")
+
+ def test_bdecode(self):
+ self.assertEqual(util.bdecode(""), 0)
+ self.assertEqual(util.bdecode("123"), 123)
+ self.assertEqual(util.bdecode("1111011", "01"), 123)
+ self.assertEqual(util.bdecode("AAAABAA", "BA"), 123)
+
+ def test_bencode_bdecode(self):
+ for _ in range(100):
+ value = random.randint(0, 1000000)
+ for alphabet in ("01", "0123456789", string.ascii_letters):
+ result = util.bdecode(util.bencode(value, alphabet), alphabet)
+ self.assertEqual(result, value)
+
+ def test_advance(self):
+ items = range(5)
+
+ self.assertCountEqual(
+ util.advance(items, 0), items)
+ self.assertCountEqual(
+ util.advance(items, 3), range(3, 5))
+ self.assertCountEqual(
+ util.advance(items, 9), [])
+ self.assertCountEqual(
+ util.advance(util.advance(items, 1), 2), range(3, 5))
+
+ def test_raises(self):
+ func = util.raises(Exception())
+ with self.assertRaises(Exception):
+ func()
+
+ func = util.raises(ValueError(1))
+ with self.assertRaises(ValueError):
+ func()
+ with self.assertRaises(ValueError):
+ func()
+ with self.assertRaises(ValueError):
+ func()
+
+ def test_combine_dict(self):
+ self.assertEqual(
+ util.combine_dict({}, {}),
+ {})
+ self.assertEqual(
+ util.combine_dict({1: 1, 2: 2}, {2: 4, 4: 8}),
+ {1: 1, 2: 4, 4: 8})
+ self.assertEqual(
+ util.combine_dict(
+ {1: {11: 22, 12: 24}, 2: {13: 26, 14: 28}},
+ {1: {11: 33, 13: 39}, 2: "str"}),
+ {1: {11: 33, 12: 24, 13: 39}, 2: "str"})
+ self.assertEqual(
+ util.combine_dict(
+ {1: {2: {3: {4: {"1": "a", "2": "b"}}}}},
+ {1: {2: {3: {4: {"1": "A", "3": "C"}}}}}),
+ {1: {2: {3: {4: {"1": "A", "2": "b", "3": "C"}}}}})
+
+ def test_transform_dict(self):
+ d = {}
+ util.transform_dict(d, str)
+ self.assertEqual(d, {})
+
+ d = {1: 123, 2: "123", 3: True, 4: None}
+ util.transform_dict(d, str)
+ self.assertEqual(
+ d, {1: "123", 2: "123", 3: "True", 4: "None"})
+
+ d = {1: 123, 2: "123", 3: "foo", 4: {11: 321, 12: "321", 13: "bar"}}
+ util.transform_dict(d, text.parse_int)
+ self.assertEqual(
+ d, {1: 123, 2: 123, 3: 0, 4: {11: 321, 12: 321, 13: 0}})
+
+ def test_number_to_string(self, f=util.number_to_string):
+ self.assertEqual(f(1) , "1")
+ self.assertEqual(f(1.0) , "1.0")
+ self.assertEqual(f("1.0") , "1.0")
+ self.assertEqual(f([1]) , [1])
+ self.assertEqual(f({1: 2}), {1: 2})
+ self.assertEqual(f(True) , True)
+ self.assertEqual(f(None) , None)
+
+ def test_to_string(self, f=util.to_string):
+ self.assertEqual(f(1) , "1")
+ self.assertEqual(f(1.0) , "1.0")
+ self.assertEqual(f("1.0"), "1.0")
+
+ self.assertEqual(f("") , "")
+ self.assertEqual(f(None) , "")
+ self.assertEqual(f(0) , "")
+
+ self.assertEqual(f(["a"]), "a")
+ self.assertEqual(f([1]) , "1")
+ self.assertEqual(f(["a", "b", "c"]), "a, b, c")
+ self.assertEqual(f([1, 2, 3]), "1, 2, 3")
+
+ def test_universal_none(self):
+ obj = util.NONE
+
+ self.assertFalse(obj)
+ self.assertEqual(str(obj), str(None))
+ self.assertEqual(repr(obj), repr(None))
+ self.assertIs(obj.attr, obj)
+ self.assertIs(obj["key"], obj)
+
+
+if __name__ == '__main__':
+ unittest.main()