summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2021-03-13 16:26:30 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2021-03-13 16:26:30 -0500
commit3201d77a148367d739862b4f07868a76eaeb7cb1 (patch)
tree78b8d71633ec000672a84ad0bbbddd0513ae2d30
parentfc83315c164afd74734adf27e0f7fec2011904aa (diff)
New upstream version 1.17.0.upstream/1.17.0
-rw-r--r--CHANGELOG.md37
-rw-r--r--PKG-INFO73
-rw-r--r--README.rst71
-rw-r--r--data/completion/_gallery-dl5
-rw-r--r--data/completion/gallery-dl2
-rw-r--r--data/man/gallery-dl.110
-rw-r--r--data/man/gallery-dl.conf.5166
-rw-r--r--docs/gallery-dl.conf164
-rw-r--r--gallery_dl.egg-info/PKG-INFO73
-rw-r--r--gallery_dl.egg-info/SOURCES.txt6
-rw-r--r--gallery_dl/__init__.py25
-rw-r--r--gallery_dl/cloudflare.py201
-rw-r--r--gallery_dl/downloader/__init__.py16
-rw-r--r--gallery_dl/downloader/http.py8
-rw-r--r--gallery_dl/downloader/ytdl.py16
-rw-r--r--gallery_dl/extractor/500px.py16
-rw-r--r--gallery_dl/extractor/__init__.py11
-rw-r--r--gallery_dl/extractor/booru.py201
-rw-r--r--gallery_dl/extractor/common.py293
-rw-r--r--gallery_dl/extractor/cyberdrop.py58
-rw-r--r--gallery_dl/extractor/deviantart.py10
-rw-r--r--gallery_dl/extractor/erome.py15
-rw-r--r--gallery_dl/extractor/exhentai.py116
-rw-r--r--gallery_dl/extractor/foolfuuka.py232
-rw-r--r--gallery_dl/extractor/foolslide.py190
-rw-r--r--gallery_dl/extractor/gelbooru.py14
-rw-r--r--gallery_dl/extractor/gelbooru_v01.py143
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py194
-rw-r--r--gallery_dl/extractor/hentaicafe.py103
-rw-r--r--gallery_dl/extractor/hentainexus.py10
-rw-r--r--gallery_dl/extractor/idolcomplex.py15
-rw-r--r--gallery_dl/extractor/imgur.py2
-rw-r--r--gallery_dl/extractor/instagram.py144
-rw-r--r--gallery_dl/extractor/komikcast.py2
-rw-r--r--gallery_dl/extractor/mangadex.py8
-rw-r--r--gallery_dl/extractor/mastodon.py216
-rw-r--r--gallery_dl/extractor/message.py4
-rw-r--r--gallery_dl/extractor/moebooru.py245
-rw-r--r--gallery_dl/extractor/naverwebtoon.py128
-rw-r--r--gallery_dl/extractor/oauth.py80
-rw-r--r--gallery_dl/extractor/patreon.py9
-rw-r--r--gallery_dl/extractor/pixiv.py5
-rw-r--r--gallery_dl/extractor/reactor.py23
-rw-r--r--gallery_dl/extractor/readcomiconline.py5
-rw-r--r--gallery_dl/extractor/sankakucomplex.py11
-rw-r--r--gallery_dl/extractor/shopify.py79
-rw-r--r--gallery_dl/extractor/tumblrgallery.py149
-rw-r--r--gallery_dl/extractor/twitter.py80
-rw-r--r--gallery_dl/extractor/unsplash.py6
-rw-r--r--gallery_dl/extractor/wallhaven.py146
-rw-r--r--gallery_dl/job.py71
-rw-r--r--gallery_dl/option.py16
-rw-r--r--gallery_dl/postprocessor/__init__.py14
-rw-r--r--gallery_dl/postprocessor/exec.py6
-rw-r--r--gallery_dl/postprocessor/metadata.py6
-rw-r--r--gallery_dl/util.py44
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_downloader.py8
-rw-r--r--test/test_extractor.py4
-rw-r--r--test/test_postprocessor.py4
-rw-r--r--test/test_results.py12
-rw-r--r--test/test_util.py12
62 files changed, 2396 insertions, 1639 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 893b944..ef4148a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,42 @@
# Changelog
+## 1.17.0 - 2021-03-05
+### Additions
+- [cyberdrop] add support for `https://cyberdrop.me/` ([#1328](https://github.com/mikf/gallery-dl/issues/1328))
+- [exhentai] add `metadata` option; extract more metadata from gallery pages ([#1325](https://github.com/mikf/gallery-dl/issues/1325))
+- [hentaicafe] add `search` and `tag` extractors ([#1345](https://github.com/mikf/gallery-dl/issues/1345))
+- [hentainexus] add `original` option ([#1322](https://github.com/mikf/gallery-dl/issues/1322))
+- [instagram] support `/user/reels/` URLs ([#1329](https://github.com/mikf/gallery-dl/issues/1329))
+- [naverwebtoon] add support for `https://comic.naver.com/` ([#1331](https://github.com/mikf/gallery-dl/issues/1331))
+- [pixiv] add `translated-tags` option ([#1354](https://github.com/mikf/gallery-dl/issues/1354))
+- [tbib] add support for `https://tbib.org/` ([#473](https://github.com/mikf/gallery-dl/issues/473), [#1082](https://github.com/mikf/gallery-dl/issues/1082))
+- [tumblrgallery] add support for `https://tumblrgallery.xyz/` ([#1298](https://github.com/mikf/gallery-dl/issues/1298))
+- [twitter] add extractor for followed users ([#1337](https://github.com/mikf/gallery-dl/issues/1337))
+- [twitter] add option to download all media from conversations ([#1319](https://github.com/mikf/gallery-dl/issues/1319))
+- [wallhaven] add `collections` extractor ([#1351](https://github.com/mikf/gallery-dl/issues/1351))
+- [snap] allow access to user's .netrc for site authentication ([#1352](https://github.com/mikf/gallery-dl/issues/1352))
+- add extractors for Gelbooru v0.1 sites ([#234](https://github.com/mikf/gallery-dl/issues/234), [#426](https://github.com/mikf/gallery-dl/issues/426), [#473](https://github.com/mikf/gallery-dl/issues/473), [#767](https://github.com/mikf/gallery-dl/issues/767), [#1238](https://github.com/mikf/gallery-dl/issues/1238))
+- add `-E/--extractor-info` command-line option ([#875](https://github.com/mikf/gallery-dl/issues/875))
+- add GitHub Actions workflow for building standalone executables ([#1312](https://github.com/mikf/gallery-dl/issues/1312))
+- add `browser` and `headers` options ([#1117](https://github.com/mikf/gallery-dl/issues/1117))
+- add option to use different youtube-dl forks ([#1330](https://github.com/mikf/gallery-dl/issues/1330))
+- support using multiple input files at once ([#1353](https://github.com/mikf/gallery-dl/issues/1353))
+### Changes
+- [deviantart] extend `extra` option to also download embedded DeviantArt posts.
+- [exhentai] rename metadata fields to match API results ([#1325](https://github.com/mikf/gallery-dl/issues/1325))
+- [mangadex] use `api.mangadex.org` as default API server
+- [mastodon] cache OAuth tokens ([#616](https://github.com/mikf/gallery-dl/issues/616))
+- replace `wait-min` and `wait-max` with `sleep-request`
+### Fixes
+- [500px] skip unavailable photos ([#1335](https://github.com/mikf/gallery-dl/issues/1335))
+- [komikcast] fix extraction
+- [readcomiconline] download high quality image versions ([#1347](https://github.com/mikf/gallery-dl/issues/1347))
+- [twitter] update GraphQL endpoints
+- fix crash when `base-directory` is an empty string ([#1339](https://github.com/mikf/gallery-dl/issues/1339))
+### Removals
+- remove support for formerly deprecated options
+- remove `cloudflare` module
+
## 1.16.5 - 2021-02-14
### Additions
- [behance] support `video` modules ([#1282](https://github.com/mikf/gallery-dl/issues/1282))
diff --git a/PKG-INFO b/PKG-INFO
index a89521e..7a9a43a 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.16.5
+Version: 1.17.0
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -21,6 +21,8 @@ Description: ==========
|pypi| |build| |gitter|
+ .. contents::
+
Dependencies
============
@@ -38,6 +40,7 @@ Description: ==========
Installation
============
+
Pip
---
@@ -57,48 +60,26 @@ Description: ==========
Note: Windows users should use :code:`py -3` instead of :code:`python3`.
- | It is advised to use the latest version of pip_,
- including the essential packages :code:`setuptools` and :code:`wheel`.
- | To ensure that these packages are up-to-date, run
+ It is advised to use the latest version of pip_,
+ including the essential packages :code:`setuptools` and :code:`wheel`.
+ To ensure these packages are up-to-date, run
.. code:: bash
$ python3 -m pip install --upgrade pip setuptools wheel
- From Source
- -----------
-
- Get the code by either
-
- * Downloading a stable_ or dev_ archive and unpacking it
- * Or via :code:`git clone https://github.com/mikf/gallery-dl.git`
-
- Navigate into the respective directory and run the :code:`setup.py` file.
-
- .. code:: bash
-
- $ wget https://github.com/mikf/gallery-dl/archive/master.tar.gz
- $ tar -xf master.tar.gz
- # or
- $ git clone https://github.com/mikf/gallery-dl.git
-
- $ cd gallery-dl*
- $ python3 setup.py install
-
-
Standalone Executable
---------------------
- Download a standalone executable file,
- put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
- and run it inside a command prompt (like ``cmd.exe``).
+ Prebuilt executable files with a Python interpreter and
+ required Python packages included are available for
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.5/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.5/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__
- These executables include a Python interpreter
- and all required Python packages.
+ | Executables build from the latest commit can be found at
+ | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
Snap
@@ -110,6 +91,7 @@ Description: ==========
$ snap install gallery-dl
+
Chocolatey
----------
@@ -119,15 +101,17 @@ Description: ==========
$ choco install gallery-dl
+
Scoop
- ----------
+ -----
- Apart from Chocolatey, *gallery-dl* is also available in Scoop_ "main" bucket for Windows users.
+ *gallery-dl* is also available in the Scoop_ "main" bucket for Windows users:
.. code:: powershell
$ scoop install gallery-dl
+
Usage
=====
@@ -232,9 +216,10 @@ Description: ==========
``e621``,
``exhentai``,
``idolcomplex``,
+ ``imgbb``,
``inkbunny``,
``instagram``,
- ``luscious``,
+ ``mangoxo``,
``pinterest``,
``sankaku``,
``subscribestar``,
@@ -264,6 +249,7 @@ Description: ==========
$ gallery-dl -u <username> -p <password> URL
$ gallery-dl -o username=<username> -o password=<password> URL
+
Cookies
-------
@@ -307,12 +293,14 @@ Description: ==========
$ gallery-dl --cookies "$HOME/path/to/cookies.txt" URL
+
OAuth
-----
*gallery-dl* supports user authentication via OAuth_ for
- ``deviantart``, ``flickr``, ``reddit``, ``smugmug`` and ``tumblr``.
- This is entirely optional, but grants *gallery-dl* the ability
+ ``deviantart``, ``flickr``, ``reddit``, ``smugmug``, ``tumblr``,
+ and ``mastodon`` instances.
+ This is mostly optional, but grants *gallery-dl* the ability
to issue requests on your account's behalf and enables it to access resources
which would otherwise be unavailable to a public user.
@@ -327,13 +315,20 @@ Description: ==========
access to *gallery-dl*. Authorize it and you will be shown one or more
"tokens", which should be added to your configuration file.
+ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
+ ``oauth:mastodon:<instance>`` as argument. For example:
+
+ .. code:: bash
+
+ $ gallery-dl oauth:mastodon:pawoo.net
+ $ gallery-dl oauth:mastodon:https://mastodon.social/
+
+
.. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
- .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.5.tar.gz
- .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
diff --git a/README.rst b/README.rst
index cec53a4..20ed222 100644
--- a/README.rst
+++ b/README.rst
@@ -10,6 +10,8 @@ and powerful filenaming capabilities.
|pypi| |build| |gitter|
+.. contents::
+
Dependencies
============
@@ -27,6 +29,7 @@ Optional
Installation
============
+
Pip
---
@@ -46,48 +49,26 @@ pip_ as well:
Note: Windows users should use :code:`py -3` instead of :code:`python3`.
-| It is advised to use the latest version of pip_,
- including the essential packages :code:`setuptools` and :code:`wheel`.
-| To ensure that these packages are up-to-date, run
+It is advised to use the latest version of pip_,
+including the essential packages :code:`setuptools` and :code:`wheel`.
+To ensure these packages are up-to-date, run
.. code:: bash
$ python3 -m pip install --upgrade pip setuptools wheel
-From Source
------------
-
-Get the code by either
-
-* Downloading a stable_ or dev_ archive and unpacking it
-* Or via :code:`git clone https://github.com/mikf/gallery-dl.git`
-
-Navigate into the respective directory and run the :code:`setup.py` file.
-
-.. code:: bash
-
- $ wget https://github.com/mikf/gallery-dl/archive/master.tar.gz
- $ tar -xf master.tar.gz
- # or
- $ git clone https://github.com/mikf/gallery-dl.git
-
- $ cd gallery-dl*
- $ python3 setup.py install
-
-
Standalone Executable
---------------------
-Download a standalone executable file,
-put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
-and run it inside a command prompt (like ``cmd.exe``).
+Prebuilt executable files with a Python interpreter and
+required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.5/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.5/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__
-These executables include a Python interpreter
-and all required Python packages.
+| Executables build from the latest commit can be found at
+| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
Snap
@@ -99,6 +80,7 @@ Linux users that are using a distro that is supported by Snapd_ can install *gal
$ snap install gallery-dl
+
Chocolatey
----------
@@ -108,15 +90,17 @@ Windows users that have Chocolatey_ installed can install *gallery-dl* from the
$ choco install gallery-dl
+
Scoop
-----------
+-----
-Apart from Chocolatey, *gallery-dl* is also available in Scoop_ "main" bucket for Windows users.
+*gallery-dl* is also available in the Scoop_ "main" bucket for Windows users:
.. code:: powershell
$ scoop install gallery-dl
+
Usage
=====
@@ -221,9 +205,10 @@ and optional for
``e621``,
``exhentai``,
``idolcomplex``,
+``imgbb``,
``inkbunny``,
``instagram``,
-``luscious``,
+``mangoxo``,
``pinterest``,
``sankaku``,
``subscribestar``,
@@ -253,6 +238,7 @@ or you can provide them directly via the
$ gallery-dl -u <username> -p <password> URL
$ gallery-dl -o username=<username> -o password=<password> URL
+
Cookies
-------
@@ -296,12 +282,14 @@ the :code:`--cookies` command-line option:
$ gallery-dl --cookies "$HOME/path/to/cookies.txt" URL
+
OAuth
-----
*gallery-dl* supports user authentication via OAuth_ for
-``deviantart``, ``flickr``, ``reddit``, ``smugmug`` and ``tumblr``.
-This is entirely optional, but grants *gallery-dl* the ability
+``deviantart``, ``flickr``, ``reddit``, ``smugmug``, ``tumblr``,
+and ``mastodon`` instances.
+This is mostly optional, but grants *gallery-dl* the ability
to issue requests on your account's behalf and enables it to access resources
which would otherwise be unavailable to a public user.
@@ -316,13 +304,20 @@ You will be sent to the site's authorization page and asked to grant read
access to *gallery-dl*. Authorize it and you will be shown one or more
"tokens", which should be added to your configuration file.
+To authenticate with a ``mastodon`` instance, run *gallery-dl* with
+``oauth:mastodon:<instance>`` as argument. For example:
+
+.. code:: bash
+
+ $ gallery-dl oauth:mastodon:pawoo.net
+ $ gallery-dl oauth:mastodon:https://mastodon.social/
+
+
.. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
-.. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.5.tar.gz
-.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl
index f134f63..76afd8a 100644
--- a/data/completion/_gallery-dl
+++ b/data/completion/_gallery-dl
@@ -8,16 +8,17 @@ _arguments -C -S \
{-h,--help}'[Print this help message and exit]' \
--version'[Print program version and exit]' \
{-d,--dest}'[Destination directory]':'<dest>':_files \
-{-i,--input-file}'[Download URLs found in FILE ("-" for stdin)]':'<file>':_files \
+{-i,--input-file}'[Download URLs found in FILE ("-" for stdin). More than one --input-file can be specified]':'<file>':_files \
--cookies'[File to load additional cookies from]':'<file>':_files \
--proxy'[Use the specified proxy]':'<url>' \
--clear-cache'[Delete all cached login sessions, cookies, etc.]' \
{-q,--quiet}'[Activate quiet mode]' \
{-v,--verbose}'[Print various debugging information]' \
{-g,--get-urls}'[Print URLs instead of downloading]' \
--G'[==SUPPRESS==]' \
+{-G,--resolve-urls}'[Print URLs instead of downloading; resolve intermediary URLs]' \
{-j,--dump-json}'[Print JSON information]' \
{-s,--simulate}'[Simulate data extraction; do not download anything]' \
+{-E,--extractor-info}'[Print extractor defaults and settings]' \
{-K,--list-keywords}'[Print a list of available keywords and example values for the given URLs]' \
--list-modules'[Print a list of available extractor modules]' \
--list-extractors'[Print a list of extractor classes with description, (sub)category and example URL]' \
diff --git a/data/completion/gallery-dl b/data/completion/gallery-dl
index 19cb39f..9a3a63e 100644
--- a/data/completion/gallery-dl
+++ b/data/completion/gallery-dl
@@ -10,7 +10,7 @@ _gallery_dl()
elif [[ "${prev}" =~ ^(-d|--dest)$ ]]; then
COMPREPLY=( $(compgen -d -- "${cur}") )
else
- COMPREPLY=( $(compgen -W "--help --version --dest --input-file --cookies --proxy --clear-cache --quiet --verbose --get-urls --dump-json --simulate --list-keywords --list-modules --list-extractors --write-log --write-unsupported --write-pages --limit-rate --retries --abort --http-timeout --sleep --filesize-min --filesize-max --no-part --no-skip --no-mtime --no-download --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --write-metadata --write-tags --mtime-from-date --exec --exec-after" -- "${cur}") )
+ COMPREPLY=( $(compgen -W "--help --version --dest --input-file --cookies --proxy --clear-cache --quiet --verbose --get-urls --resolve-urls --dump-json --simulate --extractor-info --list-keywords --list-modules --list-extractors --write-log --write-unsupported --write-pages --limit-rate --retries --abort --http-timeout --sleep --filesize-min --filesize-max --no-part --no-skip --no-mtime --no-download --no-check-certificate --config --config-yaml --option --ignore-config --username --password --netrc --download-archive --range --chapter-range --filter --chapter-filter --zip --ugoira-conv --ugoira-conv-lossless --write-metadata --write-tags --mtime-from-date --exec --exec-after" -- "${cur}") )
fi
}
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index 02639b8..c420d9b 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2021-02-14" "1.16.5" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2021-03-05" "1.17.0" "gallery-dl Manual"
.\" disable hyphenation
.nh
@@ -27,7 +27,7 @@ Print program version and exit
Destination directory
.TP
.B "\-i, \-\-input\-file" \f[I]FILE\f[]
-Download URLs found in FILE ('-' for stdin)
+Download URLs found in FILE ('-' for stdin). More than one --input-file can be specified
.TP
.B "\-\-cookies" \f[I]FILE\f[]
File to load additional cookies from
@@ -47,12 +47,18 @@ Print various debugging information
.B "\-g, \-\-get\-urls"
Print URLs instead of downloading
.TP
+.B "\-G, \-\-resolve\-urls"
+Print URLs instead of downloading; resolve intermediary URLs
+.TP
.B "\-j, \-\-dump\-json"
Print JSON information
.TP
.B "\-s, \-\-simulate"
Simulate data extraction; do not download anything
.TP
+.B "\-E, \-\-extractor\-info"
+Print extractor defaults and settings
+.TP
.B "\-K, \-\-list\-keywords"
Print a list of available keywords and example values for the given URLs
.TP
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index 408cb61..c0629bb 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2021-02-14" "1.16.5" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2021-03-05" "1.17.0" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -338,11 +338,13 @@ and optional for
.br
* \f[I]idolcomplex\f[]
.br
+* \f[I]imgbb\f[]
+.br
* \f[I]inkbunny\f[]
.br
* \f[I]instagram\f[]
.br
-* \f[I]luscious\f[]
+* \f[I]mangoxo\f[]
.br
* \f[I]pinterest\f[]
.br
@@ -451,7 +453,7 @@ otherwise \f[I]http://\f[] is assumed.
\f[I]string\f[]
.IP "Default:" 9
-\f[I]"Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"\f[]
+\f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"\f[]
.IP "Description:" 4
User-Agent header value to be used for HTTP requests.
@@ -460,6 +462,28 @@ Note: This option has no effect on pixiv extractors,
as these need specific values to function correctly.
+.SS extractor.*.browser
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"firefox"\f[] for \f[I]patreon\f[], \f[I]null\f[] everywhere else
+
+.IP "Example:" 4
+.br
+* "chrome:macos"
+
+.IP "Description:" 4
+Try to emulate a real browser (\f[I]firefox\f[] or \f[I]chrome\f[])
+by using their default HTTP headers and TLS ciphers for HTTP requests.
+
+Optionally, the operating system used in the \f[I]User-Agent\f[] header can be
+specified after a \f[I]:\f[] (\f[I]windows\f[], \f[I]linux\f[], or \f[I]macos\f[]).
+
+Note: \f[I]requests\f[] and \f[I]urllib3\f[] only support HTTP/1.1, while a real
+browser would use HTTP/2.
+
+
.SS extractor.*.keywords
.IP "Type:" 6
\f[I]object\f[]
@@ -839,7 +863,7 @@ See \f[I]Filters\f[] for details.
\f[I]false\f[]
.IP "Description:" 4
-Download extra Sta.sh resources from
+Download embedded Deviations and Sta.sh resources from
description texts and journals.
Note: Enabling this option also enables deviantart.metadata_.
@@ -1037,31 +1061,30 @@ If this value is an \f[I]integer\f[], it gets used as the limit maximum
instead of the value listed on \f[I]https://e-hentai.org/home.php\f[]
-.SS extractor.exhentai.original
+.SS extractor.exhentai.metadata
.IP "Type:" 6
\f[I]bool\f[]
.IP "Default:" 9
-\f[I]true\f[]
+\f[I]false\f[]
.IP "Description:" 4
-Download full-sized original images if available.
+Load extended gallery metadata from the
+\f[I]API\f[].
+Adds \f[I]archiver_key\f[], \f[I]posted\f[], and \f[I]torrents\f[].
+Makes \f[I]date\f[] and \f[I]filesize\f[] more precise.
-.SS extractor.exhentai.wait-min & .wait-max
+
+.SS extractor.exhentai.original
.IP "Type:" 6
-\f[I]float\f[]
+\f[I]bool\f[]
.IP "Default:" 9
-\f[I]3.0\f[] and \f[I]6.0\f[]
+\f[I]true\f[]
.IP "Description:" 4
-Minimum and maximum wait time in seconds between each image
-
-ExHentai detects and blocks automated downloaders.
-*gallery-dl* waits a randomly selected number of
-seconds between \f[I]wait-min\f[] and \f[I]wait-max\f[] after
-each image to prevent getting blocked.
+Download full-sized original images if available.
.SS extractor.flickr.access-token & .access-token-secret
@@ -1142,20 +1165,6 @@ Possible values are
You can use \f[I]"all"\f[] instead of listing all values separately.
-.SS extractor.gelbooru.api
-.IP "Type:" 6
-\f[I]bool\f[]
-
-.IP "Default:" 9
-\f[I]true\f[]
-
-.IP "Description:" 4
-Enable use of Gelbooru's API.
-
-Set this value to false if the API has been disabled to switch
-to manual information extraction.
-
-
.SS extractor.gfycat.format
.IP "Type:" 6
\f[I]string\f[]
@@ -1192,6 +1201,17 @@ Possible values are
You can use \f[I]"all"\f[] instead of listing all values separately.
+.SS extractor.hentainexus.original
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Download original files instead of WebP versions.
+
+
.SS extractor.hitomi.metadata
.IP "Type:" 6
\f[I]bool\f[]
@@ -1285,6 +1305,17 @@ If the selected format is not available,
the first in the list gets chosen (usually mp3).
+.SS extractor.mangadex.api-server
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"https://api.mangadex.org"\f[]
+
+.IP "Description:" 4
+The server to use for API requests.
+
+
.SS extractor.newgrounds.flash
.IP "Type:" 6
\f[I]bool\f[]
@@ -1444,6 +1475,17 @@ Download user avatars.
Also download related artworks.
+.SS extractor.pixiv.translated-tags
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Provide translated ´tags`.
+
+
.SS extractor.pixiv.ugoira
.IP "Type:" 6
\f[I]bool\f[]
@@ -1472,18 +1514,6 @@ to watchable videos. (Example__)
Also search Plurk comments for URLs.
-.SS extractor.reactor.wait-min & .wait-max
-.IP "Type:" 6
-\f[I]float\f[]
-
-.IP "Default:" 9
-\f[I]3.0\f[] and \f[I]6.0\f[]
-
-.IP "Description:" 4
-Minimum and maximum wait time in seconds between HTTP requests
-during the extraction process.
-
-
.SS extractor.readcomiconline.captcha
.IP "Type:" 6
\f[I]string\f[]
@@ -1755,6 +1785,18 @@ You can use \f[I]"all"\f[] instead of listing all types separately.
Fetch media from \f[I]Cards\f[].
+.SS extractor.twitter.conversations
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Fetch media from all Tweets and replies in a \f[I]conversation
+<https://help.twitter.com/en/using-twitter/twitter-conversations>\f[].
+
+
.SS extractor.twitter.quoted
.IP "Type:" 6
\f[I]bool\f[]
@@ -2064,6 +2106,17 @@ Check the file headers of \f[I]jpg\f[], \f[I]png\f[], and \f[I]gif\f[] files
and adjust their filename extensions if they do not match.
+.SS downloader.http.headers
+.IP "Type:" 6
+\f[I]object\f[]
+
+.IP "Example:" 4
+{"Accept": "image/webp,*/*", "Referer": "https://example.org/"}
+
+.IP "Description:" 4
+Additional HTTP headers to send when downloading files,
+
+
.SS downloader.ytdl.format
.IP "Type:" 6
\f[I]string\f[]
@@ -2103,6 +2156,17 @@ Note: Set \f[I]quiet\f[] and \f[I]no_warnings\f[] in
\f[I]downloader.ytdl.raw-options\f[] to \f[I]true\f[] to suppress all output.
+.SS downloader.ytdl.module
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"youtube_dl"\f[]
+
+.IP "Description:" 4
+Name of the youtube-dl Python module to import.
+
+
.SS downloader.ytdl.outtmpl
.IP "Type:" 6
\f[I]string\f[]
@@ -2687,24 +2751,6 @@ Set this option to \f[I]null\f[] or an invalid path to disable
this cache.
-.SS ciphers
-.IP "Type:" 6
-\f[I]bool\f[] or \f[I]string\f[]
-
-.IP "Default:" 9
-\f[I]true\f[]
-
-.IP "Description:" 4
-.br
-* \f[I]true\f[]: Update urllib3's default cipher list
-.br
-* \f[I]false\f[]: Leave the default cipher list as is
-.br
-* Any \f[I]string\f[]: Replace urllib3's default ciphers with these
-(See \f[I]SSLContext.set_ciphers()\f[]
-for details)
-
-
.SS pyopenssl
.IP "Type:" 6
\f[I]bool\f[]
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index bc9999b..acf60c7 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -2,17 +2,34 @@
"extractor":
{
"base-directory": "./gallery-dl/",
+ "parent-directory": false,
"postprocessors": null,
"archive": null,
"cookies": null,
- "cookies-update": false,
+ "cookies-update": true,
"proxy": null,
"skip": true,
+
+ "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0",
+ "retries": 4,
+ "timeout": 30.0,
+ "verify": true,
+
"sleep": 0,
+ "sleep-request": 0,
+ "sleep-extractor": 0,
+
"path-restrict": "auto",
"path-replace": "_",
"path-remove": "\\u0000-\\u001f\\u007f",
- "user-agent": "Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0",
+ "extension-map": {
+ "jpeg": "jpg",
+ "jpe" : "jpg",
+ "jfif": "jpg",
+ "jif" : "jpg",
+ "jfi" : "jpg"
+ },
+
"artstation":
{
@@ -21,7 +38,8 @@
"aryion":
{
"username": null,
- "password": null
+ "password": null,
+ "recursive": true
},
"blogger":
{
@@ -33,11 +51,17 @@
"password": null,
"ugoira": false
},
+ "derpibooru":
+ {
+ "api-key": null,
+ "filter": 56027
+ },
"deviantart":
{
"extra": false,
"flat": true,
"folders": false,
+ "include": "gallery",
"journals": "html",
"mature": true,
"metadata": false,
@@ -45,29 +69,43 @@
"quality": 100,
"wait-min": 0
},
+ "e621":
+ {
+ "username": null,
+ "password": null
+ },
"exhentai":
{
"username": null,
"password": null,
"domain": "auto",
"limits": true,
+ "metadata": false,
"original": true,
- "wait-min": 3.0,
- "wait-max": 6.0
+ "sleep-request": 5.0
},
"flickr":
{
"videos": true,
"size-max": null
},
- "gelbooru":
+ "furaffinity":
{
- "api": true
+ "descriptions": "text",
+ "include": "gallery"
},
"gfycat":
{
"format": "mp4"
},
+ "hentaifoundry":
+ {
+ "include": "pictures"
+ },
+ "hentainexus":
+ {
+ "original": true
+ },
"hitomi":
{
"metadata": true
@@ -76,40 +114,81 @@
{
"username": null,
"password": null,
- "wait-min": 3.0,
- "wait-max": 6.0
+ "sleep-request": 5.0
+ },
+ "imgbb":
+ {
+ "username": null,
+ "password": null
},
"imgur":
{
"mp4": true
},
+ "inkbunny":
+ {
+ "username": null,
+ "password": null,
+ "orderby": "create_datetime"
+ },
"instagram":
{
- "highlights": false,
+ "username": null,
+ "password": null,
+ "include": "posts",
+ "sleep-request": 5.0,
"videos": true
},
- "nijie":
+ "khinsider":
+ {
+ "format": "mp3"
+ },
+ "mangadex":
+ {
+ "api-server": "https://api.mangadex.org"
+ },
+ "mangoxo":
{
"username": null,
"password": null
},
+ "newgrounds":
+ {
+ "username": null,
+ "password": null,
+ "flash": true,
+ "include": "art"
+ },
+ "nijie":
+ {
+ "username": null,
+ "password": null,
+ "include": "illustration,doujin"
+ },
"oauth":
{
"browser": true,
"cache": true,
"port": 6414
},
+ "pillowfort":
+ {
+ "reblogs": false
+ },
+ "pinterest":
+ {
+ "sections": true,
+ "videos": true
+ },
"pixiv":
{
- "username": null,
- "password": null,
"avatar": false,
+ "translated-tags": false,
"ugoira": true
},
"reactor":
{
- "wait-min": 3.0,
- "wait-max": 6.0
+ "sleep-request": 5.0
},
"reddit":
{
@@ -128,11 +207,35 @@
{
"format": "mp4"
},
+ "sankakucomplex":
+ {
+ "embeds": false,
+ "videos": true
+ },
+ "sankaku":
+ {
+ "username": null,
+ "password": null
+ },
+ "smugmug":
+ {
+ "videos": true
+ },
"seiga":
{
"username": null,
"password": null
},
+ "subscribestar":
+ {
+ "username": null,
+ "password": null
+ },
+ "tsumino":
+ {
+ "username": null,
+ "password": null
+ },
"tumblr":
{
"avatar": false,
@@ -143,12 +246,20 @@
},
"twitter":
{
+ "username": null,
+ "password": null,
+ "cards": false,
+ "conversations": false,
"quoted": true,
"replies": true,
"retweets": true,
"twitpic": false,
"videos": true
},
+ "unsplash":
+ {
+ "format": "raw"
+ },
"vsco":
{
"videos": true
@@ -157,6 +268,10 @@
{
"api-key": null
},
+ "weasyl":
+ {
+ "api-key": null
+ },
"weibo":
{
"retweets": true,
@@ -172,29 +287,28 @@
{
"filesize-min": null,
"filesize-max": null,
+ "mtime": true,
"part": true,
"part-directory": null,
+ "rate": null,
+ "retries": 4,
+ "timeout": 30.0,
+ "verify": true,
"http":
{
"adjust-extensions": true,
- "mtime": true,
- "rate": null,
- "retries": 4,
- "timeout": 30.0,
- "verify": true
+ "headers": null
},
"ytdl":
{
"format": null,
"forward-cookies": false,
- "mtime": true,
+ "logging": true,
+ "module": "youtube_dl",
"outtmpl": null,
- "rate": null,
- "retries": 4,
- "timeout": 30.0,
- "verify": true
+ "raw-options": null
}
},
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index b87c59d..fbf67fe 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.16.5
+Version: 1.17.0
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -21,6 +21,8 @@ Description: ==========
|pypi| |build| |gitter|
+ .. contents::
+
Dependencies
============
@@ -38,6 +40,7 @@ Description: ==========
Installation
============
+
Pip
---
@@ -57,48 +60,26 @@ Description: ==========
Note: Windows users should use :code:`py -3` instead of :code:`python3`.
- | It is advised to use the latest version of pip_,
- including the essential packages :code:`setuptools` and :code:`wheel`.
- | To ensure that these packages are up-to-date, run
+ It is advised to use the latest version of pip_,
+ including the essential packages :code:`setuptools` and :code:`wheel`.
+ To ensure these packages are up-to-date, run
.. code:: bash
$ python3 -m pip install --upgrade pip setuptools wheel
- From Source
- -----------
-
- Get the code by either
-
- * Downloading a stable_ or dev_ archive and unpacking it
- * Or via :code:`git clone https://github.com/mikf/gallery-dl.git`
-
- Navigate into the respective directory and run the :code:`setup.py` file.
-
- .. code:: bash
-
- $ wget https://github.com/mikf/gallery-dl/archive/master.tar.gz
- $ tar -xf master.tar.gz
- # or
- $ git clone https://github.com/mikf/gallery-dl.git
-
- $ cd gallery-dl*
- $ python3 setup.py install
-
-
Standalone Executable
---------------------
- Download a standalone executable file,
- put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
- and run it inside a command prompt (like ``cmd.exe``).
+ Prebuilt executable files with a Python interpreter and
+ required Python packages included are available for
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.16.5/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.16.5/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__
- These executables include a Python interpreter
- and all required Python packages.
+ | Executables build from the latest commit can be found at
+ | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
Snap
@@ -110,6 +91,7 @@ Description: ==========
$ snap install gallery-dl
+
Chocolatey
----------
@@ -119,15 +101,17 @@ Description: ==========
$ choco install gallery-dl
+
Scoop
- ----------
+ -----
- Apart from Chocolatey, *gallery-dl* is also available in Scoop_ "main" bucket for Windows users.
+ *gallery-dl* is also available in the Scoop_ "main" bucket for Windows users:
.. code:: powershell
$ scoop install gallery-dl
+
Usage
=====
@@ -232,9 +216,10 @@ Description: ==========
``e621``,
``exhentai``,
``idolcomplex``,
+ ``imgbb``,
``inkbunny``,
``instagram``,
- ``luscious``,
+ ``mangoxo``,
``pinterest``,
``sankaku``,
``subscribestar``,
@@ -264,6 +249,7 @@ Description: ==========
$ gallery-dl -u <username> -p <password> URL
$ gallery-dl -o username=<username> -o password=<password> URL
+
Cookies
-------
@@ -307,12 +293,14 @@ Description: ==========
$ gallery-dl --cookies "$HOME/path/to/cookies.txt" URL
+
OAuth
-----
*gallery-dl* supports user authentication via OAuth_ for
- ``deviantart``, ``flickr``, ``reddit``, ``smugmug`` and ``tumblr``.
- This is entirely optional, but grants *gallery-dl* the ability
+ ``deviantart``, ``flickr``, ``reddit``, ``smugmug``, ``tumblr``,
+ and ``mastodon`` instances.
+ This is mostly optional, but grants *gallery-dl* the ability
to issue requests on your account's behalf and enables it to access resources
which would otherwise be unavailable to a public user.
@@ -327,13 +315,20 @@ Description: ==========
access to *gallery-dl*. Authorize it and you will be shown one or more
"tokens", which should be added to your configuration file.
+ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
+ ``oauth:mastodon:<instance>`` as argument. For example:
+
+ .. code:: bash
+
+ $ gallery-dl oauth:mastodon:pawoo.net
+ $ gallery-dl oauth:mastodon:https://mastodon.social/
+
+
.. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
- .. _stable: https://github.com/mikf/gallery-dl/archive/v1.16.5.tar.gz
- .. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 066ac90..89ae8ed 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -13,7 +13,6 @@ docs/gallery-dl.conf
gallery_dl/__init__.py
gallery_dl/__main__.py
gallery_dl/cache.py
-gallery_dl/cloudflare.py
gallery_dl/config.py
gallery_dl/exception.py
gallery_dl/job.py
@@ -50,6 +49,7 @@ gallery_dl/extractor/behance.py
gallery_dl/extractor/blogger.py
gallery_dl/extractor/booru.py
gallery_dl/extractor/common.py
+gallery_dl/extractor/cyberdrop.py
gallery_dl/extractor/danbooru.py
gallery_dl/extractor/derpibooru.py
gallery_dl/extractor/deviantart.py
@@ -65,6 +65,8 @@ gallery_dl/extractor/foolslide.py
gallery_dl/extractor/furaffinity.py
gallery_dl/extractor/fuskator.py
gallery_dl/extractor/gelbooru.py
+gallery_dl/extractor/gelbooru_v01.py
+gallery_dl/extractor/gelbooru_v02.py
gallery_dl/extractor/gfycat.py
gallery_dl/extractor/hbrowse.py
gallery_dl/extractor/hentai2read.py
@@ -110,6 +112,7 @@ gallery_dl/extractor/moebooru.py
gallery_dl/extractor/myhentaigallery.py
gallery_dl/extractor/myportfolio.py
gallery_dl/extractor/naver.py
+gallery_dl/extractor/naverwebtoon.py
gallery_dl/extractor/newgrounds.py
gallery_dl/extractor/ngomik.py
gallery_dl/extractor/nhentai.py
@@ -149,6 +152,7 @@ gallery_dl/extractor/subscribestar.py
gallery_dl/extractor/test.py
gallery_dl/extractor/tsumino.py
gallery_dl/extractor/tumblr.py
+gallery_dl/extractor/tumblrgallery.py
gallery_dl/extractor/twitter.py
gallery_dl/extractor/unsplash.py
gallery_dl/extractor/vanillarock.py
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index 6c2c713..c1f80b6 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -196,7 +196,7 @@ def main():
cnt, "entry" if cnt == 1 else "entries", cache._path(),
)
else:
- if not args.urls and not args.inputfile:
+ if not args.urls and not args.inputfiles:
parser.error(
"The following arguments are required: URL\n"
"Use 'gallery-dl --help' to get a list of all options.")
@@ -208,18 +208,19 @@ def main():
jobtype = args.jobtype or job.DownloadJob
urls = args.urls
- if args.inputfile:
- try:
- if args.inputfile == "-":
- if sys.stdin:
- urls += parse_inputfile(sys.stdin, log)
+ if args.inputfiles:
+ for inputfile in args.inputfiles:
+ try:
+ if inputfile == "-":
+ if sys.stdin:
+ urls += parse_inputfile(sys.stdin, log)
+ else:
+ log.warning("input file: stdin is not readable")
else:
- log.warning("input file: stdin is not readable")
- else:
- with open(args.inputfile, encoding="utf-8") as file:
- urls += parse_inputfile(file, log)
- except OSError as exc:
- log.warning("input file: %s", exc)
+ with open(inputfile, encoding="utf-8") as file:
+ urls += parse_inputfile(file, log)
+ except OSError as exc:
+ log.warning("input file: %s", exc)
# unsupported file logging handler
handler = output.setup_logging_handler(
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py
deleted file mode 100644
index 0f49d61..0000000
--- a/gallery_dl/cloudflare.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2015-2020 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Methods to access sites behind Cloudflare protection"""
-
-import time
-import operator
-import collections
-import urllib.parse
-from xml.etree import ElementTree
-from . import text
-from .cache import memcache
-
-
-def is_challenge(response):
- return (response.status_code == 503 and
- response.headers.get("Server", "").startswith("cloudflare") and
- b"jschl-answer" in response.content)
-
-
-def is_captcha(response):
- return (response.status_code == 403 and
- b'name="captcha-bypass"' in response.content)
-
-
-def solve_challenge(session, response, kwargs):
- """Solve Cloudflare challenge and get cfclearance cookie"""
- parsed = urllib.parse.urlsplit(response.url)
- root = parsed.scheme + "://" + parsed.netloc
- page = response.text
-
- cf_kwargs = {}
- headers = cf_kwargs["headers"] = collections.OrderedDict()
- params = cf_kwargs["data"] = collections.OrderedDict()
- headers["Referer"] = response.url
-
- form = text.extract(page, 'id="challenge-form"', '</form>')[0]
- for element in ElementTree.fromstring(
- "<f>" + form + "</f>").findall("input"):
- name = element.attrib.get("name")
- if not name:
- continue
- if name == "jschl_answer":
- try:
- value = solve_js_challenge(page, parsed.netloc)
- except Exception:
- return response, None, None
- else:
- value = element.attrib.get("value")
- params[name] = value
-
- try:
- params = {"ray": text.extract(page, '?ray=', '"')[0]}
-
- url = root + "/cdn-cgi/images/trace/jschal/nojs/transparent.gif"
- session.request("GET", url, params=params)
-
- url = root + "/cdn-cgi/images/trace/jschal/js/nocookie/transparent.gif"
- session.request("GET", url, params=params)
- except Exception:
- pass
-
- time.sleep(4)
- url = root + text.unescape(text.extract(page, 'action="', '"')[0])
- cf_response = session.request("POST", url, **cf_kwargs)
-
- if cf_response.history:
- initial_response = cf_response.history[0]
- else:
- initial_response = cf_response
-
- cookies = {
- cookie.name: cookie.value
- for cookie in initial_response.cookies
- }
-
- if not cookies:
- import logging
- log = logging.getLogger("cloudflare")
- log.debug("Headers:\n%s", initial_response.headers)
- log.debug("Content:\n%s", initial_response.text)
- return cf_response, None, None
-
- domain = next(iter(initial_response.cookies)).domain
- cookies["__cfduid"] = response.cookies.get("__cfduid", "")
- return cf_response, domain, cookies
-
-
-def solve_js_challenge(page, netloc):
- """Evaluate JS challenge in 'page' to get 'jschl_answer' value"""
-
- # build variable name
- # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
- data, pos = text.extract_all(page, (
- ('var' , ',f, ', '='),
- ('key' , '"' , '"'),
- ('expr', ':' , '}'),
- ))
- variable = "{}.{}".format(data["var"], data["key"])
- vlength = len(variable)
-
- k = text.extract(page, "k = '", "'")[0]
-
- # evaluate the initial expression
- solution = evaluate_expression(data["expr"], page, netloc)
-
- # iterator over all remaining expressions
- # and combine their values in 'solution'
- expressions = text.extract(
- page, "'challenge-form');", "f.submit();", pos)[0]
- for expr in expressions.split(";")[1:]:
-
- if expr.startswith(variable):
- # select arithmetc function based on operator (+/-/*)
- func = OPERATORS[expr[vlength]]
- # evaluate the rest of the expression
- value = evaluate_expression(expr[vlength+2:], page, netloc, k)
- # combine expression value with our current solution
- solution = func(solution, value)
-
- elif expr.startswith("a.value"):
- if "t.length)" in expr:
- # add length of hostname
- solution += len(netloc)
- if ".toFixed(" in expr:
- # trim solution to 10 decimal places
- solution = "{:.10f}".format(solution)
- return solution
-
- elif expr.startswith("k+="):
- k += str(evaluate_expression(expr[3:], page, netloc))
-
-
-def evaluate_expression(expr, page, netloc, k=""):
- """Evaluate a single Javascript expression for the challenge"""
-
- if expr.startswith("function(p)"):
- # get HTML element with ID k and evaluate the expression inside
- # 'eval(eval("document.getElementById(k).innerHTML"))'
- expr = text.extract(page, 'id="'+k+'"', '<')[0]
- return evaluate_expression(expr.partition(">")[2], page, netloc)
-
- if "/" in expr:
- # split the expression in numerator and denominator subexpressions,
- # evaluate them separately,
- # and return their fraction-result
- num, _, denom = expr.partition("/")
- num = evaluate_expression(num, page, netloc)
- denom = evaluate_expression(denom, page, netloc)
- return num / denom
-
- if "function(p)" in expr:
- # split initial expression and function code
- initial, _, func = expr.partition("function(p)")
- # evaluate said expression
- initial = evaluate_expression(initial, page, netloc)
- # get function argument and use it as index into 'netloc'
- index = evaluate_expression(func[func.index("}")+1:], page, netloc)
- return initial + ord(netloc[int(index)])
-
- # iterate over all subexpressions,
- # evaluate them,
- # and accumulate their values in 'result'
- result = ""
- for subexpr in expr.strip("+()").split(")+("):
- value = 0
- for part in subexpr.split("+"):
- if "-" in part:
- p1, _, p2 = part.partition("-")
- value += VALUES[p1] - VALUES[p2]
- else:
- value += VALUES[part]
- result += str(value)
- return int(result)
-
-
-OPERATORS = {
- "+": operator.add,
- "-": operator.sub,
- "*": operator.mul,
-}
-
-
-VALUES = {
- "": 0,
- "!": 1,
- "[]": 0,
- "!![]": 1,
- "(!![]": 1,
- "(!![])": 1,
-}
-
-
-@memcache(keyarg=0)
-def cookies(category):
- return None
diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py
index 6fb09e1..e1b936e 100644
--- a/gallery_dl/downloader/__init__.py
+++ b/gallery_dl/downloader/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,8 +8,6 @@
"""Downloader modules"""
-import importlib
-
modules = [
"http",
"text",
@@ -24,22 +22,22 @@ def find(scheme):
except KeyError:
pass
- klass = None
+ cls = None
if scheme == "https":
scheme = "http"
if scheme in modules: # prevent unwanted imports
try:
- module = importlib.import_module("." + scheme, __package__)
+ module = __import__(scheme, globals(), None, (), 1)
except ImportError:
pass
else:
- klass = module.__downloader__
+ cls = module.__downloader__
if scheme == "http":
- _cache["http"] = _cache["https"] = klass
+ _cache["http"] = _cache["https"] = cls
else:
- _cache[scheme] = klass
- return klass
+ _cache[scheme] = cls
+ return cls
# --------------------------------------------------------------------
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 8d72dc2..bc42d7c 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2020 Mike Fährmann
+# Copyright 2014-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -31,6 +31,7 @@ class HttpDownloader(DownloaderBase):
self.downloading = False
self.adjust_extension = self.config("adjust-extensions", True)
+ self.headers = self.config("headers")
self.minsize = self.config("filesize-min")
self.maxsize = self.config("filesize-max")
self.retries = self.config("retries", extractor._retries)
@@ -93,13 +94,16 @@ class HttpDownloader(DownloaderBase):
time.sleep(tries)
tries += 1
- headers = {}
+ headers = {"Accept": "*/*"}
file_header = None
# check for .part file
file_size = pathfmt.part_size()
if file_size:
headers["Range"] = "bytes={}-".format(file_size)
+ # general headers
+ if self.headers:
+ headers.update(self.headers)
# file-specific headers
extra = pathfmt.kwdict.get("_http_headers")
if extra:
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
index 8086b5d..e116188 100644
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,6 @@
"""Downloader module for URLs requiring youtube-dl support"""
-from youtube_dl import YoutubeDL, DEFAULT_OUTTMPL
from .common import DownloaderBase
from .. import text
import os
@@ -16,8 +15,14 @@ import os
class YoutubeDLDownloader(DownloaderBase):
scheme = "ytdl"
+ module = None
def __init__(self, job):
+ module = self.module
+ if not module:
+ module_name = self.config("module") or "youtube_dl"
+ module = YoutubeDLDownloader.module = __import__(module_name)
+
DownloaderBase.__init__(self, job)
extractor = job.extractor
@@ -42,10 +47,11 @@ class YoutubeDLDownloader(DownloaderBase):
options["logger"] = self.log
self.forward_cookies = self.config("forward-cookies", False)
- outtmpl = self.config("outtmpl")
- self.outtmpl = DEFAULT_OUTTMPL if outtmpl == "default" else outtmpl
+ self.outtmpl = self.config("outtmpl")
+ if self.outtmpl == "default":
+ self.outtmpl = module.DEFAULT_OUTTMPL
- self.ytdl = YoutubeDL(options)
+ self.ytdl = module.YoutubeDL(options)
def download(self, url, pathfmt):
if self.forward_cookies:
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
index 81b11fd..aa0e8ad 100644
--- a/gallery_dl/extractor/500px.py
+++ b/gallery_dl/extractor/500px.py
@@ -50,6 +50,8 @@ class _500pxExtractor(Extractor):
def _extend(self, edges):
"""Extend photos with additional metadata and higher resolution URLs"""
+ ids = [str(edge["node"]["legacyId"]) for edge in edges]
+
url = "https://api.500px.com/v1/photos"
params = {
"expanded_user_info" : "true",
@@ -62,14 +64,14 @@ class _500pxExtractor(Extractor):
"liked_by" : "1",
"following_sample" : "100",
"image_size" : "4096",
- "ids" : ",".join(
- str(edge["node"]["legacyId"]) for edge in edges),
+ "ids" : ",".join(ids),
}
- data = self._request_api(url, params)["photos"]
+ photos = self._request_api(url, params)["photos"]
return [
- data[str(edge["node"]["legacyId"])]
- for edge in edges
+ photos[pid] for pid in ids
+ if pid in photos or
+ self.log.warning("Unable to fetch photo %s", pid)
]
def _request_api(self, url, params, csrf_token=None):
@@ -142,6 +144,10 @@ class _500pxGalleryExtractor(_500pxExtractor):
"user": dict,
},
}),
+ # unavailable photos (#1335)
+ ("https://500px.com/p/Light_Expression_Photography/galleries/street", {
+ "count": 0,
+ }),
("https://500px.com/fashvamp/galleries/lera"),
)
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 923a78b..57794d0 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -7,7 +7,6 @@
# published by the Free Software Foundation.
import re
-import importlib
modules = [
"2chan",
@@ -23,6 +22,7 @@ modules = [
"bcy",
"behance",
"blogger",
+ "cyberdrop",
"danbooru",
"derpibooru",
"deviantart",
@@ -35,6 +35,8 @@ modules = [
"furaffinity",
"fuskator",
"gelbooru",
+ "gelbooru_v01",
+ "gelbooru_v02",
"gfycat",
"hbrowse",
"hentai2read",
@@ -76,6 +78,7 @@ modules = [
"myhentaigallery",
"myportfolio",
"naver",
+ "naverwebtoon",
"newgrounds",
"ngomik",
"nhentai",
@@ -111,6 +114,7 @@ modules = [
"subscribestar",
"tsumino",
"tumblr",
+ "tumblrgallery",
"twitter",
"unsplash",
"vanillarock",
@@ -182,11 +186,12 @@ def _list_classes():
"""Yield all available extractor classes"""
yield from _cache
+ globals_ = globals()
for module_name in _module_iter:
- module = importlib.import_module("."+module_name, __package__)
+ module = __import__(module_name, globals_, None, (), 1)
yield from add_module(module)
- globals()["_list_classes"] = lambda : _cache
+ globals_["_list_classes"] = lambda : _cache
def _get_classes(module):
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 64cde80..c3cf3f7 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,16 +8,12 @@
"""Extractors for *booru sites"""
-from .common import Extractor, Message, generate_extractors
-from .. import text, util, exception
-
-from xml.etree import ElementTree
-import collections
+from .common import BaseExtractor, Message
+from .. import text
import operator
-import re
-class BooruExtractor(Extractor):
+class BooruExtractor(BaseExtractor):
"""Base class for *booru extractors"""
basecategory = "booru"
filename_fmt = "{category}_{id}_{md5}.{extension}"
@@ -66,191 +62,8 @@ class BooruExtractor(Extractor):
_file_url = operator.itemgetter("file_url")
- @staticmethod
- def _prepare(post):
- post["date"] = text.parse_datetime(
- post["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ def _prepare(self, post):
+ """Prepare the 'post's metadata"""
def _extended_tags(self, post, page=None):
- if not page:
- url = "{}/index.php?page=post&s=view&id={}".format(
- self.root, post["id"])
- page = self.request(url).text
- html = text.extract(page, '<ul id="tag-', '</ul>')[0]
- if html:
- tags = collections.defaultdict(list)
- pattern = re.compile(
- r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
- for tag_type, tag_name in pattern.findall(html):
- tags[tag_type].append(text.unquote(tag_name))
- for key, value in tags.items():
- post["tags_" + key] = " ".join(value)
-
- def _api_request(self, params):
- url = self.root + "/index.php?page=dapi&s=post&q=index"
- return ElementTree.fromstring(self.request(url, params=params).text)
-
- def _pagination(self, params):
- params["pid"] = self.page_start
- params["limit"] = self.per_page
-
- while True:
- root = self._api_request(params)
- for post in root:
- yield post.attrib
-
- if len(root) < self.per_page:
- return
- params["pid"] += 1
-
-
-class BooruPostExtractor(BooruExtractor):
- subcategory = "post"
- archive_fmt = "{id}"
- pattern_fmt = r"/index\.php\?page=post&s=view&id=(\d+)"
-
- def __init__(self, match):
- BooruExtractor.__init__(self, match)
- self.post_id = match.group(1)
-
- def posts(self):
- return self._pagination({"id": self.post_id})
-
-
-class BooruTagExtractor(BooruExtractor):
- subcategory = "tag"
- directory_fmt = ("{category}", "{search_tags}")
- archive_fmt = "t_{search_tags}_{id}"
- pattern_fmt = r"/index\.php\?page=post&s=list&tags=([^&#]+)"
-
- def __init__(self, match):
- BooruExtractor.__init__(self, match)
- self.tags = text.unquote(match.group(1).replace("+", " "))
-
- def metadata(self):
- return {"search_tags": self.tags}
-
- def posts(self):
- return self._pagination({"tags" : self.tags})
-
-
-class BooruPoolExtractor(BooruExtractor):
- subcategory = "pool"
- directory_fmt = ("{category}", "pool", "{pool}")
- archive_fmt = "p_{pool}_{id}"
- pattern_fmt = r"/index\.php\?page=pool&s=show&id=(\d+)"
-
- def __init__(self, match):
- BooruExtractor.__init__(self, match)
- self.pool_id = match.group(1)
- self.post_ids = ()
-
- def skip(self, num):
- self.page_start += num
- return num
-
- def metadata(self):
- url = "{}/index.php?page=pool&s=show&id={}".format(
- self.root, self.pool_id)
- page = self.request(url).text
-
- name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
- if not name:
- raise exception.NotFoundError("pool")
- self.post_ids = text.extract_iter(
- page, 'class="thumb" id="p', '"', pos)
-
- return {
- "pool": text.parse_int(self.pool_id),
- "pool_name": text.unescape(name),
- }
-
- def posts(self):
- params = {}
- for params["id"] in util.advance(self.post_ids, self.page_start):
- for post in self._api_request(params):
- yield post.attrib
-
-
-EXTRACTORS = {
- "rule34": {
- "root": "https://rule34.xxx",
- "test-tag": (
- ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
- "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
- "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
- "count": 1,
- }),
- ),
- "test-pool": (
- ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
- "count": 3,
- }),
- ),
- "test-post": (
- ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
- "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "danraku",
- "tags_character": "kashima_(kantai_collection)",
- "tags_copyright": "kantai_collection",
- "tags_general": str,
- "tags_metadata": str,
- },
- }),
- ),
- },
- "safebooru": {
- "root": "https://safebooru.org",
- "test-tag": (
- ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
- "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
- "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
- }),
- ),
- "test-pool": (
- ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
- "count": 5,
- }),
- ),
- "test-post": (
- ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
- "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
- "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "kawanakajima",
- "tags_character": "heath_ledger ronald_mcdonald the_joker",
- "tags_copyright": "dc_comics mcdonald's the_dark_knight",
- "tags_general": str,
- },
- }),
- ),
- },
- "realbooru": {
- "root": "https://realbooru.com",
- "test-tag": (
- ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
- "count": ">= 64",
- }),
- ),
- "test-pool": (
- ("https://realbooru.com/index.php?page=pool&s=show&id=1", {
- "count": 3,
- }),
- ),
- "test-post": (
- ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
- "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
- "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
- }),
- ),
- },
-}
-
-generate_extractors(EXTRACTORS, globals(), (
- BooruTagExtractor,
- BooruPoolExtractor,
- BooruPostExtractor,
-))
+ """Generate extended tag information"""
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 15cc776..e9b9718 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2020 Mike Fährmann
+# Copyright 2014-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,6 +9,7 @@
"""Common classes and constants used by extractor modules."""
import re
+import ssl
import time
import netrc
import queue
@@ -16,8 +17,9 @@ import logging
import datetime
import requests
import threading
+from requests.adapters import HTTPAdapter
from .message import Message
-from .. import config, text, util, exception, cloudflare
+from .. import config, text, util, exception
class Extractor():
@@ -30,6 +32,7 @@ class Extractor():
filename_fmt = "{filename}.{extension}"
archive_fmt = ""
cookiedomain = ""
+ browser = None
root = ""
test = None
request_interval = 0.0
@@ -37,15 +40,15 @@ class Extractor():
request_timestamp = 0.0
def __init__(self, match):
- self.session = requests.Session()
self.log = logging.getLogger(self.category)
self.url = match.string
- self._cookiefile = None
- self._cookiejar = self.session.cookies
+ if self.basecategory:
+ self.config = self._config_shared
+ self.config_accumulate = self._config_shared_accumulate
+ self._cfgpath = ("extractor", self.category, self.subcategory)
self._parentdir = ""
- self._cfgpath = ("extractor", self.category, self.subcategory)
self._write_pages = self.config("write-pages", False)
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
@@ -58,11 +61,7 @@ class Extractor():
if self.request_interval < self.request_interval_min:
self.request_interval = self.request_interval_min
- if self.basecategory:
- self.config = self._config_shared
- self.config_accumulate = self._config_shared_accumulate
-
- self._init_headers()
+ self._init_session()
self._init_cookies()
self._init_proxies()
@@ -140,21 +139,20 @@ class Extractor():
if notfound and code == 404:
raise exception.NotFoundError(notfound)
- reason = response.reason
- if cloudflare.is_challenge(response):
- self.log.info("Solving Cloudflare challenge")
- response, domain, cookies = cloudflare.solve_challenge(
- session, response, kwargs)
- if cookies:
- cloudflare.cookies.update(
- self.category, (domain, cookies))
- return response
- if cloudflare.is_captcha(response):
- self.log.warning("Cloudflare CAPTCHA")
-
- msg = "'{} {}' for '{}'".format(code, reason, url)
+ msg = "'{} {}' for '{}'".format(code, response.reason, url)
+ server = response.headers.get("Server")
+ if server and server.startswith("cloudflare"):
+ if code == 503 and \
+ b"jschl-answer" in response.content:
+ self.log.warning("Cloudflare IUAM challenge")
+ break
+ if code == 403 and \
+ b'name="captcha-bypass"' in response.content:
+ self.log.warning("Cloudflare CAPTCHA")
+ break
if code < 500 and code != 429 and code != 430:
break
+
finally:
Extractor.request_timestamp = time.time()
@@ -212,19 +210,46 @@ class Extractor():
return username, password
- def _init_headers(self):
- """Initialize HTTP headers for the 'session' object"""
- headers = self.session.headers
+ def _init_session(self):
+ self.session = session = requests.Session()
+ headers = session.headers
headers.clear()
- headers["User-Agent"] = self.config(
- "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) "
- "Gecko/20100101 Firefox/68.0"))
- headers["Accept"] = "*/*"
- headers["Accept-Language"] = "en-US,en;q=0.5"
- headers["Accept-Encoding"] = "gzip, deflate"
- headers["Connection"] = "keep-alive"
- headers["Upgrade-Insecure-Requests"] = "1"
+ browser = self.config("browser") or self.browser
+ if browser:
+ browser, _, platform = browser.lower().partition(":")
+
+ if not platform or platform == "auto":
+ platform = ("Windows NT 10.0; Win64; x64"
+ if util.WINDOWS else "X11; Linux x86_64")
+ elif platform == "windows":
+ platform = "Windows NT 10.0; Win64; x64"
+ elif platform == "linux":
+ platform = "X11; Linux x86_64"
+ elif platform == "macos":
+ platform = "Macintosh; Intel Mac OS X 11.2"
+
+ if browser == "chrome":
+ _emulate_browser_chrome(session, platform)
+ else:
+ _emulate_browser_firefox(session, platform)
+ else:
+ headers["User-Agent"] = self.config("user-agent", (
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
+ "rv:78.0) Gecko/20100101 Firefox/78.0"))
+ headers["Accept"] = "*/*"
+ headers["Accept-Language"] = "en-US,en;q=0.5"
+ headers["Accept-Encoding"] = "gzip, deflate"
+
+ custom_headers = self.config("headers")
+ if custom_headers:
+ headers.update(custom_headers)
+
+ ciphers = self.config("ciphers")
+ if ciphers:
+ if isinstance(ciphers, list):
+ ciphers = ":".join(ciphers)
+ session.mount("https://", HTTPSAdapter(ciphers))
def _init_proxies(self):
"""Update the session's proxy map"""
@@ -242,6 +267,8 @@ class Extractor():
def _init_cookies(self):
"""Populate the session's cookiejar"""
+ self._cookiefile = None
+ self._cookiejar = self.session.cookies
if self.cookiedomain is None:
return
@@ -264,11 +291,6 @@ class Extractor():
"expected 'dict' or 'str' value for 'cookies' option, "
"got '%s' (%s)", cookies.__class__.__name__, cookies)
- cookies = cloudflare.cookies(self.category)
- if cookies:
- domain, cookies = cookies
- self._update_cookies_dict(cookies, domain)
-
def _store_cookies(self):
"""Store the session's cookiejar in a cookies.txt file"""
if self._cookiefile and self.config("cookies-update", True):
@@ -527,46 +549,126 @@ class AsynchronousMixin():
messages.put(None)
-def generate_extractors(extractor_data, symtable, classes):
- """Dynamically generate Extractor classes"""
- extractors = config.get(("extractor",), classes[0].basecategory)
- ckey = extractor_data.get("_ckey")
- prev = None
-
- if extractors:
- extractor_data.update(extractors)
-
- for category, info in extractor_data.items():
-
- if not isinstance(info, dict) or "root" not in info:
- continue
-
- root = info["root"]
- domain = root[root.index(":") + 3:]
- pattern = info.get("pattern") or re.escape(domain)
- name = (info.get("name") or category).capitalize()
-
- for cls in classes:
-
- class Extr(cls):
- pass
- Extr.__module__ = cls.__module__
- Extr.__name__ = Extr.__qualname__ = \
- name + cls.subcategory.capitalize() + "Extractor"
- Extr.__doc__ = \
- "Extractor for " + cls.subcategory + "s from " + domain
- Extr.category = category
- Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt
- Extr.test = info.get("test-" + cls.subcategory)
- Extr.root = root
+class BaseExtractor(Extractor):
+ instances = ()
- if "extra" in info:
- for key, value in info["extra"].items():
- setattr(Extr, key, value)
- if prev and ckey:
- setattr(Extr, ckey, prev)
+ def __init__(self, match):
+ if not self.category:
+ for index, group in enumerate(match.groups()):
+ if group is not None:
+ self.category, self.root = self.instances[index]
+ break
+ Extractor.__init__(self, match)
- symtable[Extr.__name__] = prev = Extr
+ @classmethod
+ def update(cls, instances):
+ extra_instances = config.get(("extractor",), cls.basecategory)
+ if extra_instances:
+ for category, info in extra_instances.items():
+ if isinstance(info, dict) and "root" in info:
+ instances[category] = info
+
+ pattern_list = []
+ instance_list = cls.instances = []
+ for category, info in instances.items():
+ root = info["root"].rstrip("/")
+ instance_list.append((category, root))
+
+ pattern = info.get("pattern")
+ if not pattern:
+ pattern = re.escape(root[root.index(":") + 3:])
+ pattern_list.append(pattern + "()")
+
+ return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")"
+
+
+class HTTPSAdapter(HTTPAdapter):
+
+ def __init__(self, ciphers):
+ context = self.ssl_context = ssl.create_default_context()
+ context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
+ ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
+ context.set_ecdh_curve("prime256v1")
+ context.set_ciphers(ciphers)
+ HTTPAdapter.__init__(self)
+
+ def init_poolmanager(self, *args, **kwargs):
+ kwargs["ssl_context"] = self.ssl_context
+ return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
+
+ def proxy_manager_for(self, *args, **kwargs):
+ kwargs["ssl_context"] = self.ssl_context
+ return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
+
+
+def _emulate_browser_firefox(session, platform):
+ headers = session.headers
+ headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:78.0) "
+ "Gecko/20100101 Firefox/78.0")
+ headers["Accept"] = ("text/html,application/xhtml+xml,"
+ "application/xml;q=0.9,image/webp,*/*;q=0.8")
+ headers["Accept-Language"] = "en-US,en;q=0.5"
+ headers["Accept-Encoding"] = "gzip, deflate"
+ headers["Referer"] = None
+ headers["Upgrade-Insecure-Requests"] = "1"
+ headers["Cookie"] = None
+
+ session.mount("https://", HTTPSAdapter(
+ "TLS_AES_128_GCM_SHA256:"
+ "TLS_CHACHA20_POLY1305_SHA256:"
+ "TLS_AES_256_GCM_SHA384:"
+ "ECDHE-ECDSA-AES128-GCM-SHA256:"
+ "ECDHE-RSA-AES128-GCM-SHA256:"
+ "ECDHE-ECDSA-CHACHA20-POLY1305:"
+ "ECDHE-RSA-CHACHA20-POLY1305:"
+ "ECDHE-ECDSA-AES256-GCM-SHA384:"
+ "ECDHE-RSA-AES256-GCM-SHA384:"
+ "ECDHE-ECDSA-AES256-SHA:"
+ "ECDHE-ECDSA-AES128-SHA:"
+ "ECDHE-RSA-AES128-SHA:"
+ "ECDHE-RSA-AES256-SHA:"
+ "DHE-RSA-AES128-SHA:"
+ "DHE-RSA-AES256-SHA:"
+ "AES128-SHA:"
+ "AES256-SHA:"
+ "DES-CBC3-SHA"
+ ))
+
+
+def _emulate_browser_chrome(session, platform):
+ if platform.startswith("Macintosh"):
+ platform = platform.replace(".", "_") + "_0"
+
+ headers = session.headers
+ headers["Upgrade-Insecure-Requests"] = "1"
+ headers["User-Agent"] = (
+ "Mozilla/5.0 (" + platform + ") AppleWebKit/537.36 "
+ "(KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36")
+ headers["Accept"] = ("text/html,application/xhtml+xml,application/xml;"
+ "q=0.9,image/webp,image/apng,*/*;q=0.8")
+ headers["Referer"] = None
+ headers["Accept-Encoding"] = "gzip, deflate"
+ headers["Accept-Language"] = "en-US,en;q=0.9"
+ headers["Cookie"] = None
+
+ session.mount("https://", HTTPSAdapter(
+ "TLS_AES_128_GCM_SHA256:"
+ "TLS_AES_256_GCM_SHA384:"
+ "TLS_CHACHA20_POLY1305_SHA256:"
+ "ECDHE-ECDSA-AES128-GCM-SHA256:"
+ "ECDHE-RSA-AES128-GCM-SHA256:"
+ "ECDHE-ECDSA-AES256-GCM-SHA384:"
+ "ECDHE-RSA-AES256-GCM-SHA384:"
+ "ECDHE-ECDSA-CHACHA20-POLY1305:"
+ "ECDHE-RSA-CHACHA20-POLY1305:"
+ "ECDHE-RSA-AES128-SHA:"
+ "ECDHE-RSA-AES256-SHA:"
+ "AES128-GCM-SHA256:"
+ "AES256-GCM-SHA384:"
+ "AES128-SHA:"
+ "AES256-SHA:"
+ "DES-CBC3-SHA"
+ ))
# Undo automatic pyOpenSSL injection by requests
@@ -578,38 +680,3 @@ if not pyopenssl:
except ImportError:
pass
del pyopenssl
-
-
-# Replace urllib3's default cipher list to avoid Cloudflare CAPTCHAs
-ciphers = config.get((), "ciphers", True)
-if ciphers:
-
- if ciphers is True:
- ciphers = (
- # Firefox's list
- "TLS_AES_128_GCM_SHA256:"
- "TLS_CHACHA20_POLY1305_SHA256:"
- "TLS_AES_256_GCM_SHA384:"
- "ECDHE-ECDSA-AES128-GCM-SHA256:"
- "ECDHE-RSA-AES128-GCM-SHA256:"
- "ECDHE-ECDSA-CHACHA20-POLY1305:"
- "ECDHE-RSA-CHACHA20-POLY1305:"
- "ECDHE-ECDSA-AES256-GCM-SHA384:"
- "ECDHE-RSA-AES256-GCM-SHA384:"
- "ECDHE-ECDSA-AES256-SHA:"
- "ECDHE-ECDSA-AES128-SHA:"
- "ECDHE-RSA-AES128-SHA:"
- "ECDHE-RSA-AES256-SHA:"
- "DHE-RSA-AES128-SHA:"
- "DHE-RSA-AES256-SHA:"
- "AES128-SHA:"
- "AES256-SHA:"
- "DES-CBC3-SHA"
- )
- elif isinstance(ciphers, list):
- ciphers = ":".join(ciphers)
-
- from requests.packages.urllib3.util import ssl_ # noqa
- ssl_.DEFAULT_CIPHERS = ciphers
- del ssl_
-del ciphers
diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py
new file mode 100644
index 0000000..a057b84
--- /dev/null
+++ b/gallery_dl/extractor/cyberdrop.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://cyberdrop.me/"""
+
+from .common import Extractor, Message
+from .. import text
+import base64
+
+
+class CyberdropAlbumExtractor(Extractor):
+ category = "cyberdrop"
+ subcategory = "album"
+ root = "https://cyberdrop.me"
+ directory_fmt = ("{category}", "{album_id} {album_name}")
+ archive_fmt = "{album_id}_{id}"
+ pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.me/a/([^/?#]+)"
+ test = ("https://cyberdrop.me/a/keKRjm4t", {
+ "pattern": r"https://f\.cyberdrop\.cc/.*\.[a-z]+$",
+ "keyword": {
+ "album_id": "keKRjm4t",
+ "album_name": "Fate (SFW)",
+ "album_size": 150069254,
+ "count": 62,
+ "date": "dt:2020-06-18 13:14:20",
+ "description": "",
+ "id": r"re:\w{8}",
+ },
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.album_id = match.group(1)
+
+ def items(self):
+ url = self.root + "/a/" + self.album_id
+ extr = text.extract_from(self.request(url).text)
+ extr("const albumData = {", "")
+
+ data = {
+ "album_id" : self.album_id,
+ "album_name" : extr("name: '", "'"),
+ "date" : text.parse_timestamp(extr("timestamp: ", ",")),
+ "album_size" : text.parse_int(extr("totalSize: ", ",")),
+ "description": extr("description: `", "`"),
+ }
+ files = extr("fl: '", "'").split(",")
+ data["count"] = len(files)
+
+ yield Message.Directory, data
+ for file_b64 in files:
+ file = base64.b64decode(file_b64.encode()).decode()
+ text.nameext_from_url(file, data)
+ data["filename"], _, data["id"] = data["filename"].rpartition("-")
+ yield Message.Url, "https://f.cyberdrop.cc/" + file, data
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 2eb3b28..47286b7 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -78,6 +78,10 @@ class DeviantartExtractor(Extractor):
else:
self.user = profile["user"]["username"]
+ if self.extra:
+ finditer_stash = DeviantartStashExtractor.pattern.finditer
+ finditer_deviation = DeviantartDeviationExtractor.pattern.finditer
+
yield Message.Version, 1
for deviation in self.deviations():
if isinstance(deviation, tuple):
@@ -134,10 +138,14 @@ class DeviantartExtractor(Extractor):
if self.extra:
txt = (deviation.get("description", "") +
deviation.get("_journal", ""))
- for match in DeviantartStashExtractor.pattern.finditer(txt):
+ for match in finditer_stash(txt):
url = text.ensure_http_scheme(match.group(0))
deviation["_extractor"] = DeviantartStashExtractor
yield Message.Queue, url, deviation
+ for match in finditer_deviation(txt):
+ url = text.ensure_http_scheme(match.group(0))
+ deviation["_extractor"] = DeviantartDeviationExtractor
+ yield Message.Queue, url, deviation
def deviations(self):
"""Return an iterable containing all relevant Deviation-objects"""
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 1c6ebb4..842de7e 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -50,7 +50,8 @@ class EromeExtractor(Extractor):
for data["num"], group in enumerate(util.advance(groups, 1), 1):
url = (text.extract(group, '<source src="', '"')[0] or
text.extract(group, 'data-src="', '"')[0])
- yield Message.Url, url, text.nameext_from_url(url, data)
+ if url:
+ yield Message.Url, url, text.nameext_from_url(url, data)
def albums(self):
return ()
@@ -84,14 +85,14 @@ class EromeAlbumExtractor(EromeExtractor):
"""Extractor for albums on erome.com"""
subcategory = "album"
pattern = BASE_PATTERN + r"/a/(\w+)"
- test = ("https://www.erome.com/a/UHUX1B73", {
- "pattern": r"https://s\d+\.erome\.com/342/UHUX1B73/\w+",
- "count": 5,
+ test = ("https://www.erome.com/a/KandxY7y", {
+ "pattern": r"https://s\d+\.erome\.com/355/KandxY7y/\w+",
+ "count": 26,
"keyword": {
- "album_id": "UHUX1B73",
+ "album_id": "KandxY7y",
"num": int,
- "title": "Ryan Ryans",
- "user": "gutiquq",
+ "title": "Therealbrittfitt",
+ "user": "pokow",
},
})
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 4ead3fb..5a7de23 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2020 Mike Fährmann
+# Copyright 2014-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,24 +12,22 @@ from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import itertools
-import random
-import time
import math
-
BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
class ExhentaiExtractor(Extractor):
"""Base class for exhentai extractors"""
category = "exhentai"
- directory_fmt = ("{category}", "{gallery_id} {title[:247]}")
+ directory_fmt = ("{category}", "{gid} {title[:247]}")
filename_fmt = (
- "{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}")
- archive_fmt = "{gallery_id}_{num}"
+ "{gid}_{num:>04}_{image_token}_{filename}.{extension}")
+ archive_fmt = "{gid}_{num}"
cookienames = ("ipb_member_id", "ipb_pass_hash")
cookiedomain = ".exhentai.org"
root = "https://exhentai.org"
+ request_interval = 5.0
LIMIT = False
@@ -47,8 +45,6 @@ class ExhentaiExtractor(Extractor):
Extractor.__init__(self, match)
self.limits = self.config("limits", True)
self.original = self.config("original", True)
- self.wait_min = self.config("wait-min", 3)
- self.wait_max = self.config("wait-max", 6)
if type(self.limits) is int:
self._limit_max = self.limits
@@ -57,8 +53,6 @@ class ExhentaiExtractor(Extractor):
self._limit_max = 0
self._remaining = 0
- if self.wait_max < self.wait_min:
- self.wait_max = self.wait_min
self.session.headers["Referer"] = self.root + "/"
if version != "ex":
self.session.cookies.set("nw", "1", domain=self.cookiedomain)
@@ -70,14 +64,6 @@ class ExhentaiExtractor(Extractor):
raise exception.AuthorizationError()
return response
- def wait(self, waittime=None):
- """Wait for a randomly chosen amount of seconds"""
- if not waittime:
- waittime = random.uniform(self.wait_min, self.wait_max)
- else:
- waittime = random.uniform(waittime * 0.66, waittime * 1.33)
- time.sleep(waittime)
-
def login(self):
"""Login and set necessary cookies"""
if self.LIMIT:
@@ -132,7 +118,39 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
r"|/s/([\da-f]{10})/(\d+)-(\d+))")
test = (
("https://exhentai.org/g/1200119/d55c44d3d0/", {
- "keyword": "199db053b4ccab94463b459e1cfe079df8cdcdd1",
+ "keyword": {
+ "cost": int,
+ "date": "dt:2018-03-18 20:15:00",
+ "eh_category": "Non-H",
+ "expunged": False,
+ "favorites": "17",
+ "filecount": "4",
+ "filesize": 1488978,
+ "gid": 1200119,
+ "height": int,
+ "image_token": "re:[0-9a-f]{10}",
+ "lang": "jp",
+ "language": "Japanese",
+ "parent": "",
+ "rating": r"re:\d\.\d+",
+ "size": int,
+ "tags": [
+ "parody:komi-san wa komyushou desu.",
+ "character:shouko komi",
+ "group:seventh lowlife",
+ "sample",
+ ],
+ "thumb": "https://exhentai.org/t/ce/0a/ce0a5bcb583229a9b07c0f8"
+ "3bcb1630ab1350640-624622-736-1036-jpg_250.jpg",
+ "title": "C93 [Seventh_Lowlife] Komi-san ha Tokidoki Daitan de"
+ "su (Komi-san wa Komyushou desu) [Sample]",
+ "title_jpn": "(C93) [Comiketjack (わ!)] 古見さんは、時々大胆"
+ "です。 (古見さんは、コミュ症です。) [見本]",
+ "token": "d55c44d3d0",
+ "torrentcount": "0",
+ "uploader": "klorpa",
+ "width": int,
+ },
"content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff",
}),
("https://exhentai.org/g/960461/4f0e369d82/", {
@@ -169,7 +187,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self.log.error("Failed to extract initial image token")
self.log.debug("Page content:\n%s", gpage)
return
- self.wait()
ipage = self._image_page()
else:
ipage = self._image_page()
@@ -179,13 +196,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self.log.debug("Page content:\n%s", ipage)
return
self.gallery_token = part.split("/")[1]
- self.wait()
gpage = self._gallery_page()
data = self.get_metadata(gpage)
- self.count = data["count"]
-
- yield Message.Version, 1
+ self.count = text.parse_int(data["filecount"])
yield Message.Directory, data
images = itertools.chain(
@@ -196,39 +210,64 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self._check_limits(data)
if "/fullimg.php" in url:
data["extension"] = ""
- self.wait(self.wait_max / 4)
yield Message.Url, url, data
def get_metadata(self, page):
"""Extract gallery metadata"""
+ data = self.metadata_from_page(page)
+ if self.config("metadata", False):
+ data.update(self.metadata_from_api())
+ data["date"] = text.parse_timestamp(data["posted"])
+ return data
+
+ def metadata_from_page(self, page):
extr = text.extract_from(page)
data = {
- "gallery_id" : self.gallery_id,
- "gallery_token": self.gallery_token,
+ "gid" : self.gallery_id,
+ "token" : self.gallery_token,
+ "thumb" : extr("background:transparent url(", ")"),
"title" : text.unescape(extr('<h1 id="gn">', '</h1>')),
- "title_jp" : text.unescape(extr('<h1 id="gj">', '</h1>')),
+ "title_jpn" : text.unescape(extr('<h1 id="gj">', '</h1>')),
+ "_" : extr('<div id="gdc"><div class="cs ct', '"'),
+ "eh_category" : extr('>', '<'),
+ "uploader" : text.unquote(extr('/uploader/', '"')),
"date" : text.parse_datetime(extr(
'>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
"parent" : extr(
'>Parent:</td><td class="gdt2"><a href="', '"'),
- "visible" : extr(
+ "expunged" : "Yes" != extr(
'>Visible:</td><td class="gdt2">', '<'),
- "language" : extr(
- '>Language:</td><td class="gdt2">', ' '),
- "gallery_size" : text.parse_bytes(extr(
+ "language" : extr('>Language:</td><td class="gdt2">', ' '),
+ "filesize" : text.parse_bytes(extr(
'>File Size:</td><td class="gdt2">', '<').rstrip("Bb")),
- "count" : text.parse_int(extr(
- '>Length:</td><td class="gdt2">', ' ')),
+ "filecount" : extr('>Length:</td><td class="gdt2">', ' '),
+ "favorites" : extr('id="favcount">', ' '),
+ "rating" : extr(">Average: ", "<"),
+ "torrentcount" : extr('>Torrent Download (', ')'),
}
data["lang"] = util.language_to_code(data["language"])
data["tags"] = [
- text.unquote(tag)
+ text.unquote(tag.replace("+", " "))
for tag in text.extract_iter(page, 'hentai.org/tag/', '"')
]
return data
+ def metadata_from_api(self):
+ url = self.root + "/api.php"
+ data = {
+ "method": "gdata",
+ "gidlist": ((self.gallery_id, self.gallery_token),),
+ "namespace": 1,
+ }
+
+ data = self.request(url, method="POST", json=data).json()
+ if "error" in data:
+ raise exception.StopExtraction(data["error"])
+
+ return data["gmetadata"][0]
+
def image_from_page(self, page):
"""Get image url and data from webpage"""
pos = page.index('<div id="i3"><a onclick="return load_image(') + 26
@@ -267,7 +306,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"showkey": self.key["show"],
}
for request["page"] in range(self.image_num + 1, self.count + 1):
- self.wait()
page = self.request(api_url, method="POST", json=request).json()
imgkey = nextkey
nextkey, pos = text.extract(page["i3"], "'", "'")
@@ -317,7 +355,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
return page
def _check_limits(self, data):
- if not self._remaining or data["num"] % 20 == 0:
+ if not self._remaining or data["num"] % 25 == 0:
self._update_limits()
self._remaining -= data["cost"]
@@ -400,7 +438,6 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
def items(self):
self.login()
- yield Message.Version, 1
data = {"_extractor": ExhentaiGalleryExtractor}
while True:
@@ -417,7 +454,6 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
if 'class="ptdd">&gt;<' in page or ">No hits found</p>" in page:
return
self.params["page"] += 1
- self.wait()
class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index 319ebe2..0bcec2b 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -8,21 +8,21 @@
"""Extractors for 4chan archives based on FoolFuuka"""
-from .common import Extractor, Message, generate_extractors
+from .common import BaseExtractor, Message
from .. import text
import itertools
-class FoolfuukaExtractor(Extractor):
+class FoolfuukaExtractor(BaseExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
basecategory = "foolfuuka"
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
external = "default"
def __init__(self, match):
- Extractor.__init__(self, match)
+ BaseExtractor.__init__(self, match)
self.session.headers["Referer"] = self.root
- if self.external == "direct":
+ if self.category == "b4k":
self.remote = self._remote_direct
def items(self):
@@ -43,7 +43,7 @@ class FoolfuukaExtractor(Extractor):
yield Message.Url, url, post
def metadata(self):
- """ """
+ """Return general metadata"""
def posts(self):
"""Return an iterable with all relevant posts"""
@@ -59,16 +59,90 @@ class FoolfuukaExtractor(Extractor):
return media["remote_media_link"]
+BASE_PATTERN = FoolfuukaExtractor.update({
+ "4plebs": {
+ "root": "https://archive.4plebs.org",
+ "pattern": r"(?:archive\.)?4plebs\.org",
+ },
+ "archivedmoe": {
+ "root": "https://archived.moe",
+ },
+ "archiveofsins": {
+ "root": "https://archiveofsins.com",
+ "pattern": r"(?:www\.)?archiveofsins\.com",
+ },
+ "b4k": {
+ "root": "https://arch.b4k.co",
+ },
+ "desuarchive": {
+ "root": "https://desuarchive.org",
+ },
+ "fireden": {
+ "root": "https://boards.fireden.net",
+ },
+ "nyafuu": {
+ "root": "https://archive.nyafuu.org",
+ "pattern": r"(?:archive\.)?nyafuu\.org",
+ },
+ "rbt": {
+ "root": "https://rbt.asia",
+ "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
+ },
+ "thebarchive": {
+ "root": "https://thebarchive.com",
+ "pattern": r"thebarchive\.com",
+ },
+})
+
+
class FoolfuukaThreadExtractor(FoolfuukaExtractor):
"""Base extractor for threads on FoolFuuka based boards/archives"""
subcategory = "thread"
directory_fmt = ("{category}", "{board[shortname]}",
"{thread_num}{title:? - //}")
- pattern_fmt = r"/([^/?#]+)/thread/(\d+)"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)"
+ test = (
+ ("https://archive.4plebs.org/tg/thread/54059290", {
+ "url": "07452944164b602502b02b24521f8cee5c484d2a",
+ }),
+ ("https://archived.moe/gd/thread/309639/", {
+ "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
+ "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
+ }),
+ ("https://archived.moe/a/thread/159767162/", {
+ "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
+ }),
+ ("https://archiveofsins.com/h/thread/4668813/", {
+ "url": "f612d287087e10a228ef69517cf811539db9a102",
+ "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
+ }),
+ ("https://arch.b4k.co/meta/thread/196/", {
+ "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a",
+ }),
+ ("https://desuarchive.org/a/thread/159542679/", {
+ "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
+ }),
+ ("https://boards.fireden.net/sci/thread/11264294/", {
+ "url": "3adfe181ee86a8c23021c705f623b3657a9b0a43",
+ }),
+ ("https://archive.nyafuu.org/c/thread/2849220/", {
+ "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
+ }),
+ ("https://rbt.asia/g/thread/61487650/", {
+ "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+ }),
+ ("https://archive.rebeccablacktech.com/g/thread/61487650/", {
+ "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+ }),
+ ("https://thebarchive.com/b/thread/739772332/", {
+ "url": "e8b18001307d130d67db31740ce57c8561b5d80c",
+ }),
+ )
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
- self.board, self.thread = match.groups()
+ self.board = match.group(match.lastindex-1)
+ self.thread = match.group(match.lastindex)
self.data = None
def metadata(self):
@@ -78,23 +152,34 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
return self.data["op"]
def posts(self):
+ op = (self.data["op"],)
posts = self.data.get("posts")
if posts:
posts = list(posts.values())
posts.sort(key=lambda p: p["timestamp"])
- else:
- posts = ()
- return itertools.chain((self.data["op"],), posts)
+ return itertools.chain(op, posts)
+ return op
class FoolfuukaBoardExtractor(FoolfuukaExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
subcategory = "board"
- pattern_fmt = r"/([^/?#]+)/\d*$"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$"
+ test = (
+ ("https://archive.4plebs.org/tg/"),
+ ("https://archived.moe/gd/"),
+ ("https://archiveofsins.com/h/"),
+ ("https://arch.b4k.co/meta/"),
+ ("https://desuarchive.org/a/"),
+ ("https://boards.fireden.net/sci/"),
+ ("https://archive.nyafuu.org/c/"),
+ ("https://rbt.asia/g/"),
+ ("https://thebarchive.com/b/"),
+ )
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
- self.board = match.group(1)
+ self.board = match.group(match.lastindex)
def items(self):
index_base = "{}/_/api/chan/index/?board={}&page=".format(
@@ -113,7 +198,7 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor):
for num, thread in threads.items():
thread["url"] = thread_base + format(num)
- thread["_extractor"] = self.childclass
+ thread["_extractor"] = FoolfuukaThreadExtractor
yield Message.Queue, thread["url"], thread
@@ -121,15 +206,24 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
"""Base extractor for search results on FoolFuuka based boards/archives"""
subcategory = "search"
directory_fmt = ("{category}", "search", "{search}")
- pattern_fmt = r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)"
request_interval = 1.0
+ test = (
+ ("https://archive.4plebs.org/_/search/text/test/"),
+ ("https://archived.moe/_/search/text/test/"),
+ ("https://archiveofsins.com/_/search/text/test/"),
+ ("https://archiveofsins.com/_/search/text/test/"),
+ ("https://desuarchive.org/_/search/text/test/"),
+ ("https://boards.fireden.net/_/search/text/test/"),
+ ("https://archive.nyafuu.org/_/search/text/test/"),
+ ("https://rbt.asia/_/search/text/test/"),
+ ("https://thebarchive.com/_/search/text/test/"),
+ )
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
- board, search = match.groups()
-
self.params = params = {}
- args = search.split("/")
+ args = match.group(match.lastindex).split("/")
key = None
for arg in args:
@@ -138,6 +232,8 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
key = None
else:
key = arg
+
+ board = match.group(match.lastindex-1)
if board != "_":
params["boards"] = board
@@ -170,105 +266,3 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
if len(posts) <= 3:
return
params["page"] += 1
-
-
-EXTRACTORS = {
- "4plebs": {
- "name": "_4plebs",
- "root": "https://archive.4plebs.org",
- "pattern": r"(?:archive\.)?4plebs\.org",
- "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", {
- "url": "07452944164b602502b02b24521f8cee5c484d2a",
- }),
- "test-board": ("https://archive.4plebs.org/tg/",),
- "test-search": ("https://archive.4plebs.org/_/search/text/test/",),
- },
- "archivedmoe": {
- "root": "https://archived.moe",
- "test-thread": (
- ("https://archived.moe/gd/thread/309639/", {
- "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
- "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
- }),
- ("https://archived.moe/a/thread/159767162/", {
- "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
- }),
- ),
- "test-board": ("https://archived.moe/gd/",),
- "test-search": ("https://archived.moe/_/search/text/test/",),
- },
- "archiveofsins": {
- "root": "https://archiveofsins.com",
- "pattern": r"(?:www\.)?archiveofsins\.com",
- "test-thread": ("https://archiveofsins.com/h/thread/4668813/", {
- "url": "f612d287087e10a228ef69517cf811539db9a102",
- "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
- }),
- "test-board": ("https://archiveofsins.com/h/",),
- "test-search": ("https://archiveofsins.com/_/search/text/test/",),
- },
- "b4k": {
- "root": "https://arch.b4k.co",
- "extra": {"external": "direct"},
- "test-thread": ("https://arch.b4k.co/meta/thread/196/", {
- "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a",
- }),
- "test-board": ("https://arch.b4k.co/meta/",),
- "test-search": ("https://arch.b4k.co/_/search/text/test/",),
- },
- "desuarchive": {
- "root": "https://desuarchive.org",
- "test-thread": ("https://desuarchive.org/a/thread/159542679/", {
- "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
- }),
- "test-board": ("https://desuarchive.org/a/",),
- "test-search": ("https://desuarchive.org/_/search/text/test/",),
- },
- "fireden": {
- "root": "https://boards.fireden.net",
- "test-thread": ("https://boards.fireden.net/sci/thread/11264294/", {
- "url": "3adfe181ee86a8c23021c705f623b3657a9b0a43",
- }),
- "test-board": ("https://boards.fireden.net/sci/",),
- "test-search": ("https://boards.fireden.net/_/search/text/test/",),
- },
- "nyafuu": {
- "root": "https://archive.nyafuu.org",
- "pattern": r"(?:archive\.)?nyafuu\.org",
- "test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", {
- "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
- }),
- "test-board": ("https://archive.nyafuu.org/c/",),
- "test-search": ("https://archive.nyafuu.org/_/search/text/test/",),
- },
- "rbt": {
- "root": "https://rbt.asia",
- "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
- "test-thread": (
- ("https://rbt.asia/g/thread/61487650/", {
- "url": "61896d9d9a2edb556b619000a308a984307b6d30",
- }),
- ("https://archive.rebeccablacktech.com/g/thread/61487650/", {
- "url": "61896d9d9a2edb556b619000a308a984307b6d30",
- }),
- ),
- "test-board": ("https://rbt.asia/g/",),
- "test-search": ("https://rbt.asia/_/search/text/test/",),
- },
- "thebarchive": {
- "root": "https://thebarchive.com",
- "pattern": r"thebarchive\.com",
- "test-thread": ("https://thebarchive.com/b/thread/739772332/", {
- "url": "e8b18001307d130d67db31740ce57c8561b5d80c",
- }),
- "test-board": ("https://thebarchive.com/b/",),
- "test-search": ("https://thebarchive.com/_/search/text/test/",),
- },
- "_ckey": "childclass",
-}
-
-generate_extractors(EXTRACTORS, globals(), (
- FoolfuukaThreadExtractor,
- FoolfuukaBoardExtractor,
- FoolfuukaSearchExtractor,
-))
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index db5e250..f8664e7 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2020 Mike Fährmann
+# Copyright 2016-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,23 +8,21 @@
"""Extractors for FoOlSlide based sites"""
-from .common import (
- Extractor,
- ChapterExtractor,
- MangaExtractor,
- Message,
- generate_extractors,
-)
+from .common import BaseExtractor, Message
from .. import text, util
import json
-class FoolslideBase():
+class FoolslideExtractor(BaseExtractor):
"""Base class for FoOlSlide extractors"""
basecategory = "foolslide"
+ def __init__(self, match):
+ BaseExtractor.__init__(self, match)
+ self.gallery_url = self.root + match.group(match.lastindex)
+
def request(self, url):
- return Extractor.request(
+ return BaseExtractor.request(
self, url, encoding="utf-8", method="POST", data={"adult": "true"})
@staticmethod
@@ -40,12 +38,53 @@ class FoolslideBase():
return data
-class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
+BASE_PATTERN = FoolslideExtractor.update({
+ "dokireader": {
+ "root": "https://kobato.hologfx.com/reader",
+ },
+ "kireicake": {
+ "root": "https://reader.kireicake.com",
+ },
+ "powermanga": {
+ "root": "https://read.powermanga.org",
+ "pattern": r"read(?:er)?\.powermanga\.org",
+ },
+ "sensescans": {
+ "root": "https://sensescans.com/reader",
+ "pattern": r"(?:(?:www\.)?sensescans\.com/reader"
+ r"|reader\.sensescans\.com)",
+ },
+})
+
+
+class FoolslideChapterExtractor(FoolslideExtractor):
"""Base class for chapter extractors for FoOlSlide based sites"""
+ subcategory = "chapter"
directory_fmt = ("{category}", "{manga}", "{chapter_string}")
+ filename_fmt = (
+ "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}")
archive_fmt = "{id}"
- pattern_fmt = r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
- decode = "default"
+ pattern = BASE_PATTERN + r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
+ test = (
+ (("https://kobato.hologfx.com/reader/read/"
+ "hitoribocchi_no_oo_seikatsu/en/3/34"), {
+ "keyword": "6e719ac86f0c6dab89390dd7e507e678459e0dbc",
+ }),
+ ("https://reader.kireicake.com/read/wonderland/en/1/1/", {
+ "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e",
+ "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6",
+ }),
+ (("https://read.powermanga.org"
+ "/read/one_piece_digital_colour_comics/en/0/75/"), {
+ "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384",
+ "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe",
+ }),
+ ("https://sensescans.com/reader/read/ao_no_orchestra/en/0/26/", {
+ "url": "bbd428dc578f5055e9f86ad635b510386cd317cd",
+ "keyword": "083ef6f8831c84127fe4096fa340a249be9d1424",
+ }),
+ ("https://reader.sensescans.com/read/ao_no_orchestra/en/0/26/"),
+ )
def items(self):
page = self.request(self.gallery_url).text
@@ -83,9 +122,51 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
return json.loads(text.extract(page, "var pages = ", ";")[0])
-class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
+class FoolslideMangaExtractor(FoolslideExtractor):
"""Base class for manga extractors for FoOlSlide based sites"""
- pattern_fmt = r"(/series/[^/?#]+)"
+ subcategory = "manga"
+ categorytransfer = True
+ pattern = BASE_PATTERN + r"(/series/[^/?#]+)"
+ test = (
+ (("https://kobato.hologfx.com/reader/series/"
+ "boku_ha_ohimesama_ni_narenai/"), {
+ "url": "1c1f5a7258ce4f631f5fc32be548d78a6a57990d",
+ "keyword": "614d89a6045b85c822cbd3e67578ea7577dfc995",
+ }),
+ ("https://reader.kireicake.com/series/wonderland/", {
+ "url": "d067b649af1cc88fa8c8b698fde04a10909fd169",
+ "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb",
+ }),
+ (("https://read.powermanga.org"
+ "/series/one_piece_digital_colour_comics/"), {
+ "count": ">= 1",
+ "keyword": {
+ "chapter": int,
+ "chapter_minor": str,
+ "chapter_string": str,
+ "group": "PowerManga",
+ "lang": "en",
+ "language": "English",
+ "manga": "One Piece Digital Colour Comics",
+ "title": str,
+ "volume": int,
+ },
+ }),
+ ("https://sensescans.com/reader/series/yotsubato/", {
+ "count": ">= 3",
+ }),
+ )
+
+ def items(self):
+ page = self.request(self.gallery_url).text
+
+ chapters = self.chapters(page)
+ if not self.config("chapter-reverse", False):
+ chapters.reverse()
+
+ for chapter, data in chapters:
+ data["_extractor"] = FoolslideChapterExtractor
+ yield Message.Queue, chapter, data
def chapters(self, page):
extr = text.extract_from(page)
@@ -103,82 +184,3 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
"chapter_string": extr('title="', '"'),
"group" : extr('title="', '"'),
})))
-
-
-EXTRACTORS = {
- "dokireader": {
- "root": "https://kobato.hologfx.com/reader",
- "test-chapter":
- (("https://kobato.hologfx.com/reader/read/"
- "hitoribocchi_no_oo_seikatsu/en/3/34"), {
- "keyword": "6e719ac86f0c6dab89390dd7e507e678459e0dbc",
- }),
- "test-manga":
- (("https://kobato.hologfx.com/reader/series/"
- "boku_ha_ohimesama_ni_narenai/"), {
- "url": "1c1f5a7258ce4f631f5fc32be548d78a6a57990d",
- "keyword": "614d89a6045b85c822cbd3e67578ea7577dfc995",
- }),
- },
- "kireicake": {
- "root": "https://reader.kireicake.com",
- "test-chapter":
- ("https://reader.kireicake.com/read/wonderland/en/1/1/", {
- "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e",
- "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6",
- }),
- "test-manga":
- ("https://reader.kireicake.com/series/wonderland/", {
- "url": "d067b649af1cc88fa8c8b698fde04a10909fd169",
- "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb",
- }),
- },
- "powermanga": {
- "root": "https://read.powermanga.org",
- "pattern": r"read(?:er)?\.powermanga\.org",
- "test-chapter":
- (("https://read.powermanga.org"
- "/read/one_piece_digital_colour_comics/en/0/75/"), {
- "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384",
- "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe",
- }),
- "test-manga":
- (("https://read.powermanga.org"
- "/series/one_piece_digital_colour_comics/"), {
- "count": ">= 1",
- "keyword": {
- "chapter": int,
- "chapter_minor": str,
- "chapter_string": str,
- "group": "PowerManga",
- "lang": "en",
- "language": "English",
- "manga": "One Piece Digital Colour Comics",
- "title": str,
- "volume": int,
- },
- }),
- },
- "sensescans": {
- "root": "https://sensescans.com/reader",
- "pattern": r"(?:(?:www\.)?sensescans\.com/reader"
- r"|reader\.sensescans\.com)",
- "test-chapter": (
- ("https://sensescans.com/reader/read/ao_no_orchestra/en/0/26/", {
- "url": "bbd428dc578f5055e9f86ad635b510386cd317cd",
- "keyword": "083ef6f8831c84127fe4096fa340a249be9d1424",
- }),
- ("https://reader.sensescans.com/read/ao_no_orchestra/en/0/26/"),
- ),
- "test-manga":
- ("https://sensescans.com/reader/series/yotsubato/", {
- "count": ">= 3",
- }),
- },
- "_ckey": "chapterclass",
-}
-
-generate_extractors(EXTRACTORS, globals(), (
- FoolslideChapterExtractor,
- FoolslideMangaExtractor,
-))
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 7a28e9c..92d27a9 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2020 Mike Fährmann
+# Copyright 2014-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,13 +8,14 @@
"""Extractors for https://gelbooru.com/"""
-from . import booru
+from . import gelbooru_v02
from .. import text, exception
class GelbooruBase():
"""Base class for gelbooru extractors"""
category = "gelbooru"
+ basecategory = "booru"
root = "https://gelbooru.com"
@staticmethod
@@ -27,7 +28,8 @@ class GelbooruBase():
return url
-class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor):
+class GelbooruTagExtractor(GelbooruBase,
+ gelbooru_v02.GelbooruV02TagExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
@@ -42,7 +44,8 @@ class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor):
)
-class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):
+class GelbooruPoolExtractor(GelbooruBase,
+ gelbooru_v02.GelbooruV02PoolExtractor):
"""Extractor for image-pools from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=pool&s=show&id=(?P<pool>\d+)")
@@ -72,7 +75,8 @@ class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor):
}
-class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor):
+class GelbooruPostExtractor(GelbooruBase,
+ gelbooru_v02.GelbooruV02PostExtractor):
"""Extractor for single images from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)")
diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py
new file mode 100644
index 0000000..0935998
--- /dev/null
+++ b/gallery_dl/extractor/gelbooru_v01.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Gelbooru v0.1 sites"""
+
+from . import booru
+from .. import text
+
+
+class GelbooruV01Extractor(booru.BooruExtractor):
+ basecategory = "gelbooru_v01"
+ per_page = 20
+
+ def _parse_post(self, post_id):
+ url = "{}/index.php?page=post&s=view&id={}".format(
+ self.root, post_id)
+ page = self.request(url).text
+
+ post = text.extract_all(page, (
+ ("created_at", 'Posted: ', ' <'),
+ ("uploader" , 'By: ', ' <'),
+ ("width" , 'Size: ', 'x'),
+ ("height" , '', ' <'),
+ ("source" , 'Source: <a href="', '"'),
+ ("rating" , 'Rating: ', '<'),
+ ("score" , 'Score: ', ' <'),
+ ("file_url" , '<img alt="img" src="', '"'),
+ ("tags" , 'id="tags" name="tags" cols="40" rows="5">', '<'),
+ ))[0]
+
+ post["id"] = post_id
+ post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
+ post["rating"] = (post["rating"] or "?")[0].lower()
+ post["tags"] = text.unescape(post["tags"])
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%Y-%m-%d %H:%M:%S")
+
+ return post
+
+
+BASE_PATTERN = GelbooruV01Extractor.update({
+ "thecollection" : {"root": "https://the-collection.booru.org"},
+ "illusioncardsbooru": {"root": "https://illusioncards.booru.org"},
+ "allgirlbooru" : {"root": "https://allgirl.booru.org"},
+})
+
+
+class GelbooruV01TagExtractor(GelbooruV01Extractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
+ test = (
+ (("https://the-collection.booru.org"
+ "/index.php?page=post&s=list&tags=parody"), {
+ "range": "1-25",
+ "count": 25,
+ }),
+ (("https://illusioncards.booru.org"
+ "/index.php?page=post&s=list&tags=koikatsu"), {
+ "range": "1-25",
+ "count": 25,
+ }),
+ ("https://allgirl.booru.org/index.php?page=post&s=list&tags=dress", {
+ "range": "1-25",
+ "count": 25,
+ }),
+ )
+
+ def __init__(self, match):
+ GelbooruV01Extractor.__init__(self, match)
+ self.tags = match.group(match.lastindex)
+
+ def metadata(self):
+ return {"search_tags": text.unquote(self.tags.replace("+", " "))}
+
+ def posts(self):
+ url = "{}/index.php?page=post&s=list&tags={}&pid=".format(
+ self.root, self.tags)
+ pid = self.page_start
+
+ while True:
+ page = self.request(url + str(pid)).text
+
+ cnt = 0
+ for post_id in text.extract_iter(
+ page, 'class="thumb"><a id="p', '"'):
+ yield self._parse_post(post_id)
+ cnt += 1
+
+ if cnt < self.per_page:
+ return
+ pid += self.per_page
+
+
+class GelbooruV01PostExtractor(GelbooruV01Extractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
+ test = (
+ (("https://the-collection.booru.org"
+ "/index.php?page=post&s=view&id=100520"), {
+ "url": "0329ac8588bb93cf242ca0edbe3e995b4ba554e8",
+ "content": "1e585874e7b874f7937df1060dd1517fef2f4dfb",
+ }),
+ (("https://illusioncards.booru.org"
+ "/index.php?page=post&s=view&id=82746"), {
+ "url": "3f9cd2fadf78869b90bc5422f27b48f1af0e0909",
+ "content": "159e60b92d05597bd1bb63510c2c3e4a4bada1dc",
+ }),
+ ("https://allgirl.booru.org/index.php?page=post&s=view&id=107213", {
+ "url": "b416800d2d2b072f80d3b37cfca9cb806fb25d51",
+ "content": "3e3c65e0854a988696e11adf0de52f8fa90a51c7",
+ "keyword": {
+ "created_at": "2021-02-13 16:27:39",
+ "date": "dt:2021-02-13 16:27:39",
+ "file_url": "https://img.booru.org/allgirl//images/107"
+ "/2aaa0438d58fc7baa75a53b4a9621bb89a9d3fdb.jpg",
+ "height": "1200",
+ "id": "107213",
+ "md5": "2aaa0438d58fc7baa75a53b4a9621bb89a9d3fdb",
+ "rating": "s",
+ "score": str,
+ "source": None,
+ "tags": "blush dress green_eyes green_hair hatsune_miku "
+ "long_hair twintails vocaloid",
+ "uploader": "Honochi31",
+ "width": "1600"
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ GelbooruV01Extractor.__init__(self, match)
+ self.post_id = match.group(match.lastindex)
+
+ def posts(self):
+ return (self._parse_post(self.post_id),)
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
new file mode 100644
index 0000000..51fb478
--- /dev/null
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -0,0 +1,194 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Gelbooru v0.2 sites"""
+
+from . import booru
+from .. import text, util, exception
+
+from xml.etree import ElementTree
+import collections
+import re
+
+
+class GelbooruV02Extractor(booru.BooruExtractor):
+ basecategory = "gelbooru_v02"
+
+ def _api_request(self, params):
+ url = self.root + "/index.php?page=dapi&s=post&q=index"
+ return ElementTree.fromstring(self.request(url, params=params).text)
+
+ def _pagination(self, params):
+ params["pid"] = self.page_start
+ params["limit"] = self.per_page
+
+ while True:
+ root = self._api_request(params)
+ for post in root:
+ yield post.attrib
+
+ if len(root) < self.per_page:
+ return
+ params["pid"] += 1
+
+ @staticmethod
+ def _prepare(post):
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%a %b %d %H:%M:%S %z %Y")
+
+ def _extended_tags(self, post, page=None):
+ if not page:
+ url = "{}/index.php?page=post&s=view&id={}".format(
+ self.root, post["id"])
+ page = self.request(url).text
+ html = text.extract(page, '<ul id="tag-', '</ul>')[0]
+ if html:
+ tags = collections.defaultdict(list)
+ pattern = re.compile(
+ r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S)
+ for tag_type, tag_name in pattern.findall(html):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
+
+
+BASE_PATTERN = GelbooruV02Extractor.update({
+ "realbooru": {"root": "https://realbooru.com"},
+ "rule34" : {"root": "https://rule34.xxx"},
+ "safebooru": {"root": "https://safebooru.org"},
+ "tbib" : {"root": "https://tbib.org"},
+})
+
+
+class GelbooruV02TagExtractor(GelbooruV02Extractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
+ test = (
+ ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
+ "count": 1,
+ }),
+ ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
+ "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
+ "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
+ }),
+ ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
+ "count": ">= 64",
+ }),
+ ("https://tbib.org/index.php?page=post&s=list&tags=yuyaiyaui", {
+ "count": ">= 120",
+ }),
+ )
+
+ def __init__(self, match):
+ GelbooruV02Extractor.__init__(self, match)
+ tags = match.group(match.lastindex)
+ self.tags = text.unquote(tags.replace("+", " "))
+
+ def metadata(self):
+ return {"search_tags": self.tags}
+
+ def posts(self):
+ return self._pagination({"tags" : self.tags})
+
+
+class GelbooruV02PoolExtractor(GelbooruV02Extractor):
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool}")
+ archive_fmt = "p_{pool}_{id}"
+ pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)"
+ test = (
+ ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
+ "count": 3,
+ }),
+ ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
+ "count": 5,
+ }),
+ ("https://realbooru.com/index.php?page=pool&s=show&id=1", {
+ "count": 3,
+ }),
+ )
+
+ def __init__(self, match):
+ GelbooruV02Extractor.__init__(self, match)
+ self.pool_id = match.group(match.lastindex)
+ self.post_ids = ()
+
+ def skip(self, num):
+ self.page_start += num
+ return num
+
+ def metadata(self):
+ url = "{}/index.php?page=pool&s=show&id={}".format(
+ self.root, self.pool_id)
+ page = self.request(url).text
+
+ name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
+ if not name:
+ raise exception.NotFoundError("pool")
+ self.post_ids = text.extract_iter(
+ page, 'class="thumb" id="p', '"', pos)
+
+ return {
+ "pool": text.parse_int(self.pool_id),
+ "pool_name": text.unescape(name),
+ }
+
+ def posts(self):
+ params = {}
+ for params["id"] in util.advance(self.post_ids, self.page_start):
+ for post in self._api_request(params):
+ yield post.attrib
+
+
+class GelbooruV02PostExtractor(GelbooruV02Extractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)"
+ test = (
+ ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
+ "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "danraku",
+ "tags_character": "kashima_(kantai_collection)",
+ "tags_copyright": "kantai_collection",
+ "tags_general": str,
+ "tags_metadata": str,
+ },
+ }),
+ ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
+ "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
+ "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "kawanakajima",
+ "tags_character": "heath_ledger ronald_mcdonald the_joker",
+ "tags_copyright": "dc_comics mcdonald's the_dark_knight",
+ "tags_general": str,
+ },
+ }),
+ ("https://realbooru.com/index.php?page=post&s=view&id=668483", {
+ "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
+ "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
+ }),
+ ("https://tbib.org/index.php?page=post&s=view&id=9233957", {
+ "url": "5a6ebe07bfff8e6d27f7c30b5480f27abcb577d2",
+ "content": "1c3831b6fbaa4686e3c79035b5d98460b1c85c43",
+ }),
+ )
+
+ def __init__(self, match):
+ GelbooruV02Extractor.__init__(self, match)
+ self.post_id = match.group(match.lastindex)
+
+ def posts(self):
+ return self._pagination({"id": self.post_id})
diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py
index 462d3e9..aa79b67 100644
--- a/gallery_dl/extractor/hentaicafe.py
+++ b/gallery_dl/extractor/hentaicafe.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,23 +10,46 @@
from . import foolslide
from .. import text
-from .common import Extractor
+from .common import Extractor, Message
from ..cache import memcache
import re
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?hentai\.cafe"
-class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
- """Extractor for manga-chapters from hentai.cafe"""
+
+class HentaicafeBase():
+ """Base class for hentaicafe extractors"""
category = "hentaicafe"
+ root = "https://hentai.cafe"
+
+ def _pagination(self, urlfmt):
+ data = {"_extractor": HentaicafeMangaExtractor}
+ pnum = text.parse_int(self.page_start, 1)
+
+ while True:
+ page = self.request(urlfmt(pnum)).text
+
+ for entry in text.extract_iter(
+ page, 'class="entry-featured', 'title="'):
+ url = text.extract(entry, 'href="', '"')[0]
+ if url:
+ yield Message.Queue, url, data
+
+ if '>&#x2192;<' not in page:
+ return
+ pnum += 1
+
+
+class HentaicafeChapterExtractor(HentaicafeBase,
+ foolslide.FoolslideChapterExtractor):
+ """Extractor for manga-chapters from hentai.cafe"""
directory_fmt = ("{category}", "{manga}")
filename_fmt = "c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}"
- pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe"
- r"(/manga/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
+ pattern = BASE_PATTERN + r"(/manga/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
"url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
"keyword": "6913608267d883c82b887303b9ced13821188329",
})
- root = "https://hentai.cafe"
def metadata(self, page):
info = text.unescape(text.extract(page, '<title>', '</title>')[0])
@@ -43,11 +66,10 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
return {"artist": (), "tags": ()}
-class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
+class HentaicafeMangaExtractor(HentaicafeBase,
+ foolslide.FoolslideMangaExtractor):
"""Extractor for manga from hentai.cafe"""
- category = "hentaicafe"
- pattern = (r"(?:https?://)?" + r"(?:www\.)?hentai\.cafe"
- r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?#]+)/?$")
+ pattern = BASE_PATTERN + r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?#]+)/?$"
test = (
# single chapter
("https://hentai.cafe/hazuki-yuuto-summer-blues/", {
@@ -71,13 +93,20 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
}),
)
- root = "https://hentai.cafe"
- reverse = False
- request = Extractor.request
- chapterclass = HentaicafeChapterExtractor
+
+ def items(self):
+ page = Extractor.request(self, self.gallery_url).text
+
+ chapters = self.chapters(page)
+ if self.config("chapter-reverse", False):
+ chapters.reverse()
+
+ for chapter, data in chapters:
+ data["_extractor"] = HentaicafeChapterExtractor
+ yield Message.Queue, chapter, data
def chapters(self, page):
- if "/manga/series/" in self.manga_url:
+ if "/manga/series/" in self.gallery_url:
chapters = foolslide.FoolslideMangaExtractor.chapters(self, page)
chapters.reverse()
return chapters
@@ -100,3 +129,45 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
for url in re.findall(
r'<a +class="x-btn[^"]*" +href="([^"]+)"', page)
]
+
+
+class HentaicafeSearchExtractor(HentaicafeBase, Extractor):
+ """Extractor for hentaicafe search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/(?:page/(\d+)/?)?\?s=([^&#]+)"
+ test = ("https://hentai.cafe/?s=benimura", {
+ "pattern": HentaicafeMangaExtractor.pattern,
+ "count": ">= 10",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page_start, self.search = match.groups()
+
+ def items(self):
+ fmt = "{}/page/{}?s={}".format
+ return self._pagination(lambda pnum: fmt(self.root, pnum, self.search))
+
+
+class HentaicafeTagExtractor(HentaicafeBase, Extractor):
+ """Extractor for hentaicafe tag/artist searches"""
+ subcategory = "tag"
+ pattern = (BASE_PATTERN +
+ r"/hc\.fyi/(tag|artist|category)/([^/?#]+)(?:/page/(\d+))?")
+ test = (
+ ("https://hentai.cafe/hc.fyi/tag/vanilla"),
+ ("https://hentai.cafe/hc.fyi/category/book/page/5"),
+ ("https://hentai.cafe/hc.fyi/artist/benimura-karu", {
+ "pattern": HentaicafeMangaExtractor.pattern,
+ "count": ">= 10",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.type, self.search, self.page_start = match.groups()
+
+ def items(self):
+ fmt = "{}/hc.fyi/{}/{}/page/{}".format
+ return self._pagination(
+ lambda pnum: fmt(self.root, self.type, self.search, pnum))
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
index 6d3ed74..6c1879c 100644
--- a/gallery_dl/extractor/hentainexus.py
+++ b/gallery_dl/extractor/hentainexus.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -63,14 +63,18 @@ class HentainexusGalleryExtractor(GalleryExtractor):
data = json.loads(self._decode(text.extract(
page, 'initReader("', '"')[0]))
+ headers = None
+ if not self.config("original", True):
+ headers = {"_http_headers": {"Accept": "image/webp,*/*"}}
+
pages = data.get("pages")
if pages:
- return [(page, None) for page in pages]
+ return [(page, headers) for page in pages]
base = data["b"] + data["r"]
gid = data["i"]
return [
- ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), None)
+ ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), headers)
for page in data["f"]
]
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
index 16fe0a0..3d4bcfb 100644
--- a/gallery_dl/extractor/idolcomplex.py
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -13,8 +13,6 @@ from .common import Message
from ..cache import cache
from .. import text, util, exception
import collections
-import random
-import time
import re
@@ -24,6 +22,7 @@ class IdolcomplexExtractor(SankakuExtractor):
cookienames = ("login", "pass_hash")
cookiedomain = "idol.sankakucomplex.com"
root = "https://" + cookiedomain
+ request_interval = 5.0
def __init__(self, match):
SankakuExtractor.__init__(self, match)
@@ -31,17 +30,12 @@ class IdolcomplexExtractor(SankakuExtractor):
self.start_page = 1
self.start_post = 0
self.extags = self.config("tags", False)
- self.wait_min = self.config("wait-min", 3.0)
- self.wait_max = self.config("wait-max", 6.0)
- if self.wait_max < self.wait_min:
- self.wait_max = self.wait_min
def items(self):
self.login()
data = self.metadata()
for post_id in util.advance(self.post_ids(), self.start_post):
- self.wait()
post = self._parse_post(post_id)
url = post["file_url"]
post.update(data)
@@ -130,10 +124,6 @@ class IdolcomplexExtractor(SankakuExtractor):
return data
- def wait(self):
- """Wait for a randomly chosen amount of seconds"""
- time.sleep(random.uniform(self.wait_min, self.wait_max))
-
class IdolcomplexTagExtractor(IdolcomplexExtractor):
"""Extractor for images from idol.sankakucomplex.com by search-tags"""
@@ -192,7 +182,6 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
params["page"] = self.start_page
while True:
- self.wait()
page = self.request(self.root, params=params, retries=10).text
pos = page.find("<div id=more-popular-posts-link>") + 1
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index ae4e606..f6e8f2d 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -166,8 +166,6 @@ class ImgurAlbumExtractor(ImgurExtractor):
"privacy" : "private",
"score" : int,
"title" : "138",
- "topic" : "",
- "topic_id" : 0,
"upvote_count" : int,
"url" : "https://imgur.com/a/TcBmP",
"view_count" : int,
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 84018a9..81355ce 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -46,10 +46,10 @@ class InstagramExtractor(Extractor):
for post in self.posts():
- if post["__typename"] == "GraphReel":
- post = self._parse_reel(post["id"])
+ if "__typename" in post:
+ post = self._parse_post_graphql(post)
else:
- post = self._parse_post(post)
+ post = self._parse_post_reel(post)
post.update(data)
files = post.pop("_files")
@@ -85,21 +85,19 @@ class InstagramExtractor(Extractor):
return response
- def _api_request(self, endpoint, params):
- url = "https://i.instagram.com/api/" + endpoint
- headers = {
+ def _request_api(self, endpoint, **kwargs):
+ url = "https://i.instagram.com/api" + endpoint
+ kwargs["headers"] = {
"X-CSRFToken" : self.csrf_token,
"X-IG-App-ID" : "936619743392459",
"X-IG-WWW-Claim": self.www_claim,
}
- cookies = {
+ kwargs["cookies"] = {
"csrftoken": self.csrf_token,
}
- return self.request(
- url, params=params, headers=headers, cookies=cookies,
- ).json()
+ return self.request(url, **kwargs).json()
- def _graphql_request(self, query_hash, variables):
+ def _request_graphql(self, query_hash, variables):
url = self.root + "/graphql/query/"
params = {
"query_hash": query_hash,
@@ -162,7 +160,7 @@ class InstagramExtractor(Extractor):
for key in ("sessionid", "mid", "csrftoken")
}
- def _parse_post(self, post):
+ def _parse_post_graphql(self, post):
if post.get("is_video") and "video_url" not in post:
url = "{}/tv/{}/".format(self.root, post["shortcode"])
post = self._extract_post_page(url)
@@ -230,27 +228,31 @@ class InstagramExtractor(Extractor):
return data
- def _parse_reel(self, reel_id):
- params = {"reel_ids": reel_id}
- data = self._api_request("v1/feed/reels_media/", params)
- if not data["reels_media"]:
- raise exception.NotFoundError("reel")
- reel = data["reels_media"][0]
-
- reel_id = reel_id.rpartition(":")[2]
- owner = reel["user"]
+ def _parse_post_reel(self, post):
- data = {
- "expires" : text.parse_timestamp(reel.get("expiring_at")),
- "owner_id" : owner["pk"],
- "username" : owner.get("username"),
- "fullname" : owner.get("full_name"),
- "post_id" : reel_id,
- "post_shortcode": self._shortcode_from_id(reel_id),
- }
+ if "media" in post:
+ media = post["media"]
+ owner = media["user"]
+ post["items"] = (media,)
+ data = {
+ "post_id" : media["pk"],
+ "post_shortcode": self._shortcode_from_id(media["pk"]),
+ }
+ else:
+ reel_id = str(post["id"]).rpartition(":")[2]
+ owner = post["user"]
+ data = {
+ "expires" : text.parse_timestamp(post.get("expiring_at")),
+ "post_id" : reel_id,
+ "post_shortcode": self._shortcode_from_id(reel_id),
+ }
+ data["owner_id"] = owner["pk"]
+ data["username"] = owner.get("username")
+ data["fullname"] = owner.get("full_name")
data["_files"] = files = []
- for num, item in enumerate(reel["items"], 1):
+
+ for num, item in enumerate(post["items"], 1):
image = item["image_versions2"]["candidates"][0]
@@ -337,7 +339,7 @@ class InstagramExtractor(Extractor):
}
return user[key]
- def _pagination(self, query_hash, variables, data):
+ def _pagination_graphql(self, query_hash, variables, data):
while True:
for edge in data["edges"]:
yield edge["node"]
@@ -352,9 +354,19 @@ class InstagramExtractor(Extractor):
variables["after"] = self._cursor = info["end_cursor"]
self.log.debug("Cursor: %s", self._cursor)
- data = next(iter(self._graphql_request(
+ data = next(iter(self._request_graphql(
query_hash, variables)["user"].values()))
+ def _pagination_api(self, endpoint, params):
+ while True:
+ data = self._request_api(endpoint, method="POST", data=params)
+ yield from data["items"]
+
+ info = data["paging_info"]
+ if not info["more_available"]:
+ return
+ params["max_id"] = info["max_id"]
+
class InstagramUserExtractor(InstagramExtractor):
"""Extractor for an Instagram user profile"""
@@ -366,13 +378,6 @@ class InstagramUserExtractor(InstagramExtractor):
)
def items(self):
- if self.config("highlights"):
- self.log.warning("'highlights' is deprecated, "
- "use '\"include\": \"…,highlights\"' instead")
- default = ("highlights", "posts")
- else:
- default = ("posts",)
-
base = "{}/{}/".format(self.root, self.item)
stories = "{}/stories/{}/".format(self.root, self.item)
return self._dispatch_extractors((
@@ -380,7 +385,7 @@ class InstagramUserExtractor(InstagramExtractor):
(InstagramHighlightsExtractor, base + "highlights/"),
(InstagramPostsExtractor , base + "posts/"),
(InstagramChannelExtractor , base + "channel/"),
- ), default)
+ ), ("posts",))
class InstagramPostsExtractor(InstagramExtractor):
@@ -399,7 +404,7 @@ class InstagramPostsExtractor(InstagramExtractor):
query_hash = "003056d32c2554def87228bc3fd9668a"
variables = {"id": user["id"], "first": 50}
edge = self._get_edge_data(user, "edge_owner_to_timeline_media")
- return self._pagination(query_hash, variables, edge)
+ return self._pagination_graphql(query_hash, variables, edge)
class InstagramChannelExtractor(InstagramExtractor):
@@ -418,7 +423,7 @@ class InstagramChannelExtractor(InstagramExtractor):
query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
variables = {"id": user["id"], "first": 50}
edge = self._get_edge_data(user, "edge_felix_video_timeline")
- return self._pagination(query_hash, variables, edge)
+ return self._pagination_graphql(query_hash, variables, edge)
class InstagramSavedExtractor(InstagramExtractor):
@@ -434,7 +439,7 @@ class InstagramSavedExtractor(InstagramExtractor):
query_hash = "2ce1d673055b99250e93b6f88f878fde"
variables = {"id": user["id"], "first": 50}
edge = self._get_edge_data(user, "edge_saved_media")
- return self._pagination(query_hash, variables, edge)
+ return self._pagination_graphql(query_hash, variables, edge)
class InstagramTagExtractor(InstagramExtractor):
@@ -458,9 +463,9 @@ class InstagramTagExtractor(InstagramExtractor):
query_hash = "9b498c08113f1e09617a1703c22b2f32"
variables = {"tag_name": hashtag["name"], "first": 50}
edge = self._get_edge_data(hashtag, "edge_hashtag_to_media")
- return self._pagination(query_hash, variables, edge)
+ return self._pagination_graphql(query_hash, variables, edge)
- def _pagination(self, query_hash, variables, data):
+ def _pagination_graphql(self, query_hash, variables, data):
while True:
for edge in data["edges"]:
yield edge["node"]
@@ -471,7 +476,7 @@ class InstagramTagExtractor(InstagramExtractor):
variables["after"] = self._cursor = info["end_cursor"]
self.log.debug("Cursor: %s", self._cursor)
- data = self._graphql_request(
+ data = self._request_graphql(
query_hash, variables)["hashtag"]["edge_hashtag_to_media"]
@@ -582,7 +587,7 @@ class InstagramPostExtractor(InstagramExtractor):
)
def posts(self):
- query_hash = "a9441f24ac73000fa17fe6e6da11d59d"
+ query_hash = "2c4c2e343a8f64c625ba02b2aa12c7f8"
variables = {
"shortcode" : self.item,
"child_comment_count" : 3,
@@ -590,7 +595,7 @@ class InstagramPostExtractor(InstagramExtractor):
"parent_comment_count" : 24,
"has_threaded_comments": True
}
- data = self._graphql_request(query_hash, variables)
+ data = self._request_graphql(query_hash, variables)
media = data.get("shortcode_media")
if not media:
raise exception.NotFoundError("post")
@@ -626,7 +631,9 @@ class InstagramStoriesExtractor(InstagramExtractor):
return ()
reel_id = user["id"]
- return ({"__typename": "GraphReel", "id": reel_id},)
+ endpoint = "/v1/feed/reels_media/"
+ params = {"reel_ids": reel_id}
+ return self._request_api(endpoint, params=params)["reels"].values()
class InstagramHighlightsExtractor(InstagramExtractor):
@@ -649,12 +656,35 @@ class InstagramHighlightsExtractor(InstagramExtractor):
"include_highlight_reels": True,
"include_live_status": True,
}
- data = self._graphql_request(query_hash, variables)
+ data = self._request_graphql(query_hash, variables)
+ edges = data["user"]["edge_highlight_reels"]["edges"]
+ if not edges:
+ return ()
+
+ reel_ids = ["highlight:" + edge["node"]["id"] for edge in edges]
+ endpoint = "/v1/feed/reels_media/?reel_ids=" + \
+ "&reel_ids=".join(text.quote(rid) for rid in reel_ids)
+ reels = self._request_api(endpoint)["reels"]
+ return [reels[rid] for rid in reel_ids]
+
+
+class InstagramReelsExtractor(InstagramExtractor):
+ """Extractor for an Instagram user's reels"""
+ subcategory = "reels"
+ pattern = USER_PATTERN + r"/reels"
+ test = ("https://www.instagram.com/instagram/reels/", {
+ "range": "40-60",
+ "count": ">= 20",
+ })
- return [
- {
- "__typename": "GraphReel",
- "id" : "highlight:" + edge["node"]["id"],
- }
- for edge in data["user"]["edge_highlight_reels"]["edges"]
- ]
+ def posts(self):
+ url = "{}/{}/".format(self.root, self.item)
+ user = self._extract_profile_page(url)
+
+ endpoint = "/v1/clips/user/"
+ data = {
+ "target_user_id": user["id"],
+ "page_size" : "50",
+ }
+
+ return self._pagination_api(endpoint, data)
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index b54afb7..8a4e413 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -69,7 +69,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
@staticmethod
def images(page):
readerarea = text.extract(
- page, '<div id="readerarea">', '<div class="navig">')[0]
+ page, '<div id="readerarea"', '<div class="navig')[0]
return [
(text.unescape(url), None)
for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index d59e5bb..6a88d58 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -17,6 +17,7 @@ class MangadexExtractor(Extractor):
"""Base class for mangadex extractors"""
category = "mangadex"
root = "https://mangadex.org"
+ api_root = "https://api.mangadex.org"
# mangadex-to-iso639-1 codes
iso639_map = {
@@ -28,7 +29,10 @@ class MangadexExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.api_root = self.config("api-server") or "https://mangadex.org/api"
+
+ server = self.config("api-server")
+ if server is not None:
+ self.api_root = server.rstrip("/")
def chapter_data(self, chapter_id):
"""Request API results for 'chapter_id'"""
@@ -177,7 +181,7 @@ class MangadexMangaExtractor(MangadexExtractor):
def chapters(self):
"""Return a sorted list of chapter-metadata dicts"""
- manga = self.manga_data(self.manga_id)
+ manga = self.manga_data(int(self.manga_id))
results = []
for cdata in self.manga_chapters(self.manga_id):
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 0e063d5..daa3d65 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,35 +8,25 @@
"""Extractors for mastodon instances"""
-from .common import Extractor, Message
-from .. import text, util, config, exception
-import re
+from .common import BaseExtractor, Message
+from .. import text, exception
+from ..cache import cache
-class MastodonExtractor(Extractor):
+class MastodonExtractor(BaseExtractor):
"""Base class for mastodon extractors"""
basecategory = "mastodon"
directory_fmt = ("mastodon", "{instance}", "{account[username]}")
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
cookiedomain = None
- instance = None
- root = None
def __init__(self, match):
- Extractor.__init__(self, match)
- self.api = MastodonAPI(self)
-
- def config(self, key, default=None):
- return config.interpolate_common(
- ("extractor",), (
- (self.category, self.subcategory),
- (self.basecategory, self.instance, self.subcategory),
- ), key, default,
- )
+ BaseExtractor.__init__(self, match)
+ self.instance = self.root.partition("://")[2]
+ self.item = match.group(match.lastindex)
def items(self):
- yield Message.Version, 1
for status in self.statuses():
attachments = status["media_attachments"]
if attachments:
@@ -60,34 +50,81 @@ class MastodonExtractor(Extractor):
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
+INSTANCES = {
+ "mastodon.social": {
+ "root" : "https://mastodon.social",
+ "access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48",
+ "client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo",
+ "client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI",
+ },
+ "pawoo": {
+ "root" : "https://pawoo.net",
+ "access-token" : "c12c9d275050bce0dc92169a28db09d7"
+ "0d62d0a75a8525953098c167eacd3668",
+ "client-id" : "978a25f843ec01e53d09be2c290cd75c"
+ "782bc3b7fdbd7ea4164b9f3c3780c8ff",
+ "client-secret": "9208e3d4a7997032cf4f1b0e12e5df38"
+ "8428ef1fadb446dcfeb4f5ed6872d97b",
+ },
+ "baraag": {
+ "root" : "https://baraag.net",
+ "access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0",
+ "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
+ "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
+ }
+}
+
+BASE_PATTERN = MastodonExtractor.update(INSTANCES)
+
+
class MastodonUserExtractor(MastodonExtractor):
"""Extractor for all images of an account/user"""
subcategory = "user"
-
- def __init__(self, match):
- MastodonExtractor.__init__(self, match)
- self.account_name = match.group(1)
+ pattern = BASE_PATTERN + r"/@([^/?#]+)(?:/media)?/?$"
+ test = (
+ ("https://mastodon.social/@jk", {
+ "pattern": r"https://files.mastodon.social/media_attachments"
+ r"/files/(\d+/){3,}original/\w+",
+ "range": "1-60",
+ "count": 60,
+ }),
+ ("https://pawoo.net/@yoru_nine/", {
+ "range": "1-60",
+ "count": 60,
+ }),
+ ("https://baraag.net/@pumpkinnsfw"),
+ )
def statuses(self):
- handle = "@{}@{}".format(self.account_name, self.instance)
- for account in self.api.account_search(handle, 1):
- if account["username"] == self.account_name:
+ api = MastodonAPI(self)
+ username = self.item
+ handle = "@{}@{}".format(username, self.instance)
+ for account in api.account_search(handle, 1):
+ if account["username"] == username:
break
else:
raise exception.NotFoundError("account")
- return self.api.account_statuses(account["id"])
+ return api.account_statuses(account["id"])
class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status"""
subcategory = "status"
-
- def __init__(self, match):
- MastodonExtractor.__init__(self, match)
- self.status_id = match.group(1)
+ pattern = BASE_PATTERN + r"/@[^/?#]+/(\d+)"
+ test = (
+ ("https://mastodon.social/@jk/103794036899778366", {
+ "count": 4,
+ }),
+ ("https://pawoo.net/@yoru_nine/105038878897832922", {
+ "content": "b52e807f8ab548d6f896b09218ece01eba83987a",
+ }),
+ ("https://baraag.net/@pumpkinnsfw/104364170556898443", {
+ "content": "67748c1b828c58ad60d0fe5729b59fb29c872244",
+ }),
+ )
def statuses(self):
- return (self.api.status(self.status_id),)
+ return (MastodonAPI(self).status(self.item),)
class MastodonAPI():
@@ -97,35 +134,46 @@ class MastodonAPI():
https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md
"""
- def __init__(self, extractor, access_token=None):
+ def __init__(self, extractor):
self.root = extractor.root
self.extractor = extractor
+ access_token = extractor.config("access-token")
+ if access_token is None or access_token == "cache":
+ access_token = _access_token_cache(extractor.instance)
if not access_token:
- access_token = extractor.config(
- "access-token", extractor.access_token)
- self.headers = {"Authorization": "Bearer {}".format(access_token)}
+ try:
+ access_token = INSTANCES[extractor.category]["access-token"]
+ except (KeyError, TypeError):
+ raise exception.StopExtraction(
+ "Missing access token.\n"
+ "Run 'gallery-dl oauth:mastodon:%s' to obtain one.",
+ extractor.instance)
+
+ self.headers = {"Authorization": "Bearer " + access_token}
def account_search(self, query, limit=40):
"""Search for content"""
+ endpoint = "/v1/accounts/search"
params = {"q": query, "limit": limit}
- return self._call("accounts/search", params).json()
+ return self._call(endpoint, params).json()
def account_statuses(self, account_id):
"""Get an account's statuses"""
- endpoint = "accounts/{}/statuses".format(account_id)
+ endpoint = "/v1/accounts/{}/statuses".format(account_id)
params = {"only_media": "1"}
return self._pagination(endpoint, params)
def status(self, status_id):
- """Fetch a Status"""
- return self._call("statuses/" + status_id).json()
+ """Fetch a status"""
+ endpoint = "/v1/statuses/" + status_id
+ return self._call(endpoint).json()
def _call(self, endpoint, params=None):
if endpoint.startswith("http"):
url = endpoint
else:
- url = "{}/api/v1/{}".format(self.root, endpoint)
+ url = self.root + "/api" + endpoint
while True:
response = self.extractor.request(
@@ -145,7 +193,7 @@ class MastodonAPI():
raise exception.StopExtraction(response.json().get("error"))
def _pagination(self, endpoint, params):
- url = "{}/api/v1/{}".format(self.root, endpoint)
+ url = endpoint
while url:
response = self._call(url, params)
yield from response.json()
@@ -156,86 +204,6 @@ class MastodonAPI():
url = url["url"]
-def generate_extractors():
- """Dynamically generate Extractor classes for Mastodon instances"""
-
- symtable = globals()
- extractors = config.get(("extractor",), "mastodon")
- if extractors:
- util.combine_dict(EXTRACTORS, extractors)
- config.set(("extractor",), "mastodon", EXTRACTORS)
-
- for instance, info in EXTRACTORS.items():
-
- if not isinstance(info, dict):
- continue
-
- category = info.get("category") or instance.replace(".", "")
- root = info.get("root") or "https://" + instance
- name = (info.get("name") or category).capitalize()
- token = info.get("access-token")
- pattern = info.get("pattern") or re.escape(instance)
-
- class Extr(MastodonUserExtractor):
- pass
-
- Extr.__name__ = Extr.__qualname__ = name + "UserExtractor"
- Extr.__doc__ = "Extractor for all images of a user on " + instance
- Extr.category = category
- Extr.instance = instance
- Extr.pattern = (r"(?:https?://)?" + pattern +
- r"/@([^/?#]+)(?:/media)?/?$")
- Extr.test = info.get("test-user")
- Extr.root = root
- Extr.access_token = token
- symtable[Extr.__name__] = Extr
-
- class Extr(MastodonStatusExtractor):
- pass
-
- Extr.__name__ = Extr.__qualname__ = name + "StatusExtractor"
- Extr.__doc__ = "Extractor for images from a status on " + instance
- Extr.category = category
- Extr.instance = instance
- Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?#]+/(\d+)"
- Extr.test = info.get("test-status")
- Extr.root = root
- Extr.access_token = token
- symtable[Extr.__name__] = Extr
-
-
-EXTRACTORS = {
- "mastodon.social": {
- "category" : "mastodon.social",
- "access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48",
- "client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo",
- "client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI",
- "test-user" : ("https://mastodon.social/@jk", {
- "pattern": r"https://files.mastodon.social/media_attachments"
- r"/files/(\d+/){3,}original/\w+",
- "range": "1-60",
- "count": 60,
- }),
- "test-status" : ("https://mastodon.social/@jk/103794036899778366", {
- "count": 4,
- }),
- },
- "pawoo.net": {
- "category" : "pawoo",
- "access-token" : "c12c9d275050bce0dc92169a28db09d7"
- "0d62d0a75a8525953098c167eacd3668",
- "client-id" : "978a25f843ec01e53d09be2c290cd75c"
- "782bc3b7fdbd7ea4164b9f3c3780c8ff",
- "client-secret": "9208e3d4a7997032cf4f1b0e12e5df38"
- "8428ef1fadb446dcfeb4f5ed6872d97b",
- },
- "baraag.net": {
- "category" : "baraag",
- "access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0",
- "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
- "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
- },
-}
-
-
-generate_extractors()
+@cache(maxage=100*365*24*3600, keyarg=0)
+def _access_token_cache(instance):
+ return None
diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py
index 7bf0084..d5c2554 100644
--- a/gallery_dl/extractor/message.py
+++ b/gallery_dl/extractor/message.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2018 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -52,4 +52,4 @@ class Message():
# Cookies = 5
Queue = 6
# Urllist = 7
- Metadata = 8
+ # Metadata = 8
diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py
index 0ac55cd..df77110 100644
--- a/gallery_dl/extractor/moebooru.py
+++ b/gallery_dl/extractor/moebooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020 Mike Fährmann
+# Copyright 2020-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,6 @@
"""Extractors for Moebooru based sites"""
-from .common import generate_extractors
from .booru import BooruExtractor
from .. import text
@@ -52,15 +51,93 @@ class MoebooruExtractor(BooruExtractor):
params["page"] += 1
+BASE_PATTERN = MoebooruExtractor.update({
+ "yandere": {
+ "root": "https://yande.re",
+ },
+ "konachan": {
+ "root": "https://konachan.com",
+ "pattern": r"konachan\.(?:com|net)",
+ },
+ "hypnohub": {
+ "root": "https://hypnohub.net",
+ },
+ "sakugabooru": {
+ "root": "https://www.sakugabooru.com",
+ "pattern": r"(?:www\.)?sakugabooru\.com",
+ },
+ "lolibooru": {
+ "root": "https://lolibooru.moe",
+ },
+})
+
+
+class MoebooruPostExtractor(MoebooruExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/post/show/(\d+)"
+ test = (
+ ("https://yande.re/post/show/51824", {
+ "content": "59201811c728096b2d95ce6896fd0009235fe683",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "sasaki_tamaru",
+ "tags_circle": "softhouse_chara",
+ "tags_copyright": "ouzoku",
+ "tags_general": str,
+ },
+ }),
+ ("https://konachan.com/post/show/205189", {
+ "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
+ "options": (("tags", True),),
+ "keyword": {
+ "tags_artist": "patata",
+ "tags_character": "clownpiece",
+ "tags_copyright": "touhou",
+ "tags_general": str,
+ },
+ }),
+ ("https://konachan.net/post/show/205189"),
+ ("https://hypnohub.net/post/show/73964", {
+ "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
+ }),
+ ("https://www.sakugabooru.com/post/show/125570"),
+ ("https://lolibooru.moe/post/show/287835"),
+ )
+
+ def __init__(self, match):
+ MoebooruExtractor.__init__(self, match)
+ self.post_id = match.group(match.lastindex)
+
+ def posts(self):
+ params = {"tags": "id:" + self.post_id}
+ return self.request(self.root + "/post.json", params=params).json()
+
+
class MoebooruTagExtractor(MoebooruExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern_fmt = r"/post\?(?:[^&#]*&)*tags=([^&#]+)"
+ pattern = BASE_PATTERN + r"/post\?(?:[^&#]*&)*tags=([^&#]+)"
+ test = (
+ ("https://yande.re/post?tags=ouzoku+armor", {
+ "content": "59201811c728096b2d95ce6896fd0009235fe683",
+ }),
+ ("https://konachan.com/post?tags=patata", {
+ "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
+ }),
+ ("https://konachan.net/post?tags=patata"),
+ ("https://hypnohub.net/post?tags=gonoike_biwa", {
+ "url": "072330c34a1e773d0cafd00e64b8060d34b078b6",
+ }),
+ ("https://www.sakugabooru.com/post?tags=nichijou"),
+ ("https://lolibooru.moe/post?tags=ruu_%28tksymkw%29"),
+ )
def __init__(self, match):
MoebooruExtractor.__init__(self, match)
- self.tags = text.unquote(match.group(1).replace("+", " "))
+ tags = match.group(match.lastindex)
+ self.tags = text.unquote(tags.replace("+", " "))
def metadata(self):
return {"search_tags": self.tags}
@@ -74,11 +151,25 @@ class MoebooruPoolExtractor(MoebooruExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
- pattern_fmt = r"/pool/show/(\d+)"
+ pattern = BASE_PATTERN + r"/pool/show/(\d+)"
+ test = (
+ ("https://yande.re/pool/show/318", {
+ "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68",
+ }),
+ ("https://konachan.com/pool/show/95", {
+ "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
+ }),
+ ("https://konachan.net/pool/show/95"),
+ ("https://hypnohub.net/pool/show/61", {
+ "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf",
+ }),
+ ("https://www.sakugabooru.com/pool/show/54"),
+ ("https://lolibooru.moe/pool/show/239"),
+ )
def __init__(self, match):
MoebooruExtractor.__init__(self, match)
- self.pool_id = match.group(1)
+ self.pool_id = match.group(match.lastindex)
def metadata(self):
return {"pool": text.parse_int(self.pool_id)}
@@ -88,29 +179,34 @@ class MoebooruPoolExtractor(MoebooruExtractor):
return self._pagination(self.root + "/post.json", params)
-class MoebooruPostExtractor(MoebooruExtractor):
- subcategory = "post"
- archive_fmt = "{id}"
- pattern_fmt = r"/post/show/(\d+)"
-
- def __init__(self, match):
- MoebooruExtractor.__init__(self, match)
- self.post_id = match.group(1)
-
- def posts(self):
- params = {"tags": "id:" + self.post_id}
- return self.request(self.root + "/post.json", params=params).json()
-
-
class MoebooruPopularExtractor(MoebooruExtractor):
subcategory = "popular"
directory_fmt = ("{category}", "popular", "{scale}", "{date}")
archive_fmt = "P_{scale[0]}_{date}_{id}"
- pattern_fmt = r"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?"
+ pattern = BASE_PATTERN + \
+ r"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?"
+ test = (
+ ("https://yande.re/post/popular_by_month?month=6&year=2014", {
+ "count": 40,
+ }),
+ ("https://yande.re/post/popular_recent"),
+ ("https://konachan.com/post/popular_by_month?month=11&year=2010", {
+ "count": 20,
+ }),
+ ("https://konachan.com/post/popular_recent"),
+ ("https://konachan.net/post/popular_recent"),
+ ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", {
+ "count": 20,
+ }),
+ ("https://hypnohub.net/post/popular_recent"),
+ ("https://www.sakugabooru.com/post/popular_recent"),
+ ("https://lolibooru.moe/post/popular_recent"),
+ )
def __init__(self, match):
MoebooruExtractor.__init__(self, match)
- self.scale, self.query = match.groups()
+ self.scale = match.group(match.lastindex-1)
+ self.query = match.group(match.lastindex)
def metadata(self):
self.params = params = text.parse_query(self.query)
@@ -138,108 +234,3 @@ class MoebooruPopularExtractor(MoebooruExtractor):
def posts(self):
url = "{}/post/popular_{}.json".format(self.root, self.scale)
return self.request(url, params=self.params).json()
-
-
-EXTRACTORS = {
- "yandere": {
- "root": "https://yande.re",
- "test-tag": ("https://yande.re/post?tags=ouzoku+armor", {
- "content": "59201811c728096b2d95ce6896fd0009235fe683",
- }),
- "test-pool": ("https://yande.re/pool/show/318", {
- "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68",
- }),
- "test-post": ("https://yande.re/post/show/51824", {
- "content": "59201811c728096b2d95ce6896fd0009235fe683",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "sasaki_tamaru",
- "tags_circle": "softhouse_chara",
- "tags_copyright": "ouzoku",
- "tags_general": str,
- },
- }),
- "test-popular": (
- ("https://yande.re/post/popular_by_month?month=6&year=2014", {
- "count": 40,
- }),
- ("https://yande.re/post/popular_recent"),
- ),
- },
- "konachan": {
- "root": "https://konachan.com",
- "pattern": r"konachan\.(?:com|net)",
- "test-tag": (
- ("https://konachan.com/post?tags=patata", {
- "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
- }),
- ("https://konachan.net/post?tags=patata"),
- ),
- "test-pool": (
- ("https://konachan.com/pool/show/95", {
- "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
- }),
- ("https://konachan.net/pool/show/95"),
- ),
- "test-post": (
- ("https://konachan.com/post/show/205189", {
- "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "patata",
- "tags_character": "clownpiece",
- "tags_copyright": "touhou",
- "tags_general": str,
- },
- }),
- ("https://konachan.net/post/show/205189"),
- ),
- "test-popular": (
- ("https://konachan.com/post/popular_by_month?month=11&year=2010", {
- "count": 20,
- }),
- ("https://konachan.com/post/popular_recent"),
- ("https://konachan.net/post/popular_recent"),
- ),
- },
- "hypnohub": {
- "root": "https://hypnohub.net",
- "test-tag": ("https://hypnohub.net/post?tags=gonoike_biwa", {
- "url": "072330c34a1e773d0cafd00e64b8060d34b078b6",
- }),
- "test-pool": ("https://hypnohub.net/pool/show/61", {
- "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf",
- }),
- "test-post": ("https://hypnohub.net/post/show/73964", {
- "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
- }),
- "test-popular": (
- ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", {
- "count": 20,
- }),
- ("https://hypnohub.net/post/popular_recent"),
- ),
- },
- "lolibooru": {
- "root": "https://lolibooru.moe",
- "test-tag" : ("https://lolibooru.moe/post?tags=ruu_%28tksymkw%29",),
- "test-pool" : ("https://lolibooru.moe/pool/show/239",),
- "test-post" : ("https://lolibooru.moe/post/show/287835",),
- "test-popular": ("https://lolibooru.moe/post/popular_recent",),
- },
- "sakugabooru": {
- "root": "https://www.sakugabooru.com",
- "pattern": r"(?:www\.)?sakugabooru\.com",
- "test-tag" : ("https://www.sakugabooru.com/post?tags=nichijou",),
- "test-pool" : ("https://www.sakugabooru.com/pool/show/54",),
- "test-post" : ("https://www.sakugabooru.com/post/show/125570",),
- "test-popular": ("https://www.sakugabooru.com/post/popular_recent",),
- },
-}
-
-generate_extractors(EXTRACTORS, globals(), (
- MoebooruTagExtractor,
- MoebooruPoolExtractor,
- MoebooruPostExtractor,
- MoebooruPopularExtractor,
-))
diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py
new file mode 100644
index 0000000..db15572
--- /dev/null
+++ b/gallery_dl/extractor/naverwebtoon.py
@@ -0,0 +1,128 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Seonghyeon Cho
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://comic.naver.com/"""
+
+from .common import Extractor, Message
+from .. import exception, text
+
+BASE_PATTERN = r"(?:https?://)?comic\.naver\.com/webtoon"
+
+
+class NaverwebtoonExtractor(Extractor):
+ category = "naverwebtoon"
+ root = "https://comic.naver.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.query = match.group(1)
+
+
+class NaverwebtoonEpisodeExtractor(NaverwebtoonExtractor):
+ subcategory = "episode"
+ directory_fmt = ("{category}", "{comic}")
+ filename_fmt = "{episode:>03}-{num:>02}.{extension}"
+ archive_fmt = "{title_id}_{episode}_{num}"
+ pattern = (BASE_PATTERN + r"/detail\.nhn\?([^#]+)")
+ test = (
+ (("https://comic.naver.com/webtoon/detail.nhn?"
+ "titleId=26458&no=1&weekday=tue"), {
+ "url": "47a956ba8c7a837213d5985f50c569fcff986f75",
+ "content": "3806b6e8befbb1920048de9888dfce6220f69a60",
+ "count": 14
+ }),
+ )
+
+ def __init__(self, match):
+ NaverwebtoonExtractor.__init__(self, match)
+ query = text.parse_query(self.query)
+ self.title_id = query.get("titleId")
+ if not self.title_id:
+ raise exception.NotFoundError("titleId")
+ self.episode = query.get("no")
+ if not self.episode:
+ raise exception.NotFoundError("no")
+
+ def items(self):
+ url = "{}/webtoon/detail.nhn?{}".format(self.root, self.query)
+ page = self.request(url).text
+ data = self.get_job_metadata(page)
+
+ yield Message.Directory, data
+ for data["num"], url in enumerate(self.get_image_urls(page), 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def get_job_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ title, pos = text.extract(page, 'property="og:title" content="', '"')
+ comic, pos = text.extract(page, '<h2>', '<span', pos)
+ authors, pos = text.extract(page, 'class="wrt_nm">', '</span>', pos)
+ authors = authors.strip().split("/")
+ descr, pos = text.extract(page, '<p class="txt">', '</p>', pos)
+ genre, pos = text.extract(page, '<span class="genre">', '</span>', pos)
+ date, pos = text.extract(page, '<dd class="date">', '</dd>', pos)
+
+ return {
+ "title": title,
+ "comic": comic,
+ "authors": authors,
+ "description": descr,
+ "genre": genre,
+ "title_id": self.title_id,
+ "episode": self.episode,
+ "date": date,
+ }
+
+ @staticmethod
+ def get_image_urls(page):
+ view_area = text.extract(page, 'id="comic_view_area"', '</div>')[0]
+ return text.extract_iter(view_area, '<img src="', '"')
+
+
+class NaverwebtoonComicExtractor(NaverwebtoonExtractor):
+ subcategory = "comic"
+ categorytransfer = True
+ pattern = (BASE_PATTERN + r"/list\.nhn\?([^#]+)")
+ test = (
+ ("https://comic.naver.com/webtoon/list.nhn?titleId=22073", {
+ "pattern": NaverwebtoonEpisodeExtractor.pattern,
+ "count": 32,
+ }),
+ )
+
+ def __init__(self, match):
+ NaverwebtoonExtractor.__init__(self, match)
+ query = text.parse_query(self.query)
+ self.title_id = query.get("titleId")
+ if not self.title_id:
+ raise exception.NotFoundError("titleId")
+ self.page_no = text.parse_int(query.get("page", 1))
+
+ def items(self):
+ url = self.root + "/webtoon/list.nhn"
+ params = {"titleId": self.title_id, "page": self.page_no}
+ data = {"_extractor": NaverwebtoonEpisodeExtractor}
+
+ while True:
+ page = self.request(url, params=params).text
+ data["page"] = self.page_no
+
+ for episode_url in self.get_episode_urls(page):
+ yield Message.Queue, episode_url, data
+
+ if 'class="next"' not in page:
+ return
+ params["page"] += 1
+
+ def get_episode_urls(self, page):
+ """Extract and return all episode urls in page"""
+ return [
+ self.root + "/webtoon/detail.nhn?" + query
+ for query in text.extract_iter(
+ page, '<a href="/webtoon/detail.nhn?', '"')
+ ][::2]
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 2ec7165..483c657 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -9,7 +9,7 @@
"""Utility classes to setup OAuth and link accounts to gallery-dl"""
from .common import Extractor, Message
-from . import deviantart, flickr, pixiv, reddit, smugmug, tumblr
+from . import deviantart, flickr, mastodon, pixiv, reddit, smugmug, tumblr
from .. import text, oauth, util, config, exception
from ..cache import cache
import urllib.parse
@@ -106,9 +106,9 @@ class OAuthBase(Extractor):
))
def _oauth2_authorization_code_grant(
- self, client_id, client_secret, auth_url, token_url,
+ self, client_id, client_secret, auth_url, token_url, *,
scope="read", key="refresh_token", auth=True,
- message_template=None, cache=None):
+ cache=None, instance=None):
"""Perform an OAuth2 authorization code grant"""
state = "gallery-dl_{}_{}".format(
@@ -117,12 +117,12 @@ class OAuthBase(Extractor):
)
auth_params = {
- "client_id": client_id,
+ "client_id" : client_id,
"response_type": "code",
- "state": state,
- "redirect_uri": self.redirect_uri,
- "duration": "permanent",
- "scope": scope,
+ "state" : state,
+ "redirect_uri" : self.redirect_uri,
+ "duration" : "permanent",
+ "scope" : scope,
}
# receive an authorization code
@@ -140,8 +140,8 @@ class OAuthBase(Extractor):
# exchange the authorization code for a token
data = {
- "grant_type": "authorization_code",
- "code": params["code"],
+ "grant_type" : "authorization_code",
+ "code" : params["code"],
"redirect_uri": self.redirect_uri,
}
@@ -159,27 +159,18 @@ class OAuthBase(Extractor):
self.send(data["error"])
return
+ token = data[key]
+ token_name = key.replace("_", "-")
+
# write to cache
if self.cache and cache:
- cache.update("#" + str(client_id), data[key])
- self.log.info("Writing 'refresh-token' to cache")
+ cache.update(instance or ("#" + str(client_id)), token)
+ self.log.info("Writing '%s' to cache", token_name)
# display token
- if message_template:
- msg = message_template.format(
- category=self.subcategory,
- key=key.partition("_")[0],
- token=data[key],
- instance=getattr(self, "instance", ""),
- client_id=client_id,
- client_secret=client_secret,
- )
- else:
- msg = self._generate_message(
- ("refresh-token",),
- (data[key],),
- )
- self.send(msg)
+ self.send(self._generate_message(
+ (token_name,), (token,),
+ ))
def _generate_message(self, names, values):
_vh, _va, _is, _it = (
@@ -326,8 +317,10 @@ class OAuthMastodon(OAuthBase):
def items(self):
yield Message.Version, 1
- application = self.oauth_config(self.instance)
- if not application:
+ for application in mastodon.INSTANCES.values():
+ if self.instance == application["root"].partition("://")[2]:
+ break
+ else:
application = self._register(self.instance)
self._oauth2_authorization_code_grant(
@@ -335,8 +328,9 @@ class OAuthMastodon(OAuthBase):
application["client-secret"],
"https://{}/oauth/authorize".format(self.instance),
"https://{}/oauth/token".format(self.instance),
+ instance=self.instance,
key="access_token",
- message_template=MASTODON_MSG_TEMPLATE,
+ cache=mastodon._access_token_cache,
)
@cache(maxage=10*365*24*3600, keyarg=1)
@@ -425,29 +419,3 @@ class OAuthPixiv(OAuthBase):
""")
code = input("code: ")
return code.rpartition("=")[2].strip()
-
-
-MASTODON_MSG_TEMPLATE = """
-Your 'access-token' is
-
-{token}
-
-Put this value into your configuration file as
-'extractor.mastodon.{instance}.{key}-token'.
-
-You can also add your 'client-id' and 'client-secret' values
-if you want to register another account in the future.
-
-Example:
-{{
- "extractor": {{
- "mastodon": {{
- "{instance}": {{
- "{key}-token": "{token}",
- "client-id": "{client_id}",
- "client-secret": "{client_secret}"
- }}
- }}
- }}
-}}
-"""
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 688c005..839e0b8 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,6 +23,7 @@ class PatreonExtractor(Extractor):
directory_fmt = ("{category}", "{creator[full_name]}")
filename_fmt = "{id}_{title}_{num:>02}.{extension}"
archive_fmt = "{id}_{num}"
+ browser = "firefox"
_warning = True
def items(self):
@@ -42,8 +43,6 @@ class PatreonExtractor(Extractor):
hashes = set()
yield Message.Directory, post
- yield Message.Metadata, post
-
for kind, url, name in itertools.chain(
self._images(post),
self._attachments(post),
@@ -249,9 +248,9 @@ class PatreonCreatorExtractor(PatreonExtractor):
creator_id = query.get("u")
if creator_id:
- url = "{}/user?u={}".format(self.root, creator_id)
+ url = "{}/user/posts?u={}".format(self.root, creator_id)
else:
- url = "{}/{}".format(self.root, self.creator.lower())
+ url = "{}/{}/posts".format(self.root, self.creator)
page = self.request(url, notfound="creator").text
campaign_id = text.extract(page, "/campaign/", "/")[0]
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index be976e9..db49b90 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -29,11 +29,12 @@ class PixivExtractor(Extractor):
Extractor.__init__(self, match)
self.api = PixivAppAPI(self)
self.load_ugoira = self.config("ugoira", True)
+ self.translated_tags = self.config("translated-tags", False)
def items(self):
+ tkey = "translated_name" if self.translated_tags else "name"
ratings = {0: "General", 1: "R-18", 2: "R-18G"}
metadata = self.metadata()
- yield Message.Version, 1
for work in self.works():
if not work["user"]["id"]:
@@ -45,7 +46,7 @@ class PixivExtractor(Extractor):
del work["image_urls"]
del work["meta_pages"]
work["num"] = 0
- work["tags"] = [tag["name"] for tag in work["tags"]]
+ work["tags"] = [tag[tkey] or tag["name"] for tag in work["tags"]]
work["date"] = text.parse_datetime(work["create_date"])
work["rating"] = ratings.get(work["x_restrict"])
work["suffix"] = ""
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index aa0ba6d..971347b 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,11 +11,8 @@
from .common import Extractor, Message
from .. import text
import urllib.parse
-import random
-import time
import json
-
BASE_PATTERN = r"(?:https?://)?((?:[^/.]+\.)?reactor\.cc)"
@@ -24,17 +21,14 @@ class ReactorExtractor(Extractor):
basecategory = "reactor"
filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
archive_fmt = "{post_id}_{num}"
+ instances = ()
+ request_interval = 5.0
def __init__(self, match):
Extractor.__init__(self, match)
self.root = "http://" + match.group(1)
self.session.headers["Referer"] = self.root
- self.wait_min = self.config("wait-min", 3)
- self.wait_max = self.config("wait-max", 6)
- if self.wait_max < self.wait_min:
- self.wait_max = self.wait_min
-
if not self.category:
# set category based on domain name
netloc = urllib.parse.urlsplit(self.root).netloc
@@ -60,8 +54,6 @@ class ReactorExtractor(Extractor):
def _pagination(self, url):
while True:
- time.sleep(random.uniform(self.wait_min, self.wait_max))
-
response = self.request(url)
if response.history:
# sometimes there is a redirect from
@@ -231,11 +223,11 @@ class JoyreactorSearchExtractor(ReactorSearchExtractor):
category = "joyreactor"
pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)"
test = (
- ("http://joyreactor.cc/search/Cirno", {
+ ("http://joyreactor.cc/search/Nature", {
"range": "1-25",
"count": ">= 20",
}),
- ("http://joyreactor.com/search?q=Cirno", {
+ ("http://joyreactor.com/search?q=Nature", {
"range": "1-25",
"count": ">= 20",
}),
@@ -305,10 +297,7 @@ class PornreactorSearchExtractor(ReactorSearchExtractor):
category = "pornreactor"
pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)"
test = (
- ("http://pornreactor.cc/search?q=ecchi+hentai", {
- "range": "1-25",
- "count": ">= 25",
- }),
+ ("http://pornreactor.cc/search?q=ecchi+hentai"),
("http://fapreactor.com/search/ecchi+hentai"),
)
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index ae1749e..7ffe5dc 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -47,12 +47,13 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))")
test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
- "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",
- "keyword": "30fe110273e871305001f33c18634516a0a51421",
+ "url": "30d29c5afc65043bfd384c010257ec2d0ecbafa6",
+ "keyword": "2d9ec81ce1b11fac06ebf96ce33cdbfca0e85eb5",
})
def __init__(self, match):
ChapterExtractor.__init__(self, match)
+ self.gallery_url += "&quality=hq"
self.issue_id = match.group(2)
def metadata(self, page):
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index 972750c..5d83299 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -125,17 +125,14 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor):
def items(self):
pnum = 1
- last = None
data = {"_extractor": SankakucomplexArticleExtractor}
- yield Message.Version, 1
while True:
url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
response = self.request(url, fatal=False)
if response.status_code >= 400:
return
- for url in text.extract_iter(response.text, 'data-direct="', '"'):
- if url != last:
- last = url
- yield Message.Queue, url, data
+ for url in util.unique_sequence(text.extract_iter(
+ response.text, 'data-direct="', '"')):
+ yield Message.Queue, url, data
pnum += 1
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index d65f334..ba1ab08 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,28 +8,23 @@
"""Extractors for Shopify instances"""
-from .common import Extractor, Message, generate_extractors
+from .common import BaseExtractor, Message
from .. import text
import re
-class ShopifyExtractor(Extractor):
+class ShopifyExtractor(BaseExtractor):
"""Base class for Shopify extractors"""
basecategory = "shopify"
filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}"
archive_fmt = "{id}"
def __init__(self, match):
- Extractor.__init__(self, match)
- self.item_url = self.root + match.group(1)
-
- def request(self, url, **kwargs):
- kwargs["retries"] = float("inf")
- return Extractor.request(self, url, **kwargs)
+ BaseExtractor.__init__(self, match)
+ self.item_url = self.root + match.group(match.lastindex)
def items(self):
data = self.metadata()
- yield Message.Version, 1
yield Message.Directory, data
headers = {"X-Requested-With": "XMLHttpRequest"}
@@ -58,22 +53,34 @@ class ShopifyExtractor(Extractor):
"""Return an iterable with all relevant product URLs"""
+BASE_PATTERN = ShopifyExtractor.update({
+ "fashionnova": {
+ "root": "https://www.fashionnova.com",
+ "pattern": r"(?:www\.)?fashionnova\.com",
+ },
+})
+
+
class ShopifyCollectionExtractor(ShopifyExtractor):
"""Base class for collection extractors for Shopify based sites"""
subcategory = "collection"
directory_fmt = ("{category}", "{collection[title]}")
- pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)"
-
- def __init__(self, match):
- ShopifyExtractor.__init__(self, match)
- self.params = match.group(2)
+ pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$|[?#])"
+ test = (
+ ("https://www.fashionnova.com/collections/mini-dresses", {
+ "range": "1-20",
+ "count": 20,
+ "archive": False,
+ }),
+ ("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
+ ("https://www.fashionnova.com/collections/mini-dresses#1"),
+ )
def metadata(self):
return self.request(self.item_url + ".json").json()
def products(self):
- params = text.parse_query(self.params)
- params["page"] = text.parse_int(params.get("page"), 1)
+ params = {"page": 1}
fetch = True
last = None
@@ -107,36 +114,14 @@ class ShopifyProductExtractor(ShopifyExtractor):
"""Base class for product extractors for Shopify based sites"""
subcategory = "product"
directory_fmt = ("{category}", "Products")
- pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)"
+ pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)"
+ test = (
+ ("https://www.fashionnova.com/products/essential-slide-red", {
+ "pattern": r"https?://cdn\d*\.shopify.com/",
+ "count": 3,
+ }),
+ ("https://www.fashionnova.com/collections/flats/products/name"),
+ )
def products(self):
return (self.item_url,)
-
-
-EXTRACTORS = {
- "fashionnova": {
- "root": "https://www.fashionnova.com",
- "pattern": r"(?:www\.)?fashionnova\.com",
- "test-product": (
- ("https://www.fashionnova.com/products/essential-slide-red", {
- "pattern": r"https?://cdn\d*\.shopify.com/",
- "count": 3,
- }),
- ("https://www.fashionnova.com/collections/flats/products/name"),
- ),
- "test-collection": (
- ("https://www.fashionnova.com/collections/mini-dresses", {
- "range": "1-20",
- "count": 20,
- "archive": False,
- }),
- ("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
- ("https://www.fashionnova.com/collections/mini-dresses#1"),
- ),
- },
-}
-
-generate_extractors(EXTRACTORS, globals(), (
- ShopifyProductExtractor,
- ShopifyCollectionExtractor,
-))
diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py
new file mode 100644
index 0000000..849dc49
--- /dev/null
+++ b/gallery_dl/extractor/tumblrgallery.py
@@ -0,0 +1,149 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://tumblrgallery.xyz/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?tumblrgallery\.xyz"
+
+
+class TumblrgalleryExtractor(GalleryExtractor):
+ """Base class for tumblrgallery extractors"""
+ category = "tumblrgallery"
+ filename_fmt = "{category}_{gallery_id}_{num:>03}_{id}.{extension}"
+ directory_fmt = ("{category}", "{gallery_id} {title}")
+ root = "https://tumblrgallery.xyz"
+
+
+class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
+ """Extractor for Tumblrblog on tumblrgallery.xyz"""
+ subcategory = "tumblrblog"
+ pattern = BASE_PATTERN + r"(/tumblrblog/gallery/(\d+)\.html)"
+ test = ("https://tumblrgallery.xyz/tumblrblog/gallery/103975.html",)
+
+ def __init__(self, match):
+ TumblrgalleryExtractor.__init__(self, match)
+ self.gallery_id = text.parse_int(match.group(2))
+
+ def metadata(self, page):
+ return {
+ "title" : text.unescape(text.extract(page, "<h1>", "</h1>"))[0],
+ "gallery_id": self.gallery_id,
+ }
+
+ def images(self, _):
+ page_num = 1
+ while True:
+ response = self.request(
+ "{}/tumblrblog/gallery/{}/{}.html"
+ .format(self.root, self.gallery_id, page_num),
+ allow_redirects=False
+ )
+ if response.status_code != 200:
+ return
+
+ page = response.text
+ page_num += 1
+
+ urls = list(text.extract_iter(
+ page,
+ '<div class="report xx-co-me"> <a href="',
+ '" data-fancybox="gallery"'
+ ))
+
+ for image_src in urls:
+ yield image_src, {
+ "id": text.extract(image_src, "tumblr_", "_")[0]
+ }
+
+
+class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
+ """Extractor for Posts on tumblrgallery.xyz"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"(/post/(\d+)\.html)"
+ test = ("https://tumblrgallery.xyz/post/405674.html",)
+
+ def __init__(self, match):
+ TumblrgalleryExtractor.__init__(self, match)
+ self.gallery_id = text.parse_int(match.group(2))
+
+ def metadata(self, page):
+ return {
+ "title" : text.remove_html(
+ text.unescape(text.extract(page, "<title>", "</title>")[0])
+ ).replace("_", "-"),
+ "gallery_id": self.gallery_id,
+ }
+
+ def images(self, page):
+ urls = list(text.extract_iter(
+ page,
+ '<div class="report xx-co-me"> <a href="',
+ '" data-fancybox="gallery"'
+ ))
+
+ for image_src in urls:
+ yield image_src, {
+ "id": text.extract(image_src, "tumblr_", "_")[0] or
+ text.nameext_from_url(image_src)["filename"]
+ }
+
+
+class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
+ """Extractor for Search result on tumblrgallery.xyz"""
+ subcategory = "search"
+ filename_fmt = "{category}_{num:>03}_{gallery_id}_{id}_{title}.{extension}"
+ directory_fmt = ("{category}", "{search_term}")
+ pattern = BASE_PATTERN + r"(/s\.php\?q=([^&#]+))"
+ test = ("https://tumblrgallery.xyz/s.php?q=everyday-life",)
+
+ def __init__(self, match):
+ TumblrgalleryExtractor.__init__(self, match)
+ self.search_term = match.group(2)
+
+ def metadata(self, page):
+ return {
+ "search_term": self.search_term,
+ }
+
+ def images(self, _):
+ page_num = 1
+ while True:
+ response = self.request(
+ "{}/s.php?q={}&page={}"
+ .format(self.root, self.search_term, page_num),
+ allow_redirects=False
+ )
+ if response.status_code != 200:
+ return
+
+ page = response.text
+ page_num += 1
+
+ gallery_ids = list(text.extract_iter(
+ page,
+ '<div class="title"><a href="post/',
+ '.html'
+ ))
+
+ for gallery_id in gallery_ids:
+ post_page = self.request(
+ "{}/post/{}.html"
+ .format(self.root, gallery_id),
+ allow_redirects=False
+ ).text
+ for image_src in TumblrgalleryPostExtractor.images(
+ self, post_page
+ ):
+ image_src[1]["title"] = text.remove_html(
+ text.unescape(
+ text.extract(post_page, "<title>", "</title>")[0]
+ )
+ ).replace("_", "-")
+ image_src[1]["gallery_id"] = gallery_id
+ yield image_src
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 7b6bf21..a7d2de5 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -362,6 +362,23 @@ class TwitterListMembersExtractor(TwitterExtractor):
yield Message.Queue, url, user
+class TwitterFollowingExtractor(TwitterExtractor):
+ """Extractor for followed users"""
+ subcategory = "following"
+ pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)"
+ test = (
+ ("https://twitter.com/supernaturepics/following"),
+ ("https://www.twitter.com/id:2976459548/following"),
+ )
+
+ def items(self):
+ self.login()
+ for user in TwitterAPI(self).user_following(self.user):
+ user["_extractor"] = TwitterTimelineExtractor
+ url = "{}/i/user/{}".format(self.root, user["rest_id"])
+ yield Message.Queue, url, user
+
+
class TwitterSearchExtractor(TwitterExtractor):
"""Extractor for all images from a search timeline"""
subcategory = "search"
@@ -451,6 +468,11 @@ class TwitterTweetExtractor(TwitterExtractor):
"date" : "dt:2020-08-20 04:00:28",
},
}),
+ # all Tweets from a conversation (#1319)
+ ("https://twitter.com/BlankArts_/status/1323314488611872769", {
+ "options": (("conversations", True),),
+ "count": ">= 50",
+ }),
)
def __init__(self, match):
@@ -458,6 +480,8 @@ class TwitterTweetExtractor(TwitterExtractor):
self.tweet_id = match.group(2)
def tweets(self):
+ if self.config("conversations", False):
+ return TwitterAPI(self).conversation(self.tweet_id)
return TwitterAPI(self).tweet(self.tweet_id)
@@ -537,6 +561,10 @@ class TwitterAPI():
break
return tweets
+ def conversation(self, conversation_id):
+ endpoint = "/2/timeline/conversation/{}.json".format(conversation_id)
+ return self._pagination(endpoint)
+
def timeline_profile(self, screen_name):
user_id = self._user_id_by_screen_name(screen_name)
endpoint = "/2/timeline/profile/{}.json".format(user_id)
@@ -577,18 +605,8 @@ class TwitterAPI():
params["spelling_corrections"] = "1"
return self._pagination(endpoint, params)
- def list_members(self, list_id):
- endpoint = "/graphql/3pV4YlpljXUTFAa1jVNWQw/ListMembers"
- variables = {
- "listId": list_id,
- "count" : 20,
- "withTweetResult": False,
- "withUserResult" : False,
- }
- return self._pagination_members(endpoint, variables)
-
def list_by_rest_id(self, list_id):
- endpoint = "/graphql/EhaI2uiCBJI97e28GN8WjQ/ListByRestId"
+ endpoint = "/graphql/18MAHTcDU-TdJSjWWmoH7w/ListByRestId"
params = {"variables": '{"listId":"' + list_id + '"'
',"withUserResult":false}'}
try:
@@ -596,8 +614,33 @@ class TwitterAPI():
except KeyError:
raise exception.NotFoundError("list")
+ def list_members(self, list_id):
+ endpoint = "/graphql/tA7h9hy4U0Yc9COfIOh3qQ/ListMembers"
+ variables = {
+ "listId": list_id,
+ "count" : 100,
+ "withTweetResult": False,
+ "withUserResult" : False,
+ }
+ return self._pagination_graphql(
+ endpoint, variables, "list", "members_timeline")
+
+ def user_following(self, screen_name):
+ endpoint = "/graphql/Q_QTiPvoXwsA13eoA7okIQ/Following"
+ variables = {
+ "userId": self._user_id_by_screen_name(screen_name),
+ "count" : 100,
+ "withTweetResult": False,
+ "withUserResult" : False,
+ "withTweetQuoteCount" : False,
+ "withHighlightedLabel" : False,
+ "includePromotedContent": False,
+ }
+ return self._pagination_graphql(
+ endpoint, variables, "user", "following_timeline")
+
def user_by_screen_name(self, screen_name):
- endpoint = "/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName"
+ endpoint = "/graphql/hc-pka9A7gyS3xODIafnrQ/UserByScreenName"
params = {"variables": '{"screen_name":"' + screen_name + '"'
',"withHighlightedLabel":true}'}
try:
@@ -691,6 +734,13 @@ class TwitterAPI():
tweet = True
cursor = cursor["value"]
+ elif entry_startswith("conversationThread-"):
+ tweet_ids.extend(
+ item["entryId"][6:]
+ for item in entry["content"]["timelineModule"]["items"]
+ if item["entryId"].startswith("tweet-")
+ )
+
# process tweets
for tweet_id in tweet_ids:
try:
@@ -728,15 +778,15 @@ class TwitterAPI():
return
params["cursor"] = cursor
- def _pagination_members(self, endpoint, variables):
+ def _pagination_graphql(self, endpoint, variables, key, timeline):
while True:
cursor = entry = stop = None
params = {"variables": json.dumps(variables)}
data = self._call(endpoint, params)
try:
- instructions = (data["data"]["list"]["members_timeline"]
- ["timeline"]["instructions"])
+ instructions = \
+ data["data"][key][timeline]["timeline"]["instructions"]
except KeyError:
raise exception.AuthorizationError()
diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
index 545eb31..c653c01 100644
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -122,7 +122,7 @@ class UnsplashImageExtractor(UnsplashExtractor):
"total_photos": int,
"twitter_username": None,
"updated_at": str,
- "username": "johnwestrock"
+ "username": "davehoefler",
},
"views": int,
"width": 4480,
@@ -138,7 +138,7 @@ class UnsplashUserExtractor(UnsplashExtractor):
"""Extractor for all photos of an unsplash user"""
subcategory = "user"
pattern = BASE_PATTERN + r"/@(\w+)/?$"
- test = ("https://unsplash.com/@johnwestrock", {
+ test = ("https://unsplash.com/@davehoefler", {
"pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
"range": "1-30",
@@ -155,7 +155,7 @@ class UnsplashFavoriteExtractor(UnsplashExtractor):
"""Extractor for all likes of an unsplash user"""
subcategory = "favorite"
pattern = BASE_PATTERN + r"/@(\w+)/likes"
- test = ("https://unsplash.com/@johnwestrock/likes", {
+ test = ("https://unsplash.com/@davehoefler/likes", {
"pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+"
r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$",
"range": "1-30",
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
index 20980ac..e025a22 100644
--- a/gallery_dl/extractor/wallhaven.py
+++ b/gallery_dl/extractor/wallhaven.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -16,11 +16,35 @@ class WallhavenExtractor(Extractor):
"""Base class for wallhaven extractors"""
category = "wallhaven"
filename_fmt = "{category}_{id}_{resolution}.{extension}"
+ archive_fmt = "{id}"
root = "https://wallhaven.cc"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.api = WallhavenAPI(self)
+ def items(self):
+ metadata = self.metadata()
+ for wp in self.wallpapers():
+ self._transform(wp)
+ wp.update(metadata)
+ url = wp["url"]
+ yield Message.Directory, wp
+ yield Message.Url, url, text.nameext_from_url(url, wp)
+
+ def wallpapers(self):
+ """Return relevant 'wallpaper' objects"""
+
+ def metadata(self):
+ """Return general metadata"""
+ return ()
+
+ @staticmethod
+ def _transform(wp):
+ wp["url"] = wp.pop("path")
+ if "tags" in wp:
+ wp["tags"] = [t["name"] for t in wp["tags"]]
+ wp["date"] = text.parse_datetime(
+ wp.pop("created_at"), "%Y-%m-%d %H:%M:%S")
+ wp["width"] = wp.pop("dimension_x")
+ wp["height"] = wp.pop("dimension_y")
+ wp["wh_category"] = wp["category"]
class WallhavenSearchExtractor(WallhavenExtractor):
@@ -42,18 +66,57 @@ class WallhavenSearchExtractor(WallhavenExtractor):
WallhavenExtractor.__init__(self, match)
self.params = text.parse_query(match.group(1))
+ def wallpapers(self):
+ return WallhavenAPI(self).search(self.params.copy())
+
+ def metadata(self):
+ return {"search": self.params}
+
+
+class WallhavenCollectionExtractor(WallhavenExtractor):
+ """Extractor for a collection on wallhaven.cc"""
+ subcategory = "collection"
+ directory_fmt = ("{category}", "{username}", "{collection_id}")
+ pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/favorites/(\d+)"
+ test = ("https://wallhaven.cc/user/AksumkA/favorites/74", {
+ "count": ">= 50",
+ })
+
+ def __init__(self, match):
+ WallhavenExtractor.__init__(self, match)
+ self.username, self.collection_id = match.groups()
+
+ def wallpapers(self):
+ return WallhavenAPI(self).collection(self.username, self.collection_id)
+
+ def metadata(self):
+ return {"username": self.username, "collection_id": self.collection_id}
+
+
+class WallhavenCollectionsExtractor(WallhavenExtractor):
+ """Extractor for all collections of a wallhaven user"""
+ subcategory = "collections"
+ pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/favorites/?$"
+ test = ("https://wallhaven.cc/user/AksumkA/favorites", {
+ "pattern": WallhavenCollectionExtractor.pattern,
+ "count": 4,
+ })
+
+ def __init__(self, match):
+ WallhavenExtractor.__init__(self, match)
+ self.username = match.group(1)
+
def items(self):
- yield Message.Version, 1
- yield Message.Directory, {"search": self.params}
- for wp in self.api.search(self.params.copy()):
- wp["search"] = self.params
- yield Message.Url, wp["url"], wp
+ for collection in WallhavenAPI(self).collections(self.username):
+ collection["_extractor"] = WallhavenCollectionExtractor
+ url = "https://wallhaven.cc/user/{}/favorites/{}".format(
+ self.username, collection["id"])
+ yield Message.Queue, url, collection
class WallhavenImageExtractor(WallhavenExtractor):
"""Extractor for individual wallpaper on wallhaven.cc"""
subcategory = "image"
- archive_fmt = "{id}"
pattern = (r"(?:https?://)?(?:wallhaven\.cc/w/|whvn\.cc/"
r"|w\.wallhaven\.cc/[a-z]+/\w\w/wallhaven-)(\w+)")
test = (
@@ -65,7 +128,7 @@ class WallhavenImageExtractor(WallhavenExtractor):
"width" : 1920,
"height" : 1200,
"resolution" : "1920x1200",
- "ratio" : 1.6,
+ "ratio" : "1.6",
"colors" : list,
"tags" : list,
"file_size" : 278799,
@@ -95,15 +158,15 @@ class WallhavenImageExtractor(WallhavenExtractor):
WallhavenExtractor.__init__(self, match)
self.wallpaper_id = match.group(1)
- def items(self):
- data = self.api.info(self.wallpaper_id)
- yield Message.Version, 1
- yield Message.Directory, data
- yield Message.Url, data["url"], data
+ def wallpapers(self):
+ return (WallhavenAPI(self).info(self.wallpaper_id),)
class WallhavenAPI():
- """Minimal interface to wallhaven's API"""
+ """Interface for wallhaven's API
+
+ Ref: https://wallhaven.cc/help/api
+ """
def __init__(self, extractor):
self.extractor = extractor
@@ -117,32 +180,35 @@ class WallhavenAPI():
self.headers = {"X-API-Key": key}
def info(self, wallpaper_id):
- url = "https://wallhaven.cc/api/v1/w/" + wallpaper_id
- return self._update(self._call(url)["data"])
+ endpoint = "/v1/w/" + wallpaper_id
+ return self._call(endpoint)["data"]
+
+ def collection(self, username, collection_id):
+ endpoint = "/v1/collections/{}/{}".format(username, collection_id)
+ return self._pagination(endpoint)
+
+ def collections(self, username):
+ endpoint = "/v1/collections/" + username
+ return self._pagination(endpoint)
def search(self, params):
- url = "https://wallhaven.cc/api/v1/search"
- while True:
- data = self._call(url, params)
- yield from map(self._update, data["data"])
- if data["meta"]["current_page"] >= data["meta"]["last_page"]:
- return
- params["page"] = data["meta"]["current_page"] + 1
+ endpoint = "/v1/search"
+ return self._pagination(endpoint, params)
- def _call(self, url, params=None):
+ def _call(self, endpoint, params=None):
+ url = "https://wallhaven.cc/api" + endpoint
return self.extractor.request(
url, headers=self.headers, params=params).json()
- @staticmethod
- def _update(wp):
- width, _, height = wp["resolution"].partition("x")
- wp["url"] = wp.pop("path")
- if "tags" in wp:
- wp["tags"] = [t["name"] for t in wp["tags"]]
- wp["date"] = text.parse_datetime(
- wp.pop("created_at"), "%Y-%m-%d %H:%M:%S")
- wp["ratio"] = text.parse_float(wp["ratio"])
- wp["width"] = wp.pop("dimension_x")
- wp["height"] = wp.pop("dimension_y")
- wp["wh_category"] = wp["category"]
- return text.nameext_from_url(wp["url"], wp)
+ def _pagination(self, endpoint, params=None):
+ if params is None:
+ params = {}
+
+ while True:
+ data = self._call(endpoint, params)
+ yield from data["data"]
+
+ meta = data.get("meta")
+ if not meta or meta["current_page"] >= meta["last_page"]:
+ return
+ params["page"] = meta["current_page"] + 1
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index c1d32ef..0f40bb9 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -1,15 +1,17 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2020 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
import sys
+import json
import time
import errno
import logging
+import operator
import collections
from . import extractor, downloader, postprocessor
from . import config, text, util, output, exception
@@ -111,10 +113,6 @@ class Job():
if self.pred_queue(url, kwds):
self.handle_queue(url, kwds)
- elif msg[0] == Message.Metadata:
- self.update_kwdict(msg[1])
- self.handle_metadata(msg[1])
-
elif msg[0] == Message.Version:
if msg[1] != 1:
raise "unsupported message-version ({}, {})".format(
@@ -128,9 +126,6 @@ class Job():
def handle_directory(self, kwdict):
"""Handle Message.Directory"""
- def handle_metadata(self, kwdict):
- """Handle Message.Metadata"""
-
def handle_queue(self, url, kwdict):
"""Handle Message.Queue"""
@@ -280,15 +275,6 @@ class DownloadJob(Job):
for callback in self.hooks["post"]:
callback(self.pathfmt)
- def handle_metadata(self, kwdict):
- """Run postprocessors with metadata from 'kwdict'"""
- if "metadata" in self.hooks:
- kwdict["extension"] = "metadata"
- pathfmt = self.pathfmt
- pathfmt.set_filename(kwdict)
- for callback in self.hooks["metadata"]:
- callback(pathfmt)
-
def handle_queue(self, url, kwdict):
if url in self.visited:
return
@@ -456,7 +442,21 @@ class DownloadJob(Job):
if wlist is not None:
if isinstance(wlist, str):
wlist = wlist.split(",")
- blist = {e.category for e in extractor._list_classes()}
+
+ # build a set of all categories
+ blist = set()
+ add = blist.add
+ update = blist.update
+ get = operator.itemgetter(0)
+
+ for extr in extractor._list_classes():
+ category = extr.category
+ if category:
+ add(category)
+ else:
+ update(map(get, extr.instances))
+
+ # remove whitelisted categories
blist.difference_update(wlist)
return blist
@@ -576,6 +576,38 @@ class UrlJob(Job):
self._write_unsupported(url)
+class InfoJob(Job):
+ """Print extractor defaults and settings"""
+
+ def run(self):
+ ex = self.extractor
+ pm = self._print_multi
+ pc = self._print_config
+
+ if ex.basecategory:
+ pm("Category / Subcategory / Basecategory",
+ ex.category, ex.subcategory, ex.basecategory)
+ else:
+ pm("Category / Subcategory", ex.category, ex.subcategory)
+
+ pc("Filename format", "filename", ex.filename_fmt)
+ pc("Directory format", "directory", ex.directory_fmt)
+ pc("Request interval", "sleep-request", ex.request_interval)
+
+ return 0
+
+ def _print_multi(self, title, *values):
+ print(title, "\n ", " / ".join(json.dumps(v) for v in values), sep="")
+
+ def _print_config(self, title, optname, value):
+ optval = self.extractor.config(optname, util.SENTINEL)
+ if optval is not util.SENTINEL:
+ print(title, "(custom):\n ", json.dumps(optval))
+ print(title, "(default):\n ", json.dumps(value))
+ elif value:
+ print(title, "(default):\n ", json.dumps(value))
+
+
class DataJob(Job):
"""Collect extractor results and dump them"""
@@ -624,8 +656,5 @@ class DataJob(Job):
def handle_directory(self, kwdict):
self.data.append((Message.Directory, self.filter(kwdict)))
- def handle_metadata(self, kwdict):
- self.data.append((Message.Metadata, self.filter(kwdict)))
-
def handle_queue(self, url, kwdict):
self.data.append((Message.Queue, url, self.filter(kwdict)))
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
index 367b934..3e585fe 100644
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2020 Mike Fährmann
+# Copyright 2017-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -98,8 +98,9 @@ def build_parser():
)
general.add_argument(
"-i", "--input-file",
- dest="inputfile", metavar="FILE",
- help="Download URLs found in FILE ('-' for stdin)",
+ dest="inputfiles", metavar="FILE", action="append",
+ help=("Download URLs found in FILE ('-' for stdin). "
+ "More than one --input-file can be specified"),
)
general.add_argument(
"--cookies",
@@ -136,9 +137,9 @@ def build_parser():
help="Print URLs instead of downloading",
)
output.add_argument(
- "-G",
+ "-G", "--resolve-urls",
dest="list_urls", action="store_const", const=128,
- help=argparse.SUPPRESS,
+ help="Print URLs instead of downloading; resolve intermediary URLs",
)
output.add_argument(
"-j", "--dump-json",
@@ -151,6 +152,11 @@ def build_parser():
help="Simulate data extraction; do not download anything",
)
output.add_argument(
+ "-E", "--extractor-info",
+ dest="jobtype", action="store_const", const=job.InfoJob,
+ help="Print extractor defaults and settings",
+ )
+ output.add_argument(
"-K", "--list-keywords",
dest="jobtype", action="store_const", const=job.KeywordJob,
help=("Print a list of available keywords and example values "
diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py
index faa4d6c..ee490e7 100644
--- a/gallery_dl/postprocessor/__init__.py
+++ b/gallery_dl/postprocessor/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,8 +8,6 @@
"""Post-processing modules"""
-import importlib
-
modules = [
"classify",
"compare",
@@ -28,16 +26,16 @@ def find(name):
except KeyError:
pass
- klass = None
+ cls = None
if name in modules: # prevent unwanted imports
try:
- module = importlib.import_module("." + name, __package__)
+ module = __import__(name, globals(), None, (), 1)
except ImportError:
pass
else:
- klass = module.__postprocessor__
- _cache[name] = klass
- return klass
+ cls = module.__postprocessor__
+ _cache[name] = cls
+ return cls
# --------------------------------------------------------------------
diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py
index 5a54a77..2514219 100644
--- a/gallery_dl/postprocessor/exec.py
+++ b/gallery_dl/postprocessor/exec.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -39,10 +39,6 @@ class ExecPP(PostProcessor):
events = options.get("event")
if events is None:
events = ("after",)
- if options.get("final"):
- self.log.warning("'final' is deprecated, "
- "use '\"event\": \"finalize\"' instead")
- events = ("finalize",)
elif isinstance(events, str):
events = events.split(",")
for event in events:
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
index c08f111..49696a0 100644
--- a/gallery_dl/postprocessor/metadata.py
+++ b/gallery_dl/postprocessor/metadata.py
@@ -39,7 +39,7 @@ class MetadataPP(PostProcessor):
if directory:
self._directory = self._directory_custom
sep = os.sep + (os.altsep or "")
- self._metadir = directory.rstrip(sep) + os.sep
+ self._metadir = util.expand_path(directory).rstrip(sep) + os.sep
filename = options.get("filename")
extfmt = options.get("extension-format")
@@ -55,10 +55,6 @@ class MetadataPP(PostProcessor):
events = options.get("event")
if events is None:
events = ("file",)
- if options.get("bypost"):
- self.log.warning("'bypost' is deprecated, use '\"event\": "
- "\"post\"' and 'filename' instead")
- events = ("metadata",)
elif isinstance(events, str):
events = events.split(",")
for event in events:
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 2161b9d..2466adf 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2020 Mike Fährmann
+# Copyright 2017-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -65,6 +65,15 @@ def unique(iterable):
yield element
+def unique_sequence(iterable):
+ """Yield sequentially unique elements from 'iterable'"""
+ last = None
+ for element in iterable:
+ if element != last:
+ last = element
+ yield element
+
+
def raises(cls):
"""Returns a function that raises 'cls' as exception"""
def wrap(*args):
@@ -731,21 +740,25 @@ class PathFormat():
}
def __init__(self, extractor):
- filename_fmt = extractor.config("filename", extractor.filename_fmt)
- directory_fmt = extractor.config("directory", extractor.directory_fmt)
- kwdefault = extractor.config("keywords-default")
+ filename_fmt = extractor.config("filename")
+ if filename_fmt is None:
+ filename_fmt = extractor.filename_fmt
+
+ directory_fmt = extractor.config("directory")
+ if directory_fmt is None:
+ directory_fmt = extractor.directory_fmt
extension_map = extractor.config("extension-map")
if extension_map is None:
extension_map = self.EXTENSION_MAP
self.extension_map = extension_map.get
+ kwdefault = extractor.config("keywords-default")
try:
self.filename_formatter = Formatter(
filename_fmt, kwdefault).format_map
except Exception as exc:
raise exception.FilenameFormatError(exc)
-
try:
self.directory_formatters = [
Formatter(dirfmt, kwdefault).format_map
@@ -754,20 +767,23 @@ class PathFormat():
except Exception as exc:
raise exception.DirectoryFormatError(exc)
- self.directory = self.realdirectory = ""
- self.filename = self.extension = self.prefix = ""
- self.path = self.realpath = self.temppath = ""
self.kwdict = {}
+ self.directory = self.realdirectory = \
+ self.filename = self.extension = self.prefix = \
+ self.path = self.realpath = self.temppath = ""
self.delete = self._create_directory = False
basedir = extractor._parentdir
if not basedir:
- basedir = expand_path(
- extractor.config("base-directory", (".", "gallery-dl")))
- if os.altsep and os.altsep in basedir:
- basedir = basedir.replace(os.altsep, os.sep)
- if basedir[-1] != os.sep:
- basedir += os.sep
+ basedir = extractor.config("base-directory")
+ if basedir is None:
+ basedir = "." + os.sep + "gallery-dl" + os.sep
+ elif basedir:
+ basedir = expand_path(basedir)
+ if os.altsep and os.altsep in basedir:
+ basedir = basedir.replace(os.altsep, os.sep)
+ if basedir[-1] != os.sep:
+ basedir += os.sep
self.basedirectory = basedir
restrict = extractor.config("path-restrict", "auto")
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 8244a95..f1c49e9 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.16.5"
+__version__ = "1.17.0"
diff --git a/test/test_downloader.py b/test/test_downloader.py
index 99cfb62..42b5c72 100644
--- a/test/test_downloader.py
+++ b/test/test_downloader.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2018-2020 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -74,7 +74,7 @@ class TestDownloaderModule(unittest.TestCase):
self.assertEqual(downloader.find(1234) , None)
self.assertEqual(downloader.find(None) , None)
- @patch("importlib.import_module")
+ @patch("builtins.__import__")
def test_cache(self, import_module):
import_module.return_value = MockDownloaderModule()
downloader.find("http")
@@ -86,14 +86,14 @@ class TestDownloaderModule(unittest.TestCase):
downloader.find("ytdl")
self.assertEqual(import_module.call_count, 3)
- @patch("importlib.import_module")
+ @patch("builtins.__import__")
def test_cache_http(self, import_module):
import_module.return_value = MockDownloaderModule()
downloader.find("http")
downloader.find("https")
self.assertEqual(import_module.call_count, 1)
- @patch("importlib.import_module")
+ @patch("builtins.__import__")
def test_cache_https(self, import_module):
import_module.return_value = MockDownloaderModule()
downloader.find("https")
diff --git a/test/test_extractor.py b/test/test_extractor.py
index 8bc3a27..f04e1c7 100644
--- a/test/test_extractor.py
+++ b/test/test_extractor.py
@@ -219,10 +219,6 @@ class TestExtractorWait(unittest.TestCase):
class TextExtractorOAuth(unittest.TestCase):
- @classmethod
- def setUpClass(cls):
- mastodon.generate_extractors()
-
def test_oauth1(self):
for category in ("flickr", "smugmug", "tumblr"):
extr = extractor.find("oauth:" + category)
diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py
index 4e98a97..6bf887c 100644
--- a/test/test_postprocessor.py
+++ b/test/test_postprocessor.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2019-2020 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -53,7 +53,7 @@ class TestPostprocessorModule(unittest.TestCase):
self.assertEqual(postprocessor.find(1234) , None)
self.assertEqual(postprocessor.find(None) , None)
- @patch("importlib.import_module")
+ @patch("builtins.__import__")
def test_cache(self, import_module):
import_module.return_value = MockPostprocessorModule()
diff --git a/test/test_results.py b/test/test_results.py
index f7356d5..223ef57 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2015-2020 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,14 +20,6 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from gallery_dl import extractor, util, job, config, exception # noqa E402
-# these don't work on Travis CI
-TRAVIS_SKIP = {
- "exhentai", "mangafox", "dynastyscans", "nijie", "instagram", "ngomik",
- "archivedmoe", "archiveofsins", "thebarchive", "fireden", "4plebs",
- "sankaku", "idolcomplex", "mangahere", "mangadex", "sankakucomplex",
- "warosu", "fuskator", "patreon", "komikcast", "twitter",
-}
-
# temporary issues, etc.
BROKEN = {
"imagevenue",
@@ -361,8 +353,6 @@ def generate_tests():
del sys.argv[1:]
else:
skip = set(BROKEN)
- if "CI" in os.environ and "TRAVIS" in os.environ:
- skip |= set(TRAVIS_SKIP)
if skip:
print("skipping:", ", ".join(skip))
fltr = lambda c, bc: c not in skip # noqa: E731
diff --git a/test/test_util.py b/test/test_util.py
index 8848ea0..06de735 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2015-2020 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -461,6 +461,16 @@ class TestOther(unittest.TestCase):
self.assertSequenceEqual(
list(util.unique([1, 2, 1, 3, 2, 1])), [1, 2, 3])
+ def test_unique_sequence(self):
+ self.assertSequenceEqual(
+ list(util.unique_sequence("")), "")
+ self.assertSequenceEqual(
+ list(util.unique_sequence("AABBCC")), "ABC")
+ self.assertSequenceEqual(
+ list(util.unique_sequence("ABABABCAABBCC")), "ABABABCABC")
+ self.assertSequenceEqual(
+ list(util.unique_sequence([1, 2, 1, 3, 2, 1])), [1, 2, 1, 3, 2, 1])
+
def test_raises(self):
func = util.raises(Exception)
with self.assertRaises(Exception):