aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2020-03-16 23:20:22 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2020-03-16 23:20:22 -0400
commitf1baa4aa12d705e290f74c9fb4c6cd5eb2976fa2 (patch)
tree70267e5f04db1da396e75fd4148d9c542683bbab
parent2bd320e568d015940227b7355396701331e2cd1e (diff)
parente8cc000750de972384f2f34d02d42222b4018ae9 (diff)
downloadgallery-dl-f1baa4aa12d705e290f74c9fb4c6cd5eb2976fa2.tar.bz2
gallery-dl-f1baa4aa12d705e290f74c9fb4c6cd5eb2976fa2.tar.xz
gallery-dl-f1baa4aa12d705e290f74c9fb4c6cd5eb2976fa2.tar.zst
Update upstream source from tag 'upstream/1.13.2'
Update to upstream version '1.13.2' with Debian dir a36309ac1ae7b23d042eaafd21c4267c2f840ab4
-rw-r--r--PKG-INFO19
-rw-r--r--README.rst17
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.5417
-rw-r--r--gallery_dl.egg-info/PKG-INFO19
-rw-r--r--gallery_dl.egg-info/SOURCES.txt8
-rw-r--r--gallery_dl/__init__.py13
-rw-r--r--gallery_dl/cache.py39
-rw-r--r--gallery_dl/cloudflare.py4
-rw-r--r--gallery_dl/downloader/http.py6
-rw-r--r--gallery_dl/extractor/3dbooru.py2
-rw-r--r--gallery_dl/extractor/8kun.py91
-rw-r--r--gallery_dl/extractor/8muses.py4
-rw-r--r--gallery_dl/extractor/__init__.py7
-rw-r--r--gallery_dl/extractor/bcy.py188
-rw-r--r--gallery_dl/extractor/blogger.py48
-rw-r--r--gallery_dl/extractor/booru.py42
-rw-r--r--gallery_dl/extractor/common.py39
-rw-r--r--gallery_dl/extractor/danbooru.py176
-rw-r--r--gallery_dl/extractor/deviantart.py37
-rw-r--r--gallery_dl/extractor/e621.py214
-rw-r--r--gallery_dl/extractor/flickr.py3
-rw-r--r--gallery_dl/extractor/furaffinity.py235
-rw-r--r--gallery_dl/extractor/hentaifoundry.py2
-rw-r--r--gallery_dl/extractor/hentaihand.py134
-rw-r--r--gallery_dl/extractor/hentainexus.py2
-rw-r--r--gallery_dl/extractor/hiperdex.py137
-rw-r--r--gallery_dl/extractor/hitomi.py146
-rw-r--r--gallery_dl/extractor/imgbb.py12
-rw-r--r--gallery_dl/extractor/imgur.py8
-rw-r--r--gallery_dl/extractor/instagram.py14
-rw-r--r--gallery_dl/extractor/issuu.py2
-rw-r--r--gallery_dl/extractor/kabeuchi.py92
-rw-r--r--gallery_dl/extractor/khinsider.py60
-rw-r--r--gallery_dl/extractor/kissmanga.py2
-rw-r--r--gallery_dl/extractor/luscious.py4
-rw-r--r--gallery_dl/extractor/mangareader.py4
-rw-r--r--gallery_dl/extractor/mangoxo.py35
-rw-r--r--gallery_dl/extractor/newgrounds.py84
-rw-r--r--gallery_dl/extractor/nozomi.py4
-rw-r--r--gallery_dl/extractor/oauth.py14
-rw-r--r--gallery_dl/extractor/paheal.py4
-rw-r--r--gallery_dl/extractor/patreon.py82
-rw-r--r--gallery_dl/extractor/piczel.py62
-rw-r--r--gallery_dl/extractor/pixiv.py84
-rw-r--r--gallery_dl/extractor/pururin.py6
-rw-r--r--gallery_dl/extractor/realbooru.py2
-rw-r--r--gallery_dl/extractor/reddit.py120
-rw-r--r--gallery_dl/extractor/sexcom.py40
-rw-r--r--gallery_dl/extractor/tsumino.py4
-rw-r--r--gallery_dl/extractor/tumblr.py2
-rw-r--r--gallery_dl/extractor/twitter.py118
-rw-r--r--gallery_dl/extractor/vsco.py16
-rw-r--r--gallery_dl/extractor/wallhaven.py6
-rw-r--r--gallery_dl/extractor/weibo.py6
-rw-r--r--gallery_dl/extractor/xhamster.py4
-rw-r--r--gallery_dl/extractor/yaplog.py128
-rw-r--r--gallery_dl/job.py9
-rw-r--r--gallery_dl/output.py48
-rw-r--r--gallery_dl/postprocessor/ugoira.py14
-rw-r--r--gallery_dl/text.py6
-rw-r--r--gallery_dl/util.py254
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_cache.py202
-rw-r--r--test/test_cookies.py7
-rw-r--r--test/test_results.py16
-rw-r--r--test/test_util.py131
67 files changed, 2825 insertions, 934 deletions
diff --git a/PKG-INFO b/PKG-INFO
index e86eb0c..84237a6 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.12.3
+Version: 1.13.2
Summary: Command-line program to download image-galleries and -collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -94,8 +94,8 @@ Description: ==========
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.bin>`__
These executables include a Python 3.8 interpreter
and all required Python packages.
@@ -141,6 +141,13 @@ Description: ==========
$ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703"
+ Filter manga chapters by language and chapter number:
+
+ .. code:: bash
+
+ $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/"
+
+
| Search a remote resource for URLs and download images from them:
| (URLs for which no extractor can be found will be silently ignored)
@@ -186,8 +193,8 @@ Description: ==========
Some extractors require you to provide valid login-credentials in the form of
a username & password pair. This is necessary for
``pixiv``, ``nijie``, and ``seiga``
- and optional (but strongly recommended) for
- ``danbooru``, ``exhentai``, ``idolcomplex``, ``instagram``,
+ and optional for
+ ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``,
``luscious``, ``sankaku``, ``tsumino``, and ``twitter``.
You can set the necessary information in your configuration file
@@ -240,7 +247,7 @@ Description: ==========
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
- .. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.3.tar.gz
+ .. _stable: https://github.com/mikf/gallery-dl/archive/v1.13.2.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
diff --git a/README.rst b/README.rst
index f450c81..3b5945c 100644
--- a/README.rst
+++ b/README.rst
@@ -83,8 +83,8 @@ Download a standalone executable file,
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.bin>`__
These executables include a Python 3.8 interpreter
and all required Python packages.
@@ -130,6 +130,13 @@ Get the direct URL of an image from a site that requires authentication:
$ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703"
+Filter manga chapters by language and chapter number:
+
+.. code:: bash
+
+ $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/"
+
+
| Search a remote resource for URLs and download images from them:
| (URLs for which no extractor can be found will be silently ignored)
@@ -175,8 +182,8 @@ Username & Password
Some extractors require you to provide valid login-credentials in the form of
a username & password pair. This is necessary for
``pixiv``, ``nijie``, and ``seiga``
-and optional (but strongly recommended) for
-``danbooru``, ``exhentai``, ``idolcomplex``, ``instagram``,
+and optional for
+``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``,
``luscious``, ``sankaku``, ``tsumino``, and ``twitter``.
You can set the necessary information in your configuration file
@@ -229,7 +236,7 @@ access to *gallery-dl*. Authorize it and you will be shown one or more
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
-.. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.3.tar.gz
+.. _stable: https://github.com/mikf/gallery-dl/archive/v1.13.2.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index a2cd77d..304c345 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2020-01-19" "1.12.3" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2020-03-14" "1.13.2" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index d7bb941..4ad93f8 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2020-01-19" "1.12.3" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2020-03-14" "1.13.2" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -139,6 +139,18 @@ segment, which will be joined together and appended to the
.IP "Description:" 4
Directory path used as the base for all download destinations.
+.SS extractor.*.parent-directory
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Use an extractor's current target directory as
+\f[I]base-directory <extractor.*.base-directory_>\f[]
+for any spawned child extractors.
+
.SS extractor.*.path-restrict
.IP "Type:" 6
\f[I]string\f[]
@@ -155,12 +167,12 @@ in generated path segment names.
Special values:
-
+.br
* \f[I]"auto"\f[]: Use characters from \f[I]"unix"\f[] or \f[I]"windows"\f[]
depending on the local operating system
-
+.br
* \f[I]"unix"\f[]: \f[I]"/"\f[]
-
+.br
* \f[I]"windows"\f[]: \f[I]"\\\\\\\\|/<>:\\"?*"\f[]
Note: In a set with 2 or more characters, \f[I][]^-\\\f[] need to be
@@ -191,24 +203,24 @@ Controls the behavior when downloading files that have been
downloaded before, i.e. a file with the same filename already
exists or its ID is in a \f[I]download archive\f[].
-
+.br
* \f[I]true\f[]: Skip downloads
-
+.br
* \f[I]false\f[]: Overwrite already existing files
-
+.br
* \f[I]"abort"\f[]: Abort the current extractor run
-
+.br
* \f[I]"abort:N"\f[]: Skip downloads and abort extractor run
after \f[I]N\f[] consecutive skips
-
+.br
* \f[I]"exit"\f[]: Exit the program altogether
-
+.br
* \f[I]"exit:N"\f[]: Skip downloads and exit the program
after \f[I]N\f[] consecutive skips
-
+.br
* \f[I]"enumerate"\f[]: Add an enumeration index to the beginning of the
filename extension (\f[I]file.1.ext\f[], \f[I]file.2.ext\f[], etc.)
@@ -233,18 +245,43 @@ Number of seconds to sleep before each download.
The username and password to use when attempting to log in to
another site.
-Specifying username and password is required for the
-\f[I]pixiv\f[], \f[I]nijie\f[], and \f[I]seiga\f[]
-modules and optional (but strongly recommended) for
-\f[I]danbooru\f[], \f[I]exhentai\f[], \f[I]idolcomplex\f[], \f[I]instagram\f[],
-\f[I]luscious\f[], \f[I]sankaku\f[], \f[I]tsumino\f[], and \f[I]twitter\f[].
+Specifying a username and password is required for
+
+.br
+* \f[I]pixiv\f[]
+.br
+* \f[I]nijie\f[]
+.br
+* \f[I]seiga\f[]
+
+and optional for
+
+.br
+* \f[I]danbooru\f[]
+.br
+* \f[I]e621\f[]
+.br
+* \f[I]exhentai\f[]
+.br
+* \f[I]idolcomplex\f[]
+.br
+* \f[I]instagram\f[]
+.br
+* \f[I]luscious\f[]
+.br
+* \f[I]sankaku\f[]
+.br
+* \f[I]tsumino\f[]
+.br
+* \f[I]twitter\f[]
These values can also be set via the \f[I]-u/--username\f[] and
\f[I]-p/--password\f[] command-line options or by using a \f[I].netrc\f[] file.
(see Authentication_)
-Note: The password for \f[I]danbooru\f[] is the API key found in your
-user profile, not the password for your account.
+Note: The password values for \f[I]danbooru\f[] and \f[I]e621\f[] should be
+the API keys found in your user profile, not your actual account
+password.
.SS extractor.*.netrc
.IP "Type:" 6
@@ -264,14 +301,23 @@ Enable the use of \f[I].netrc\f[] authentication data.
\f[I]null\f[]
.IP "Description:" 4
-Source to read additional cookies from.
+Source to read additional cookies from. Either as
+.br
+* the \f[I]Path\f[] to a Mozilla/Netscape format cookies.txt file or
+.br
+* a JSON \f[I]object\f[] specifying cookies as a name-to-value mapping
-* If this is a \f[I]Path\f[], it specifies a
-Mozilla/Netscape format cookies.txt file.
+Example:
+
+.. code::
+
+{
+"cookie-name": "cookie-value",
+"sessionid" : "14313336321%3AsabDFvuASDnlpb%3A31",
+"isAdult" : "1"
+}
-* If this is an \f[I]object\f[], its key-value pairs, which should both
-be \f[I]strings\f[], will be used as cookie-names and -values.
.SS extractor.*.cookies-update
.IP "Type:" 6
@@ -281,8 +327,9 @@ be \f[I]strings\f[], will be used as cookie-names and -values.
\f[I]true\f[]
.IP "Description:" 4
-If \f[I]extractor.*.cookies\f[] specifies a cookies.txt file, update its
-contents with cookies received during data extraction.
+If \f[I]extractor.*.cookies\f[] specifies the \f[I]Path\f[] to a cookies.txt
+file and it can be opened and parsed without errors,
+update its contents with cookies received during data extraction.
.SS extractor.*.proxy
.IP "Type:" 6
@@ -294,10 +341,10 @@ contents with cookies received during data extraction.
.IP "Description:" 4
Proxy (or proxies) to be used for remote connections.
-
+.br
* If this is a \f[I]string\f[], it is the proxy URL for all
outgoing requests.
-
+.br
* If this is an \f[I]object\f[], it is a scheme-to-proxy mapping to
specify different proxy URLs for each scheme.
It is also possible to set a proxy for a specific host by using
@@ -482,8 +529,8 @@ Note: The index of the first image is \f[I]1\f[].
\f[I]string\f[]
.IP "Description:" 4
-Like \f[I]image-range\f[], but applies to delegated URLs
-like manga-chapters, etc.
+Like \f[I]image-range <extractor.*.image-range_>\f[],
+but applies to delegated URLs like manga-chapters, etc.
.SS extractor.*.image-filter
.IP "Type:" 6
@@ -509,9 +556,15 @@ by \f[I]-K\f[] or \f[I]-j\f[].
.IP "Type:" 6
\f[I]string\f[]
+.IP "Example:" 4
+"lang == 'en'"
+.br
+"language == 'French' and 10 <= chapter < 20"
+.br
+
.IP "Description:" 4
-Like \f[I]image-filter\f[], but applies to delegated URLs
-like manga-chapters, etc.
+Like \f[I]image-filter <extractor.*.image-filter_>\f[],
+but applies to delegated URLs like manga-chapters, etc.
.SS extractor.*.image-unique
.IP "Type:" 6
@@ -559,6 +612,16 @@ See \f[I]strptime\f[] for a list of formatting directives.
.IP "Description:" 4
Try to follow external URLs of embedded players.
+.SS extractor.blogger.videos
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Download embedded videos hosted on https://www.blogger.com/
+
.SS extractor.danbooru.ugoira
.IP "Type:" 6
\f[I]bool\f[]
@@ -569,9 +632,9 @@ Try to follow external URLs of embedded players.
.IP "Description:" 4
Controls the download target for Ugoira posts.
-
+.br
* \f[I]true\f[]: Original ZIP archives
-
+.br
* \f[I]false\f[]: Converted video files
.SS extractor.deviantart.extra
@@ -597,9 +660,9 @@ Note: Enabling this option also enables deviantart.metadata_.
Select the directory structure created by the Gallery- and
Favorite-Extractors.
-
+.br
* \f[I]true\f[]: Use a flat directory structure.
-
+.br
* \f[I]false\f[]: Collect a list of all gallery-folders or
favorites-collections and transfer any further work to other
extractors (\f[I]folder\f[] or \f[I]collection\f[]), which will then
@@ -648,11 +711,11 @@ You can use \f[I]"all"\f[] instead of listing all values separately.
.IP "Description:" 4
Selects the output format of journal entries.
-
+.br
* \f[I]"html"\f[]: HTML with (roughly) the same layout as on DeviantArt.
-
+.br
* \f[I]"text"\f[]: Plain text with image references and HTML tags removed.
-
+.br
* \f[I]"none"\f[]: Don't download journals.
.SS extractor.deviantart.mature
@@ -720,9 +783,9 @@ The \f[I]refresh-token\f[] value you get from
Using a \f[I]refresh-token\f[] allows you to access private or otherwise
not publicly available deviations.
-Note: Authenticating with a \f[I]refresh-token\f[] requires persistent
-storage in a \f[I]cache file <cache.file_>\f[].
-Otherwise the token will become invalid after its first use.
+Note: The \f[I]refresh-token\f[] becomes invalid
+\f[I]after 3 months <https://www.deviantart.com/developers/authentication#refresh>\f[]
+or whenever your \f[I]cache file <cache.file_>\f[] is deleted or cleared.
.SS extractor.deviantart.wait-min
.IP "Type:" 6
@@ -804,14 +867,33 @@ Extract and download videos.
.IP "Description:" 4
Sets the maximum allowed size for downloaded images.
-
+.br
* If this is an \f[I]integer\f[], it specifies the maximum image dimension
(width and height) in pixels.
-
+.br
* If this is a \f[I]string\f[], it should be one of Flickr's format specifiers
(\f[I]"Original"\f[], \f[I]"Large"\f[], ... or \f[I]"o"\f[], \f[I]"k"\f[], \f[I]"h"\f[],
\f[I]"l"\f[], ...) to use as an upper limit.
+.SS extractor.furaffinity.include
+.IP "Type:" 6
+\f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[]
+
+.IP "Default:" 9
+\f[I]"gallery"\f[]
+
+.IP "Example:" 4
+"scraps,favorite" or ["scraps", "favorite"]
+
+.IP "Description:" 4
+A (comma-separated) list of subcategories to include
+when processing a user profile.
+
+Possible values are
+\f[I]"gallery"\f[], \f[I]"scraps"\f[], \f[I]"favorite"\f[].
+
+You can use \f[I]"all"\f[] instead of listing all values separately.
+
.SS extractor.gelbooru.api
.IP "Type:" 6
\f[I]bool\f[]
@@ -840,6 +922,18 @@ If the selected format is not available, \f[I]"mp4"\f[], \f[I]"webm"\f[]
and \f[I]"gif"\f[] (in that order) will be tried instead, until an
available format is found.
+.SS extractor.hitomi.metadata
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Try to extract
+\f[I]artist\f[], \f[I]group\f[], \f[I]parody\f[], and \f[I]characters\f[]
+metadata.
+
.SS extractor.imgur.mp4
.IP "Type:" 6
\f[I]bool\f[] or \f[I]string\f[]
@@ -850,12 +944,12 @@ available format is found.
.IP "Description:" 4
Controls whether to choose the GIF or MP4 version of an animation.
-
+.br
* \f[I]true\f[]: Follow Imgur's advice and choose MP4 if the
\f[I]prefer_video\f[] flag in an image's metadata is set.
-
+.br
* \f[I]false\f[]: Always choose GIF.
-
+.br
* \f[I]"always"\f[]: Always choose MP4.
.SS extractor.instagram.highlights
@@ -889,9 +983,9 @@ Download video files.
.IP "Description:" 4
Controls how to handle redirects to CAPTCHA pages.
-
+.br
* \f[I]"stop\f[]: Stop the current extractor run.
-
+.br
* \f[I]"wait\f[]: Ask the user to solve the CAPTCHA and wait.
.SS extractor.newgrounds.include
@@ -923,12 +1017,27 @@ You can use \f[I]"all"\f[] instead of listing all values separately.
.IP "Description:" 4
Controls how a user is directed to an OAuth authorization site.
-
+.br
* \f[I]true\f[]: Use Python's \f[I]webbrowser.open()\f[] method to automatically
open the URL in the user's browser.
-
+.br
* \f[I]false\f[]: Ask the user to copy & paste an URL from the terminal.
+.SS extractor.oauth.port
+.IP "Type:" 6
+\f[I]integer\f[]
+
+.IP "Default:" 9
+\f[I]6414\f[]
+
+.IP "Description:" 4
+Port number to listen on during OAuth authorization.
+
+Note: All redirects will go to http://localhost:6414/, regardless
+of the port specified here. You'll have to manually adjust the
+port number in your browser's address bar when using a different
+port than the default.
+
.SS extractor.photobucket.subalbums
.IP "Type:" 6
\f[I]bool\f[]
@@ -939,6 +1048,16 @@ open the URL in the user's browser.
.IP "Description:" 4
Download subalbums.
+.SS extractor.pixiv.user.avatar
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Download user avatars.
+
.SS extractor.pixiv.ugoira
.IP "Type:" 6
\f[I]bool\f[]
@@ -949,9 +1068,12 @@ Download subalbums.
.IP "Description:" 4
Download Pixiv's Ugoira animations or ignore them.
-These animations come as a \f[I].zip\f[] file containing all the single
+These animations come as a \f[I].zip\f[] file containing all
animation frames in JPEG format.
+Use an \f[I]ugoira\f[] post processor to convert them
+to watchable videos. (Example__)
+
.SS extractor.plurk.comments
.IP "Type:" 6
\f[I]bool\f[]
@@ -983,9 +1105,9 @@ during the extraction process.
.IP "Description:" 4
Controls how to handle redirects to CAPTCHA pages.
-
+.br
* \f[I]"stop\f[]: Stop the current extractor run.
-
+.br
* \f[I]"wait\f[]: Ask the user to solve the CAPTCHA and wait.
.SS extractor.recursive.blacklist
@@ -1066,9 +1188,9 @@ This value sets the maximum recursion depth.
Special values:
-
+.br
* \f[I]0\f[]: Recursion is disabled
-
+.br
* \f[I]-1\f[]: Infinite recursion (don't do this)
.SS extractor.reddit.refresh-token
@@ -1088,6 +1210,25 @@ authorized to do so,
but requests to the reddit API are going to be rate limited
at 600 requests every 10 minutes/600 seconds.
+.SS extractor.reddit.videos
+.IP "Type:" 6
+\f[I]bool\f[] or \f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]true\f[]
+
+.IP "Description:" 4
+Control video download behavior.
+
+.br
+* \f[I]true\f[]: Download videos and use \f[I]youtube-dl\f[] to handle
+HLS and DASH manifests
+.br
+* \f[I]"ytdl"\f[]: Download videos and let \f[I]youtube-dl\f[] handle all of
+video extraction and download
+.br
+* \f[I]false\f[]: Ignore videos
+
.SS extractor.sankaku.wait-min & .wait-max
.IP "Type:" 6
\f[I]float\f[]
@@ -1161,11 +1302,11 @@ Search posts for inline images and videos.
\f[I]true\f[]
.IP "Description:" 4
-
+.br
* \f[I]true\f[]: Extract media from reblogged posts
-
+.br
* \f[I]false\f[]: Skip reblogged posts
-
+.br
* \f[I]"same-blog"\f[]: Skip reblogged posts unless the original post
is from the same blog
@@ -1222,18 +1363,18 @@ Extract \f[I]TwitPic <https://twitpic.com/>\f[] embeds.
\f[I]bool\f[] or \f[I]string\f[]
.IP "Default:" 9
-\f[I]false\f[]
+\f[I]true\f[]
.IP "Description:" 4
Control video download behavior.
-
+.br
* \f[I]true\f[]: Download videos and use \f[I]youtube-dl\f[] to handle
HLS \f[I].m3u8\f[] manifests
-
+.br
* \f[I]"ytdl"\f[]: Download videos and let \f[I]youtube-dl\f[] handle all of
video extraction and download
-
+.br
* \f[I]false\f[]: Skip video Tweets
.SS extractor.vsco.videos
@@ -1282,9 +1423,9 @@ Note: This requires 1 additional HTTP request for each post.
.IP "Description:" 4
Reverse the order of chapter URLs extracted from manga pages.
-
+.br
* \f[I]true\f[]: Start with the latest chapter
-
+.br
* \f[I]false\f[]: Start with the first chapter
.SH DOWNLOADER OPTIONS
@@ -1319,11 +1460,11 @@ to set file modification times.
.IP "Description:" 4
Controls the use of \f[I].part\f[] files during file downloads.
-
+.br
* \f[I]true\f[]: Write downloaded data into \f[I].part\f[] files and rename
them upon download completion. This mode additionally supports
resuming incomplete downloads.
-
+.br
* \f[I]false\f[]: Do not use \f[I].part\f[] files and write data directly
into the actual output files.
@@ -1451,9 +1592,9 @@ used to generate filenames for files downloaded with youtube-dl.
Special values:
-
+.br
* \f[I]null\f[]: generate filenames with \f[I]extractor.*.filename\f[]
-
+.br
* \f[I]"default"\f[]: use youtube-dl's default, currently \f[I]"%(title)s-%(id)s.%(ext)s"\f[]
Note: An output template other than \f[I]null\f[] might
@@ -1492,15 +1633,15 @@ All available options can be found in \f[I]youtube-dl's docstrings
.IP "Description:" 4
Controls the output string format and status indicators.
-
+.br
* \f[I]"null"\f[]: No output
-
+.br
* \f[I]"pipe"\f[]: Suitable for piping to other processes or files
-
+.br
* \f[I]"terminal"\f[]: Suitable for the standard Windows console
-
+.br
* \f[I]"color"\f[]: Suitable for terminals that understand ANSI escape codes and colors
-
+.br
* \f[I]"auto"\f[]: Automatically choose the best suitable output mode
.SS output.shorten
@@ -1525,12 +1666,12 @@ on one console line.
Controls the progress indicator when *gallery-dl* is run with
multiple URLs as arguments.
-
+.br
* \f[I]true\f[]: Show the default progress indicator
(\f[I]"[{current}/{total}] {url}"\f[])
-
+.br
* \f[I]false\f[]: Do not show any progress indicator
-
+.br
* Any \f[I]string\f[]: Show the progress indicator using this
as a custom \f[I]format string\f[]. Possible replacement keys are
\f[I]current\f[], \f[I]total\f[] and \f[I]url\f[].
@@ -1614,9 +1755,9 @@ in their default location.
.IP "Description:" 4
The action to take when files do not compare as equal.
-
+.br
* \f[I]"replace"\f[]: Replace/Overwrite the old version with the new one
-
+.br
* \f[I]"enumerate"\f[]: Add an enumeration index to the filename of the new
version like \f[I]skip = "enumerate" <extractor.*.skip_>\f[]
@@ -1654,13 +1795,13 @@ or to let it run asynchronously.
.IP "Description:" 4
The command to run.
-
+.br
* If this is a \f[I]string\f[], it will be executed using the system's
shell, e.g. \f[I]/bin/sh\f[]. Any \f[I]{}\f[] will be replaced
with the full path of a file or target directory, depending on
\f[I]exec.final\f[]
-
+.br
* If this is a \f[I]list\f[], the first element specifies the program
name and any further elements its arguments.
Each element of this list is treated as a \f[I]format string\f[] using
@@ -1689,12 +1830,12 @@ have been downloaded successfully.
.IP "Description:" 4
Select how to write metadata.
-
+.br
* \f[I]"json"\f[]: all metadata using \f[I]json.dump()
<https://docs.python.org/3/library/json.html#json.dump>\f[]
-
+.br
* \f[I]"tags"\f[]: \f[I]tags\f[] separated by newlines
-
+.br
* \f[I]"custom"\f[]: result of applying \f[I]metadata.content-format\f[]
to a file's metadata dictionary
@@ -1827,12 +1968,12 @@ Enable Two-Pass encoding.
.IP "Description:" 4
Controls the frame rate argument (\f[I]-r\f[]) for FFmpeg
-
+.br
* \f[I]"auto"\f[]: Automatically assign a fitting frame rate
based on delays between frames.
-
+.br
* any other \f[I]string\f[]: Use this value as argument for \f[I]-r\f[].
-
+.br
* \f[I]null\f[] or an empty \f[I]string\f[]: Don't set an explicit frame rate.
.SS ugoira.keep-files
@@ -1904,11 +2045,11 @@ Keep the actual files after writing them to a ZIP archive.
\f[I]"default"\f[]
.IP "Description:" 4
-
+.br
* \f[I]"default"\f[]: Write the central directory file header
once after everything is done or an exception is raised.
-
+.br
* \f[I]"safe"\f[]: Update the central directory file header
each time a file is stored in a ZIP archive.
@@ -1922,9 +2063,9 @@ case the Python interpreter gets shut down unexpectedly
\f[I]Path\f[]
.IP "Default:" 9
-
+.br
* \f[I]tempfile.gettempdir()\f[] + \f[I]".gallery-dl.cache"\f[] on Windows
-
+.br
* (\f[I]$XDG_CACHE_HOME\f[] or \f[I]"~/.cache"\f[]) + \f[I]"/gallery-dl/cache.sqlite3"\f[] on all other platforms
.IP "Description:" 4
@@ -1942,11 +2083,11 @@ this cache.
\f[I]true\f[]
.IP "Description:" 4
-
+.br
* \f[I]true\f[]: Update urllib3's default cipher list
-
+.br
* \f[I]false\f[]: Leave the default cipher list as is
-
+.br
* Any \f[I]string\f[]: Replace urllib3's default ciphers with these
(See \f[I]SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>\f[]
for details)
@@ -1968,25 +2109,25 @@ SSL-support.
\f[I]string\f[]
.IP "How To:" 4
-
+.br
* login and visit DeviantArt's
\f[I]Applications & Keys <https://www.deviantart.com/developers/apps>\f[]
section
-
+.br
* click "Register Application"
-
+.br
* scroll to "OAuth2 Redirect URI Whitelist (Required)"
and enter "https://mikf.github.io/gallery-dl/oauth-redirect.html"
-
+.br
* scroll to the bottom and agree to the API License Agreement.
Submission Policy, and Terms of Service.
-
+.br
* click "Save"
-
+.br
* copy \f[I]client_id\f[] and \f[I]client_secret\f[] of your new
application and put them in your configuration file
as \f[I]"client-id"\f[] and \f[I]"client-secret"\f[]
-
+.br
* get a new \f[I]refresh-token <extractor.deviantart.refresh-token_>\f[]
if necessary
@@ -1995,15 +2136,15 @@ if necessary
\f[I]string\f[]
.IP "How To:" 4
-
+.br
* login and \f[I]Create an App <https://www.flickr.com/services/apps/create/apply/>\f[]
in Flickr's \f[I]App Garden <https://www.flickr.com/services/>\f[]
-
+.br
* click "APPLY FOR A NON-COMMERCIAL KEY"
-
+.br
* fill out the form with a random name and description
and click "SUBMIT"
-
+.br
* copy \f[I]Key\f[] and \f[I]Secret\f[] and put them in your configuration
file
@@ -2020,19 +2161,19 @@ file
\f[I]string\f[]
.IP "How To:" 4
-
+.br
* login and visit the \f[I]apps <https://www.reddit.com/prefs/apps/>\f[]
section of your account's preferences
-
+.br
* click the "are you a developer? create an app..." button
-
+.br
* fill out the form, choose "installed app", preferably set
"http://localhost:6414/" as "redirect uri" and finally click
"create app"
-
+.br
* copy the client id (third line, under your application's name and
"installed app") and put it in your configuration file
-
+.br
* use "\f[I]Python:<application name>:v1.0 (by /u/<username>)\f[]" as
user-agent and replace \f[I]<application name>\f[] and \f[I]<username>\f[]
accordingly (see Reddit's
@@ -2043,15 +2184,15 @@ accordingly (see Reddit's
\f[I]string\f[]
.IP "How To:" 4
-
+.br
* login and \f[I]Apply for an API Key <https://api.smugmug.com/api/developer/apply>\f[]
-
+.br
* use a random name and description,
set "Type" to "Application", "Platform" to "All",
and "Use" to "Non-Commercial"
-
+.br
* fill out the two checkboxes at the bottom and click "Apply"
-
+.br
* copy \f[I]API Key\f[] and \f[I]API Secret\f[]
and put them in your configuration file
@@ -2060,20 +2201,20 @@ and put them in your configuration file
\f[I]string\f[]
.IP "How To:" 4
-
+.br
* login and visit Tumblr's
\f[I]Applications <https://www.tumblr.com/oauth/apps>\f[] section
-
+.br
* click "Register application"
-
+.br
* fill out the form: use a random name and description, set
https://example.org/ as "Application Website" and "Default
callback URL"
-
+.br
* solve Google's "I'm not a robot" challenge and click "Register"
-
+.br
* click "Show secret key" (below "OAuth Consumer Key")
-
+.br
* copy your \f[I]OAuth Consumer Key\f[] and \f[I]Secret Key\f[]
and put them in your configuration file
@@ -2093,9 +2234,9 @@ and put them in your configuration file
.IP "Description:" 4
A \f[I]Date\f[] value represents a specific point in time.
-
+.br
* If given as \f[I]string\f[], it is parsed according to \f[I]date-format\f[].
-
+.br
* If given as \f[I]integer\f[], it is interpreted as UTC timestamp.
.SS Path
@@ -2156,9 +2297,9 @@ The path \f[I]C:\\path\\to\\file.ext\f[] has therefore to be written as
.IP "Description:" 4
Extended logging output configuration.
-
+.br
* format
-
+.br
* General format string for logging messages
or a dictionary with format strings for each loglevel.
@@ -2169,39 +2310,39 @@ it is also possible to access the current
and \f[I]job <https://github.com/mikf/gallery-dl/blob/2e516a1e3e09cb8a9e36a8f6f7e41ce8d4402f5a/gallery_dl/job.py#L19>\f[]
objects as well as their attributes
(e.g. \f[I]"{extractor.url}"\f[])
-
+.br
* Default: \f[I]"[{name}][{levelname}] {message}"\f[]
-
+.br
* format-date
-
+.br
* Format string for \f[I]{asctime}\f[] fields in logging messages
(see \f[I]strftime() directives <https://docs.python.org/3/library/time.html#time.strftime>\f[])
-
+.br
* Default: \f[I]"%Y-%m-%d %H:%M:%S"\f[]
-
+.br
* level
-
+.br
* Minimum logging message level
(one of \f[I]"debug"\f[], \f[I]"info"\f[], \f[I]"warning"\f[], \f[I]"error"\f[], \f[I]"exception"\f[])
-
+.br
* Default: \f[I]"info"\f[]
-
+.br
* path
-
+.br
* \f[I]Path\f[] to the output file
-
+.br
* mode
-
+.br
* Mode in which the file is opened;
use \f[I]"w"\f[] to truncate or \f[I]"a"\f[] to append
(see \f[I]open() <https://docs.python.org/3/library/functions.html#open>\f[])
-
+.br
* Default: \f[I]"w"\f[]
-
+.br
* encoding
-
+.br
* File encoding
-
+.br
* Default: \f[I]"utf-8"\f[]
Note: path, mode and encoding are only applied when configuring
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index 8f4897f..92ded16 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.12.3
+Version: 1.13.2
Summary: Command-line program to download image-galleries and -collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -94,8 +94,8 @@ Description: ==========
put it into your `PATH <https://en.wikipedia.org/wiki/PATH_(variable)>`__,
and run it inside a command prompt (like ``cmd.exe``).
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.12.3/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.13.2/gallery-dl.bin>`__
These executables include a Python 3.8 interpreter
and all required Python packages.
@@ -141,6 +141,13 @@ Description: ==========
$ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703"
+ Filter manga chapters by language and chapter number:
+
+ .. code:: bash
+
+ $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/"
+
+
| Search a remote resource for URLs and download images from them:
| (URLs for which no extractor can be found will be silently ignored)
@@ -186,8 +193,8 @@ Description: ==========
Some extractors require you to provide valid login-credentials in the form of
a username & password pair. This is necessary for
``pixiv``, ``nijie``, and ``seiga``
- and optional (but strongly recommended) for
- ``danbooru``, ``exhentai``, ``idolcomplex``, ``instagram``,
+ and optional for
+ ``danbooru``, ``e621``, ``exhentai``, ``idolcomplex``, ``instagram``,
``luscious``, ``sankaku``, ``tsumino``, and ``twitter``.
You can set the necessary information in your configuration file
@@ -240,7 +247,7 @@ Description: ==========
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
- .. _stable: https://github.com/mikf/gallery-dl/archive/v1.12.3.tar.gz
+ .. _stable: https://github.com/mikf/gallery-dl/archive/v1.13.2.tar.gz
.. _dev: https://github.com/mikf/gallery-dl/archive/master.tar.gz
.. _Python: https://www.python.org/downloads/
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index bbe9bbe..ecb052c 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -34,10 +34,12 @@ gallery_dl/extractor/35photo.py
gallery_dl/extractor/3dbooru.py
gallery_dl/extractor/4chan.py
gallery_dl/extractor/500px.py
+gallery_dl/extractor/8kun.py
gallery_dl/extractor/8muses.py
gallery_dl/extractor/__init__.py
gallery_dl/extractor/adultempire.py
gallery_dl/extractor/artstation.py
+gallery_dl/extractor/bcy.py
gallery_dl/extractor/behance.py
gallery_dl/extractor/blogger.py
gallery_dl/extractor/bobx.py
@@ -53,6 +55,7 @@ gallery_dl/extractor/fallenangels.py
gallery_dl/extractor/flickr.py
gallery_dl/extractor/foolfuuka.py
gallery_dl/extractor/foolslide.py
+gallery_dl/extractor/furaffinity.py
gallery_dl/extractor/fuskator.py
gallery_dl/extractor/gelbooru.py
gallery_dl/extractor/gfycat.py
@@ -61,8 +64,10 @@ gallery_dl/extractor/hentai2read.py
gallery_dl/extractor/hentaicafe.py
gallery_dl/extractor/hentaifoundry.py
gallery_dl/extractor/hentaifox.py
+gallery_dl/extractor/hentaihand.py
gallery_dl/extractor/hentaihere.py
gallery_dl/extractor/hentainexus.py
+gallery_dl/extractor/hiperdex.py
gallery_dl/extractor/hitomi.py
gallery_dl/extractor/hypnohub.py
gallery_dl/extractor/idolcomplex.py
@@ -75,6 +80,7 @@ gallery_dl/extractor/imgth.py
gallery_dl/extractor/imgur.py
gallery_dl/extractor/instagram.py
gallery_dl/extractor/issuu.py
+gallery_dl/extractor/kabeuchi.py
gallery_dl/extractor/keenspot.py
gallery_dl/extractor/khinsider.py
gallery_dl/extractor/kissmanga.py
@@ -142,7 +148,6 @@ gallery_dl/extractor/wikiart.py
gallery_dl/extractor/xhamster.py
gallery_dl/extractor/xvideos.py
gallery_dl/extractor/yandere.py
-gallery_dl/extractor/yaplog.py
gallery_dl/extractor/yuki.py
gallery_dl/postprocessor/__init__.py
gallery_dl/postprocessor/classify.py
@@ -153,6 +158,7 @@ gallery_dl/postprocessor/metadata.py
gallery_dl/postprocessor/mtime.py
gallery_dl/postprocessor/ugoira.py
gallery_dl/postprocessor/zip.py
+test/test_cache.py
test/test_config.py
test/test_cookies.py
test/test_downloader.py
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index ffaed3d..6fba5e2 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2019 Mike Fährmann
+# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -129,17 +129,8 @@ def main():
for opts in args.options:
config.set(*opts)
- # stream logging handler
- output.configure_logging_handler(
- "log", logging.getLogger().handlers[0])
-
- # file logging handler
- handler = output.setup_logging_handler(
- "logfile", lvl=args.loglevel)
- if handler:
- logging.getLogger().addHandler(handler)
-
# loglevels
+ output.configure_logging(args.loglevel)
if args.loglevel >= logging.ERROR:
config.set(("output",), "mode", "null")
elif args.loglevel <= logging.DEBUG:
diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py
index c48b53f..6cde65d 100644
--- a/gallery_dl/cache.py
+++ b/gallery_dl/cache.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -96,12 +96,12 @@ class DatabaseCacheDecorator():
# database lookup
fullkey = "%s-%s" % (self.key, key)
- cursor = self.cursor()
- try:
- cursor.execute("BEGIN EXCLUSIVE")
- except sqlite3.OperationalError:
- pass # Silently swallow exception - workaround for Python 3.6
- try:
+ with self.database() as db:
+ cursor = db.cursor()
+ try:
+ cursor.execute("BEGIN EXCLUSIVE")
+ except sqlite3.OperationalError:
+ pass # Silently swallow exception - workaround for Python 3.6
cursor.execute(
"SELECT value, expires FROM data WHERE key=? LIMIT 1",
(fullkey,),
@@ -118,37 +118,38 @@ class DatabaseCacheDecorator():
"INSERT OR REPLACE INTO data VALUES (?,?,?)",
(fullkey, pickle.dumps(value), expires),
)
- finally:
- self.db.commit()
+
self.cache[key] = value, expires
return value
def update(self, key, value):
expires = int(time.time()) + self.maxage
self.cache[key] = value, expires
- self.cursor().execute(
- "INSERT OR REPLACE INTO data VALUES (?,?,?)",
- ("%s-%s" % (self.key, key), pickle.dumps(value), expires),
- )
+ with self.database() as db:
+ db.execute(
+ "INSERT OR REPLACE INTO data VALUES (?,?,?)",
+ ("%s-%s" % (self.key, key), pickle.dumps(value), expires),
+ )
def invalidate(self, key):
try:
del self.cache[key]
except KeyError:
pass
- self.cursor().execute(
- "DELETE FROM data WHERE key=? LIMIT 1",
- ("%s-%s" % (self.key, key),),
- )
+ with self.database() as db:
+ db.execute(
+ "DELETE FROM data WHERE key=?",
+ ("%s-%s" % (self.key, key),),
+ )
- def cursor(self):
+ def database(self):
if self._init:
self.db.execute(
"CREATE TABLE IF NOT EXISTS data "
"(key TEXT PRIMARY KEY, value TEXT, expires INTEGER)"
)
DatabaseCacheDecorator._init = False
- return self.db.cursor()
+ return self.db
def memcache(maxage=None, keyarg=None):
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py
index 6e23c83..6ba5480 100644
--- a/gallery_dl/cloudflare.py
+++ b/gallery_dl/cloudflare.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -38,7 +38,7 @@ def solve_challenge(session, response, kwargs):
params = cf_kwargs["data"] = collections.OrderedDict()
page = response.text
- url = root + text.extract(page, 'action="', '"')[0]
+ url = root + text.unescape(text.extract(page, 'action="', '"')[0])
params["r"] = text.extract(page, 'name="r" value="', '"')[0]
params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 9cd2aa6..844e422 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -193,6 +193,9 @@ class HttpDownloader(DownloaderBase):
mtype = response.headers.get("Content-Type", "image/jpeg")
mtype = mtype.partition(";")[0]
+ if "/" not in mtype:
+ mtype = "image/" + mtype
+
if mtype in MIMETYPE_MAP:
return MIMETYPE_MAP[mtype]
@@ -231,6 +234,8 @@ MIMETYPE_MAP = {
"image/png": "png",
"image/gif": "gif",
"image/bmp": "bmp",
+ "image/x-bmp": "bmp",
+ "image/x-ms-bmp": "bmp",
"image/webp": "webp",
"image/svg+xml": "svg",
@@ -247,6 +252,7 @@ MIMETYPE_MAP = {
"application/zip": "zip",
"application/x-zip": "zip",
"application/x-zip-compressed": "zip",
+ "application/rar": "rar",
"application/x-rar": "rar",
"application/x-rar-compressed": "rar",
"application/x-7z-compressed": "7z",
diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py
index ac96211..3773ee5 100644
--- a/gallery_dl/extractor/3dbooru.py
+++ b/gallery_dl/extractor/3dbooru.py
@@ -67,7 +67,7 @@ class _3dbooruPopularExtractor(booru.MoebooruPopularMixin, _3dbooruExtractor):
r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
r"(?:\?(?P<query>[^#]*))?")
test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", {
- "url": "8b1a5c5b7a10f8f5d3d6124d1aabfee0277078cb",
+ "pattern": r"http://behoimi\.org/data/../../[0-9a-f]{32}\.jpg",
"count": 20,
})
diff --git a/gallery_dl/extractor/8kun.py b/gallery_dl/extractor/8kun.py
new file mode 100644
index 0000000..7162920
--- /dev/null
+++ b/gallery_dl/extractor/8kun.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://8kun.top/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class _8kunThreadExtractor(Extractor):
+ """Extractor for 8kun threads"""
+ category = "8kun"
+ subcategory = "thread"
+ directory_fmt = ("{category}", "{board}", "{thread} {title}")
+ filename_fmt = "{time}{num:?-//} {filename}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ pattern = r"(?:https?://)?8kun\.top/([^/]+)/res/(\d+)"
+ test = ("https://8kun.top/test/res/65248.html", {
+ "pattern": r"https://media\.8kun\.top/file_store/\w{64}\.\w+",
+ "count": ">= 8",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "https://8kun.top/{}/res/{}.json".format(self.board, self.thread)
+ posts = self.request(url).json()["posts"]
+ title = posts[0].get("sub") or text.remove_html(posts[0]["com"])
+ process = self._process
+
+ data = {
+ "board" : self.board,
+ "thread": self.thread,
+ "title" : text.unescape(title)[:50],
+ "num" : 0,
+ }
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for post in posts:
+ if "filename" in post:
+ yield process(post, data)
+ if "extra_files" in post:
+ for post["num"], filedata in enumerate(
+ post["extra_files"], 1):
+ yield process(post, filedata)
+
+ @staticmethod
+ def _process(post, data):
+ post.update(data)
+ post["extension"] = post["ext"][1:]
+ url = "https://media.8kun.top/file_store/" + post["tim"] + post["ext"]
+ return Message.Url, url, post
+
+
+class _8kunBoardExtractor(Extractor):
+ """Extractor for 8kun boards"""
+ category = "8kun"
+ subcategory = "board"
+ pattern = r"(?:https?://)?8kun\.top/([^/?&#]+)/(?:index|\d+)\.html"
+ test = (
+ ("https://8kun.top/v/index.html", {
+ "pattern": _8kunThreadExtractor.pattern,
+ "count": ">= 100",
+ }),
+ ("https://8kun.top/v/2.html"),
+ ("https://8kun.top/v/index.html?PageSpeed=noscript"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board = match.group(1)
+
+ def items(self):
+ url = "https://8kun.top/{}/threads.json".format(self.board)
+ threads = self.request(url).json()
+
+ for page in threads:
+ for thread in page["threads"]:
+ url = "https://8kun.top/{}/res/{}.html".format(
+ self.board, thread["no"])
+ thread["page"] = page["page"]
+ thread["_extractor"] = _8kunThreadExtractor
+ yield Message.Queue, url, thread
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index 089a0e9..dec5972 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -40,7 +40,7 @@ class _8musesAlbumExtractor(Extractor):
"parent" : 10454,
"views" : int,
"likes" : int,
- "date" : "type:datetime",
+ "date" : "dt:2018-07-10 00:00:00",
},
},
}),
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 66203fe..74c553d 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -15,9 +15,11 @@ modules = [
"3dbooru",
"4chan",
"500px",
+ "8kun",
"8muses",
"adultempire",
"artstation",
+ "bcy",
"behance",
"blogger",
"bobx",
@@ -28,6 +30,7 @@ modules = [
"exhentai",
"fallenangels",
"flickr",
+ "furaffinity",
"fuskator",
"gelbooru",
"gfycat",
@@ -36,8 +39,10 @@ modules = [
"hentaicafe",
"hentaifoundry",
"hentaifox",
+ "hentaihand",
"hentaihere",
"hentainexus",
+ "hiperdex",
"hitomi",
"hypnohub",
"idolcomplex",
@@ -49,6 +54,7 @@ modules = [
"imgur",
"instagram",
"issuu",
+ "kabeuchi",
"keenspot",
"khinsider",
"kissmanga",
@@ -110,7 +116,6 @@ modules = [
"xhamster",
"xvideos",
"yandere",
- "yaplog",
"yuki",
"foolfuuka",
"foolslide",
diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py
new file mode 100644
index 0000000..c3049a4
--- /dev/null
+++ b/gallery_dl/extractor/bcy.py
@@ -0,0 +1,188 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://bcy.net/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+import re
+
+
+class BcyExtractor(Extractor):
+ """Base class for bcy extractors"""
+ category = "bcy"
+ directory_fmt = ("{category}", "{user[id]} {user[name]}")
+ filename_fmt = "{post[id]} {id}.{extension}"
+ archive_fmt = "{post[id]}_{id}"
+ root = "https://bcy.net"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item_id = match.group(1)
+
+ def items(self):
+ sub = re.compile(r"^https?://p\d+-bcy\.byteimg\.com/img/banciyuan").sub
+ iroot = "https://img-bcy-qn.pstatp.com"
+ noop = self.config("noop")
+
+ for post in self.posts():
+ if not post["image_list"]:
+ continue
+
+ multi = None
+ tags = post.get("post_tags") or ()
+ data = {
+ "user": {
+ "id" : post["uid"],
+ "name" : post["uname"],
+ "avatar" : sub(iroot, post["avatar"].partition("~")[0]),
+ },
+ "post": {
+ "id" : text.parse_int(post["item_id"]),
+ "tags" : [t["tag_name"] for t in tags],
+ "date" : text.parse_timestamp(post["ctime"]),
+ "parody" : post["work"],
+ "content": post["plain"],
+ "likes" : post["like_count"],
+ "shares" : post["share_count"],
+ "replies": post["reply_count"],
+ },
+ }
+
+ yield Message.Directory, data
+ for data["num"], image in enumerate(post["image_list"], 1):
+ data["id"] = image["mid"]
+ data["width"] = image["w"]
+ data["height"] = image["h"]
+
+ url = image["path"].partition("~")[0]
+ text.nameext_from_url(url, data)
+
+ if data["extension"]:
+ if not url.startswith(iroot):
+ url = sub(iroot, url)
+ data["filter"] = ""
+ yield Message.Url, url, data
+
+ else:
+ if not multi:
+ if len(post["multi"]) < len(post["image_list"]):
+ multi = self._data_from_post(post["item_id"])
+ multi = multi["post_data"]["multi"]
+ else:
+ multi = post["multi"]
+ image = multi[data["num"] - 1]
+
+ if image["origin"]:
+ data["filter"] = "watermark"
+ yield Message.Url, image["origin"], data
+
+ if noop:
+ data["extension"] = ""
+ data["filter"] = "noop"
+ yield Message.Url, image["original_path"], data
+
+ def posts(self):
+ """Returns an iterable with all relevant 'post' objects"""
+
+ def _data_from_post(self, post_id):
+ url = "{}/item/detail/{}".format(self.root, post_id)
+ page = self.request(url).text
+ return json.loads(
+ text.extract(page, 'JSON.parse("', '");')[0]
+ .replace('\\\\u002F', '/')
+ .replace('\\"', '"')
+ )["detail"]
+
+
+class BcyUserExtractor(BcyExtractor):
+ """Extractor for user timelines"""
+ subcategory = "user"
+ pattern = r"(?:https?://)?bcy\.net/u/(\d+)"
+ test = (
+ ("https://bcy.net/u/1933712", {
+ "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg",
+ "count": ">= 25",
+ }),
+ ("https://bcy.net/u/109282764041", {
+ "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+"
+ r"~tplv-banciyuan-logo-v3:.+\.image",
+ "range": "1-25",
+ "count": 25,
+ }),
+ )
+
+ def posts(self):
+ url = self.root + "/apiv3/user/selfPosts"
+ params = {"uid": self.item_id, "since": None}
+
+ while True:
+ data = self.request(url, params=params).json()
+
+ item = None
+ for item in data["data"]["items"]:
+ yield item["item_detail"]
+
+ if not item:
+ return
+ params["since"] = item["since"]
+
+
+class BcyPostExtractor(BcyExtractor):
+ """Extractor for individual posts"""
+ subcategory = "post"
+ pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)"
+ test = (
+ ("https://bcy.net/item/detail/6355835481002893070", {
+ "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3",
+ "count": 1,
+ "keyword": {
+ "user": {
+ "id" : 1933712,
+ "name" : "wukloo",
+ "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/",
+ },
+ "post": {
+ "id" : 6355835481002893070,
+ "tags" : list,
+ "date" : "dt:2016-11-22 08:47:46",
+ "parody" : "东方PROJECT",
+ "content": "re:根据微博的建议稍微做了点修改",
+ "likes" : int,
+ "shares" : int,
+ "replies": int,
+ },
+ "id": 8330182,
+ "num": 1,
+ "width" : 3000,
+ "height": 1687,
+ "filename": "712e0780b09011e696f973c3d1568337",
+ "extension": "jpg",
+ },
+ }),
+ # only watermarked images available
+ ("https://bcy.net/item/detail/6780546160802143236", {
+ "pattern": r"https://p\d-bcy.byteimg.com/img/banciyuan/[0-9a-f]+"
+ r"~tplv-banciyuan-logo-v3:.+\.image",
+ "count": 8,
+ "keyword": {"filter": "watermark"}
+ }),
+ # only visible to logged in users
+ ("https://bcy.net/item/detail/6747523535150783495", {
+ "count": 0,
+ }),
+ )
+
+ def posts(self):
+ data = self._data_from_post(self.item_id)
+ post = data["post_data"]
+ post["image_list"] = post["multi"]
+ post["plain"] = text.parse_unicode_escapes(post["plain"])
+ post.update(data["detail_user"])
+ return (post,)
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 31bbaf8..2657b5d 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text
+import json
import re
BASE_PATTERN = (
@@ -28,6 +29,7 @@ class BloggerExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
+ self.videos = self.config("videos", True)
self.blog = match.group(1) or match.group(2)
self.api = BloggerAPI(self)
@@ -41,24 +43,41 @@ class BloggerExtractor(Extractor):
del blog["selfLink"]
sub = re.compile(r"/s\d+/").sub
- findall = re.compile(
- r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)"').findall
+ findall_image = re.compile(
+ r'src="(https?://\d+\.bp\.blogspot\.com/[^"]+)').findall
+ findall_video = re.compile(
+ r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
for post in self.posts(blog):
- images = findall(post["content"])
- if not images:
+ content = post["content"]
+
+ files = findall_image(content)
+ for idx, url in enumerate(files):
+ files[idx] = sub("/s0/", url).replace("http:", "https:", 1)
+
+ if self.videos and 'id="BLOG_video-' in content:
+ page = self.request(post["url"]).text
+ for url in findall_video(page):
+ page = self.request(url).text
+ video_config = json.loads(text.extract(
+ page, 'var VIDEO_CONFIG =', '\n')[0])
+ files.append(max(
+ video_config["streams"],
+ key=lambda x: x["format_id"],
+ )["play_url"])
+
+ if not files:
continue
post["author"] = post["author"]["displayName"]
post["replies"] = post["replies"]["totalItems"]
- post["content"] = text.remove_html(post["content"])
+ post["content"] = text.remove_html(content)
post["date"] = text.parse_datetime(post["published"])
del post["selfLink"]
del post["blog"]
yield Message.Directory, {"blog": blog, "post": post}
- for num, url in enumerate(images, 1):
- url = sub("/s0/", url).replace("http:", "https:", 1)
+ for num, url in enumerate(files, 1):
yield Message.Url, url, text.nameext_from_url(url, {
"blog": blog,
"post": post,
@@ -80,7 +99,7 @@ class BloggerPostExtractor(BloggerExtractor):
"pattern": r"https://3.bp.blogspot.com/.*/s0/Icy-Moonrise-.*.jpg",
"keyword": {
"blog": {
- "date" : "type:datetime",
+ "date" : "dt:2010-11-21 18:19:42",
"description": "",
"id" : "5623928067739466034",
"kind" : "blogger#blog",
@@ -95,7 +114,7 @@ class BloggerPostExtractor(BloggerExtractor):
"post": {
"author" : "Julian Bunker",
"content" : str,
- "date" : "type:datetime",
+ "date" : "dt:2010-12-26 01:08:00",
"etag" : str,
"id" : "6955139236418998998",
"kind" : "blogger#post",
@@ -112,6 +131,11 @@ class BloggerPostExtractor(BloggerExtractor):
("blogger:http://www.julianbunker.com/2010/12/moon-rise.html", {
"url": "9928429fb62f712eb4de80f53625eccecc614aae",
}),
+ # video (#587)
+ (("http://cfnmscenesinmovies.blogspot.com/2011/11/"
+ "cfnm-scene-jenna-fischer-in-office.html"), {
+ "pattern": r"https://.+\.googlevideo\.com/videoplayback",
+ }),
)
def __init__(self, match):
@@ -171,8 +195,8 @@ class BloggerAPI():
def _pagination(self, endpoint, params):
while True:
data = self._call(endpoint, params)
- yield from data["items"]
-
+ if "items" in data:
+ yield from data["items"]
if "nextPageToken" not in data:
return
params["pageToken"] = data["nextPageToken"]
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index ac45e0b..162e9cc 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -27,7 +27,6 @@ class BooruExtractor(SharedConfigMixin, Extractor):
page_start = 1
page_limit = None
sort = False
- ugoira = True
def __init__(self, match):
super().__init__(match)
@@ -52,11 +51,7 @@ class BooruExtractor(SharedConfigMixin, Extractor):
for image in images:
try:
- if "pixiv_ugoira_frame_data" in image and \
- "large_file_url" in image and not self.ugoira:
- url = image["large_file_url"]
- else:
- url = image["file_url"]
+ url = image["file_url"]
except KeyError:
continue
if url.startswith("/"):
@@ -112,12 +107,6 @@ class XmlParserMixin():
return [post.attrib for post in root]
-class DanbooruPageMixin():
- """Pagination for Danbooru v2"""
- def update_page(self, data):
- self.params["page"] = "b{}".format(data["id"])
-
-
class MoebooruPageMixin():
"""Pagination for Moebooru and Danbooru v1"""
def update_page(self, data):
@@ -214,8 +203,8 @@ class PostMixin():
self.params["tags"] = "id:" + self.post
-class PopularMixin():
- """Extraction and metadata handling for Danbooru v2"""
+class MoebooruPopularMixin():
+ """Extraction and metadata handling for Moebooru and Danbooru v1"""
subcategory = "popular"
directory_fmt = ("{category}", "popular", "{scale}", "{date}")
archive_fmt = "P_{scale[0]}_{date}_{id}"
@@ -225,37 +214,20 @@ class PopularMixin():
def __init__(self, match):
super().__init__(match)
self.params.update(text.parse_query(match.group("query")))
+ self.scale = match.group("scale")
def get_metadata(self, fmt="%Y-%m-%d"):
- date = self.get_date() or datetime.datetime.utcnow().strftime(fmt)
+ date = self.get_date() or datetime.date.today().isoformat()
scale = self.get_scale() or "day"
if scale == "week":
- dt = datetime.datetime.strptime(date, fmt)
- dt -= datetime.timedelta(days=dt.weekday())
- date = dt.strftime(fmt)
+ date = datetime.date.fromisoformat(date)
+ date = (date - datetime.timedelta(days=date.weekday())).isoformat()
elif scale == "month":
date = date[:-3]
return {"date": date, "scale": scale}
- def get_scale(self):
- if "scale" in self.params:
- return self.params["scale"]
- return None
-
- def get_date(self):
- if "date" in self.params:
- return self.params["date"][:10]
- return None
-
-
-class MoebooruPopularMixin(PopularMixin):
- """Extraction and metadata handling for Moebooru and Danbooru v1"""
- def __init__(self, match):
- super().__init__(match)
- self.scale = match.group("scale")
-
def get_date(self):
if "year" in self.params:
return "{:>04}-{:>02}-{:>02}".format(
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 55b15d4..19ee182 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -16,7 +16,6 @@ import logging
import datetime
import requests
import threading
-import http.cookiejar
from .message import Message
from .. import config, text, util, exception, cloudflare
@@ -40,6 +39,7 @@ class Extractor():
self._cookiefile = None
self._cookiejar = self.session.cookies
+ self._parentdir = ""
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
@@ -197,13 +197,13 @@ class Extractor():
self._update_cookies_dict(cookies, self.cookiedomain)
elif isinstance(cookies, str):
cookiefile = util.expand_path(cookies)
- cookiejar = http.cookiejar.MozillaCookieJar()
try:
- cookiejar.load(cookiefile)
- except OSError as exc:
+ with open(cookiefile) as fp:
+ cookies = util.load_cookiestxt(fp)
+ except Exception as exc:
self.log.warning("cookies: %s", exc)
else:
- self._cookiejar.update(cookiejar)
+ self._update_cookies(cookies)
self._cookiefile = cookiefile
else:
self.log.warning(
@@ -218,11 +218,9 @@ class Extractor():
def _store_cookies(self):
"""Store the session's cookiejar in a cookies.txt file"""
if self._cookiefile and self.config("cookies-update", True):
- cookiejar = http.cookiejar.MozillaCookieJar()
- for cookie in self._cookiejar:
- cookiejar.set_cookie(cookie)
try:
- cookiejar.save(self._cookiefile)
+ with open(self._cookiefile, "w") as fp:
+ util.save_cookiestxt(fp, self._cookiejar)
except OSError as exc:
self.log.warning("cookies: %s", exc)
@@ -248,15 +246,22 @@ class Extractor():
def _check_cookies(self, cookienames, *, domain=None):
"""Check if all 'cookienames' are in the session's cookiejar"""
+ if not self._cookiejar:
+ return False
+
if domain is None:
domain = self.cookiedomain
-
names = set(cookienames)
+ now = time.time()
+
for cookie in self._cookiejar:
- if cookie.domain == domain:
- names.discard(cookie.name)
- if not names:
- return True
+ if cookie.name in names and cookie.domain == domain:
+ if cookie.expires and cookie.expires < now:
+ self.log.warning("Cookie '%s' has expired", cookie.name)
+ else:
+ names.discard(cookie.name)
+ if not names:
+ return True
return False
def _get_date_min_max(self, dmin=None, dmax=None):
@@ -491,12 +496,6 @@ def generate_extractors(extractor_data, symtable, classes):
symtable[Extr.__name__] = prev = Extr
-# Reduce strictness of the expected magic string in cookiejar files.
-# (This allows the use of Wget-generated cookiejars without modification)
-http.cookiejar.MozillaCookieJar.magic_re = re.compile(
- "#( Netscape)? HTTP Cookie File", re.IGNORECASE)
-
-
# Undo automatic pyOpenSSL injection by requests
pyopenssl = config.get((), "pyopenssl", False)
if not pyopenssl:
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index e8d3abf..3fdeaf9 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -1,69 +1,154 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2019 Mike Fährmann
+# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://danbooru.donmai.us/"""
+"""Extractors for https://danbooru.donmai.us/"""
-from . import booru
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text
+import datetime
BASE_PATTERN = (
r"(?:https?://)?"
- r"(?P<subdomain>danbooru|hijiribe|sonohara|safebooru)"
- r"\.donmai\.us")
+ r"(danbooru|hijiribe|sonohara|safebooru)"
+ r"\.donmai\.us"
+)
-class DanbooruExtractor(booru.DanbooruPageMixin, booru.BooruExtractor):
+class DanbooruExtractor(SharedConfigMixin, Extractor):
"""Base class for danbooru extractors"""
+ basecategory = "booru"
category = "danbooru"
+ filename_fmt = "{category}_{id}_{md5}.{extension}"
page_limit = 1000
+ page_start = None
+ per_page = 100
def __init__(self, match):
- super().__init__(match)
- self.subdomain = match.group("subdomain")
- self.scheme = "https" if self.subdomain == "danbooru" else "http"
- self.api_url = "{scheme}://{subdomain}.donmai.us/posts.json".format(
- scheme=self.scheme, subdomain=self.subdomain)
+ Extractor.__init__(self, match)
+ self.root = "https://{}.donmai.us".format(match.group(1))
self.ugoira = self.config("ugoira", True)
+ self.params = {}
username, api_key = self._get_auth_info()
if username:
self.log.debug("Using HTTP Basic Auth for user '%s'", username)
self.session.auth = (username, api_key)
-
-class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor):
- """Extractor for images from danbooru based on search-tags"""
- pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"
+ def skip(self, num):
+ pages = num // self.per_page
+ if pages >= self.page_limit:
+ pages = self.page_limit - 1
+ self.page_start = pages + 1
+ return pages * self.per_page
+
+ def items(self):
+ data = self.metadata()
+ for post in self.posts():
+ try:
+ url = post["file_url"]
+ except KeyError:
+ continue
+
+ text.nameext_from_url(url, post)
+ if post["extension"] == "zip":
+ if self.ugoira:
+ post["frames"] = self.request(
+ "{}/posts/{}.json?only=pixiv_ugoira_frame_data".format(
+ self.root, post["id"])
+ ).json()["pixiv_ugoira_frame_data"]["data"]
+ else:
+ url = post["large_file_url"]
+ post["extension"] = "webm"
+
+ post.update(data)
+ yield Message.Directory, post
+ yield Message.Url, url, post
+
+ def metadata(self):
+ return {}
+
+ def posts(self):
+ return self._pagination(self.root + "/posts.json")
+
+ def _pagination(self, url, pagenum=False):
+ params = self.params.copy()
+ params["limit"] = self.per_page
+ params["page"] = self.page_start
+
+ while True:
+ posts = self.request(url, params=params).json()
+ yield from posts
+
+ if len(posts) < self.per_page:
+ return
+
+ if pagenum:
+ params["page"] += 1
+ else:
+ params["page"] = "b{}".format(posts[-1]["id"])
+
+
+class DanbooruTagExtractor(DanbooruExtractor):
+ """Extractor for danbooru posts from tag searches"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=([^&#]+)"
test = (
("https://danbooru.donmai.us/posts?tags=bonocho", {
"content": "b196fb9f1668109d7774a0a82efea3ffdda07746",
}),
# test page transitions
- ("https://danbooru.donmai.us/posts?tags=canvas_%28cocktail_soft%29", {
- "count": ">= 50",
+ ("https://danbooru.donmai.us/posts?tags=mushishi", {
+ "count": ">= 300",
}),
("https://hijiribe.donmai.us/posts?tags=bonocho"),
("https://sonohara.donmai.us/posts?tags=bonocho"),
("https://safebooru.donmai.us/posts?tags=bonocho"),
)
+ def __init__(self, match):
+ DanbooruExtractor.__init__(self, match)
+ self.params["tags"] = text.unquote(match.group(2).replace("+", " "))
+
+ def metadata(self):
+ return {"search_tags": self.params["tags"]}
+
-class DanbooruPoolExtractor(booru.PoolMixin, DanbooruExtractor):
- """Extractor for image-pools from danbooru"""
- pattern = BASE_PATTERN + r"/pools/(?P<pool>\d+)"
+class DanbooruPoolExtractor(DanbooruExtractor):
+ """Extractor for posts from danbooru pools"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}")
+ archive_fmt = "p_{pool[id]}_{id}"
+ pattern = BASE_PATTERN + r"/pools/(\d+)"
test = ("https://danbooru.donmai.us/pools/7659", {
"content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99",
})
-
-class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor):
- """Extractor for single images from danbooru"""
- pattern = BASE_PATTERN + r"/posts/(?P<post>\d+)"
+ def __init__(self, match):
+ DanbooruExtractor.__init__(self, match)
+ self.pool_id = match.group(2)
+ self.params["tags"] = "pool:" + self.pool_id
+
+ def metadata(self):
+ url = "{}/pools/{}.json".format(self.root, self.pool_id)
+ pool = self.request(url).json()
+ pool["name"] = pool["name"].replace("_", " ")
+ del pool["post_ids"]
+ return {"pool": pool}
+
+
+class DanbooruPostExtractor(DanbooruExtractor):
+ """Extractor for single danbooru posts"""
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/posts/(\d+)"
test = (
("https://danbooru.donmai.us/posts/294929", {
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
@@ -74,20 +159,47 @@ class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor):
})
)
+ def __init__(self, match):
+ DanbooruExtractor.__init__(self, match)
+ self.post_id = match.group(2)
-class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor):
+ def posts(self):
+ url = "{}/posts/{}.json".format(self.root, self.post_id)
+ return (self.request(url).json(),)
+
+
+class DanbooruPopularExtractor(DanbooruExtractor):
"""Extractor for popular images from danbooru"""
- pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?(?P<query>[^#]*))?"
+ subcategory = "popular"
+ directory_fmt = ("{category}", "popular", "{scale}", "{date}")
+ archive_fmt = "P_{scale[0]}_{date}_{id}"
+ pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?"
test = (
("https://danbooru.donmai.us/explore/posts/popular"),
(("https://danbooru.donmai.us/explore/posts/popular"
- "?date=2013-06-06+03%3A34%3A22+-0400&scale=week"), {
- "count": ">= 1",
+ "?date=2013-06-06&scale=week"), {
+ "range": "1-120",
+ "count": 120,
}),
)
def __init__(self, match):
- super().__init__(match)
- urlfmt = "{scheme}://{subdomain}.donmai.us/explore/posts/popular.json"
- self.api_url = urlfmt.format(
- scheme=self.scheme, subdomain=self.subdomain)
+ DanbooruExtractor.__init__(self, match)
+ self.params.update(text.parse_query(match.group(2)))
+
+ def metadata(self):
+ self.page_start = self.page_start or 1
+ scale = self.params.get("scale", "day")
+ date = self.params.get("date") or datetime.date.today().isoformat()
+
+ if scale == "week":
+ date = datetime.date.fromisoformat(date)
+ date = (date - datetime.timedelta(days=date.weekday())).isoformat()
+ elif scale == "month":
+ date = date[:-3]
+
+ return {"date": date, "scale": scale}
+
+ def posts(self):
+ url = self.root + "/explore/posts/popular.json"
+ return self._pagination(url, True)
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 02a14e3..90b27d1 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -93,9 +93,11 @@ class DeviantartExtractor(Extractor):
if content["src"].startswith("https://images-wixmp-"):
if deviation["index"] <= 790677560:
# https://github.com/r888888888/danbooru/issues/4069
- content["src"] = re.sub(
+ intermediary, count = re.subn(
r"(/f/[^/]+/[^/]+)/v\d+/.*",
r"/intermediary\1", content["src"])
+ if count and self._check_url(intermediary):
+ content["src"] = intermediary
if self.quality:
content["src"] = re.sub(
r"q_\d+", self.quality, content["src"])
@@ -261,6 +263,9 @@ class DeviantartExtractor(Extractor):
if mtype and mtype.startswith("image/"):
content.update(data)
+ def _check_url(self, url):
+ return self.request(url, method="HEAD", fatal=False).status_code < 400
+
class DeviantartUserExtractor(DeviantartExtractor):
"""Extractor for an artist's user profile"""
@@ -717,7 +722,7 @@ class DeviantartExtractorV2(DeviantartExtractor):
# select largest video
target = max(media["types"],
key=lambda x: text.parse_int(x.get("q", "")[:-1]))
- src = target["s"]
+ src = target["b"]
elif target["t"] == "flash":
src = target["s"]
@@ -737,8 +742,10 @@ class DeviantartExtractorV2(DeviantartExtractor):
if src.startswith("https://images-wixmp-"):
if deviation["index"] <= 790677560:
# https://github.com/r888888888/danbooru/issues/4069
- src = re.sub(
+ intermediary, count = re.subn(
r"(/f/[^/]+/[^/]+)/v\d+/.*", r"/intermediary\1", src)
+ if count and self._check_url(intermediary):
+ src = intermediary
if self.quality:
src = re.sub(r"q_\d+", self.quality, src)
@@ -811,15 +818,17 @@ class DeviantartDeviationExtractor(DeviantartExtractorV2):
}),
# video
("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", {
- "url": "3b6e6e761d2d393fa61a4dc3ed6e7db51b14d07b",
+ "pattern": r"https://wixmp-.+wixmp.com/v/mp4/.+\.720p\.\w+.mp4",
"keyword": {
"filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5",
"extension": "mp4",
"target": {
"d": 306,
- "f": 9963639,
- "q": "1080p",
+ "f": 19367585,
+ "h": 720,
+ "q": "720p",
"t": "video",
+ "w": 1364,
"src": str,
},
}
@@ -952,11 +961,15 @@ class DeviantartAPI():
self.folders = extractor.config("folders", False)
self.metadata = extractor.extra or extractor.config("metadata", False)
- self.refresh_token = extractor.config("refresh-token")
- self.client_id = extractor.config("client-id", self.CLIENT_ID)
+ self.client_id = extractor.config(
+ "client-id", self.CLIENT_ID)
self.client_secret = extractor.config(
"client-secret", self.CLIENT_SECRET)
+ self.refresh_token = extractor.config("refresh-token")
+ if self.refresh_token == "cache":
+ self.refresh_token = "#" + str(self.client_id)
+
self.log.debug(
"Using %s API credentials (client-id %s)",
"default" if self.client_id == self.CLIENT_ID else "custom",
@@ -1026,8 +1039,12 @@ class DeviantartAPI():
"type" : kind,
"include_session": "false",
}
- return self.extractor.request(
- url, headers=headers, params=params, fatal=None).json()
+ response = self.extractor.request(
+ url, headers=headers, params=params, fatal=None)
+ if response.status_code == 404:
+ raise exception.StopExtraction(
+ "Your account must use the Eclipse interface.")
+ return response.json()
def deviation_metadata(self, deviations):
""" Fetch deviation metadata for a set of deviations"""
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index f245ddf..bc3f67a 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -1,71 +1,193 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2019 Mike Fährmann
+# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://e621.net/"""
+"""Extractors for https://e621.net/"""
-from . import booru
+from .common import Extractor, Message, SharedConfigMixin
+from .. import text
+import datetime
+import time
-class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
+BASE_PATTERN = r"(?:https?://)?e(621|926)\.net"
+
+
+class E621Extractor(SharedConfigMixin, Extractor):
"""Base class for e621 extractors"""
+ basecategory = "booru"
category = "e621"
- api_url = "https://e621.net/post/index.json"
- post_url = "https://e621.net/post/show/{}"
+ filename_fmt = "{category}_{id}_{file[md5]}.{extension}"
page_limit = 750
+ page_start = None
+ per_page = 200
+ _last_request = 0
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.root = "https://e{}.net".format(match.group(1))
+ self.params = {}
+
+ username, api_key = self._get_auth_info()
+ if username:
+ self.log.debug("Using HTTP Basic Auth for user '%s'", username)
+ self.session.auth = (username, api_key)
+
+ def request(self, url, **kwargs):
+ diff = time.time() - E621Extractor._last_request
+ if diff < 1.0:
+ self.log.debug("Sleeping for %s seconds", diff)
+ time.sleep(diff)
+ kwargs["headers"] = {"User-Agent": "gallery-dl/1.13.0 (by mikf)"}
+ response = Extractor.request(self, url, **kwargs)
+ E621Extractor._last_request = time.time()
+ return response
+
+ def items(self):
+ data = self.metadata()
+ for post in self.posts():
+ file = post["file"]
+
+ if not file["url"]:
+ ihash = file["md5"]
+ file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format(
+ self.root[8:], ihash[0:2], ihash[2:4], ihash, file["ext"])
+
+ post["filename"] = file["md5"]
+ post["extension"] = file["ext"]
+ post.update(data)
+ yield Message.Directory, post
+ yield Message.Url, file["url"], post
+
+ def metadata(self):
+ return {}
+ def posts(self):
+ return self._pagination(self.root + "/posts.json")
-class E621TagExtractor(booru.TagMixin, E621Extractor):
- """Extractor for images from e621.net based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?e621\.net/post"
- r"(?:/index/\d+/|\?tags=)(?P<tags>[^/?&#]+)")
+ def _pagination(self, url):
+ params = self.params.copy()
+ params["limit"] = self.per_page
+ tags = params.get("tags", "")
+
+ while True:
+ posts = self.request(url, params=params).json()["posts"]
+ yield from posts
+
+ if len(posts) < self.per_page:
+ return
+ params["tags"] = "id:<{} {}".format(posts[-1]["id"], tags)
+
+
+class E621TagExtractor(E621Extractor):
+ """Extractor for e621 posts from tag searches"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/posts?(?:\?.*?tags=|/index/\d+/)([^&#]+)"
test = (
- ("https://e621.net/post/index/1/anry", {
+ ("https://e621.net/posts?tags=anry", {
"url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba",
"content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58",
}),
+ ("https://e926.net/posts?tags=anry"),
+ ("https://e621.net/post/index/1/anry"),
("https://e621.net/post?tags=anry"),
)
+ def __init__(self, match):
+ E621Extractor.__init__(self, match)
+ self.params["tags"] = text.unquote(match.group(2).replace("+", " "))
+
+ def metadata(self):
+ return {"search_tags": self.params["tags"]}
+
-class E621PoolExtractor(booru.PoolMixin, E621Extractor):
- """Extractor for image-pools from e621.net"""
- pattern = r"(?:https?://)?(?:www\.)?e621\.net/pool/show/(?P<pool>\d+)"
- test = ("https://e621.net/pool/show/73", {
- "url": "842f2fb065c7c339486a9b1d689020b8569888ed",
- "content": "c2c87b7a9150509496cddc75ccab08109922876a",
- })
-
-
-class E621PostExtractor(booru.PostMixin, E621Extractor):
- """Extractor for single images from e621.net"""
- pattern = r"(?:https?://)?(?:www\.)?e621\.net/post/show/(?P<post>\d+)"
- test = ("https://e621.net/post/show/535", {
- "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
- "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
- "options": (("tags", True),),
- "keyword": {
- "tags_artist": "anry",
- "tags_general": str,
- "tags_species": str,
- },
- })
-
-
-class E621PopularExtractor(booru.MoebooruPopularMixin, E621Extractor):
- """Extractor for popular images from 621.net"""
- pattern = (r"(?:https?://)?(?:www\.)?e621\.net"
- r"/post/popular_by_(?P<scale>day|week|month)"
- r"(?:\?(?P<query>[^#]*))?")
- test = ("https://e621.net/post/popular_by_month?month=6&year=2013", {
- "count": 32,
- })
+class E621PoolExtractor(E621Extractor):
+ """Extractor for e621 pools"""
+ subcategory = "pool"
+ directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}")
+ archive_fmt = "p_{pool[id]}_{id}"
+ pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)"
+ test = (
+ ("https://e621.net/pools/73", {
+ "url": "842f2fb065c7c339486a9b1d689020b8569888ed",
+ "content": "c2c87b7a9150509496cddc75ccab08109922876a",
+ }),
+ ("https://e621.net/pool/show/73"),
+ )
def __init__(self, match):
- super().__init__(match)
- self.api_url = "https://e621.net/post/popular_by_{scale}.json".format(
- scale=self.scale)
+ E621Extractor.__init__(self, match)
+ self.pool_id = match.group(2)
+ self.params["tags"] = "pool:" + self.pool_id
+
+ def metadata(self):
+ url = "{}/pools/{}.json".format(self.root, self.pool_id)
+ pool = self.request(url).json()
+ pool["name"] = pool["name"].replace("_", " ")
+ del pool["post_ids"]
+ return {"pool": pool}
+
+
+class E621PostExtractor(E621Extractor):
+ """Extractor for single e621 posts"""
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)"
+ test = (
+ ("https://e621.net/posts/535", {
+ "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
+ "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
+ }),
+ ("https://e621.net/post/show/535"),
+ )
+
+ def __init__(self, match):
+ E621Extractor.__init__(self, match)
+ self.post_id = match.group(2)
+
+ def posts(self):
+ url = "{}/posts/{}.json".format(self.root, self.post_id)
+ return (self.request(url).json()["post"],)
+
+
+class E621PopularExtractor(E621Extractor):
+ """Extractor for popular images from e621"""
+ subcategory = "popular"
+ directory_fmt = ("{category}", "popular", "{scale}", "{date}")
+ archive_fmt = "P_{scale[0]}_{date}_{id}"
+ pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?"
+ test = (
+ ("https://e621.net/explore/posts/popular"),
+ (("https://e621.net/explore/posts/popular"
+ "?date=2019-06-01&scale=month"), {
+ "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+",
+ "count": ">= 70",
+ })
+ )
+
+ def __init__(self, match):
+ E621Extractor.__init__(self, match)
+ self.params.update(text.parse_query(match.group(2)))
+
+ def metadata(self):
+ scale = self.params.get("scale", "day")
+ date = self.params.get("date") or datetime.date.today().isoformat()
+ date = date[:10]
+
+ if scale == "week":
+ date = datetime.date.fromisoformat(date)
+ date = (date - datetime.timedelta(days=date.weekday())).isoformat()
+ elif scale == "month":
+ date = date[:-3]
+
+ return {"date": date, "scale": scale}
+
+ def posts(self):
+ url = self.root + "/explore/posts/popular.json"
+ return self._pagination(url)
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 967fd9c..a9d3c9d 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -61,7 +61,8 @@ class FlickrImageExtractor(FlickrExtractor):
test = (
("https://www.flickr.com/photos/departingyyz/16089302239", {
"pattern": pattern,
- "content": "0821a28ee46386e85b02b67cf2720063440a228c",
+ "content": ("3133006c6d657fe54cf7d4c46b82abbcb0efaf9f",
+ "0821a28ee46386e85b02b67cf2720063440a228c"),
"keyword": {
"comments": int,
"description": str,
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
new file mode 100644
index 0000000..ba60e19
--- /dev/null
+++ b/gallery_dl/extractor/furaffinity.py
@@ -0,0 +1,235 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.furaffinity.net/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net"
+
+
+class FuraffinityExtractor(Extractor):
+ """Base class for furaffinity extractors"""
+ category = "furaffinity"
+ directory_fmt = ("{category}", "{user!l}")
+ filename_fmt = "{id} {title}.{extension}"
+ archive_fmt = "{id}"
+ cookiedomain = ".furaffinity.net"
+ root = "https://www.furaffinity.net"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1)
+ self.offset = 0
+
+ def items(self):
+ for post_id in util.advance(self.posts(), self.offset):
+ post = self._parse_post(post_id)
+ if post:
+ yield Message.Directory, post
+ yield Message.Url, post["url"], post
+
+ def posts(self):
+ return self._pagination()
+
+ def skip(self, num):
+ self.offset += num
+ return num
+
+ def _parse_post(self, post_id):
+ url = "{}/view/{}/".format(self.root, post_id)
+ extr = text.extract_from(self.request(url).text)
+ title, _, artist = text.unescape(extr(
+ 'property="og:title" content="', '"')).rpartition(" by ")
+ path = extr('href="//d.facdn.net/', '"')
+
+ if not path:
+ self.log.warning(
+ "Unable to download post %s (\"%s\")",
+ post_id, text.remove_html(
+ extr('System Message', '</section>') or
+ extr('System Message', '</table>')
+ )
+ )
+ return None
+
+ pi = text.parse_int
+ rh = text.remove_html
+
+ data = text.nameext_from_url(path, {
+ "id" : pi(post_id),
+ "title" : title,
+ "artist": artist,
+ "user" : self.user or artist,
+ "url" : "https://d.facdn.net/" + path
+ })
+
+ tags = extr('class="tags-row">', '</section>')
+ if tags:
+ # new site layout
+ data["tags"] = text.split_html(tags)
+ data["description"] = text.unescape(rh(extr(
+ 'class="section-body">', '</div>'), "", ""))
+ data["views"] = pi(rh(extr('class="views">', '</span>')))
+ data["favorites"] = pi(rh(extr('class="favorites">', '</span>')))
+ data["comments"] = pi(rh(extr('class="comments">', '</span>')))
+ data["rating"] = rh(extr('class="rating">', '</span>'))
+ data["fa_category"] = rh(extr('>Category</strong>', '</span>'))
+ data["theme"] = rh(extr('>', '<'))
+ data["species"] = rh(extr('>Species</strong>', '</div>'))
+ data["gender"] = rh(extr('>Gender</strong>', '</div>'))
+ data["width"] = pi(extr("<span>", "x"))
+ data["height"] = pi(extr("", "p"))
+ else:
+ # old site layout
+ data["fa_category"] = extr("<b>Category:</b>", "<").strip()
+ data["theme"] = extr("<b>Theme:</b>", "<").strip()
+ data["species"] = extr("<b>Species:</b>", "<").strip()
+ data["gender"] = extr("<b>Gender:</b>", "<").strip()
+ data["favorites"] = pi(extr("<b>Favorites:</b>", "<"))
+ data["comments"] = pi(extr("<b>Comments:</b>", "<"))
+ data["views"] = pi(extr("<b>Views:</b>", "<"))
+ data["width"] = pi(extr("<b>Resolution:</b>", "x"))
+ data["height"] = pi(extr("", "<"))
+ data["tags"] = text.split_html(extr(
+ 'id="keywords">', '</div>'))[::2]
+ data["rating"] = extr('<img alt="', ' ')
+ data["description"] = text.unescape(text.remove_html(extr(
+ "</table>", "</table>"), "", ""))
+ data["date"] = text.parse_timestamp(data["filename"].partition(".")[0])
+
+ return data
+
+ def _pagination(self):
+ num = 1
+
+ while True:
+ url = "{}/{}/{}/{}/".format(
+ self.root, self.subcategory, self.user, num)
+ page = self.request(url).text
+ post_id = None
+
+ for post_id in text.extract_iter(page, 'id="sid-', '"'):
+ yield post_id
+
+ if not post_id:
+ return
+ num += 1
+
+ def _pagination_favorites(self):
+ path = "/favorites/{}/".format(self.user)
+
+ while path:
+ page = self.request(self.root + path).text
+ yield from text.extract_iter(page, 'id="sid-', '"')
+ path = text.extract(page, 'right" href="', '"')[0]
+
+
+class FuraffinityGalleryExtractor(FuraffinityExtractor):
+ """Extractor for a furaffinity user's gallery"""
+ subcategory = "gallery"
+ pattern = BASE_PATTERN + r"/gallery/([^/?&#]+)"
+ test = ("https://www.furaffinity.net/gallery/mirlinthloth/", {
+ "pattern": r"https://d.facdn.net/art/mirlinthloth/\d+/\d+.\w+\.\w+",
+ "range": "45-50",
+ "count": 6,
+ })
+
+
+class FuraffinityScrapsExtractor(FuraffinityExtractor):
+ """Extractor for a furaffinity user's scraps"""
+ subcategory = "scraps"
+ directory_fmt = ("{category}", "{user!l}", "Scraps")
+ pattern = BASE_PATTERN + r"/scraps/([^/?&#]+)"
+ test = ("https://www.furaffinity.net/scraps/mirlinthloth/", {
+ "pattern": r"https://d.facdn.net/art/[^/]+(/stories)?/\d+/\d+.\w+.\w+",
+ "count": ">= 3",
+ })
+
+
+class FuraffinityFavoriteExtractor(FuraffinityExtractor):
+ """Extractor for a furaffinity user's favorites"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{user!l}", "Favorites")
+ pattern = BASE_PATTERN + r"/favorites/([^/?&#]+)"
+ test = ("https://www.furaffinity.net/favorites/mirlinthloth/", {
+ "pattern": r"https://d.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+",
+ "range": "45-50",
+ "count": 6,
+ })
+
+ def posts(self):
+ return self._pagination_favorites()
+
+
+class FuraffinityPostExtractor(FuraffinityExtractor):
+ """Extractor for individual posts on furaffinity"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(?:view|full)/(\d+)"
+ test = (
+ ("https://www.furaffinity.net/view/21835115/", {
+ "url": "eae4ef93d99365c69b31a37561bd800c03d336ad",
+ "keyword": {
+ "artist" : "mirlinthloth",
+ "date" : "dt:2016-11-27 17:24:06",
+ "description": "A Song made playing the game Cosmic DJ.",
+ "extension" : "mp3",
+ "filename" : r"re:\d+\.\w+_dj_fennmink_-_bude_s_4_ever",
+ "id" : 21835115,
+ "tags" : list,
+ "title" : "Bude's 4 Ever",
+ "url" : "re:https://d.facdn.net/art/mirlinthloth/music",
+ "user" : "mirlinthloth",
+ "views" : int,
+ "favorites" : int,
+ "comments" : int,
+ "rating" : "General",
+ "fa_category": "Music",
+ "theme" : "All",
+ "species" : "Unspecified / Any",
+ "gender" : "Any",
+ "width" : 120,
+ "height" : 120,
+ },
+ }),
+ ("https://furaffinity.net/view/21835115/"),
+ ("https://sfw.furaffinity.net/view/21835115/"),
+ ("https://www.furaffinity.net/full/21835115/"),
+ )
+
+ def posts(self):
+ post_id = self.user
+ self.user = None
+ return (post_id,)
+
+
+class FuraffinityUserExtractor(FuraffinityExtractor):
+ """Extractor for furaffinity user profiles"""
+ subcategory = "user"
+ cookiedomain = None
+ pattern = BASE_PATTERN + r"/user/([^/?&#]+)"
+ test = (
+ ("https://www.furaffinity.net/user/mirlinthloth/", {
+ "pattern": r"/gallery/mirlinthloth/$",
+ }),
+ ("https://www.furaffinity.net/user/mirlinthloth/", {
+ "options": (("include", "all"),),
+ "pattern": r"/(gallery|scraps|favorites)/mirlinthloth/$",
+ "count": 3,
+ }),
+ )
+
+ def items(self):
+ base = "{}/{{}}/{}/".format(self.root, self.user)
+ return self._dispatch_extractors((
+ (FuraffinityGalleryExtractor , base.format("gallery")),
+ (FuraffinityScrapsExtractor , base.format("scraps")),
+ (FuraffinityFavoriteExtractor, base.format("favorites")),
+ ), ("gallery",))
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index 19f9481..6e82091 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -247,7 +247,7 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor):
"content": "91bf01497c39254b6dfb234a18e8f01629c77fd1",
"keyword": {
"artist" : "Tenpura",
- "date" : "type:datetime",
+ "date" : "dt:2016-02-22 14:41:19",
"description": "Thank you!",
"height" : 700,
"index" : 407501,
diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py
new file mode 100644
index 0000000..302999b
--- /dev/null
+++ b/gallery_dl/extractor/hentaihand.py
@@ -0,0 +1,134 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hentaihand.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+import collections
+
+
+class HentaihandGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries on hentaihand.com"""
+ category = "hentaihand"
+ root = "https://hentaihand.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com"
+ r"/(?:comi|view)c/(\d+)")
+ test = (
+ ("https://hentaihand.com/comic/272772/kouda-tomohiro-chiyomi-bl", {
+ "pattern": r"https://i.hentaihand.com/.*/images/full/\d+.jpg$",
+ "count": 19,
+ "keyword": {
+ "artists" : ["kouda tomohiro"],
+ "categories": ["manga"],
+ "date" : "Feb. 6, 2020, 3:19 p.m.",
+ "gallery_id": 272772,
+ "lang" : "en",
+ "language" : "English",
+ "relationships": ["family", "step family"],
+ "tags" : list,
+ "title" : r"re:\[Kouda Tomohiro\] Chiyomi Blizzard",
+ "title_jp" : r"re:\[幸田朋弘\] ちよみブリザード",
+ },
+ }),
+ ("https://hentaihand.com/viewc/272772/kouda-tomohiro-chiyomi-bl"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/comic/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+
+ title_en = text.unescape(extr("<h1>", "<"))
+ title_jp = text.unescape(extr("<h2>", "<"))
+ tags = extr('<section id="tags"', "</section>")
+
+ data = {
+ "gallery_id" : text.parse_int(self.gallery_id),
+ "title" : title_en or title_jp,
+ "title_en" : title_en,
+ "title_jp" : title_jp,
+
+ # impossible to parse with strptime()
+ "date" : extr('datetime="', '"'),
+ }
+
+ tdict = collections.defaultdict(list)
+ for path in text.extract_iter(tags, 'href="/', '"'):
+ kind, _, name = path.partition("/")
+ tdict[kind].append(name.replace("+", " "))
+ data.update(tdict)
+
+ if "languages" in data:
+ data["language"] = data["languages"][-1].capitalize()
+ data["lang"] = util.language_to_code(data["language"])
+ del data["languages"]
+ return data
+
+ def images(self, _):
+ url = "{}/viewc/{}/1".format(self.root, self.gallery_id)
+ page = self.request(url).text
+ images = text.extract(page, "var images", ";")[0]
+ return [(img, None) for img in text.extract_iter(images, "'", "'")]
+
+
+class HentaihandTagExtractor(Extractor):
+ """Extractor for tag searches on hentaihand.com"""
+ category = "hentaihand"
+ subcategory = "tag"
+ root = "https://hentaihand.com"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com"
+ r"(/(?:parody|characters|tags|artists|groups|languages"
+ r"|categories|relationships)/[^#]+)")
+ test = (
+ ("https://hentaihand.com/artists/tony+taka", {
+ "pattern": HentaihandGalleryExtractor.pattern,
+ "count": ">= 50",
+ }),
+ ("https://hentaihand.com/artists/tony+taka/popular?page=2"),
+ ("https://hentaihand.com/tags/full+color"),
+ ("https://hentaihand.com/languages/japanese"),
+ ("https://hentaihand.com/categories/manga"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path, _, query = match.group(1).partition("?")
+ self.query = text.parse_query(query)
+ self.query["page"] = text.parse_int(self.query.get("page"), 1)
+
+ def items(self):
+ yield Message.Version, 1
+ url = self.root + self.path
+ params = self.query.copy()
+ data = {"_extractor": HentaihandGalleryExtractor}
+
+ while True:
+ page = self.request(url, params=params).text
+
+ for path in text.extract_iter(page, '<a href="/comic/', '"'):
+ yield Message.Queue, self.root + "/comic/" + path, data
+
+ pos = page.find(">(current)<")
+ if pos < 0 or page.find('class="page-link" href="', pos) < 0:
+ break
+ params["page"] += 1
+
+
+class HentaihandSearchExtractor(HentaihandTagExtractor):
+ """Extractor for search results on hentaihand.com"""
+ subcategory = "search"
+ pattern = r"(?i)(?:https?://)?(?:www\.)?hentaihand\.com(/search/?[^#]+)"
+ test = ("https://hentaihand.com/search?q=color", {
+ "pattern": HentaihandGalleryExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ })
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
index 193cadf..ad97eba 100644
--- a/gallery_dl/extractor/hentainexus.py
+++ b/gallery_dl/extractor/hentainexus.py
@@ -22,7 +22,7 @@ class HentainexusGalleryExtractor(GalleryExtractor):
test = (
("https://hentainexus.com/view/5688", {
"url": "746d0043e20030f1171aae5ea113176607302517",
- "keyword": "c1b7091e2bc2f733f6401711e072ad11cf93dd69",
+ "keyword": "77702b42f8f76ecfe5d8a14cfbbcbd855eb14d7f",
}),
("https://hentainexus.com/read/5688"),
)
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
new file mode 100644
index 0000000..e0b0f50
--- /dev/null
+++ b/gallery_dl/extractor/hiperdex.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hiperdex.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+from ..cache import memcache
+import re
+
+
+class HiperdexBase():
+ """Base class for hiperdex extractors"""
+ category = "hiperdex"
+ root = "https://hiperdex.com"
+
+ @memcache(keyarg=1)
+ def manga_data(self, manga, page=None):
+ if not page:
+ url = "{}/manga/{}/".format(self.root, manga)
+ page = self.request(url).text
+ extr = text.extract_from(page)
+
+ return {
+ "manga" : text.unescape(extr(
+ "<title>", "<").rpartition("&")[0].strip()),
+ "score" : text.parse_float(extr(
+ 'id="averagerate">', '<')),
+ "author" : text.remove_html(extr(
+ 'class="author-content">', '</div>')),
+ "artist" : text.remove_html(extr(
+ 'class="artist-content">', '</div>')),
+ "genre" : text.split_html(extr(
+ 'class="genres-content">', '</div>'))[::2],
+ "type" : extr(
+ 'class="summary-content">', '<').strip(),
+ "release": text.parse_int(text.remove_html(extr(
+ 'class="summary-content">', '</div>'))),
+ "status" : extr(
+ 'class="summary-content">', '<').strip(),
+ "description": text.remove_html(text.unescape(extr(
+ 'class="description-summary">', '</div>'))),
+ "language": "English",
+ "lang" : "en",
+ }
+
+ def chapter_data(self, chapter):
+ chapter, _, minor = chapter.partition("-")
+ data = {
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": "." + minor if minor and minor != "end" else "",
+ }
+ data.update(self.manga_data(self.manga.lower()))
+ return data
+
+
+class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
+ """Extractor for manga chapters from hiperdex.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.com"
+ r"(/manga/([^/?&#]+)/([^/?&#]+))")
+ test = ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", {
+ "url": "111bc3ee14ce91d78c275770ef63b56c9ac15d8d",
+ "keyword": {
+ "artist" : "Sasuga Kei",
+ "author" : "Sasuga Kei",
+ "chapter": 154,
+ "chapter_minor": ".5",
+ "description": "re:Natsuo Fujii is in love with his teacher, Hina",
+ "genre" : list,
+ "manga" : "Domestic na Kanojo",
+ "release": 2014,
+ "score" : float,
+ "type" : "Manga",
+ },
+ })
+
+ def __init__(self, match):
+ path, self.manga, self.chapter = match.groups()
+ ChapterExtractor.__init__(self, match, self.root + path + "/")
+
+ def metadata(self, _):
+ return self.chapter_data(self.chapter)
+
+ def images(self, page):
+ return [
+ (url.strip(), None)
+ for url in re.findall(r'id="image-\d+"\s+src="([^"]+)', page)
+ ]
+
+
+class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
+ """Extractor for manga from hiperdex.com"""
+ chapterclass = HiperdexChapterExtractor
+ pattern = r"(?:https?://)?(?:www\.)?hiperdex\.com(/manga/([^/?&#]+))/?$"
+ test = ("https://hiperdex.com/manga/youre-not-that-special/", {
+ "count": 51,
+ "pattern": HiperdexChapterExtractor.pattern,
+ "keyword": {
+ "artist" : "Bolp",
+ "author" : "Abyo4",
+ "chapter": int,
+ "chapter_minor": "",
+ "description": "re:I didn’t think much of the creepy girl in ",
+ "genre" : list,
+ "manga" : "You're Not That Special!",
+ "release": 2019,
+ "score" : float,
+ "status" : "Completed",
+ "type" : "Manhwa",
+ },
+ })
+
+ def __init__(self, match):
+ path, self.manga = match.groups()
+ MangaExtractor.__init__(self, match, self.root + path + "/")
+
+ def chapters(self, page):
+ self.manga_data(self.manga, page)
+ results = []
+ last = None
+
+ page = text.extract(page, 'class="page-content-listing', '</ul>')[0]
+ for match in HiperdexChapterExtractor.pattern.finditer(page):
+ path = match.group(1)
+ if last != path:
+ last = path
+ results.append((
+ self.root + path,
+ self.chapter_data(path.rpartition("/")[2]),
+ ))
+
+ return results
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index d6fdcf2..3baf819 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://hitomi.la/"""
+"""Extractors for https://hitomi.la/"""
from .common import GalleryExtractor
from .. import text, util
@@ -27,21 +27,25 @@ class HitomiGalleryExtractor(GalleryExtractor):
"keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
"count": 16,
}),
+ # download test
("https://hitomi.la/galleries/1401410.html", {
- # download test
"range": "1",
"content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c",
}),
+ # Game CG with scenes (#321)
("https://hitomi.la/galleries/733697.html", {
- # Game CG with scenes (#321)
- "url": "21064f9e3c244aca87f1a91967a3fbe79032c4ce",
+ "url": "b4cbc76032852db4a655bf6a2c4d58eae8153c8e",
"count": 210,
}),
+ # fallback for galleries only available through /reader/ URLs
("https://hitomi.la/galleries/1045954.html", {
- # fallback for galleries only available through /reader/ URLs
- "url": "0a67f5e6c3c6a384b578e328f4817fa6ccdf856a",
+ "url": "f3aa914ad148437f72d307268fa0d250eabe8dab",
"count": 1413,
}),
+ # gallery with "broken" redirect
+ ("https://hitomi.la/cg/scathacha-sama-okuchi-ecchi-1291900.html", {
+ "count": 10,
+ }),
("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"),
("https://hitomi.la/manga/867789.html"),
("https://hitomi.la/doujinshi/867789.html"),
@@ -51,84 +55,90 @@ class HitomiGalleryExtractor(GalleryExtractor):
)
def __init__(self, match):
- self.gallery_id = match.group(1)
- self.fallback = False
- url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
+ gid = match.group(1)
+ url = "https://ltn.hitomi.la/galleries/{}.js".format(gid)
GalleryExtractor.__init__(self, match, url)
+ self.info = None
+ self.session.headers["Referer"] = "{}/reader/{}.html".format(
+ self.root, gid)
+
+ def metadata(self, page):
+ self.info = info = json.loads(page.partition("=")[2])
+
+ data = self._data_from_gallery_info(info)
+ if self.config("metadata", True):
+ data.update(self._data_from_gallery_page(info))
+ return data
+
+ def _data_from_gallery_info(self, info):
+ language = info.get("language")
+ if language:
+ language = language.capitalize()
+
+ tags = []
+ for tinfo in info["tags"]:
+ tag = tinfo["tag"]
+ if tinfo.get("female"):
+ tag += " ♀"
+ elif tinfo.get("male"):
+ tag += " ♂"
+ tags.append(string.capwords(tag))
+
+ return {
+ "gallery_id": text.parse_int(info["id"]),
+ "title" : info["title"],
+ "type" : info["type"].capitalize(),
+ "language" : language,
+ "lang" : util.language_to_code(language),
+ "tags" : tags,
+ "date" : text.parse_datetime(
+ info["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"),
+ }
+
+ def _data_from_gallery_page(self, info):
+ url = "{}/galleries/{}.html".format(self.root, info["id"])
- def request(self, url, **kwargs):
- response = GalleryExtractor.request(self, url, fatal=False, **kwargs)
- if response.status_code == 404:
- self.fallback = True
- url = url.replace("/galleries/", "/reader/")
- response = GalleryExtractor.request(self, url, **kwargs)
- elif b"<title>Redirect</title>" in response.content:
+ # follow redirects
+ while True:
+ response = self.request(url, fatal=False)
+ if b"<title>Redirect</title>" not in response.content:
+ break
url = text.extract(response.text, "href='", "'")[0]
if not url.startswith("http"):
url = text.urljoin(self.root, url)
- response = self.request(url, **kwargs)
- return response
- def metadata(self, page):
- if self.fallback:
- return {
- "gallery_id": text.parse_int(self.gallery_id),
- "title": text.unescape(text.extract(
- page, "<title>", "<")[0].rpartition(" | ")[0]),
- }
-
- extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
- data = {
- "gallery_id": text.parse_int(self.gallery_id),
- "title" : text.unescape(extr('.html">', '<').strip()),
- "artist" : self._prep(extr('<h2>', '</h2>')),
- "group" : self._prep(extr('<td>Group</td><td>', '</td>')),
- "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')),
- "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')),
- "parody" : self._prep(extr('<td>Series</td><td>', '</td>')),
- "characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
- "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),
- "date" : self._date(extr('<span class="date">', '</span>')),
+ if response.status_code >= 400:
+ return {}
+
+ def prep(value):
+ return [
+ text.unescape(string.capwords(v))
+ for v in text.extract_iter(value or "", '.html">', '<')
+ ]
+
+ extr = text.extract_from(response.text)
+ return {
+ "artist" : prep(extr('<h2>', '</h2>')),
+ "group" : prep(extr('<td>Group</td><td>', '</td>')),
+ "parody" : prep(extr('<td>Series</td><td>', '</td>')),
+ "characters": prep(extr('<td>Characters</td><td>', '</td>')),
}
- if data["language"] == "N/a":
- data["language"] = None
- data["lang"] = util.language_to_code(data["language"])
- return data
-
- def images(self, page):
- # set Referer header before image downloads (#239)
- self.session.headers["Referer"] = self.gallery_url
-
- # get 'galleryinfo'
- url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id)
- page = self.request(url).text
+ def images(self, _):
result = []
- for image in json.loads(page.partition("=")[2]):
+ for image in self.info["files"]:
ihash = image["hash"]
idata = text.nameext_from_url(image["name"])
# see https://ltn.hitomi.la/common.js
- offset = int(ihash[-3:-1], 16) % 3
+ inum = int(ihash[-3:-1], 16)
+ frontends = 2 if inum < 0x30 else 3
+ inum = 1 if inum < 0x09 else inum
+
url = "https://{}a.hitomi.la/images/{}/{}/{}.{}".format(
- chr(97 + offset),
+ chr(97 + (inum % frontends)),
ihash[-1], ihash[-3:-1], ihash,
idata["extension"],
)
result.append((url, idata))
return result
-
- @staticmethod
- def _prep(value):
- return [
- text.unescape(string.capwords(v))
- for v in text.extract_iter(value or "", '.html">', '<')
- ]
-
- @staticmethod
- def _prep_1(value):
- return text.remove_html(value).capitalize()
-
- @staticmethod
- def _date(value):
- return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index 15152b7..d0aa4f2 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -111,13 +111,13 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
test = (
("https://ibb.co/album/i5PggF", {
"range": "1-80",
- "url": "570872b6eb3e11cf10b618922b780fed204c3f09",
- "keyword": "0f2fc956728c36540c577578bd168d2459d6ae4b",
+ "url": "70afec9fcc3a6de62a6b644b487d892d8d47cf1a",
+ "keyword": "569e1d88ebdd27655387559cdf1cd526a3e1ab69",
}),
("https://ibb.co/album/i5PggF?sort=title_asc", {
"range": "1-80",
- "url": "e2e387b8fdb3690bd75d804d0af2833112e385cd",
- "keyword": "a307fc9d2085bdc0eb7c538c8d866c59198d460c",
+ "url": "a2dfc58fe3348fa37e242082bd5a85eaa490d0a5",
+ "keyword": "5bb79c82411c3770d673fac64a0a98fa28111c3b",
}),
# no user data (#471)
("https://ibb.co/album/kYKpwF", {
@@ -192,12 +192,12 @@ class ImgbbImageExtractor(ImgbbExtractor):
subcategory = "image"
pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)"
test = ("https://ibb.co/fUqh5b", {
- "pattern": "https://image.ibb.co/dY5FQb/Arundel-Ireeman-5.jpg",
+ "pattern": r"https://i\.ibb\.co/g3kvx80/Arundel-Ireeman-5\.jpg",
"content": "c5a0965178a8b357acd8aa39660092918c63795e",
"keyword": {
"id" : "fUqh5b",
"title" : "Arundel Ireeman 5",
- "url" : "https://image.ibb.co/dY5FQb/Arundel-Ireeman-5.jpg",
+ "url" : "https://i.ibb.co/g3kvx80/Arundel-Ireeman-5.jpg",
"width" : 960,
"height": 719,
"user" : "folkie",
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 5084e80..0813ea9 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://imgur.com/"""
+"""Extractors for https://imgur.com/"""
from .common import Extractor, Message
from .. import text, exception
@@ -65,7 +65,7 @@ class ImgurImageExtractor(ImgurExtractor):
"account_url" : None,
"animated" : False,
"bandwidth" : int,
- "date" : "type:datetime",
+ "date" : "dt:2016-11-10 14:24:35",
"datetime" : 1478787875,
"description" : None,
"edited" : "0",
@@ -142,7 +142,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
"cover_edited": None,
"cover_height": 1400,
"cover_width" : 951,
- "date" : "type:datetime",
+ "date" : "dt:2015-10-09 10:37:50",
"datetime" : 1444387070,
"description" : None,
"favorite" : False,
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 05adac1..96afea1 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -1,12 +1,13 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Leonardo Taccari, Mike Fährmann
+# Copyright 2018-2019 Leonardo Taccari
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://www.instagram.com/"""
+"""Extractors for https://www.instagram.com/"""
from .common import Extractor, Message
from .. import text, exception
@@ -129,6 +130,7 @@ class InstagramExtractor(Extractor):
'owner_id': media['owner']['id'],
'username': media['owner']['username'],
'fullname': media['owner']['full_name'],
+ "post_shortcode": media['shortcode'],
'description': text.parse_unicode_escapes('\n'.join(
edge['node']['text']
for edge in media['edge_media_to_caption']['edges']
@@ -306,12 +308,13 @@ class InstagramImageExtractor(InstagramExtractor):
r"/v(p/[0-9a-f]+/[0-9A-F]+)?/t51.2885-15/e35"
r"/44877605_725955034447492_3123079845831750529_n.jpg",
"keyword": {
- "date": "type:datetime",
+ "date": "dt:2018-11-29 01:04:04",
"description": str,
"height": int,
"likes": int,
"media_id": "1922949326347663701",
"shortcode": "BqvsDleB3lV",
+ "post_shortcode": "BqvsDleB3lV",
"typename": "GraphImage",
"username": "instagram",
"width": int,
@@ -324,6 +327,7 @@ class InstagramImageExtractor(InstagramExtractor):
"keyword": {
"sidecar_media_id": "1875629777499953996",
"sidecar_shortcode": "BoHk1haB5tM",
+ "post_shortcode": "BoHk1haB5tM",
"likes": int,
"username": "instagram",
}
@@ -333,7 +337,7 @@ class InstagramImageExtractor(InstagramExtractor):
("https://www.instagram.com/p/Bqxp0VSBgJg/", {
"pattern": r"/47129943_191645575115739_8539303288426725376_n\.mp4",
"keyword": {
- "date": "type:datetime",
+ "date": "dt:2018-11-29 19:23:58",
"description": str,
"height": int,
"likes": int,
@@ -349,7 +353,7 @@ class InstagramImageExtractor(InstagramExtractor):
("https://www.instagram.com/tv/BkQjCfsBIzi/", {
"pattern": r"/10000000_1760663964018792_716207142595461120_n\.mp4",
"keyword": {
- "date": "type:datetime",
+ "date": "dt:2018-06-20 19:51:32",
"description": str,
"height": int,
"likes": int,
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index 49d68ef..b34b288 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -35,7 +35,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
"access" : "public",
"articleStories": list,
"contentRating" : dict,
- "date" : "type:datetime",
+ "date" : "dt:2019-09-16 00:00:00",
"description" : "re:Motions, the brand new publication by I",
"documentId" : r"re:\d+-d99ec95935f15091b040cb8060f05510",
"documentName" : "motions-1-2019",
diff --git a/gallery_dl/extractor/kabeuchi.py b/gallery_dl/extractor/kabeuchi.py
new file mode 100644
index 0000000..a8702f1
--- /dev/null
+++ b/gallery_dl/extractor/kabeuchi.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://kabe-uchiroom.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+
+class KabeuchiUserExtractor(Extractor):
+ """Extractor for all posts of a user on kabe-uchiroom.com"""
+ category = "kabeuchi"
+ subcategory = "user"
+ directory_fmt = ("{category}", "{twitter_user_id} {twitter_id}")
+ filename_fmt = "{id}_{num:>02}{title:?_//}.{extension}"
+ archive_fmt = "{id}_{num}"
+ root = "https://kabe-uchiroom.com"
+ pattern = r"(?:https?://)?kabe-uchiroom\.com/mypage/?\?id=(\d+)"
+ test = (
+ ("https://kabe-uchiroom.com/mypage/?id=919865303848255493", {
+ "pattern": (r"https://kabe-uchiroom\.com/accounts/upfile/3/"
+ r"919865303848255493/\w+\.jpe?g"),
+ "count": ">= 24",
+ }),
+ ("https://kabe-uchiroom.com/mypage/?id=123456789", {
+ "exception": exception.NotFoundError,
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def items(self):
+ base = "{}/accounts/upfile/{}/{}/".format(
+ self.root, self.user_id[-1], self.user_id)
+ keys = ("image1", "image2", "image3", "image4", "image5", "image6")
+
+ for post in self.posts():
+ if post.get("is_ad") or not post["image1"]:
+ continue
+
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%Y-%m-%d %H:%M:%S")
+ yield Message.Directory, post
+
+ for key in keys:
+ name = post[key]
+ if not name:
+ break
+ url = base + name
+ post["num"] = ord(key[-1]) - 48
+ yield Message.Url, url, text.nameext_from_url(name, post)
+
+ def posts(self):
+ url = "{}/mypage/?id={}".format(self.root, self.user_id)
+ response = self.request(url)
+ if response.history and response.url == self.root + "/":
+ raise exception.NotFoundError("user")
+ target_id = text.extract(response.text, 'user_friend_id = "', '"')[0]
+ return self._pagination(target_id)
+
+ def _pagination(self, target_id):
+ url = "{}/get_posts.php".format(self.root)
+ data = {
+ "user_id" : "0",
+ "target_id" : target_id,
+ "type" : "uploads",
+ "sort_type" : "0",
+ "category_id": "all",
+ "latest_post": "",
+ "page_num" : 0,
+ }
+
+ while True:
+ info = self.request(url, method="POST", data=data).json()
+ datas = info["datas"]
+
+ if not datas or not isinstance(datas, list):
+ return
+ yield from datas
+
+ last_id = datas[-1]["id"]
+ if last_id == info["last_data"]:
+ return
+ data["latest_post"] = last_id
+ data["page_num"] += 1
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index c9e6959..822a743 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract soundtracks from https://downloads.khinsider.com/"""
+"""Extractors for https://downloads.khinsider.com/"""
from .common import Extractor, Message, AsynchronousMixin
from .. import text, exception
@@ -16,54 +16,52 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
"""Extractor for soundtracks from khinsider.com"""
category = "khinsider"
subcategory = "soundtrack"
- directory_fmt = ("{category}", "{album}")
- archive_fmt = "{album}_{filename}.{extension}"
+ directory_fmt = ("{category}", "{album[name]}")
+ archive_fmt = "{filename}.{extension}"
pattern = (r"(?:https?://)?downloads\.khinsider\.com"
r"/game-soundtracks/album/([^/?&#]+)")
+ root = "https://downloads.khinsider.com"
test = (("https://downloads.khinsider.com"
"/game-soundtracks/album/horizon-riders-wii"), {
- "pattern": r"https?://\d+\.\d+\.\d+\.\d+/ost/horizon-riders-wii/[^/]+"
- r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3",
- "count": 1,
- "keyword": "b4f460c78dd23e1f1121f4ac784dd67ded7c2679",
+ "pattern": r"https?://vgmdownloads.com/soundtracks/horizon-riders-wii/"
+ r"[^/]+/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3",
+ "keyword": "5b2c35cce638c326cab2a4f7a79f245d008d62ff",
})
- root = "https://downloads.khinsider.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.album = match.group(1)
def items(self):
- url = (self.root + "/game-soundtracks/album/" + self.album)
+ url = self.root + "/game-soundtracks/album/" + self.album
page = self.request(url, encoding="utf-8").text
- data = self.get_job_metadata(page)
+ if "Download all songs at once:" not in page:
+ raise exception.NotFoundError("soundtrack")
+
+ data = self.metadata(page)
yield Message.Version, 1
yield Message.Directory, data
- for url, track in self.get_album_tracks(page):
+ for track in self.tracks(page):
track.update(data)
- yield Message.Url, url, track
+ yield Message.Url, track["url"], track
- def get_job_metadata(self, page):
- """Collect metadata for extractor-job"""
- if "Download all songs at once:" not in page:
- raise exception.NotFoundError("soundtrack")
- data = text.extract_all(page, (
- ("album", "Album name: <b>", "</b>"),
- ("count", "Number of Files: <b>", "</b>"),
- ("size" , "Total Filesize: <b>", "</b>"),
- ("date" , "Date added: <b>", "</b>"),
- ("type" , "Album type: <b>", "</b>"),
- ))[0]
- data["album"] = text.unescape(data["album"])
- return data
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ return {"album": {
+ "name" : text.unescape(extr("Album name: <b>", "<")),
+ "count": text.parse_int(extr("Number of Files: <b>", "<")),
+ "size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]),
+ "date" : extr("Date added: <b>", "<"),
+ "type" : extr("Album type: <b>", "<"),
+ }}
- def get_album_tracks(self, page):
- """Collect url and metadata for all tracks of a soundtrack"""
+ def tracks(self, page):
page = text.extract(page, '<table id="songlist">', '</table>')[0]
+
for num, url in enumerate(text.extract_iter(
page, '<td class="clickable-row"><a href="', '"'), 1):
url = text.urljoin(self.root, url)
page = self.request(url, encoding="utf-8").text
- url = text.extract(
- page, '<p><a style="color: #21363f;" href="', '"')[0]
- yield url, text.nameext_from_url(url, {"num": num})
+
+ url = text.extract(page, 'style="color: #21363f;" href="', '"')[0]
+ yield text.nameext_from_url(url, {"num": num, "url": url})
diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py
index 7151de0..8809589 100644
--- a/gallery_dl/extractor/kissmanga.py
+++ b/gallery_dl/extractor/kissmanga.py
@@ -94,7 +94,7 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
}),
("https://kissmanga.com/Manga/Houseki-no-Kuni/Oneshot?id=404189", {
"count": 49,
- "keyword": "d44d1b21d08e4dbf888b0c450a3f1bc919588b4f",
+ "keyword": "cea131c9fe9c71309b3270cd86718d4d1198c31c",
}),
("https://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608"),
)
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
index c80cf14..c31de1c 100644
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -59,7 +59,7 @@ class LusciousAlbumExtractor(LusciousExtractor):
"cover" : "re:https://\\w+.luscious.net/.+/277031/",
"created" : 1479625853,
"created_by" : "NTRshouldbeillegal",
- "date" : "type:datetime",
+ "date" : "dt:2016-11-20 07:10:53",
"description" : "Enjoy.",
"download_url": "/download/824778/277031/",
"genres" : list,
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py
index d24d452..31083dc 100644
--- a/gallery_dl/extractor/mangareader.py
+++ b/gallery_dl/extractor/mangareader.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -38,7 +38,7 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))"
test = (("https://www.mangareader.net"
"/karate-shoukoushi-kohinata-minoru/11"), {
- "url": "061cc92a07edf17bb991ce0821fa4c77a147a860",
+ "url": "3d8a5b900856d59b8d8e83908d0df392be92c0f4",
"keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6",
})
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
index 114a48e..8cd7fa5 100644
--- a/gallery_dl/extractor/mangoxo.py
+++ b/gallery_dl/extractor/mangoxo.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,6 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
import hashlib
+import time
class MangoxoExtractor(Extractor):
@@ -35,28 +36,34 @@ class MangoxoExtractor(Extractor):
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- page = self.request(self.root + "/login/").text
- token = text.extract(page, 'id="loginToken" value="', '"')[0]
- if not token:
- self.log.debug("failed to extract 'loginToken'")
-
- url = self.root + "/login/loginxmm"
+ url = self.root + "/api/login"
headers = {
"X-Requested-With": "XMLHttpRequest",
"Referer": self.root + "/login",
}
- data = {
- "name": username,
- "password": hashlib.md5(password.encode()).hexdigest(),
- "loginToken": token,
- }
+ data = self._sign_by_md5(username, password)
response = self.request(url, method="POST", headers=headers, data=data)
- if response.json().get("result") != "1":
- raise exception.AuthenticationError()
+ data = response.json()
+ if str(data.get("result")) != "1":
+ raise exception.AuthenticationError(data.get("msg"))
return {"SESSION": self.session.cookies.get("SESSION")}
@staticmethod
+ def _sign_by_md5(username, password):
+ # https://dns.mangoxo.com/libs/plugins/phoenix-ui/js/phoenix-ui.js
+ params = [
+ ("username" , username),
+ ("password" , password),
+ ("timestamp", str(int(time.time()))),
+ ]
+ query = "&".join("=".join(item) for item in sorted(params))
+ query += "&secretKey=996293536"
+ sign = hashlib.md5(query.encode()).hexdigest()
+ params.append(("sign", sign.upper()))
+ return params
+
+ @staticmethod
def _total_pages(page):
return text.parse_int(text.extract(page, "total :", ",")[0])
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 54e60b0..21afeae 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,6 +11,7 @@
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
+import itertools
import json
@@ -35,16 +36,17 @@ class NewgroundsExtractor(Extractor):
for post_url in self.posts():
try:
- file = self.extract_post(post_url)
- url = file["url"]
- # except Exception:
+ post = self.extract_post(post_url)
+ url = post.get("url")
except OSError:
url = None
- if not url:
- self.log.warning("Unable to get download URL for %s", post_url)
- continue
- yield Message.Directory, file
- yield Message.Url, url, text.nameext_from_url(url, file)
+
+ if url:
+ yield Message.Directory, post
+ yield Message.Url, url, text.nameext_from_url(url, post)
+ else:
+ self.log.warning(
+ "Unable to get download URL for '%s'", post_url)
def posts(self):
"""Return urls of all relevant image pages"""
@@ -82,7 +84,10 @@ class NewgroundsExtractor(Extractor):
}
def extract_post(self, post_url):
- page = self.request(post_url).text
+ response = self.request(post_url, fatal=False)
+ if response.status_code >= 400:
+ return {}
+ page = response.text
extr = text.extract_from(page)
if "/art/view/" in post_url:
@@ -97,8 +102,7 @@ class NewgroundsExtractor(Extractor):
data["favorites"] = text.parse_int(extr(
'id="faves_load">', '<').replace(",", ""))
data["score"] = text.parse_float(extr('id="score_number">', '<'))
- data["tags"] = text.split_html(extr(
- '<dd class="tags">', '</dd>'))
+ data["tags"] = text.split_html(extr('<dd class="tags">', '</dd>'))
data["artist"] = [
text.extract(user, '//', '.')[0]
for user in text.extract_iter(page, '<div class="item-user">', '>')
@@ -194,7 +198,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
"keyword": {
"artist" : ["tomfulp"],
"comment" : "re:Consider this the bottom threshold for ",
- "date" : "type:datetime",
+ "date" : "dt:2009-06-04 14:44:05",
"description": "re:Consider this the bottom threshold for ",
"favorites" : int,
"filename" : "94_tomfulp_ryu-is-hawt",
@@ -241,7 +245,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"keyword": {
"artist" : ["psychogoldfish", "tomfulp"],
"comment" : "re:People have been asking me how I like the ",
- "date" : "type:datetime",
+ "date" : "dt:2012-02-08 21:40:56",
"description": "re:People have been asking how I like the ",
"favorites" : int,
"filename" : "527818_alternate_1896",
@@ -259,7 +263,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor):
"keyword": {
"artist" : ["zj", "tomfulp"],
"comment" : "re:RECORDED 12-09-2014\n\nFrom The ZJ \"Late ",
- "date" : "type:datetime",
+ "date" : "dt:2015-02-23 19:31:59",
"description": "From The ZJ Report Show!",
"favorites" : int,
"index" : 609768,
@@ -334,3 +338,53 @@ class NewgroundsUserExtractor(NewgroundsExtractor):
(NewgroundsAudioExtractor , base + "audio"),
(NewgroundsMoviesExtractor, base + "movies"),
), ("art",))
+
+
+class NewgroundsFavoriteExtractor(NewgroundsExtractor):
+ """Extractor for posts favorited by a newgrounds user"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{user}", "Favorites")
+ pattern = (r"(?:https?://)?([^.]+)\.newgrounds\.com"
+ r"/favorites(?:/(art|audio|movies))?/?")
+ test = (
+ ("https://tomfulp.newgrounds.com/favorites/art", {
+ "range": "1-10",
+ "count": ">= 10",
+ }),
+ ("https://tomfulp.newgrounds.com/favorites/audio"),
+ ("https://tomfulp.newgrounds.com/favorites/movies"),
+ ("https://tomfulp.newgrounds.com/favorites/"),
+ )
+
+ def __init__(self, match):
+ NewgroundsExtractor.__init__(self, match)
+ self.kind = match.group(2)
+
+ def posts(self):
+ if self.kind:
+ return self._pagination(self.kind)
+ return itertools.chain.from_iterable(
+ self._pagination(k) for k in ("art", "audio", "movies")
+ )
+
+ def _pagination(self, kind):
+ num = 1
+ headers = {
+ "Accept": "application/json, text/javascript, */*; q=0.01",
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": self.user_root,
+ }
+
+ while True:
+ url = "{}/favorites/{}/{}".format(self.user_root, kind, num)
+ response = self.request(url, headers=headers)
+ if response.history:
+ return
+
+ favs = list(text.extract_iter(
+ response.text, 'href="//www.newgrounds.com', '"'))
+ for path in favs:
+ yield self.root + path
+ if len(favs) < 24:
+ return
+ num += 1
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 97be789..dfe31e3 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -86,7 +86,7 @@ class NozomiPostExtractor(NozomiExtractor):
"character": ["patchouli knowledge"],
"copyright": ["touhou"],
"dataid" : "re:aaa9f7c632cde1e1a5baaff3fb6a6d857ec73df7fdc5cf5a",
- "date" : "type:datetime",
+ "date" : "dt:2016-07-26 02:32:03",
"extension": "jpg",
"favorites": int,
"filename" : str,
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 74835bf..2f5b429 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2019 Mike Fährmann
+# Copyright 2017-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Utility classes to setup OAuth and link a users account to gallery-dl"""
+"""Utility classes to setup OAuth and link accounts to gallery-dl"""
from .common import Extractor, Message
from . import deviantart, flickr, reddit, smugmug, tumblr
@@ -38,7 +38,7 @@ class OAuthBase(Extractor):
print("Waiting for response. (Cancel with Ctrl+c)")
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
- server.bind(("localhost", 6414))
+ server.bind(("localhost", self.config("port", 6414)))
server.listen(1)
# workaround for ctrl+c not working during server.accept on Windows
@@ -98,7 +98,7 @@ class OAuthBase(Extractor):
def _oauth2_authorization_code_grant(
self, client_id, client_secret, auth_url, token_url,
scope="read", key="refresh_token", auth=True,
- message_template=None):
+ message_template=None, cache=None):
"""Perform an OAuth2 authorization code grant"""
state = "gallery-dl_{}_{}".format(
@@ -162,6 +162,11 @@ class OAuthBase(Extractor):
client_secret=client_secret,
))
+ # write to cache
+ if cache and config.get(("extractor", self.category), "cache"):
+ cache.update("#" + str(client_id), data[key])
+ self.log.info("Writing 'refresh-token' to cache")
+
class OAuthDeviantart(OAuthBase):
subcategory = "deviantart"
@@ -179,6 +184,7 @@ class OAuthDeviantart(OAuthBase):
"https://www.deviantart.com/oauth2/authorize",
"https://www.deviantart.com/oauth2/token",
scope="browse",
+ cache=deviantart._refresh_token_cache,
)
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index a4731d0..931fb13 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -45,7 +45,7 @@ class PahealTagExtractor(PahealExtractor):
directory_fmt = ("{category}", "{search_tags}")
pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
r"/post/list/([^/?&#]+)")
- test = ("https://rule34.paheal.net/post/list/k-on/1", {
+ test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", {
"pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20",
"count": ">= 15"
})
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 1e52559..0d51df2 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,6 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
from ..cache import memcache
import collections
+import itertools
import json
@@ -33,43 +34,62 @@ class PatreonExtractor(Extractor):
PatreonExtractor._warning = False
for post in self.posts():
- ids = set()
post["num"] = 0
- content = post.get("content")
- postfile = post.get("post_file")
+ hashes = set()
yield Message.Directory, post
yield Message.Metadata, text.nameext_from_url(
post["creator"].get("image_url", ""), post)
- for image in post["images"]:
- url = image.get("download_url")
- if not url:
- continue
- ids.add(url.split("/")[-2])
- name = image.get("file_name") or self._filename(url) or url
+ for kind, url, name in itertools.chain(
+ self._postfile(post),
+ self._images(post),
+ self._attachments(post),
+ self._content(post),
+ ):
+ fhash = url.rsplit("/", 2)[1]
+ if fhash not in hashes:
+ hashes.add(fhash)
+ post["hash"] = fhash
+ post["type"] = kind
+ post["num"] += 1
+ yield Message.Url, url, text.nameext_from_url(name, post)
+ else:
+ self.log.debug("skipping %s (%s %s)", url, fhash, kind)
- post["num"] += 1
- post["type"] = "image"
- yield Message.Url, url, text.nameext_from_url(name, post)
+ @staticmethod
+ def _postfile(post):
+ postfile = post.get("post_file")
+ if postfile:
+ return (("postfile", postfile["url"], postfile["name"]),)
+ return ()
+
+ def _images(self, post):
+ for image in post["images"]:
+ url = image.get("download_url")
+ if url:
+ name = image.get("file_name") or self._filename(url) or url
+ yield "image", url, name
- if postfile and postfile["url"].split("/")[-2] not in ids:
- post["num"] += 1
- post["type"] = "postfile"
- text.nameext_from_url(postfile["name"], post)
- yield Message.Url, postfile["url"], post
+ def _attachments(self, post):
+ for attachment in post["attachments"]:
+ url = self.request(
+ attachment["url"], method="HEAD",
+ allow_redirects=False, fatal=False,
+ ).headers.get("Location")
- for attachment in post["attachments"]:
- post["num"] += 1
- post["type"] = "attachment"
- text.nameext_from_url(attachment["name"], post)
- yield Message.Url, attachment["url"], post
+ if url:
+ yield "attachment", url, attachment["name"]
- if content:
- for url in text.extract_iter(content, 'src="', '"'):
- post["num"] += 1
- post["type"] = "content"
- yield Message.Url, url, text.nameext_from_url(url, post)
+ @staticmethod
+ def _content(post):
+ content = post.get("content")
+ if content:
+ for img in text.extract_iter(
+ content, '<img data-media-id="', '>'):
+ url = text.extract(img, 'src="', '"')[0]
+ if url:
+ yield "content", url, url
def posts(self):
"""Return all relevant post objects"""
@@ -238,11 +258,13 @@ class PatreonPostExtractor(PatreonExtractor):
subcategory = "post"
pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?&#]+)"
test = (
+ # postfile + attachments
("https://www.patreon.com/posts/precious-metal-23563293", {
"count": 4,
}),
- ("https://www.patreon.com/posts/er1-28201153", {
- "count": 1,
+ # postfile + content
+ ("https://www.patreon.com/posts/19987002", {
+ "count": 4,
}),
("https://www.patreon.com/posts/not-found-123", {
"exception": exception.NotFoundError,
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
index 5f50245..35f9f91 100644
--- a/gallery_dl/extractor/piczel.py
+++ b/gallery_dl/extractor/piczel.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,27 +22,30 @@ class PiczelExtractor(Extractor):
def items(self):
yield Message.Version, 1
- for image in self.unpack(self.images()):
- url = self.root + "/static" + image["image"]["image"]["url"]
- yield Message.Directory, image
- yield Message.Url, url, text.nameext_from_url(url, image)
-
- @staticmethod
- def unpack(images):
- """Unpack 'images' into individual image objects"""
- for image in images:
- if image["multi"]:
- multi = image["images"]
- del image["images"]
- for image["num"], img in enumerate(multi):
- image["image"] = img
- yield image
+ for post in self.posts():
+ post["tags"] = [t["title"] for t in post["tags"] if t["title"]]
+ post["date"] = text.parse_datetime(
+ post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+
+ if post["multi"]:
+ images = post["images"]
+ del post["images"]
+ yield Message.Directory, post
+ for post["num"], image in enumerate(images):
+ if "id" in image:
+ del image["id"]
+ post.update(image)
+ url = post["image"]["url"]
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
else:
- image["num"] = 0
- yield image
+ yield Message.Directory, post
+ post["num"] = 0
+ url = post["image"]["url"]
+ yield Message.Url, url, text.nameext_from_url(url, post)
- def images(self):
- """Return an iterable with all relevant image objects"""
+ def posts(self):
+ """Return an iterable with all relevant post objects"""
def _pagination(self, url, folder_id=None):
params = {
@@ -53,26 +56,26 @@ class PiczelExtractor(Extractor):
while True:
data = self.request(url, params=params).json()
- yield from data
-
- if len(data) < 32:
+ if not data:
return
params["from_id"] = data[-1]["id"]
+ yield from data
class PiczelUserExtractor(PiczelExtractor):
"""Extractor for all images from a user's gallery"""
subcategory = "user"
pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$"
- test = ("https://piczel.tv/gallery/Maximumwarp", {
- "count": ">= 45",
+ test = ("https://piczel.tv/gallery/Bikupan", {
+ "range": "1-100",
+ "count": ">= 100",
})
def __init__(self, match):
PiczelExtractor.__init__(self, match)
self.user = match.group(1)
- def images(self):
+ def posts(self):
url = "{}/api/users/{}/gallery".format(self.root, self.user)
return self._pagination(url)
@@ -92,7 +95,7 @@ class PiczelFolderExtractor(PiczelExtractor):
PiczelExtractor.__init__(self, match)
self.user, self.folder_id = match.groups()
- def images(self):
+ def posts(self):
url = "{}/api/users/{}/gallery".format(self.root, self.user)
return self._pagination(url, self.folder_id)
@@ -106,6 +109,7 @@ class PiczelImageExtractor(PiczelExtractor):
"content": "df9a053a24234474a19bce2b7e27e0dec23bff87",
"keyword": {
"created_at": "2018-07-22T05:13:58.000Z",
+ "date": "dt:2018-07-22 05:13:58",
"description": None,
"extension": "png",
"favorites_count": int,
@@ -118,7 +122,7 @@ class PiczelImageExtractor(PiczelExtractor):
"nsfw": False,
"num": 0,
"password_protected": False,
- "tags": "fanart, commission, altair, recreators, ",
+ "tags": ["fanart", "commission", "altair", "recreators"],
"title": "Altair",
"user": dict,
"views": int,
@@ -129,6 +133,6 @@ class PiczelImageExtractor(PiczelExtractor):
PiczelExtractor.__init__(self, match)
self.image_id = match.group(1)
- def images(self):
+ def posts(self):
url = "{}/api/gallery/image/{}".format(self.root, self.image_id)
return (self.request(url).json(),)
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 8a10028..eaf97fd 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -12,6 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
from datetime import datetime, timedelta
+import itertools
import hashlib
import time
@@ -27,11 +28,11 @@ class PixivExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.api = PixivAppAPI(self)
- self.user_id = -1
self.load_ugoira = self.config("ugoira", True)
def items(self):
- metadata = self.get_metadata()
+ ratings = {0: "General", 1: "R-18", 2: "R-18G"}
+ metadata = self.metadata()
yield Message.Version, 1
for work in self.works():
@@ -46,6 +47,7 @@ class PixivExtractor(Extractor):
work["num"] = 0
work["tags"] = [tag["name"] for tag in work["tags"]]
work["date"] = text.parse_datetime(work["create_date"])
+ work["rating"] = ratings.get(work["x_restrict"])
work["suffix"] = ""
work.update(metadata)
@@ -74,11 +76,9 @@ class PixivExtractor(Extractor):
def works(self):
"""Return an iterable containing all relevant 'work'-objects"""
- def get_metadata(self, user=None):
+ def metadata(self):
"""Collect metadata for extractor-job"""
- if not user:
- user = self.api.user_detail(self.user_id)
- return {"user": user}
+ return {}
class PixivUserExtractor(PixivExtractor):
@@ -102,8 +102,15 @@ class PixivUserExtractor(PixivExtractor):
"&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), {
"url": "25b1cd81153a8ff82eec440dd9f20a4a22079658",
}),
+ # avatar (#595, 623)
+ ("https://www.pixiv.net/en/users/173530", {
+ "options": (("avatar", True),),
+ "content": "22af450d4dbaf4973d370f164f66f48c7382a6de",
+ "range": "1",
+ }),
+ # deleted account
("http://www.pixiv.net/member_illust.php?id=173531", {
- "exception": exception.NotFoundError,
+ "count": 0,
}),
("https://www.pixiv.net/en/users/173530"),
("https://www.pixiv.net/en/users/173530/manga"),
@@ -136,6 +143,27 @@ class PixivUserExtractor(PixivExtractor):
if tag in [t["name"].lower() for t in work["tags"]]
)
+ if self.config("avatar"):
+ user = self.api.user_detail(self.user_id)
+ url = user["profile_image_urls"]["medium"].replace("_170.", ".")
+ avatar = {
+ "create_date" : None,
+ "height" : 0,
+ "id" : "avatar",
+ "image_urls" : None,
+ "meta_pages" : (),
+ "meta_single_page": {"original_image_url": url},
+ "page_count" : 1,
+ "sanity_level" : 0,
+ "tags" : (),
+ "title" : "avatar",
+ "type" : "avatar",
+ "user" : user,
+ "width" : 0,
+ "x_restrict" : 0,
+ }
+ works = itertools.chain((avatar,), works)
+
return works
@@ -203,15 +231,9 @@ class PixivWorkExtractor(PixivExtractor):
def __init__(self, match):
PixivExtractor.__init__(self, match)
self.illust_id = match.group(1) or match.group(2)
- self.load_ugoira = True
- self.work = None
def works(self):
- return (self.work,)
-
- def get_metadata(self, user=None):
- self.work = self.api.illust_detail(self.illust_id)
- return PixivExtractor.get_metadata(self, self.work["user"])
+ return (self.api.illust_detail(self.illust_id),)
class PixivFavoriteExtractor(PixivExtractor):
@@ -220,8 +242,8 @@ class PixivFavoriteExtractor(PixivExtractor):
directory_fmt = ("{category}", "bookmarks",
"{user_bookmark[id]} {user_bookmark[account]}")
archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}"
- pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/"
- r"(?:(?:en/)?users/(\d+)/(bookmarks/artworks|following)"
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?"
+ r"users/(\d+)/(bookmarks/artworks(?:/([^/?&#]+))?|following)"
r"|bookmark\.php(?:\?([^#]*))?)")
test = (
("https://www.pixiv.net/en/users/173530/bookmarks/artworks", {
@@ -231,20 +253,29 @@ class PixivFavoriteExtractor(PixivExtractor):
"url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
}),
# bookmarks with specific tag
+ (("https://www.pixiv.net/en/users/3137110"
+ "/bookmarks/artworks/%E3%81%AF%E3%82%93%E3%82%82%E3%82%93"), {
+ "url": "379b28275f786d946e01f721e54afe346c148a8c",
+ }),
+ # bookmarks with specific tag (legacy url)
(("https://www.pixiv.net/bookmark.php?id=3137110"
"&tag=%E3%81%AF%E3%82%93%E3%82%82%E3%82%93&p=1"), {
- "count": 2,
+ "url": "379b28275f786d946e01f721e54afe346c148a8c",
}),
# own bookmarks
("https://www.pixiv.net/bookmark.php", {
"url": "90c1715b07b0d1aad300bce256a0bc71f42540ba",
}),
+ # own bookmarks with tag (#596)
+ ("https://www.pixiv.net/bookmark.php?tag=foobar", {
+ "count": 0,
+ }),
# followed users (#515)
("https://www.pixiv.net/en/users/173530/following", {
"pattern": PixivUserExtractor.pattern,
"count": ">= 12",
}),
- # followed users (#515)
+ # followed users (legacy url) (#515)
("https://www.pixiv.net/bookmark.php?id=173530&type=user", {
"pattern": PixivUserExtractor.pattern,
"count": ">= 12",
@@ -255,11 +286,11 @@ class PixivFavoriteExtractor(PixivExtractor):
)
def __init__(self, match):
- uid, kind, query = match.groups()
+ uid, kind, self.tag, query = match.groups()
if query:
self.query = text.parse_query(query)
- uid = self.query.get("id", -1)
+ uid = self.query.get("id")
if not uid:
self.subcategory = "bookmark"
elif self.query.get("type") == "user":
@@ -280,12 +311,15 @@ class PixivFavoriteExtractor(PixivExtractor):
if "tag" in self.query:
tag = text.unquote(self.query["tag"])
+ elif self.tag:
+ tag = text.unquote(self.tag)
+
if "rest" in self.query and self.query["rest"] == "hide":
restrict = "private"
return self.api.user_bookmarks_illust(self.user_id, tag, restrict)
- def get_metadata(self, user=None):
+ def metadata(self):
if self.user_id:
user = self.api.user_detail(self.user_id)
else:
@@ -301,7 +335,7 @@ class PixivFavoriteExtractor(PixivExtractor):
for preview in self.api.user_following(self.user_id):
user = preview["user"]
user["_extractor"] = PixivUserExtractor
- url = "https://www.pixiv.net/member.php?id={}".format(user["id"])
+ url = "https://www.pixiv.net/users/{}".format(user["id"])
yield Message.Queue, url, user
@@ -327,7 +361,7 @@ class PixivRankingExtractor(PixivExtractor):
def works(self):
return self.api.illust_ranking(self.mode, self.date)
- def get_metadata(self, user=None):
+ def metadata(self):
query = text.parse_query(self.query)
mode = query.get("mode", "daily").lower()
@@ -393,7 +427,7 @@ class PixivSearchExtractor(PixivExtractor):
def works(self):
return self.api.search_illust(self.word, self.sort, self.target)
- def get_metadata(self, user=None):
+ def metadata(self):
query = text.parse_query(self.query)
if self.word:
@@ -446,7 +480,7 @@ class PixivFollowExtractor(PixivExtractor):
def works(self):
return self.api.illust_follow()
- def get_metadata(self, user=None):
+ def metadata(self):
self.api.login()
return {"user_follow": self.api.user}
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
index aa5c9c6..721fc2f 100644
--- a/gallery_dl/extractor/pururin.py
+++ b/gallery_dl/extractor/pururin.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -21,8 +21,8 @@ class PururinGalleryExtractor(GalleryExtractor):
("https://pururin.io/gallery/38661/iowant-2", {
"pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg",
"keyword": {
- "title" : "Iowant 2!!",
- "title_en" : "Iowant 2!!",
+ "title" : "re:I ?owant 2!!",
+ "title_en" : "re:I ?owant 2!!",
"title_jp" : "",
"gallery_id": 38661,
"count" : 19,
diff --git a/gallery_dl/extractor/realbooru.py b/gallery_dl/extractor/realbooru.py
index 6d89151..70b4833 100644
--- a/gallery_dl/extractor/realbooru.py
+++ b/gallery_dl/extractor/realbooru.py
@@ -30,7 +30,7 @@ class RealbooruTagExtractor(booru.TagMixin, RealbooruExtractor):
pattern = (r"(?:https?://)?(?:www\.)?realbooru\.com/(?:index\.php)?"
r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
test = ("https://realbooru.com/index.php?page=post&s=list&tags=wine", {
- "count": 64,
+ "count": ">= 64",
})
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 4c83019..a312c1c 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -9,74 +9,104 @@
"""Extractors for https://www.reddit.com/"""
from .common import Extractor, Message
-from .. import text, util, extractor, exception
+from .. import text, util, exception
from ..cache import cache
class RedditExtractor(Extractor):
"""Base class for reddit extractors"""
category = "reddit"
+ directory_fmt = ("{category}", "{subreddit}")
+ filename_fmt = "{id} {title[:242]}.{extension}"
+ archive_fmt = "{filename}"
cookiedomain = None
def __init__(self, match):
Extractor.__init__(self, match)
self.api = RedditAPI(self)
- self.max_depth = int(self.config("recursion", 0))
- self._visited = set()
+ self.max_depth = self.config("recursion", 0)
def items(self):
- subre = RedditSubmissionExtractor.pattern
+ match_submission = RedditSubmissionExtractor.pattern.match
+ match_subreddit = RedditSubredditExtractor.pattern.match
+ match_user = RedditUserExtractor.pattern.match
+
+ parentdir = self.config("parent-directory")
+ videos = self.config("videos", True)
+
submissions = self.submissions()
+ visited = set()
depth = 0
yield Message.Version, 1
- with extractor.blacklist(
- util.SPECIAL_EXTRACTORS,
- [RedditSubredditExtractor, RedditUserExtractor]):
- while True:
- extra = []
- for url, data in self._urls(submissions):
- if url[0] == "#":
+
+ while True:
+ extra = []
+
+ for submission, comments in submissions:
+ urls = []
+
+ if submission:
+ yield Message.Directory, submission
+ visited.add(submission["id"])
+ url = submission["url"]
+
+ if url.startswith("https://i.redd.it/"):
+ text.nameext_from_url(url, submission)
+ yield Message.Url, url, submission
+
+ elif submission["is_video"]:
+ if videos:
+ text.nameext_from_url(url, submission)
+ if videos == "ytdl":
+ url = "https://www.reddit.com" + \
+ submission["permalink"]
+ else:
+ submission["_ytdl_extra"] = {
+ "title": submission["title"],
+ }
+ yield Message.Url, "ytdl:" + url, submission
+
+ elif not submission["is_self"]:
+ urls.append((url, submission))
+
+ elif parentdir:
+ yield Message.Directory, comments[0]
+
+ if self.api.comments:
+ if submission:
+ for url in text.extract_iter(
+ submission["selftext_html"] or "",
+ ' href="', '"'):
+ urls.append((url, submission))
+ for comment in comments:
+ for url in text.extract_iter(
+ comment["body_html"] or "", ' href="', '"'):
+ urls.append((url, comment))
+
+ for url, data in urls:
+ if not url or url[0] == "#":
continue
if url[0] == "/":
url = "https://www.reddit.com" + url
- match = subre.match(url)
+ match = match_submission(url)
if match:
extra.append(match.group(1))
- else:
+ elif not match_user(url) and not match_subreddit(url):
yield Message.Queue, text.unescape(url), data
- if not extra or depth == self.max_depth:
- return
- depth += 1
- submissions = (
- self.api.submission(sid) for sid in extra
- if sid not in self._visited
- )
+ if not extra or depth == self.max_depth:
+ return
+ depth += 1
+ submissions = (
+ self.api.submission(sid) for sid in extra
+ if sid not in self._visited
+ )
def submissions(self):
"""Return an iterable containing all (submission, comments) tuples"""
- def _urls(self, submissions):
- for submission, comments in submissions:
-
- if submission:
- self._visited.add(submission["id"])
-
- if not submission["is_self"]:
- yield submission["url"], submission
-
- for url in text.extract_iter(
- submission["selftext_html"] or "", ' href="', '"'):
- yield url, submission
-
- if comments:
- for comment in comments:
- for url in text.extract_iter(
- comment["body_html"] or "", ' href="', '"'):
- yield url, comment
-
class RedditSubredditExtractor(RedditExtractor):
"""Extractor for URLs from subreddits on reddit.com"""
@@ -84,7 +114,10 @@ class RedditSubredditExtractor(RedditExtractor):
pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/"
r"([^/?&#]+(?:/[a-z]+)?)/?(?:\?([^#]*))?(?:$|#)")
test = (
- ("https://www.reddit.com/r/lavaporn/"),
+ ("https://www.reddit.com/r/lavaporn/", {
+ "range": "1-20",
+ "count": ">= 20",
+ }),
("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"),
("https://old.reddit.com/r/lavaporn/"),
("https://np.reddit.com/r/lavaporn/"),
@@ -210,7 +243,7 @@ class RedditAPI():
link_id = "t3_" + submission_id if self.morecomments else None
submission, comments = self._call(endpoint, {"limit": self.comments})
return (submission["data"]["children"][0]["data"],
- self._flatten(comments, link_id) if self.comments else None)
+ self._flatten(comments, link_id) if self.comments else ())
def submissions_subreddit(self, subreddit, params):
"""Collect all (submission, comments)-tuples of a subreddit"""
@@ -290,7 +323,8 @@ class RedditAPI():
raise exception.AuthorizationError()
if data["error"] == 404:
raise exception.NotFoundError()
- raise Exception(data["message"])
+ self.log.debug(data)
+ raise exception.StopExtraction(data.get("message"))
return data
def _pagination(self, endpoint, params):
@@ -315,7 +349,7 @@ class RedditAPI():
except exception.AuthorizationError:
pass
else:
- yield post, None
+ yield post, ()
elif kind == "t1" and self.comments:
yield None, (post,)
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 2c9746e..521b034 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -83,11 +83,11 @@ class SexcomExtractor(Extractor):
data["url"] = "ytdl:" + text.extract(
extr('<iframe', '>'), ' src="', '"')[0]
else:
- data["url"] = extr(' src="', '"')
+ data["url"] = text.unescape(extr(' src="', '"').partition("?")[0])
text.nameext_from_url(data["url"], data)
data["uploader"] = extr('itemprop="author">', '<')
- data["date"] = extr('datetime="', '"')
+ data["date"] = text.parse_datetime(extr('datetime="', '"'))
data["tags"] = text.split_html(extr('class="tags"> Tags', '</div>'))
data["comments"] = text.parse_int(extr('Comments (', ')'))
@@ -102,28 +102,28 @@ class SexcomPinExtractor(SexcomExtractor):
test = (
# picture
("https://www.sex.com/pin/56714360/", {
- "url": "599190d6e3d79f9f49dda194a0a58cb0ffa3ab86",
- "content": "963ed681cf53904173c7581b713c7f9471f04db0",
+ "pattern": "https://cdn.sex.com/images/.+/2018/10/02/20037816.jpg",
+ "content": "e579e3283fea812d0545a3f79734b79bc3c51acb",
"keyword": {
- "comments": int,
- "date": "2018-10-02T21:18:17-04:00",
+ "comments" : int,
+ "date" : "dt:2018-10-02 21:18:17",
"extension": "jpg",
- "filename": "20037816",
- "likes": int,
- "pin_id": 56714360,
- "repins": int,
- "tags": list,
+ "filename" : "20037816",
+ "likes" : int,
+ "pin_id" : 56714360,
+ "repins" : int,
+ "tags" : list,
"thumbnail": str,
- "title": "Pin #56714360",
- "type": "picture",
- "uploader": "alguem",
- "url": str,
+ "title" : "Pin #56714360",
+ "type" : "picture",
+ "uploader" : "alguem",
+ "url" : str,
},
}),
# gif
("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", {
- "url": "98a82c5ae7a65c8228e1405ac740f80d4d556de1",
- "content": "a54b37eb39d565094c54ad7d21244fe8f978fb14",
+ "pattern": "https://cdn.sex.com/images/.+/2014/01/26/4829951.gif",
+ "content": "af6726d74d11d819e1c885fe5303f711862eae96",
}),
# video
("https://www.sex.com/pin/55748341/", {
@@ -134,10 +134,6 @@ class SexcomPinExtractor(SexcomExtractor):
("https://www.sex.com/pin/55847384-very-nicely-animated/", {
"pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2",
}),
- # 404
- ("https://www.sex.com/pin/55847385/", {
- "count": 0,
- }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index 298b7e0..31dbdad 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -52,7 +52,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
"title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
"title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
"gallery_id": 40996,
- "date" : "type:datetime",
+ "date" : "dt:2018-06-29 00:00:00",
"count" : 42,
"collection": "",
"artist" : ["Itou Life"],
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index a1f2199..0505fa9 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -310,7 +310,7 @@ class TumblrTagExtractor(TumblrExtractor):
def __init__(self, match):
TumblrExtractor.__init__(self, match)
- self.tag = text.unquote(match.group(3))
+ self.tag = text.unquote(match.group(3).replace("-", " "))
def posts(self):
return self.api.posts(self.blog, {"tag": self.tag})
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index dc558c0..2a04463 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2019 Mike Fährmann
+# Copyright 2016-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://twitter.com/"""
+"""Extractors for https://twitter.com/"""
from .common import Extractor, Message
from .. import text, exception
@@ -21,8 +21,11 @@ class TwitterExtractor(Extractor):
directory_fmt = ("{category}", "{user[name]}")
filename_fmt = "{tweet_id}_{num}.{extension}"
archive_fmt = "{tweet_id}_{retweet_id}_{num}"
+ cookiedomain = ".twitter.com"
root = "https://twitter.com"
sizes = (":orig", ":large", ":medium", ":small")
+ user_agent = ("Mozilla/5.0 (Windows NT 6.1; WOW64; "
+ "Trident/7.0; rv:11.0) like Gecko")
def __init__(self, match):
Extractor.__init__(self, match)
@@ -32,7 +35,7 @@ class TwitterExtractor(Extractor):
self.retweets = self.config("retweets", True)
self.twitpic = self.config("twitpic", False)
self.content = self.config("content", False)
- self.videos = self.config("videos", False)
+ self.videos = self.config("videos", True)
if self.content:
self._emoji_sub = re.compile(
@@ -117,7 +120,8 @@ class TwitterExtractor(Extractor):
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- page = self.request(self.root + "/login").text
+ headers = {"User-Agent": self.user_agent}
+ page = self.request(self.root + "/login", headers=headers).text
pos = page.index('name="authenticity_token"')
token = text.extract(page, 'value="', '"', pos-80)[0]
@@ -131,11 +135,15 @@ class TwitterExtractor(Extractor):
"redirect_after_login" : "",
"remember_me" : "1",
}
- response = self.request(url, method="POST", data=data)
-
+ response = self.request(url, method="POST", headers=headers, data=data)
if "/error" in response.url:
raise exception.AuthenticationError()
- return self.session.cookies
+
+ return {
+ cookie.name: cookie.value
+ for cookie in self.session.cookies
+ if cookie.domain and "twitter.com" in cookie.domain
+ }
def _data_from_tweet(self, tweet):
extr = text.extract_from(tweet)
@@ -353,7 +361,11 @@ class TwitterTweetExtractor(TwitterExtractor):
# content with emoji, newlines, hashtags (#338)
("https://twitter.com/yumi_san0112/status/1151144618936823808", {
"options": (("content", True),),
- "keyword": "0b7a3d05607b480c1412dfd85f8606478313e7bf",
+ "keyword": {"content": (
+ "re:晴、お誕生日おめでとう🎉!\n実は下の名前が同じなので結構親近感ある"
+ "アイドルです✨\n今年の晴ちゃんめちゃくちゃ可愛い路線攻めてるから、そろ"
+ "そろまたかっこいい晴が見たいですねw\n#結城晴生誕祭2019\n#結城晴生誕祭"
+ )},
}),
# Reply to another tweet (#403)
("https://twitter.com/tyson_hesse/status/1103767554424598528", {
@@ -365,9 +377,12 @@ class TwitterTweetExtractor(TwitterExtractor):
"pattern": r"https://pbs.twimg.com/media/EAel0vUUYAAZ4Bq.jpg:orig",
}),
# quoted tweet (#526)
- ("https://twitter.com/Meiyu_miu/status/1070693241413021696", {
- "count": 4,
- "keyword": "0c627af2b8cdccc7e0da8fd221155c4a4a3141a8",
+ ("https://twitter.com/Pistachio/status/1222690391817932803", {
+ "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg",
+ "keyword": {
+ "author": {"name": "Afro_Herper", "id": 786047748508221440},
+ "user" : {"name": "Pistachio" , "id": 3533231},
+ },
}),
# TwitPic embeds (#579)
("https://twitter.com/i/web/status/112900228289540096", {
@@ -384,11 +399,7 @@ class TwitterTweetExtractor(TwitterExtractor):
def tweets(self):
url = "{}/i/web/status/{}".format(self.root, self.tweet_id)
cookies = {"app_shell_visited": "1"}
- headers = {
- "Referer" : url,
- "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; "
- "Trident/7.0; rv:11.0) like Gecko",
- }
+ headers = {"User-Agent": self.user_agent, "Referer": url}
response = self.request(url, cookies=cookies, headers=headers)
if response.history and response.url == self.root + "/":
@@ -400,6 +411,81 @@ class TwitterTweetExtractor(TwitterExtractor):
return (page[beg:end],)
+class TwitterBookmarkExtractor(TwitterExtractor):
+ """Extractor for bookmarked tweets"""
+ subcategory = "bookmark"
+ pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()"
+ test = ("https://twitter.com/i/bookmarks",)
+
+ def items(self):
+ self.login()
+ if not self.logged_in:
+ raise exception.AuthorizationError("Login required")
+ for cookie in self.session.cookies:
+ cookie.expires = None
+
+ url = "https://api.twitter.com/2/timeline/bookmark.json"
+ params = {
+ "include_profile_interstitial_type": "1",
+ "include_blocking": "1",
+ "include_blocked_by": "1",
+ "include_followed_by": "1",
+ "include_want_retweets": "1",
+ "include_mute_edge": "1",
+ "include_can_dm": "1",
+ "include_can_media_tag": "1",
+ "skip_status": "1",
+ "cards_platform": "Web-12",
+ "include_cards": "1",
+ "include_composer_source": "true",
+ "include_ext_alt_text": "true",
+ "include_reply_count": "1",
+ "tweet_mode": "extended",
+ "include_entities": "true",
+ "include_user_entities": "true",
+ "include_ext_media_color": "true",
+ "include_ext_media_availability": "true",
+ "send_error_codes": "true",
+ "simple_quoted_tweets": "true",
+ "count": "100",
+ "cursor": None,
+ "ext": "mediaStats%2CcameraMoment",
+ }
+ headers = {
+ "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
+ "COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
+ "4FA33AGWWjCpTnA",
+ "Origin": self.root,
+ "Referer": self.root + "/i/bookmarks",
+ "x-csrf-token": self.session.cookies.get("ct0"),
+ "x-twitter-active-user": "yes",
+ "x-twitter-auth-type": "Auth2Session",
+ "x-twitter-client-language": "en",
+ }
+
+ while True:
+ response = self.request(
+ url, params=params, headers=headers, fatal=False)
+ if response.status_code >= 400:
+ raise exception.StopExtraction(response.text)
+ data = response.json()
+ tweets = data["globalObjects"]["tweets"]
+
+ if not tweets:
+ return
+ for tweet_id, tweet_data in tweets.items():
+ tweet_url = "{}/i/web/status/{}".format(self.root, tweet_id)
+ tweet_data["_extractor"] = TwitterTweetExtractor
+ yield Message.Queue, tweet_url, tweet_data
+
+ inst = data["timeline"]["instructions"][0]
+ for entry in inst["addEntries"]["entries"]:
+ if entry["entryId"].startswith("cursor-bottom-"):
+ params["cursor"] = \
+ entry["content"]["operation"]["cursor"]["value"]
+ break
+
+
@memcache()
def _guest_token(extr, headers):
return extr.request(
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index a24d3fe..a020064 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -34,6 +34,9 @@ class VscoExtractor(Extractor):
yield Message.Directory, {"user": self.user}
for img in self.images():
+ if not img or "responsive_url" not in img:
+ continue
+
if img["is_video"]:
if not videos:
continue
@@ -98,6 +101,8 @@ class VscoExtractor(Extractor):
@staticmethod
def _transform_media(media):
+ if "responsiveUrl" not in media:
+ return None
media["_id"] = media["id"]
media["is_video"] = media["isVideo"]
media["grid_name"] = media["gridName"]
@@ -111,18 +116,19 @@ class VscoExtractor(Extractor):
class VscoUserExtractor(VscoExtractor):
"""Extractor for images from a user on vsco.co"""
subcategory = "user"
- pattern = BASE_PATTERN + r"(?:/images(?:/\d+)?)?/?(?:$|[?#])"
+ pattern = BASE_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])"
test = (
- ("https://vsco.co/missuri/images/1", {
+ ("https://vsco.co/missuri/gallery", {
"pattern": r"https://image(-aws.+)?\.vsco\.co/[0-9a-f/]+/vsco\w+",
"range": "1-80",
"count": 80,
}),
+ ("https://vsco.co/missuri/images/1"),
("https://vsco.co/missuri"),
)
def images(self):
- url = "{}/{}/images/1".format(self.root, self.user)
+ url = "{}/{}/gallery".format(self.root, self.user)
data = self._extract_preload_state(url)
tkn = data["users"]["currentUser"]["tkn"]
@@ -186,7 +192,7 @@ class VscoImageExtractor(VscoExtractor):
"grid" : "erenyildiz",
"meta" : dict,
"tags" : list,
- "date" : "type:datetime",
+ "date" : "dt:2019-07-21 19:12:11",
"video" : False,
"width" : 1537,
"height": 1537,
diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py
index 737c253..043da0b 100644
--- a/gallery_dl/extractor/wallhaven.py
+++ b/gallery_dl/extractor/wallhaven.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://wallhaven.cc/"""
+"""Extractors for https://wallhaven.cc/"""
from .common import Extractor, Message
from .. import text
@@ -77,7 +77,7 @@ class WallhavenImageExtractor(WallhavenExtractor):
"group" : "Owner/Developer",
"username" : "AksumkA",
},
- "date" : "type:datetime",
+ "date" : "dt:2014-08-31 06:17:19",
"wh_category": "anime",
"views" : int,
"favorites" : int,
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 49fa082..6a779d9 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -56,7 +56,7 @@ class WeiboExtractor(Extractor):
info = obj["page_info"]["media_info"]
url = info.get("stream_url_hd") or info.get("stream_url")
- if url and not info.get("goto"):
+ if url:
data = text.nameext_from_url(url, {
"num" : num,
"pid" : 0,
@@ -65,6 +65,10 @@ class WeiboExtractor(Extractor):
"height": 0,
"status": status,
})
+ if data["extension"] == "m3u8":
+ url = "ytdl:" + url
+ data["extension"] = "mp4"
+ data["_ytdl_extra"] = {"protocol": "m3u8_native"}
yield Message.Url, url, data
if self.retweets and "retweeted_status" in obj:
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 62acb28..0422589 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -49,7 +49,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
"pageURL": str,
"thumbURL": str,
"gallery": {
- "date": "type:datetime",
+ "date": "dt:2019-04-16 00:07:31",
"description": "",
"dislikes": int,
"id": 11748968,
diff --git a/gallery_dl/extractor/yaplog.py b/gallery_dl/extractor/yaplog.py
deleted file mode 100644
index b07ba4b..0000000
--- a/gallery_dl/extractor/yaplog.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://yaplog.jp/"""
-
-from .common import Extractor, Message, AsynchronousMixin
-from .. import text, util
-
-
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?yaplog\.jp/([\w-]+)"
-
-
-class YaplogExtractor(AsynchronousMixin, Extractor):
- """Base class for yaplog extractors"""
- category = "yaplog"
- root = "https://yaplog.jp"
- filename_fmt = "{post[id]}_{post[title]}_{id}.{extension}"
- directory_fmt = ("{category}", "{post[user]}")
- archive_fmt = "{post[user]}_{id}"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.user = match.group(1)
-
- def items(self):
- yield Message.Version, 1
- for post, urls in self.posts():
- yield Message.Directory, {"post": post}
- for num, url in enumerate(urls, 1):
- page = self.request(url).text if num > 1 else url
- iurl = text.extract(page, '<img src="', '"')[0]
- if iurl[0] == "/":
- iurl = text.urljoin(self.root, iurl)
- name, _, ext = iurl.rpartition("/")[2].rpartition(".")
- iid = name.rpartition("_")[0] or name
- image = {
- "url" : iurl,
- "num" : num,
- "id" : text.parse_int(iid, iid),
- "filename" : name,
- "extension": ext,
- "post" : post,
- }
- yield Message.Url, iurl, image
-
- def posts(self):
- """Return an iterable with (data, image page URLs) tuples"""
-
- def _parse_post(self, url):
- page = self.request(url).text
- title, pos = text.extract(page, 'class="title">', '<')
- date , pos = text.extract(page, 'class="date">' , '<', pos)
- pid , pos = text.extract(page, '/archive/' , '"', pos)
- prev , pos = text.extract(page, 'class="last"><a href="', '"', pos)
-
- urls = list(text.extract_iter(page, '<li><a href="', '"', pos))
- if urls:
- urls[0] = page # cache HTML of first page
-
- if len(urls) == 24 and text.extract(page, '(1/', ')')[0] != '24':
- # there are a maximum of 24 image entries in an /image/ page
- # -> search /archive/ page for the rest
- url = "{}/{}/archive/{}".format(self.root, self.user, pid)
- page = self.request(url).text
-
- base = "{}/{}/image/{}/".format(self.root, self.user, pid)
- for part in util.advance(text.extract_iter(
- page, base, '"', pos), 24):
- urls.append(base + part)
-
- return prev, urls, {
- "id" : text.parse_int(pid),
- "title": text.unescape(title[:-3]),
- "user" : self.user,
- "date" : text.parse_datetime(date, "%B %d [%a], %Y, %H:%M"),
- }
-
-
-class YaplogBlogExtractor(YaplogExtractor):
- """Extractor for a user's blog on yaplog.jp"""
- subcategory = "blog"
- pattern = BASE_PATTERN + r"/?(?:$|[?&#])"
- test = ("https://yaplog.jp/omitakashi3", {
- "pattern": r"https://img.yaplog.jp/img/18/pc/o/m/i/omitakashi3/0/",
- "count": ">= 2",
- })
-
- def posts(self):
- url = "{}/{}/image/".format(self.root, self.user)
- while url:
- url, images, data = self._parse_post(url)
- yield data, images
-
-
-class YaplogPostExtractor(YaplogExtractor):
- """Extractor for images from a blog post on yaplog.jp"""
- subcategory = "post"
- pattern = BASE_PATTERN + r"/(?:archive|image)/(\d+)"
- test = (
- ("https://yaplog.jp/imamiami0726/image/1299", {
- "url": "896cae20fa718735a57e723c48544e830ff31345",
- "keyword": "22df8ad6cb534514c6bb2ff000381d156769a620",
- }),
- # complete image URLs (#443)
- ("https://yaplog.jp/msjane/archive/246", {
- "pattern": r"https://yaplog.jp/cv/msjane/img/246/img\d+_t.jpg"
- }),
- # empty post (#443)
- ("https://yaplog.jp/f_l_a_s_c_o/image/872", {
- "count": 0,
- }),
- # blog names with '-' (#443)
- ("https://yaplog.jp/a-pierrot-o/image/3946/22779"),
- )
-
- def __init__(self, match):
- YaplogExtractor.__init__(self, match)
- self.post_id = match.group(2)
-
- def posts(self):
- url = "{}/{}/image/{}".format(self.root, self.user, self.post_id)
- _, images, data = self._parse_post(url)
- return ((data, images),)
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index c717dc2..6ba2572 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -182,7 +182,14 @@ class DownloadJob(Job):
self.downloaders = {}
self.postprocessors = None
self.out = output.select()
- self.visited = parent.visited if parent else set()
+
+ if parent:
+ self.visited = parent.visited
+ pfmt = parent.pathfmt
+ if pfmt and parent.extractor.config("parent-directory"):
+ self.extractor._parentdir = pfmt.directory
+ else:
+ self.visited = set()
def handle_url(self, url, kwdict, fallback=None):
"""Download the resource specified in 'url'"""
diff --git a/gallery_dl/output.py b/gallery_dl/output.py
index 38e2f60..f084950 100644
--- a/gallery_dl/output.py
+++ b/gallery_dl/output.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -81,6 +81,36 @@ def initialize_logging(loglevel):
return logging.getLogger("gallery-dl")
+def configure_logging(loglevel):
+ root = logging.getLogger()
+ minlevel = loglevel
+
+ # stream logging handler
+ handler = root.handlers[0]
+ opts = config.interpolate(("output",), "log")
+ if opts:
+ if isinstance(opts, str):
+ opts = {"format": opts}
+ if handler.level == LOG_LEVEL and "level" in opts:
+ handler.setLevel(opts["level"])
+ if "format" in opts or "format-date" in opts:
+ handler.setFormatter(Formatter(
+ opts.get("format", LOG_FORMAT),
+ opts.get("format-date", LOG_FORMAT_DATE),
+ ))
+ if minlevel > handler.level:
+ minlevel = handler.level
+
+ # file logging handler
+ handler = setup_logging_handler("logfile", lvl=loglevel)
+ if handler:
+ root.addHandler(handler)
+ if minlevel > handler.level:
+ minlevel = handler.level
+
+ root.setLevel(minlevel)
+
+
def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL):
"""Setup a new logging handler"""
opts = config.interpolate(("output",), key)
@@ -112,22 +142,6 @@ def setup_logging_handler(key, fmt=LOG_FORMAT, lvl=LOG_LEVEL):
return handler
-def configure_logging_handler(key, handler):
- """Configure a logging handler"""
- opts = config.interpolate(("output",), key)
- if not opts:
- return
- if isinstance(opts, str):
- opts = {"format": opts}
- if handler.level == LOG_LEVEL and "level" in opts:
- handler.setLevel(opts["level"])
- if "format" in opts or "format-date" in opts:
- handler.setFormatter(Formatter(
- opts.get("format", LOG_FORMAT),
- opts.get("format-date", LOG_FORMAT_DATE),
- ))
-
-
# --------------------------------------------------------------------
# Utility functions
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
index 162eb9e..706e706 100644
--- a/gallery_dl/postprocessor/ugoira.py
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -42,7 +42,7 @@ class UgoiraPP(PostProcessor):
if arg == "-vcodec" or arg in ("-c", "-codec") and (
not stream or stream.partition(":")[0] in ("v", "V")):
vcodec = self.args[index + 1]
- # use filter if libx264/5 is explicitly or implicitly used
+ # use filter when using libx264/5
self.prevent_odd = (
vcodec in ("libx264", "libx265") or
not vcodec and self.extension.lower() in ("mp4", "mkv"))
@@ -91,12 +91,12 @@ class UgoiraPP(PostProcessor):
# collect command-line arguments
args = [self.ffmpeg]
if rate_in:
- args += ["-r", str(rate_in)]
- args += ["-i", ffconcat]
+ args += ("-r", str(rate_in))
+ args += ("-i", ffconcat)
if rate_out:
- args += ["-r", str(rate_out)]
+ args += ("-r", str(rate_out))
if self.prevent_odd:
- args += ["-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)"]
+ args += ("-vf", "crop=iw-mod(iw\\,2):ih-mod(ih\\,2)")
if self.args:
args += self.args
self.log.debug("ffmpeg args: %s", args)
@@ -106,8 +106,8 @@ class UgoiraPP(PostProcessor):
try:
if self.twopass:
if "-f" not in args:
- args += ["-f", self.extension]
- args += ["-passlogfile", tempdir + "/ffmpeg2pass", "-pass"]
+ args += ("-f", self.extension)
+ args += ("-passlogfile", tempdir + "/ffmpeg2pass", "-pass")
self._exec(args + ["1", "-y", os.devnull])
self._exec(args + ["2", pathfmt.realpath])
else:
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 72dad5b..a3f4e0a 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -15,6 +15,8 @@ import datetime
import urllib.parse
+HTML_RE = re.compile("<[^>]+>")
+
INVALID_XML_CHARS = (
"\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07",
"\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12",
@@ -39,7 +41,7 @@ def clean_xml(xmldata, repl=""):
def remove_html(txt, repl=" ", sep=" "):
"""Remove html-tags from a string"""
try:
- txt = re.sub("<[^>]+>", repl, txt)
+ txt = HTML_RE.sub(repl, txt)
except TypeError:
return ""
if sep:
@@ -51,7 +53,7 @@ def split_html(txt, sep=None):
"""Split input string by html-tags"""
try:
return [
- x.strip() for x in re.split("<[^>]+>", txt)
+ x.strip() for x in HTML_RE.split(txt)
if x and not x.isspace()
]
except TypeError:
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 13bf80e..232047c 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -21,6 +21,7 @@ import datetime
import operator
import itertools
import urllib.parse
+from http.cookiejar import Cookie
from email.utils import mktime_tz, parsedate_tz
from . import text, exception
@@ -135,6 +136,67 @@ def remove_directory(path):
pass
+def load_cookiestxt(fp):
+ """Parse a Netscape cookies.txt file and return a list of its Cookies"""
+ cookies = []
+
+ for line in fp:
+
+ line = line.lstrip()
+ # strip '#HttpOnly_'
+ if line.startswith("#HttpOnly_"):
+ line = line[10:]
+ # ignore empty lines and comments
+ if not line or line[0] in ("#", "$"):
+ continue
+ # strip trailing '\n'
+ if line[-1] == "\n":
+ line = line[:-1]
+
+ domain, domain_specified, path, secure, expires, name, value = \
+ line.split("\t")
+ if not name:
+ name = value
+ value = None
+
+ cookies.append(Cookie(
+ 0, name, value,
+ None, False,
+ domain,
+ domain_specified == "TRUE",
+ domain.startswith("."),
+ path, False,
+ secure == "TRUE",
+ None if expires == "0" or not expires else expires,
+ False, None, None, {},
+ ))
+
+ return cookies
+
+
+def save_cookiestxt(fp, cookies):
+ """Write 'cookies' in Netscape cookies.txt format to 'fp'"""
+ fp.write("# Netscape HTTP Cookie File\n\n")
+
+ for cookie in cookies:
+ if cookie.value is None:
+ name = ""
+ value = cookie.name
+ else:
+ name = cookie.name
+ value = cookie.value
+
+ fp.write("\t".join((
+ cookie.domain,
+ "TRUE" if cookie.domain.startswith(".") else "FALSE",
+ cookie.path,
+ "TRUE" if cookie.secure else "FALSE",
+ "0" if cookie.expires is None else str(cookie.expires),
+ name,
+ value,
+ )) + "\n")
+
+
def code_to_language(code, default=None):
"""Map an ISO 639-1 language code to its actual name"""
return CODES.get((code or "").lower(), default)
@@ -419,63 +481,85 @@ class Formatter():
self.format_map = self.fields[0][1]
else:
self.format_map = lambda _: format_string
- del self.result
- del self.fields
+ del self.result, self.fields
- def format_map(self, kwargs):
- """Apply 'kwargs' to the initial format_string and return its result"""
+ def format_map(self, kwdict):
+ """Apply 'kwdict' to the initial format_string and return its result"""
+ result = self.result
for index, func in self.fields:
- self.result[index] = func(kwargs)
- return "".join(self.result)
+ result[index] = func(kwdict)
+ return "".join(result)
def _field_access(self, field_name, format_spec, conversion):
- first, rest = _string.formatter_field_name_split(field_name)
+ fmt = self._parse_format_spec(format_spec, conversion)
+
+ if "|" in field_name:
+ return self._apply_list([
+ self._parse_field_name(fn)
+ for fn in field_name.split("|")
+ ], fmt)
+ else:
+ key, funcs = self._parse_field_name(field_name)
+ if funcs:
+ return self._apply(key, funcs, fmt)
+ return self._apply_simple(key, fmt)
+ @staticmethod
+ def _parse_field_name(field_name):
+ first, rest = _string.formatter_field_name_split(field_name)
funcs = []
+
for is_attr, key in rest:
if is_attr:
func = operator.attrgetter
- elif ":" in key:
- func = self._slicegetter
else:
func = operator.itemgetter
+ try:
+ if ":" in key:
+ start, _, stop = key.partition(":")
+ stop, _, step = stop.partition(":")
+ start = int(start) if start else None
+ stop = int(stop) if stop else None
+ step = int(step) if step else None
+ key = slice(start, stop, step)
+ except TypeError:
+ pass # key is an integer
+
funcs.append(func(key))
- if conversion:
- funcs.append(self.CONVERSIONS[conversion])
+ return first, funcs
- if format_spec:
- if format_spec[0] == "?":
- func = self._format_optional
- elif format_spec[0] == "L":
- func = self._format_maxlen
- elif format_spec[0] == "J":
- func = self._format_join
- elif format_spec[0] == "R":
- func = self._format_replace
- else:
- func = self._format_default
- fmt = func(format_spec)
- else:
- fmt = str
+ def _parse_format_spec(self, format_spec, conversion):
+ fmt = self._build_format_func(format_spec)
+ if not conversion:
+ return fmt
- if funcs:
- return self._apply(first, funcs, fmt)
- return self._apply_simple(first, fmt)
+ conversion = self.CONVERSIONS[conversion]
+ if fmt is format:
+ return conversion
+ else:
+ def chain(obj):
+ return fmt(conversion(obj))
+ return chain
- def _apply_simple(self, key, fmt):
- def wrap(obj):
- if key in obj:
- obj = obj[key]
- else:
- obj = self.default
- return fmt(obj)
- return wrap
+ def _build_format_func(self, format_spec):
+ if format_spec:
+ fmt = format_spec[0]
+ if fmt == "?":
+ return self._parse_optional(format_spec)
+ if fmt == "L":
+ return self._parse_maxlen(format_spec)
+ if fmt == "J":
+ return self._parse_join(format_spec)
+ if fmt == "R":
+ return self._parse_replace(format_spec)
+ return self._default_format(format_spec)
+ return format
def _apply(self, key, funcs, fmt):
- def wrap(obj):
+ def wrap(kwdict):
try:
- obj = obj[key]
+ obj = kwdict[key]
for func in funcs:
obj = func(obj)
except Exception:
@@ -483,54 +567,66 @@ class Formatter():
return fmt(obj)
return wrap
- @staticmethod
- def _slicegetter(key):
- start, _, stop = key.partition(":")
- stop, _, step = stop.partition(":")
- start = int(start) if start else None
- stop = int(stop) if stop else None
- step = int(step) if step else None
- return operator.itemgetter(slice(start, stop, step))
+ def _apply_simple(self, key, fmt):
+ def wrap(kwdict):
+ return fmt(kwdict[key] if key in kwdict else self.default)
+ return wrap
- @staticmethod
- def _format_optional(format_spec):
- def wrap(obj):
- if not obj:
- return ""
- return before + format(obj, format_spec) + after
+ def _apply_list(self, lst, fmt):
+ def wrap(kwdict):
+ for key, funcs in lst:
+ try:
+ obj = kwdict[key]
+ for func in funcs:
+ obj = func(obj)
+ if obj is not None:
+ break
+ except Exception:
+ pass
+ else:
+ obj = self.default
+ return fmt(obj)
+ return wrap
+
+ def _parse_optional(self, format_spec):
before, after, format_spec = format_spec.split("/", 2)
before = before[1:]
- return wrap
+ fmt = self._build_format_func(format_spec)
- @staticmethod
- def _format_maxlen(format_spec):
- def wrap(obj):
- obj = format(obj, format_spec)
- return obj if len(obj) <= maxlen else replacement
+ def optional(obj):
+ return before + fmt(obj) + after if obj else ""
+ return optional
+
+ def _parse_maxlen(self, format_spec):
maxlen, replacement, format_spec = format_spec.split("/", 2)
maxlen = text.parse_int(maxlen[1:])
- return wrap
+ fmt = self._build_format_func(format_spec)
- @staticmethod
- def _format_join(format_spec):
- def wrap(obj):
- obj = separator.join(obj)
- return format(obj, format_spec)
+ def mlen(obj):
+ obj = fmt(obj)
+ return obj if len(obj) <= maxlen else replacement
+ return mlen
+
+ def _parse_join(self, format_spec):
separator, _, format_spec = format_spec.partition("/")
separator = separator[1:]
- return wrap
+ fmt = self._build_format_func(format_spec)
- @staticmethod
- def _format_replace(format_spec):
- def wrap(obj):
- obj = obj.replace(old, new)
- return format(obj, format_spec)
+ def join(obj):
+ return fmt(separator.join(obj))
+ return join
+
+ def _parse_replace(self, format_spec):
old, new, format_spec = format_spec.split("/", 2)
old = old[1:]
- return wrap
+ fmt = self._build_format_func(format_spec)
+
+ def replace(obj):
+ return fmt(obj.replace(old, new))
+ return replace
@staticmethod
- def _format_default(format_spec):
+ def _default_format(format_spec):
def wrap(obj):
return format(obj, format_spec)
return wrap
@@ -565,12 +661,14 @@ class PathFormat():
self.delete = False
self.path = self.realpath = self.temppath = ""
- basedir = expand_path(
- extractor.config("base-directory", (".", "gallery-dl")))
- if os.altsep and os.altsep in basedir:
- basedir = basedir.replace(os.altsep, os.sep)
- if basedir[-1] != os.sep:
- basedir += os.sep
+ basedir = extractor._parentdir
+ if not basedir:
+ basedir = expand_path(
+ extractor.config("base-directory", (".", "gallery-dl")))
+ if os.altsep and os.altsep in basedir:
+ basedir = basedir.replace(os.altsep, os.sep)
+ if basedir[-1] != os.sep:
+ basedir += os.sep
self.basedirectory = basedir
restrict = extractor.config("path-restrict", "auto")
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 36d729e..9171f15 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.12.3"
+__version__ = "1.13.2"
diff --git a/test/test_cache.py b/test/test_cache.py
new file mode 100644
index 0000000..31ece7e
--- /dev/null
+++ b/test/test_cache.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+import unittest
+import tempfile
+import time
+
+from gallery_dl import config, util
+dbpath = tempfile.mkstemp()[1]
+config.set(("cache",), "file", dbpath)
+from gallery_dl import cache # noqa
+
+
+def tearDownModule():
+ util.remove_file(dbpath)
+
+
+class TestCache(unittest.TestCase):
+
+ def test_decorator(self):
+
+ @cache.memcache()
+ def mc1():
+ pass
+
+ @cache.memcache(maxage=10)
+ def mc2():
+ pass
+
+ @cache.cache()
+ def dbc():
+ pass
+
+ self.assertIsInstance(mc1, cache.CacheDecorator)
+ self.assertIsInstance(mc2, cache.MemoryCacheDecorator)
+ self.assertIsInstance(dbc, cache.DatabaseCacheDecorator)
+
+ def test_keyarg_mem_simple(self):
+ @cache.memcache(keyarg=2)
+ def ka(a, b, c):
+ return a+b+c
+
+ self.assertEqual(ka(1, 1, 1), 3)
+ self.assertEqual(ka(2, 2, 2), 6)
+
+ self.assertEqual(ka(0, 0, 1), 3)
+ self.assertEqual(ka(9, 9, 1), 3)
+ self.assertEqual(ka(0, 0, 2), 6)
+ self.assertEqual(ka(9, 9, 2), 6)
+
+ def test_keyarg_mem(self):
+ @cache.memcache(keyarg=2, maxage=10)
+ def ka(a, b, c):
+ return a+b+c
+
+ self.assertEqual(ka(1, 1, 1), 3)
+ self.assertEqual(ka(2, 2, 2), 6)
+
+ self.assertEqual(ka(0, 0, 1), 3)
+ self.assertEqual(ka(9, 9, 1), 3)
+ self.assertEqual(ka(0, 0, 2), 6)
+ self.assertEqual(ka(9, 9, 2), 6)
+
+ def test_keyarg_db(self):
+ @cache.cache(keyarg=2, maxage=10)
+ def ka(a, b, c):
+ return a+b+c
+
+ self.assertEqual(ka(1, 1, 1), 3)
+ self.assertEqual(ka(2, 2, 2), 6)
+
+ self.assertEqual(ka(0, 0, 1), 3)
+ self.assertEqual(ka(9, 9, 1), 3)
+ self.assertEqual(ka(0, 0, 2), 6)
+ self.assertEqual(ka(9, 9, 2), 6)
+
+ def test_expires_mem(self):
+ @cache.memcache(maxage=1)
+ def ex(a, b, c):
+ return a+b+c
+
+ self.assertEqual(ex(1, 1, 1), 3)
+ self.assertEqual(ex(2, 2, 2), 3)
+ self.assertEqual(ex(3, 3, 3), 3)
+
+ time.sleep(2)
+ self.assertEqual(ex(3, 3, 3), 9)
+ self.assertEqual(ex(2, 2, 2), 9)
+ self.assertEqual(ex(1, 1, 1), 9)
+
+ def test_expires_db(self):
+ @cache.cache(maxage=1)
+ def ex(a, b, c):
+ return a+b+c
+
+ self.assertEqual(ex(1, 1, 1), 3)
+ self.assertEqual(ex(2, 2, 2), 3)
+ self.assertEqual(ex(3, 3, 3), 3)
+
+ time.sleep(2)
+ self.assertEqual(ex(3, 3, 3), 9)
+ self.assertEqual(ex(2, 2, 2), 9)
+ self.assertEqual(ex(1, 1, 1), 9)
+
+ def test_update_mem_simple(self):
+ @cache.memcache(keyarg=0)
+ def up(a, b, c):
+ return a+b+c
+
+ self.assertEqual(up(1, 1, 1), 3)
+ up.update(1, 0)
+ up.update(2, 9)
+ self.assertEqual(up(1, 0, 0), 0)
+ self.assertEqual(up(2, 0, 0), 9)
+
+ def test_update_mem(self):
+ @cache.memcache(keyarg=0, maxage=10)
+ def up(a, b, c):
+ return a+b+c
+
+ self.assertEqual(up(1, 1, 1), 3)
+ up.update(1, 0)
+ up.update(2, 9)
+ self.assertEqual(up(1, 0, 0), 0)
+ self.assertEqual(up(2, 0, 0), 9)
+
+ def test_update_db(self):
+ @cache.cache(keyarg=0, maxage=10)
+ def up(a, b, c):
+ return a+b+c
+
+ self.assertEqual(up(1, 1, 1), 3)
+ up.update(1, 0)
+ up.update(2, 9)
+ self.assertEqual(up(1, 0, 0), 0)
+ self.assertEqual(up(2, 0, 0), 9)
+
+ def test_invalidate_mem_simple(self):
+ @cache.memcache(keyarg=0)
+ def inv(a, b, c):
+ return a+b+c
+
+ self.assertEqual(inv(1, 1, 1), 3)
+ inv.invalidate(1)
+ inv.invalidate(2)
+ self.assertEqual(inv(1, 0, 0), 1)
+ self.assertEqual(inv(2, 0, 0), 2)
+
+ def test_invalidate_mem(self):
+ @cache.memcache(keyarg=0, maxage=10)
+ def inv(a, b, c):
+ return a+b+c
+
+ self.assertEqual(inv(1, 1, 1), 3)
+ inv.invalidate(1)
+ inv.invalidate(2)
+ self.assertEqual(inv(1, 0, 0), 1)
+ self.assertEqual(inv(2, 0, 0), 2)
+
+ def test_invalidate_db(self):
+ @cache.cache(keyarg=0, maxage=10)
+ def inv(a, b, c):
+ return a+b+c
+
+ self.assertEqual(inv(1, 1, 1), 3)
+ inv.invalidate(1)
+ inv.invalidate(2)
+ self.assertEqual(inv(1, 0, 0), 1)
+ self.assertEqual(inv(2, 0, 0), 2)
+
+ def test_database_read(self):
+ @cache.cache(keyarg=0, maxage=10)
+ def db(a, b, c):
+ return a+b+c
+
+ # initialize cache
+ self.assertEqual(db(1, 1, 1), 3)
+ db.update(2, 6)
+
+ # check and clear the in-memory portion of said cache
+ self.assertEqual(db.cache[1][0], 3)
+ self.assertEqual(db.cache[2][0], 6)
+ db.cache.clear()
+ self.assertEqual(db.cache, {})
+
+ # fetch results from database
+ self.assertEqual(db(1, 0, 0), 3)
+ self.assertEqual(db(2, 0, 0), 6)
+
+ # check in-memory cache updates
+ self.assertEqual(db.cache[1][0], 3)
+ self.assertEqual(db.cache[2][0], 6)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_cookies.py b/test/test_cookies.py
index 4f294bf..c39a5e6 100644
--- a/test/test_cookies.py
+++ b/test/test_cookies.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2017-2019 Mike Fährmann
+# Copyright 2017-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -12,7 +12,6 @@ from unittest import mock
import logging
import tempfile
-import http.cookiejar
from os.path import join
import gallery_dl.config as config
@@ -34,7 +33,7 @@ class TestCookiejar(unittest.TestCase):
cls.invalid_cookiefile = join(cls.path.name, "invalid.txt")
with open(cls.invalid_cookiefile, "w") as file:
file.write("""# asd
-.example.org\tTRUE\t/\tFALSE\t253402210800\tNAME\tVALUE
+.example.org\tTRUE/FALSE\t253402210800\tNAME\tVALUE
""")
@classmethod
@@ -55,7 +54,7 @@ class TestCookiejar(unittest.TestCase):
self.assertEqual(cookie.value , "VALUE")
def test_invalid_cookiefile(self):
- self._test_warning(self.invalid_cookiefile, http.cookiejar.LoadError)
+ self._test_warning(self.invalid_cookiefile, ValueError)
def test_invalid_filename(self):
self._test_warning(join(self.path.name, "nothing"), FileNotFoundError)
diff --git a/test/test_results.py b/test/test_results.py
index e87b4b8..538abfa 100644
--- a/test/test_results.py
+++ b/test/test_results.py
@@ -12,6 +12,7 @@ import sys
import re
import json
import hashlib
+import datetime
import unittest
from gallery_dl import extractor, util, job, config, exception
@@ -21,14 +22,17 @@ TRAVIS_SKIP = {
"exhentai", "kissmanga", "mangafox", "dynastyscans", "nijie", "bobx",
"archivedmoe", "archiveofsins", "thebarchive", "fireden", "4plebs",
"sankaku", "idolcomplex", "mangahere", "readcomiconline", "mangadex",
- "sankakucomplex", "warosu", "fuskator", "patreon",
+ "sankakucomplex", "warosu", "fuskator", "patreon", "komikcast",
}
# temporary issues, etc.
BROKEN = {
- "imxto",
+ "35photo",
"mangapark",
"photobucket",
+ "sexcom",
+ "hentaicafe",
+ "worldthree",
}
@@ -154,6 +158,9 @@ class TestExtractorResults(unittest.TestCase):
elif isinstance(test, str):
if test.startswith("re:"):
self.assertRegex(value, test[3:], msg=key)
+ elif test.startswith("dt:"):
+ self.assertIsInstance(value, datetime.datetime, msg=key)
+ self.assertEqual(str(value), test[3:], msg=key)
elif test.startswith("type:"):
self.assertEqual(type(value).__name__, test[5:], msg=key)
else:
@@ -267,7 +274,7 @@ class TestFormatter(util.Formatter):
return ""
def _apply_simple(self, key, fmt):
- if key == "extension" or "._format_optional." in repr(fmt):
+ if key == "extension" or "._parse_optional." in repr(fmt):
return self._noop
def wrap(obj):
@@ -275,7 +282,7 @@ class TestFormatter(util.Formatter):
return wrap
def _apply(self, key, funcs, fmt):
- if key == "extension" or "._format_optional." in repr(fmt):
+ if key == "extension" or "._parse_optional." in repr(fmt):
return self._noop
def wrap(obj):
@@ -301,6 +308,7 @@ def setup_test_config():
config.set(("extractor", "nijie") , "username", email)
config.set(("extractor", "seiga") , "username", email)
config.set(("extractor", "danbooru") , "username", None)
+ config.set(("extractor", "e621") , "username", None)
config.set(("extractor", "instagram") , "username", None)
config.set(("extractor", "twitter") , "username", None)
diff --git a/test/test_util.py b/test/test_util.py
index 5a103cf..ffabd37 100644
--- a/test/test_util.py
+++ b/test/test_util.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,10 @@
import unittest
import sys
+import io
import random
import string
+import http.cookiejar
from gallery_dl import util, text, exception
@@ -158,11 +160,106 @@ class TestISO639_1(unittest.TestCase):
self.assertEqual(func(*args), result)
+class TestCookiesTxt(unittest.TestCase):
+
+ def test_load_cookiestxt(self):
+
+ def _assert(content, expected):
+ cookies = util.load_cookiestxt(io.StringIO(content, None))
+ for c, e in zip(cookies, expected):
+ self.assertEqual(c.__dict__, e.__dict__)
+
+ _assert("", [])
+ _assert("\n\n\n", [])
+ _assert("$ Comment", [])
+ _assert("# Comment", [])
+ _assert(" # Comment \n\n $ Comment ", [])
+ _assert(
+ ".example.org\tTRUE\t/\tTRUE\t0\tname\tvalue",
+ [self._cookie("name", "value", ".example.org")],
+ )
+ _assert(
+ ".example.org\tTRUE\t/\tTRUE\t\tname\t",
+ [self._cookie("name", "", ".example.org")],
+ )
+ _assert(
+ "# Netscape HTTP Cookie File\n"
+ "\n"
+ "# default\n"
+ ".example.org TRUE / FALSE 0 n1 v1\n"
+ ".example.org TRUE / TRUE 2145945600 n2 v2\n"
+ ".example.org TRUE /path FALSE 0 n3\n"
+ "\n"
+ " # # extra # # \n"
+ "www.example.org FALSE / FALSE n4 \n"
+ "www.example.org FALSE /path FALSE 100 n5 v5\n",
+ [
+ self._cookie(
+ "n1", "v1", ".example.org", True, "/", False),
+ self._cookie(
+ "n2", "v2", ".example.org", True, "/", True, 2145945600),
+ self._cookie(
+ "n3", None, ".example.org", True, "/path", False),
+ self._cookie(
+ "n4", "" , "www.example.org", False, "/", False),
+ self._cookie(
+ "n5", "v5", "www.example.org", False, "/path", False, 100),
+ ],
+ )
+
+ with self.assertRaises(ValueError):
+ util.load_cookiestxt("example.org\tTRUE\t/\tTRUE\t0\tname")
+
+ def test_save_cookiestxt(self):
+
+ def _assert(cookies, expected):
+ fp = io.StringIO(newline=None)
+ util.save_cookiestxt(fp, cookies)
+ self.assertMultiLineEqual(fp.getvalue(), expected)
+
+ _assert([], "# Netscape HTTP Cookie File\n\n")
+ _assert(
+ [self._cookie("name", "value", ".example.org")],
+ "# Netscape HTTP Cookie File\n\n"
+ ".example.org\tTRUE\t/\tTRUE\t0\tname\tvalue\n",
+ )
+ _assert(
+ [
+ self._cookie(
+ "n1", "v1", ".example.org", True, "/", False),
+ self._cookie(
+ "n2", "v2", ".example.org", True, "/", True, 2145945600),
+ self._cookie(
+ "n3", None, ".example.org", True, "/path", False),
+ self._cookie(
+ "n4", "" , "www.example.org", False, "/", False),
+ self._cookie(
+ "n5", "v5", "www.example.org", False, "/path", False, 100),
+ ],
+ "# Netscape HTTP Cookie File\n"
+ "\n"
+ ".example.org TRUE / FALSE 0 n1 v1\n"
+ ".example.org TRUE / TRUE 2145945600 n2 v2\n"
+ ".example.org TRUE /path FALSE 0 n3\n"
+ "www.example.org FALSE / FALSE 0 n4 \n"
+ "www.example.org FALSE /path FALSE 100 n5 v5\n",
+ )
+
+ def _cookie(self, name, value, domain, domain_specified=True,
+ path="/", secure=True, expires=None):
+ return http.cookiejar.Cookie(
+ 0, name, value, None, False,
+ domain, domain_specified, domain.startswith("."),
+ path, False, secure, expires, False, None, None, {},
+ )
+
+
class TestFormatter(unittest.TestCase):
kwdict = {
"a": "hElLo wOrLd",
"b": "äöü",
+ "d": {"a": "foo", "b": 0, "c": None},
"l": ["a", "b", "c"],
"n": None,
"u": "%27%3C%20/%20%3E%27",
@@ -227,6 +324,26 @@ class TestFormatter(unittest.TestCase):
self._run_test("{missing[key]}", replacement, default)
self._run_test("{missing:?a//}", "a" + default, default)
+ def test_alternative(self):
+ self._run_test("{a|z}" , "hElLo wOrLd")
+ self._run_test("{z|a}" , "hElLo wOrLd")
+ self._run_test("{z|y|a}" , "hElLo wOrLd")
+ self._run_test("{z|y|x|a}", "hElLo wOrLd")
+ self._run_test("{z|n|a|y}", "hElLo wOrLd")
+
+ self._run_test("{z|a!C}" , "Hello World")
+ self._run_test("{z|a:Rh/C/}" , "CElLo wOrLd")
+ self._run_test("{z|a!C:RH/C/}", "Cello World")
+ self._run_test("{z|y|x:?</>/}", "")
+
+ self._run_test("{d[c]|d[b]|d[a]}", "0")
+ self._run_test("{d[a]|d[b]|d[c]}", "foo")
+ self._run_test("{d[z]|d[y]|d[x]}", "None")
+
+ def test_indexing(self):
+ self._run_test("{l[0]}" , "a")
+ self._run_test("{a[6]}" , "w")
+
def test_slicing(self):
v = self.kwdict["a"]
self._run_test("{a[1:10]}" , v[1:10])
@@ -273,6 +390,18 @@ class TestFormatter(unittest.TestCase):
self._run_test("{a!l:Rl//}" , "heo word")
self._run_test("{name:Rame/othing/}", "Nothing")
+ def test_chain_special(self):
+ # multiple replacements
+ self._run_test("{a:Rh/C/RE/e/RL/l/}", "Cello wOrld")
+ self._run_test("{d[b]!s:R1/Q/R2/A/R0/Y/}", "Y")
+
+ # join-and-replace
+ self._run_test("{l:J-/Rb/E/}", "a-E-c")
+
+ # optional-and-maxlen
+ self._run_test("{d[a]:?</>/L1/too long/}", "<too long>")
+ self._run_test("{d[c]:?</>/L5/too long/}", "")
+
def _run_test(self, format_string, result, default=None):
formatter = util.Formatter(format_string, default)
output = formatter.format_map(self.kwdict)