aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2021-04-13 19:33:55 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2021-04-13 19:33:55 -0400
commit027af8fd5fb02b9cb2fc3bc6e58ed41ac1056289 (patch)
tree45e9927640751d54f1c2331595e6a804807a388f
parenta7f4d54b42ad98cd8e28bff2891097e0eebfac7c (diff)
parentd27dcd4646242d6da8436f14c7b37ce864355858 (diff)
downloadgallery-dl-027af8fd5fb02b9cb2fc3bc6e58ed41ac1056289.tar.bz2
gallery-dl-027af8fd5fb02b9cb2fc3bc6e58ed41ac1056289.tar.xz
gallery-dl-027af8fd5fb02b9cb2fc3bc6e58ed41ac1056289.tar.zst
Update upstream source from tag 'upstream/1.17.2'
Update to upstream version '1.17.2' with Debian dir 223e9a6bbd333c762be6ae0b8588efbfc0885dd0
-rw-r--r--CHANGELOG.md52
-rw-r--r--PKG-INFO9
-rw-r--r--README.rst7
-rw-r--r--data/man/gallery-dl.12
-rw-r--r--data/man/gallery-dl.conf.564
-rw-r--r--docs/gallery-dl.conf2
-rw-r--r--gallery_dl.egg-info/PKG-INFO9
-rw-r--r--gallery_dl.egg-info/SOURCES.txt10
-rw-r--r--gallery_dl/extractor/__init__.py10
-rw-r--r--gallery_dl/extractor/architizer.py101
-rw-r--r--gallery_dl/extractor/aryion.py3
-rw-r--r--gallery_dl/extractor/bcy.py15
-rw-r--r--gallery_dl/extractor/common.py2
-rw-r--r--gallery_dl/extractor/derpibooru.py188
-rw-r--r--gallery_dl/extractor/deviantart.py78
-rw-r--r--gallery_dl/extractor/dynastyscans.py7
-rw-r--r--gallery_dl/extractor/erome.py14
-rw-r--r--gallery_dl/extractor/exhentai.py78
-rw-r--r--gallery_dl/extractor/gelbooru.py26
-rw-r--r--gallery_dl/extractor/gelbooru_v01.py9
-rw-r--r--gallery_dl/extractor/hentaicafe.py173
-rw-r--r--gallery_dl/extractor/hentaifox.py104
-rw-r--r--gallery_dl/extractor/hentainexus.py185
-rw-r--r--gallery_dl/extractor/imagehosts.py3
-rw-r--r--gallery_dl/extractor/imgur.py23
-rw-r--r--gallery_dl/extractor/instagram.py1
-rw-r--r--gallery_dl/extractor/komikcast.py29
-rw-r--r--gallery_dl/extractor/manganelo.py119
-rw-r--r--gallery_dl/extractor/mangareader.py95
-rw-r--r--gallery_dl/extractor/mangastream.py54
-rw-r--r--gallery_dl/extractor/nozomi.py54
-rw-r--r--gallery_dl/extractor/philomena.py216
-rw-r--r--gallery_dl/extractor/pinterest.py33
-rw-r--r--gallery_dl/extractor/pixiv.py4
-rw-r--r--gallery_dl/extractor/sankaku.py8
-rw-r--r--gallery_dl/extractor/tapas.py205
-rw-r--r--gallery_dl/extractor/tumblr.py3
-rw-r--r--gallery_dl/extractor/twitter.py74
-rw-r--r--gallery_dl/extractor/unsplash.py2
-rw-r--r--gallery_dl/extractor/vk.py88
-rw-r--r--gallery_dl/extractor/weasyl.py2
-rw-r--r--gallery_dl/extractor/wikiart.py2
-rw-r--r--gallery_dl/job.py32
-rw-r--r--gallery_dl/text.py47
-rw-r--r--gallery_dl/version.py2
-rw-r--r--test/test_text.py34
46 files changed, 1270 insertions, 1008 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ef4148a..d57583e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,57 @@
# Changelog
+## 1.17.2 - 2021-04-02
+### Additions
+- [deviantart] add support for posts from watched users ([#794](https://github.com/mikf/gallery-dl/issues/794))
+- [manganelo] add `chapter` and `manga` extractors ([#1415](https://github.com/mikf/gallery-dl/issues/1415))
+- [pinterest] add `search` extractor ([#1411](https://github.com/mikf/gallery-dl/issues/1411))
+- [sankaku] add `tag_string` metadata field ([#1388](https://github.com/mikf/gallery-dl/issues/1388))
+- [sankaku] add enumeration index for books ([#1388](https://github.com/mikf/gallery-dl/issues/1388))
+- [tapas] add `series` and `episode` extractors ([#692](https://github.com/mikf/gallery-dl/issues/692))
+- [tapas] implement login with username & password ([#692](https://github.com/mikf/gallery-dl/issues/692))
+- [twitter] allow specifying a custom format for user results ([#1337](https://github.com/mikf/gallery-dl/issues/1337))
+- [twitter] add extractor for direct image links ([#1417](https://github.com/mikf/gallery-dl/issues/1417))
+- [vk] add support for albums ([#474](https://github.com/mikf/gallery-dl/issues/474))
+### Fixes
+- [aryion] unescape paths ([#1414](https://github.com/mikf/gallery-dl/issues/1414))
+- [bcy] improve pagination
+- [deviantart] update `watch` URL pattern ([#794](https://github.com/mikf/gallery-dl/issues/794))
+- [deviantart] fix arguments for search/popular results ([#1408](https://github.com/mikf/gallery-dl/issues/1408))
+- [deviantart] use fallback for `/intermediary/` URLs
+- [exhentai] improve and simplify image limit checks
+- [komikcast] fix extraction
+- [pixiv] fix `favorite` URL pattern ([#1405](https://github.com/mikf/gallery-dl/issues/1405))
+- [sankaku] simplify `pool` tags ([#1388](https://github.com/mikf/gallery-dl/issues/1388))
+- [twitter] improve error message when trying to log in with 2FA ([#1409](https://github.com/mikf/gallery-dl/issues/1409))
+- [twitter] don't use youtube-dl for cards when videos are disabled ([#1416](https://github.com/mikf/gallery-dl/issues/1416))
+
+## 1.17.1 - 2021-03-19
+### Additions
+- [architizer] add `project` and `firm` extractors ([#1369](https://github.com/mikf/gallery-dl/issues/1369))
+- [deviantart] add `watch` extractor ([#794](https://github.com/mikf/gallery-dl/issues/794))
+- [exhentai] support `/tag/` URLs ([#1363](https://github.com/mikf/gallery-dl/issues/1363))
+- [gelbooru_v01] support `drawfriends.booru.org`, `vidyart.booru.org`, and `tlb.booru.org` by default
+- [nozomi] support `/index-N.html` URLs ([#1365](https://github.com/mikf/gallery-dl/issues/1365))
+- [philomena] add generalized extractors for philomena sites ([#1379](https://github.com/mikf/gallery-dl/issues/1379))
+- [philomena] support post URLs without `/images/`
+- [twitter] implement `users` option ([#1337](https://github.com/mikf/gallery-dl/issues/1337))
+- implement `parent-metadata` option ([#1364](https://github.com/mikf/gallery-dl/issues/1364))
+### Changes
+- [deviantart] revert previous changes to `extra` option ([#1356](https://github.com/mikf/gallery-dl/issues/1356), [#1387](https://github.com/mikf/gallery-dl/issues/1387))
+### Fixes
+- [exhentai] improve favorites count extraction ([#1360](https://github.com/mikf/gallery-dl/issues/1360))
+- [gelbooru] update domain for video downloads ([#1368](https://github.com/mikf/gallery-dl/issues/1368))
+- [hentaifox] improve image and metadata extraction ([#1366](https://github.com/mikf/gallery-dl/issues/1366), [#1378](https://github.com/mikf/gallery-dl/issues/1378))
+- [imgur] fix and improve rate limit handling ([#1386](https://github.com/mikf/gallery-dl/issues/1386))
+- [weasyl] improve favorites URL pattern ([#1374](https://github.com/mikf/gallery-dl/issues/1374))
+- use type check before applying `browser` option ([#1358](https://github.com/mikf/gallery-dl/issues/1358))
+- ensure `-s/--simulate` always prints filenames ([#1360](https://github.com/mikf/gallery-dl/issues/1360))
+### Removals
+- [hentaicafe] remove module
+- [hentainexus] remove module
+- [mangareader] remove module
+- [mangastream] remove module
+
## 1.17.0 - 2021-03-05
### Additions
- [cyberdrop] add support for `https://cyberdrop.me/` ([#1328](https://github.com/mikf/gallery-dl/issues/1328))
diff --git a/PKG-INFO b/PKG-INFO
index 7a9a43a..f3ee6d3 100644
--- a/PKG-INFO
+++ b/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery_dl
-Version: 1.17.0
+Version: 1.17.2
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -75,8 +75,8 @@ Description: ==========
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -223,6 +223,7 @@ Description: ==========
``pinterest``,
``sankaku``,
``subscribestar``,
+ ``tapas``,
``tsumino``,
and ``twitter``.
@@ -328,7 +329,7 @@ Description: ==========
.. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
- .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
+ .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
diff --git a/README.rst b/README.rst
index 20ed222..4cbaa0e 100644
--- a/README.rst
+++ b/README.rst
@@ -64,8 +64,8 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
-- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__
-- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__
+- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__
+- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -212,6 +212,7 @@ and optional for
``pinterest``,
``sankaku``,
``subscribestar``,
+``tapas``,
``tsumino``,
and ``twitter``.
@@ -317,7 +318,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with
.. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
-.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
+.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1
index c420d9b..1ab1ec6 100644
--- a/data/man/gallery-dl.1
+++ b/data/man/gallery-dl.1
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL" "1" "2021-03-05" "1.17.0" "gallery-dl Manual"
+.TH "GALLERY-DL" "1" "2021-04-02" "1.17.2" "gallery-dl Manual"
.\" disable hyphenation
.nh
diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5
index c0629bb..608c2e5 100644
--- a/data/man/gallery-dl.conf.5
+++ b/data/man/gallery-dl.conf.5
@@ -1,4 +1,4 @@
-.TH "GALLERY-DL.CONF" "5" "2021-03-05" "1.17.0" "gallery-dl Manual"
+.TH "GALLERY-DL.CONF" "5" "2021-04-02" "1.17.2" "gallery-dl Manual"
.\" disable hyphenation
.nh
.\" disable justification (adjust text to left margin only)
@@ -155,6 +155,17 @@ Use an extractor's current target directory as
for any spawned child extractors.
+.SS extractor.*.parent-metadata
+.IP "Type:" 6
+\f[I]bool\f[]
+
+.IP "Default:" 9
+\f[I]false\f[]
+
+.IP "Description:" 4
+Overwrite any metadata provided by a child extractor with its parent's.
+
+
.SS extractor.*.path-restrict
.IP "Type:" 6
\f[I]string\f[] or \f[I]object\f[]
@@ -352,6 +363,8 @@ and optional for
.br
* \f[I]subscribestar\f[]
.br
+* \f[I]tapas\f[]
+.br
* \f[I]tsumino\f[]
.br
* \f[I]twitter\f[]
@@ -863,7 +876,7 @@ See \f[I]Filters\f[] for details.
\f[I]false\f[]
.IP "Description:" 4
-Download embedded Deviations and Sta.sh resources from
+Download extra Sta.sh resources from
description texts and journals.
Note: Enabling this option also enables deviantart.metadata_.
@@ -1046,21 +1059,6 @@ depending on the input URL
* \f[I]"exhentai.org"\f[]: Use \f[I]exhentai.org\f[] for all URLs
-.SS extractor.exhentai.limits
-.IP "Type:" 6
-\f[I]bool\f[] or \f[I]integer\f[]
-
-.IP "Default:" 9
-\f[I]true\f[]
-
-.IP "Description:" 4
-Check image download limits
-and stop extraction when they are exceeded.
-
-If this value is an \f[I]integer\f[], it gets used as the limit maximum
-instead of the value listed on \f[I]https://e-hentai.org/home.php\f[]
-
-
.SS extractor.exhentai.metadata
.IP "Type:" 6
\f[I]bool\f[]
@@ -1272,7 +1270,7 @@ A (comma-separated) list of subcategories to include
when processing a user profile.
Possible values are
-\f[I]"posts"\f[], \f[I]"stories"\f[], \f[I]"highlights"\f[], \f[I]"channel"\f[].
+\f[I]"posts"\f[], \f[I]reels\f[], \f[I]"stories"\f[], \f[I]"highlights"\f[], \f[I]"channel"\f[].
You can use \f[I]"all"\f[] instead of listing all values separately.
@@ -1599,6 +1597,7 @@ linked to in the initial set of submissions.
This value sets the maximum recursion depth.
Special values:
+
.br
* \f[I]0\f[]: Recursion is disabled
.br
@@ -1844,6 +1843,35 @@ will be taken from the original Tweets, not the Retweets.
Extract \f[I]TwitPic\f[] embeds.
+.SS extractor.twitter.users
+.IP "Type:" 6
+\f[I]string\f[]
+
+.IP "Default:" 9
+\f[I]"timeline"\f[]
+
+.IP "Example:" 4
+"https://twitter.com/search?q=from:{legacy[screen_name]}"
+
+.IP "Description:" 4
+Format string for user URLs generated from
+.br
+\f[I]following\f[] and \f[I]list-members\f[] queries,
+whose replacement field values come from Twitter \f[I]user\f[] objects
+.br
+(\f[I]Example\f[])
+
+Special values:
+
+.br
+* \f[I]"timeline"\f[]: \f[I]https://twitter.com/i/user/{rest_id}\f[]
+.br
+* \f[I]"media"\f[]: \f[I]https://twitter.com/id:{rest_id}/media\f[]
+
+Note: To allow gallery-dl to follow custom URL formats, set the \f[I]blacklist\f[]
+for \f[I]twitter\f[] to a non-default value, e.g. an empty string \f[I]""\f[].
+
+
.SS extractor.twitter.videos
.IP "Type:" 6
\f[I]bool\f[] or \f[I]string\f[]
diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf
index acf60c7..8a3d9e2 100644
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@@ -79,7 +79,6 @@
"username": null,
"password": null,
"domain": "auto",
- "limits": true,
"metadata": false,
"original": true,
"sleep-request": 5.0
@@ -254,6 +253,7 @@
"replies": true,
"retweets": true,
"twitpic": false,
+ "users": "timeline",
"videos": true
},
"unsplash":
diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO
index fbf67fe..f233a1a 100644
--- a/gallery_dl.egg-info/PKG-INFO
+++ b/gallery_dl.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: gallery-dl
-Version: 1.17.0
+Version: 1.17.2
Summary: Command-line program to download image galleries and collections from several image hosting sites
Home-page: https://github.com/mikf/gallery-dl
Author: Mike Fährmann
@@ -75,8 +75,8 @@ Description: ==========
Prebuilt executable files with a Python interpreter and
required Python packages included are available for
- - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__
- - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__
+ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__
+ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__
| Executables build from the latest commit can be found at
| https://github.com/mikf/gallery-dl/actions/workflows/executables.yml
@@ -223,6 +223,7 @@ Description: ==========
``pinterest``,
``sankaku``,
``subscribestar``,
+ ``tapas``,
``tsumino``,
and ``twitter``.
@@ -328,7 +329,7 @@ Description: ==========
.. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf
.. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf
.. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst
- .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
+ .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md
.. _Python: https://www.python.org/downloads/
.. _PyPI: https://pypi.org/
diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt
index 89ae8ed..09e7097 100644
--- a/gallery_dl.egg-info/SOURCES.txt
+++ b/gallery_dl.egg-info/SOURCES.txt
@@ -42,6 +42,7 @@ gallery_dl/extractor/8kun.py
gallery_dl/extractor/8muses.py
gallery_dl/extractor/__init__.py
gallery_dl/extractor/adultempire.py
+gallery_dl/extractor/architizer.py
gallery_dl/extractor/artstation.py
gallery_dl/extractor/aryion.py
gallery_dl/extractor/bcy.py
@@ -51,7 +52,6 @@ gallery_dl/extractor/booru.py
gallery_dl/extractor/common.py
gallery_dl/extractor/cyberdrop.py
gallery_dl/extractor/danbooru.py
-gallery_dl/extractor/derpibooru.py
gallery_dl/extractor/deviantart.py
gallery_dl/extractor/directlink.py
gallery_dl/extractor/dynastyscans.py
@@ -70,12 +70,10 @@ gallery_dl/extractor/gelbooru_v02.py
gallery_dl/extractor/gfycat.py
gallery_dl/extractor/hbrowse.py
gallery_dl/extractor/hentai2read.py
-gallery_dl/extractor/hentaicafe.py
gallery_dl/extractor/hentaifoundry.py
gallery_dl/extractor/hentaifox.py
gallery_dl/extractor/hentaihand.py
gallery_dl/extractor/hentaihere.py
-gallery_dl/extractor/hentainexus.py
gallery_dl/extractor/hiperdex.py
gallery_dl/extractor/hitomi.py
gallery_dl/extractor/idolcomplex.py
@@ -102,9 +100,8 @@ gallery_dl/extractor/mangadex.py
gallery_dl/extractor/mangafox.py
gallery_dl/extractor/mangahere.py
gallery_dl/extractor/mangakakalot.py
+gallery_dl/extractor/manganelo.py
gallery_dl/extractor/mangapark.py
-gallery_dl/extractor/mangareader.py
-gallery_dl/extractor/mangastream.py
gallery_dl/extractor/mangoxo.py
gallery_dl/extractor/mastodon.py
gallery_dl/extractor/message.py
@@ -122,6 +119,7 @@ gallery_dl/extractor/nsfwalbum.py
gallery_dl/extractor/oauth.py
gallery_dl/extractor/paheal.py
gallery_dl/extractor/patreon.py
+gallery_dl/extractor/philomena.py
gallery_dl/extractor/photobucket.py
gallery_dl/extractor/photovogue.py
gallery_dl/extractor/piczel.py
@@ -149,6 +147,7 @@ gallery_dl/extractor/slideshare.py
gallery_dl/extractor/smugmug.py
gallery_dl/extractor/speakerdeck.py
gallery_dl/extractor/subscribestar.py
+gallery_dl/extractor/tapas.py
gallery_dl/extractor/test.py
gallery_dl/extractor/tsumino.py
gallery_dl/extractor/tumblr.py
@@ -156,6 +155,7 @@ gallery_dl/extractor/tumblrgallery.py
gallery_dl/extractor/twitter.py
gallery_dl/extractor/unsplash.py
gallery_dl/extractor/vanillarock.py
+gallery_dl/extractor/vk.py
gallery_dl/extractor/vsco.py
gallery_dl/extractor/wallhaven.py
gallery_dl/extractor/warosu.py
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 57794d0..3d61515 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -17,6 +17,7 @@ modules = [
"8kun",
"8muses",
"adultempire",
+ "architizer",
"artstation",
"aryion",
"bcy",
@@ -24,7 +25,6 @@ modules = [
"blogger",
"cyberdrop",
"danbooru",
- "derpibooru",
"deviantart",
"dynastyscans",
"e621",
@@ -40,12 +40,10 @@ modules = [
"gfycat",
"hbrowse",
"hentai2read",
- "hentaicafe",
"hentaifoundry",
"hentaifox",
"hentaihand",
"hentaihere",
- "hentainexus",
"hiperdex",
"hitomi",
"idolcomplex",
@@ -71,9 +69,8 @@ modules = [
"mangafox",
"mangahere",
"mangakakalot",
+ "manganelo",
"mangapark",
- "mangareader",
- "mangastream",
"mangoxo",
"myhentaigallery",
"myportfolio",
@@ -87,6 +84,7 @@ modules = [
"nsfwalbum",
"paheal",
"patreon",
+ "philomena",
"photobucket",
"photovogue",
"piczel",
@@ -112,12 +110,14 @@ modules = [
"smugmug",
"speakerdeck",
"subscribestar",
+ "tapas",
"tsumino",
"tumblr",
"tumblrgallery",
"twitter",
"unsplash",
"vanillarock",
+ "vk",
"vsco",
"wallhaven",
"warosu",
diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py
new file mode 100644
index 0000000..9629e25
--- /dev/null
+++ b/gallery_dl/extractor/architizer.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://architizer.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text
+
+
+class ArchitizerProjectExtractor(GalleryExtractor):
+ """Extractor for project pages on architizer.com"""
+ category = "architizer"
+ subcategory = "project"
+ root = "https://architizer.com"
+ directory_fmt = ("{category}", "{firm}", "{title}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{gid}_{num}"
+ pattern = r"(?:https?://)?architizer\.com/projects/([^/?#]+)"
+ test = ("https://architizer.com/projects/house-lo/", {
+ "pattern": r"https://architizer-prod\.imgix\.net/media/mediadata"
+ r"/uploads/.+\.jpg$",
+ "keyword": {
+ "count": 27,
+ "description": str,
+ "firm": "Atelier Lina Bellovicova",
+ "gid": "225496",
+ "location": "Czechia",
+ "num": int,
+ "size": "1000 sqft - 3000 sqft",
+ "slug": "house-lo",
+ "status": "Built",
+ "subcategory": "project",
+ "title": "House LO",
+ "type": "Residential › Private House",
+ "year": "2018",
+ },
+ })
+
+ def __init__(self, match):
+ url = "{}/projects/{}/".format(self.root, match.group(1))
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ return {
+ "title" : extr("data-name='", "'"),
+ "slug" : extr("data-slug='", "'"),
+ "gid" : extr("data-gid='", "'").rpartition(".")[2],
+ "firm" : extr("data-firm-leaders-str='", "'"),
+ "location" : extr("<h2>", "<").strip(),
+ "type" : text.unescape(text.remove_html(extr(
+ '<div class="title">Type</div>', '<br'))),
+ "status" : text.remove_html(extr(
+ '<div class="title">STATUS</div>', '</')),
+ "year" : text.remove_html(extr(
+ '<div class="title">YEAR</div>', '</')),
+ "size" : text.remove_html(extr(
+ '<div class="title">SIZE</div>', '</')),
+ "description": text.unescape(extr(
+ '<span class="copy js-copy">', '</span></div>')
+ .replace("<br />", "\n")),
+ }
+
+ def images(self, page):
+ return [
+ (url, None)
+ for url in text.extract_iter(
+ page, "property='og:image:secure_url' content='", "?")
+ ]
+
+
+class ArchitizerFirmExtractor(Extractor):
+ """Extractor for all projects of a firm"""
+ category = "architizer"
+ subcategory = "firm"
+ root = "https://architizer.com"
+ pattern = r"(?:https?://)?architizer\.com/firms/([^/?#]+)"
+ test = ("https://architizer.com/firms/olson-kundig/", {
+ "pattern": ArchitizerProjectExtractor.pattern,
+ "count": ">= 90",
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.firm = match.group(1)
+
+ def items(self):
+ url = url = "{}/firms/{}/?requesting_merlin=pages".format(
+ self.root, self.firm)
+ page = self.request(url).text
+ data = {"_extractor": ArchitizerProjectExtractor}
+
+ for project in text.extract_iter(page, '<a href="/projects/', '"'):
+ if not project.startswith("q/"):
+ url = "{}/projects/{}".format(self.root, project)
+ yield Message.Queue, url, data
diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py
index 6a90b76..ded2ae3 100644
--- a/gallery_dl/extractor/aryion.py
+++ b/gallery_dl/extractor/aryion.py
@@ -126,7 +126,8 @@ class AryionExtractor(Extractor):
"user" : self.user or artist,
"title" : title,
"artist": artist,
- "path" : text.split_html(extr("cookiecrumb'>", '</span'))[4:-1:2],
+ "path" : text.split_html(extr(
+ "cookiecrumb'>", '</span'))[4:-1:2],
"date" : extr("class='pretty-date' title='", "'"),
"size" : text.parse_int(clen),
"views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")),
diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py
index ec7020a..6e0003d 100644
--- a/gallery_dl/extractor/bcy.py
+++ b/gallery_dl/extractor/bcy.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020 Mike Fährmann
+# Copyright 2020-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -125,12 +125,15 @@ class BcyUserExtractor(BcyExtractor):
while True:
data = self.request(url, params=params).json()
- item = None
- for item in data["data"]["items"]:
- yield item["item_detail"]
-
- if not item:
+ try:
+ items = data["data"]["items"]
+ except KeyError:
+ return
+ if not items:
return
+
+ for item in items:
+ yield item["item_detail"]
params["since"] = item["since"]
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index e9b9718..048e0a3 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -216,7 +216,7 @@ class Extractor():
headers.clear()
browser = self.config("browser") or self.browser
- if browser:
+ if browser and isinstance(browser, str):
browser, _, platform = browser.lower().partition(":")
if not platform or platform == "auto":
diff --git a/gallery_dl/extractor/derpibooru.py b/gallery_dl/extractor/derpibooru.py
deleted file mode 100644
index 94f3729..0000000
--- a/gallery_dl/extractor/derpibooru.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2021 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://derpibooru.org/"""
-
-from .booru import BooruExtractor
-from .. import text, exception
-import operator
-
-BASE_PATTERN = r"(?:https?://)?derpibooru\.org"
-
-
-class DerpibooruExtractor(BooruExtractor):
- """Base class for derpibooru extractors"""
- category = "derpibooru"
- filename_fmt = "{filename}.{extension}"
- archive_fmt = "{id}"
- root = "https://derpibooru.org"
- request_interval = 1.0
- per_page = 50
-
- _file_url = operator.itemgetter("view_url")
-
- @staticmethod
- def _prepare(post):
- post["date"] = text.parse_datetime(post["created_at"])
-
- @staticmethod
- def _extended_tags(post):
- pass
-
- def _pagination(self, url, params):
- params["page"] = 1
- params["per_page"] = self.per_page
-
- api_key = self.config("api-key")
- if api_key:
- params["key"] = api_key
-
- filter_id = self.config("filter")
- if filter_id:
- params["filter_id"] = filter_id
- elif not api_key:
- params["filter_id"] = "56027" # "Everything" filter
-
- while True:
- data = self.request(url, params=params).json()
- yield from data["images"]
-
- if len(data["images"]) < self.per_page:
- return
- params["page"] += 1
-
-
-class DerpibooruPostExtractor(DerpibooruExtractor):
- """Extractor for single posts from derpibooru.org"""
- subcategory = "post"
- pattern = BASE_PATTERN + r"/images/(\d+)"
- test = ("https://derpibooru.org/images/1", {
- "content": "88449eeb0c4fa5d3583d0b794f6bc1d70bf7f889",
- "count": 1,
- "keyword": {
- "animated": False,
- "aspect_ratio": 1.0,
- "comment_count": int,
- "created_at": "2012-01-02T03:12:33Z",
- "date": "dt:2012-01-02 03:12:33",
- "deletion_reason": None,
- "description": "",
- "downvotes": int,
- "duplicate_of": None,
- "duration": 0.04,
- "extension": "png",
- "faves": int,
- "first_seen_at": "2012-01-02T03:12:33Z",
- "format": "png",
- "height": 900,
- "hidden_from_users": False,
- "id": 1,
- "mime_type": "image/png",
- "name": "1__safe_fluttershy_solo_cloud_happy_flying_upvotes+galore"
- "_artist-colon-speccysy_get_sunshine",
- "orig_sha512_hash": None,
- "processed": True,
- "representations": dict,
- "score": int,
- "sha512_hash": "f16c98e2848c2f1bfff3985e8f1a54375cc49f78125391aeb8"
- "0534ce011ead14e3e452a5c4bc98a66f56bdfcd07ef7800663"
- "b994f3f343c572da5ecc22a9660f",
- "size": 860914,
- "source_url": "https://www.deviantart.com/speccysy/art"
- "/Afternoon-Flight-215193985",
- "spoilered": False,
- "tag_count": 36,
- "tag_ids": list,
- "tags": list,
- "thumbnails_generated": True,
- "updated_at": "2020-05-28T13:14:07Z",
- "uploader": "Clover the Clever",
- "uploader_id": 211188,
- "upvotes": int,
- "view_url": str,
- "width": 900,
- "wilson_score": float,
- },
- })
-
- def __init__(self, match):
- DerpibooruExtractor.__init__(self, match)
- self.image_id = match.group(1)
-
- def posts(self):
- url = self.root + "/api/v1/json/images/" + self.image_id
- return (self.request(url).json()["image"],)
-
-
-class DerpibooruSearchExtractor(DerpibooruExtractor):
- """Extractor for search results on derpibooru.org"""
- subcategory = "search"
- directory_fmt = ("{category}", "{search_tags}")
- pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))"
- test = (
- ("https://derpibooru.org/search?q=cute", {
- "range": "40-60",
- "count": 21,
- }),
- ("https://derpibooru.org/tags/cute", {
- "range": "40-60",
- "count": 21,
- }),
- )
-
- def __init__(self, match):
- DerpibooruExtractor.__init__(self, match)
- query, tags = match.groups()
- self.params = text.parse_query(query) if query else {"q": tags}
-
- def metadata(self):
- return {"search_tags": self.params.get("q", "")}
-
- def posts(self):
- url = self.root + "/api/v1/json/search/images"
- return self._pagination(url, self.params)
-
-
-class DerpibooruGalleryExtractor(DerpibooruExtractor):
- """Extractor for galleries on derpibooru.org"""
- subcategory = "gallery"
- directory_fmt = ("{category}", "galleries",
- "{gallery[id]} {gallery[title]}")
- pattern = BASE_PATTERN + r"/galleries/(\d+)"
- test = ("https://derpibooru.org/galleries/1", {
- "pattern": r"https://derpicdn\.net/img/view/\d+/\d+/\d+/\d+[^/]+$",
- "keyword": {
- "gallery": {
- "description": "Indexes start at 1 :P",
- "id": 1,
- "spoiler_warning": "",
- "thumbnail_id": 1,
- "title": "The Very First Gallery",
- "user": "DeliciousBlackInk",
- "user_id": 365446,
- },
- },
- })
-
- def __init__(self, match):
- DerpibooruExtractor.__init__(self, match)
- self.gallery_id = match.group(1)
-
- def metadata(self):
- url = self.root + "/api/v1/json/search/galleries"
- params = {"q": "id:" + self.gallery_id}
- galleries = self.request(url, params=params).json()["galleries"]
- if not galleries:
- raise exception.NotFoundError("gallery")
- return {"gallery": galleries[0]}
-
- def posts(self):
- gallery_id = "gallery_id:" + self.gallery_id
- url = self.root + "/api/v1/json/search/images"
- params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id}
- return self._pagination(url, params)
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 47286b7..9d1701f 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -20,7 +20,7 @@ import re
BASE_PATTERN = (
r"(?:https?://)?(?:"
- r"(?:www\.)?deviantart\.com/([\w-]+)|"
+ r"(?:www\.)?deviantart\.com/(?!watch/)([\w-]+)|"
r"(?!www\.)([\w-]+)\.deviantart\.com)"
)
@@ -78,10 +78,6 @@ class DeviantartExtractor(Extractor):
else:
self.user = profile["user"]["username"]
- if self.extra:
- finditer_stash = DeviantartStashExtractor.pattern.finditer
- finditer_deviation = DeviantartDeviationExtractor.pattern.finditer
-
yield Message.Version, 1
for deviation in self.deviations():
if isinstance(deviation, tuple):
@@ -109,7 +105,8 @@ class DeviantartExtractor(Extractor):
intermediary, count = re.subn(
r"(/f/[^/]+/[^/]+)/v\d+/.*",
r"/intermediary\1", content["src"], 1)
- if count and self._check_url(intermediary):
+ if count:
+ deviation["_fallback"] = (content["src"],)
content["src"] = intermediary
if self.quality:
content["src"] = re.sub(
@@ -138,14 +135,10 @@ class DeviantartExtractor(Extractor):
if self.extra:
txt = (deviation.get("description", "") +
deviation.get("_journal", ""))
- for match in finditer_stash(txt):
+ for match in DeviantartStashExtractor.pattern.finditer(txt):
url = text.ensure_http_scheme(match.group(0))
deviation["_extractor"] = DeviantartStashExtractor
yield Message.Queue, url, deviation
- for match in finditer_deviation(txt):
- url = text.ensure_http_scheme(match.group(0))
- deviation["_extractor"] = DeviantartDeviationExtractor
- yield Message.Queue, url, deviation
def deviations(self):
"""Return an iterable containing all relevant Deviation-objects"""
@@ -290,9 +283,6 @@ class DeviantartExtractor(Extractor):
if mtype and mtype.startswith("image/"):
content.update(data)
- def _check_url(self, url):
- return self.request(url, method="HEAD", fatal=False).status_code < 400
-
def _limited_request(self, url, **kwargs):
"""Limits HTTP requests to one every 2 seconds"""
kwargs["fatal"] = None
@@ -718,15 +708,16 @@ class DeviantartPopularExtractor(DeviantartExtractor):
if path:
self.category_path = path.strip("/")
if trange:
- trange = trange[8:] if trange.startswith("popular-") else ""
+ if trange.startswith("popular-"):
+ trange = trange[8:]
self.time_range = trange.replace("-", "").replace("hours", "hr")
if query:
self.search_term = query.get("q")
self.popular = {
"search": self.search_term or "",
- "range": trange or "24-hours",
- "path": self.category_path,
+ "range" : trange or "",
+ "path" : self.category_path,
}
def deviations(self):
@@ -738,6 +729,30 @@ class DeviantartPopularExtractor(DeviantartExtractor):
deviation["popular"] = self.popular
+class DeviantartWatchExtractor(DeviantartExtractor):
+ """Extractor for Deviations from watched users"""
+ subcategory = "watch"
+ pattern = (r"(?:https?://)?(?:www\.)?deviantart\.com"
+ r"/(?:watch/deviations|notifications/watch)()()")
+ test = (
+ ("https://www.deviantart.com/watch/deviations"),
+ ("https://www.deviantart.com/notifications/watch"),
+ )
+
+ def deviations(self):
+ return self.api.browse_deviantsyouwatch()
+
+
+class DeviantartWatchPostsExtractor(DeviantartExtractor):
+ """Extractor for Posts from watched users"""
+ subcategory = "watch-posts"
+ pattern = r"(?:https?://)?(?:www\.)?deviantart\.com/watch/posts()()"
+ test = ("https://www.deviantart.com/watch/posts",)
+
+ def deviations(self):
+ return self.api.browse_posts_deviantsyouwatch()
+
+
###############################################################################
# Eclipse #####################################################################
@@ -926,6 +941,20 @@ class DeviantartOAuthAPI():
self.client_id,
)
+ def browse_deviantsyouwatch(self, offset=0):
+ """Yield deviations from users you watch"""
+ endpoint = "browse/deviantsyouwatch"
+ params = {"limit": "50", "offset": offset,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params, public=False)
+
+ def browse_posts_deviantsyouwatch(self, offset=0):
+ """Yield posts from users you watch"""
+ endpoint = "browse/posts/deviantsyouwatch"
+ params = {"limit": "50", "offset": offset,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params, public=False, unpack=True)
+
def browse_popular(self, query=None, timerange=None, offset=0):
"""Yield popular deviations"""
endpoint = "browse/popular"
@@ -1085,16 +1114,21 @@ class DeviantartOAuthAPI():
self.log.error(msg)
return data
- def _pagination(self, endpoint, params, extend=True, public=True):
+ def _pagination(self, endpoint, params,
+ extend=True, public=True, unpack=False):
warn = True
while True:
data = self._call(endpoint, params, public=public)
if "results" not in data:
self.log.error("Unexpected API response: %s", data)
return
+ results = data["results"]
+ if unpack:
+ results = [item["journal"] for item in results
+ if "journal" in item]
if extend:
- if public and len(data["results"]) < params["limit"]:
+ if public and len(results) < params["limit"]:
if self.refresh_token_key:
self.log.debug("Switching to private access token")
public = False
@@ -1106,10 +1140,10 @@ class DeviantartOAuthAPI():
"oauth:deviantart' and follow the instructions to "
"be able to access them.")
if self.metadata:
- self._metadata(data["results"])
+ self._metadata(results)
if self.folders:
- self._folders(data["results"])
- yield from data["results"]
+ self._folders(results)
+ yield from results
if not data["has_more"]:
return
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index 7d26c47..67051c9 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -1,19 +1,18 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract manga-chapters from https://dynasty-scans.com/"""
+"""Extractors for https://dynasty-scans.com/"""
from .common import ChapterExtractor, Extractor, Message
from .. import text
import json
import re
-
BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
@@ -36,7 +35,7 @@ class DynastyscansBase():
return {
"url" : self.root + url,
"image_id": text.parse_int(image_id),
- "tags" : text.split_html(text.unescape(tags)),
+ "tags" : text.split_html(tags),
"date" : text.remove_html(date),
"source" : text.unescape(src),
}
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 842de7e..2e2e952 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -85,14 +85,14 @@ class EromeAlbumExtractor(EromeExtractor):
"""Extractor for albums on erome.com"""
subcategory = "album"
pattern = BASE_PATTERN + r"/a/(\w+)"
- test = ("https://www.erome.com/a/KandxY7y", {
- "pattern": r"https://s\d+\.erome\.com/355/KandxY7y/\w+",
- "count": 26,
+ test = ("https://www.erome.com/a/TyFMI7ik", {
+ "pattern": r"https://s\d+\.erome\.com/\d+/TyFMI7ik/\w+",
+ "count": 9,
"keyword": {
- "album_id": "KandxY7y",
+ "album_id": "TyFMI7ik",
"num": int,
- "title": "Therealbrittfitt",
- "user": "pokow",
+ "title": "Ryan Ryans",
+ "user": "xanub",
},
})
@@ -103,7 +103,7 @@ class EromeAlbumExtractor(EromeExtractor):
class EromeUserExtractor(EromeExtractor):
subcategory = "user"
pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)"
- test = ("https://www.erome.com/gutiquq", {
+ test = ("https://www.erome.com/xanub", {
"range": "1-25",
"count": 25,
})
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 5a7de23..872a338 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -43,16 +43,8 @@ class ExhentaiExtractor(Extractor):
self.cookiedomain = "." + domain
Extractor.__init__(self, match)
- self.limits = self.config("limits", True)
self.original = self.config("original", True)
- if type(self.limits) is int:
- self._limit_max = self.limits
- self.limits = True
- else:
- self._limit_max = 0
-
- self._remaining = 0
self.session.headers["Referer"] = self.root + "/"
if version != "ex":
self.session.cookies.set("nw", "1", domain=self.cookiedomain)
@@ -77,7 +69,6 @@ class ExhentaiExtractor(Extractor):
self.log.info("no username given; using e-hentai.org")
self.root = "https://e-hentai.org"
self.original = False
- self.limits = False
self.session.cookies["nw"] = "1"
@cache(maxage=90*24*3600, keyarg=1)
@@ -206,8 +197,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
(self.image_from_page(ipage),), self.images_from_api())
for url, image in images:
data.update(image)
- if self.limits:
- self._check_limits(data)
if "/fullimg.php" in url:
data["extension"] = ""
yield Message.Url, url, data
@@ -246,6 +235,12 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"torrentcount" : extr('>Torrent Download (', ')'),
}
+ f = data["favorites"][0]
+ if f == "N":
+ data["favorites"] = "0"
+ elif f == "O":
+ data["favorites"] = "1"
+
data["lang"] = util.language_to_code(data["language"])
data["tags"] = [
text.unquote(tag.replace("+", " "))
@@ -293,6 +288,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["image_token"] = self.key["start"] = extr('var startkey="', '";')
self.key["show"] = extr('var showkey="', '";')
+ if iurl.endswith("g/509.gif"):
+ self._report_limits(data)
return url, text.nameext_from_url(iurl, data)
def images_from_api(self):
@@ -327,10 +324,20 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["num"] = request["page"]
data["image_token"] = imgkey
+
+ if imgurl.endswith("g/509.gif"):
+ self._report_limits(data)
yield url, text.nameext_from_url(imgurl, data)
request["imgkey"] = nextkey
+ def _report_limits(self, data):
+ ExhentaiExtractor.LIMIT = True
+ raise exception.StopExtraction(
+ "Image limit reached! "
+ "Continue with '%s/s/%s/%s-%s' as URL after resetting it.",
+ self.root, data["image_token"], self.gallery_id, data["num"])
+
def _gallery_page(self):
url = "{}/g/{}/{}/".format(
self.root, self.gallery_id, self.gallery_token)
@@ -354,35 +361,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
raise exception.NotFoundError("image page")
return page
- def _check_limits(self, data):
- if not self._remaining or data["num"] % 25 == 0:
- self._update_limits()
- self._remaining -= data["cost"]
-
- if self._remaining <= 0:
- ExhentaiExtractor.LIMIT = True
- url = "{}/s/{}/{}-{}".format(
- self.root, data["image_token"], self.gallery_id, data["num"])
- raise exception.StopExtraction(
- "Image limit reached! Continue with '%s' "
- "as URL after resetting it.", url)
-
- def _update_limits(self):
- url = "https://e-hentai.org/home.php"
- cookies = {
- cookie.name: cookie.value
- for cookie in self.session.cookies
- if cookie.domain == self.cookiedomain and cookie.name != "igneous"
- }
-
- page = self.request(url, cookies=cookies).text
- current, pos = text.extract(page, "<strong>", "</strong>")
- maximum, pos = text.extract(page, "<strong>", "</strong>", pos)
- if self._limit_max:
- maximum = self._limit_max
- self.log.debug("Image Limits: %s/%s", current, maximum)
- self._remaining = text.parse_int(maximum) - text.parse_int(current)
-
@staticmethod
def _parse_image_info(url):
for part in url.split("/")[4:]:
@@ -418,9 +396,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
class ExhentaiSearchExtractor(ExhentaiExtractor):
"""Extractor for exhentai search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/?\?(.*)$"
+ pattern = BASE_PATTERN + r"/(?:\?([^#]*)|tag/([^/?#]+))"
test = (
("https://e-hentai.org/?f_search=touhou"),
+ ("https://exhentai.org/?f_cats=767&f_search=touhou"),
+ ("https://exhentai.org/tag/parody:touhou+project"),
(("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0"
"&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0"
"&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), {
@@ -432,10 +412,20 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
def __init__(self, match):
ExhentaiExtractor.__init__(self, match)
- self.params = text.parse_query(match.group(2))
- self.params["page"] = text.parse_int(self.params.get("page"))
self.search_url = self.root
+ _, query, tag = match.groups()
+ if tag:
+ if "+" in tag:
+ ns, _, tag = tag.rpartition(":")
+ tag = '{}:"{}$"'.format(ns, tag.replace("+", " "))
+ else:
+ tag += "$"
+ self.params = {"f_search": tag, "page": 0}
+ else:
+ self.params = text.parse_query(query)
+ self.params["page"] = text.parse_int(self.params.get("page"))
+
def items(self):
self.login()
data = {"_extractor": ExhentaiGalleryExtractor}
@@ -459,7 +449,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
"""Extractor for favorited exhentai galleries"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?"
+ pattern = BASE_PATTERN + r"/favorites\.php(?:\?([^#]*)())?"
test = (
("https://e-hentai.org/favorites.php", {
"count": 1,
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 92d27a9..0042676 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -23,10 +23,16 @@ class GelbooruBase():
url = post["file_url"]
if url.startswith(("https://mp4.gelbooru.com/", "https://video-cdn")):
md5 = post["md5"]
- url = "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
- md5[0:2], md5[2:4], md5)
+ path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5)
+ post["_fallback"] = GelbooruBase._video_fallback(path)
+ url = "https://img3.gelbooru.com" + path
return url
+ @staticmethod
+ def _video_fallback(path):
+ yield "https://img2.gelbooru.com" + path
+ yield "https://img1.gelbooru.com" + path
+
class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor):
@@ -80,7 +86,15 @@ class GelbooruPostExtractor(GelbooruBase,
"""Extractor for single images from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=view&id=(?P<post>\d+)")
- test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
- "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
- "count": 1,
- })
+ test = (
+ ("https://gelbooru.com/index.php?page=post&s=view&id=313638", {
+ "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
+ "count": 1,
+ }),
+ # video
+ ("https://gelbooru.com/index.php?page=post&s=view&id=5938076", {
+ "content": "6360452fa8c2f0c1137749e81471238564df832a",
+ "pattern": r"https://img\d\.gelbooru\.com/images"
+ r"/22/61/226111273615049235b001b381707bd0\.webm",
+ }),
+ )
diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py
index 0935998..541f454 100644
--- a/gallery_dl/extractor/gelbooru_v01.py
+++ b/gallery_dl/extractor/gelbooru_v01.py
@@ -47,6 +47,9 @@ BASE_PATTERN = GelbooruV01Extractor.update({
"thecollection" : {"root": "https://the-collection.booru.org"},
"illusioncardsbooru": {"root": "https://illusioncards.booru.org"},
"allgirlbooru" : {"root": "https://allgirl.booru.org"},
+ "drawfriends" : {"root": "https://drawfriends.booru.org"},
+ "vidyart" : {"root": "https://vidyart.booru.org"},
+ "theloudbooru" : {"root": "https://tlb.booru.org"},
})
@@ -70,6 +73,9 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor):
"range": "1-25",
"count": 25,
}),
+ ("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"),
+ ("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"),
+ ("https://tlb.booru.org/index.php?page=post&s=list&tags=all"),
)
def __init__(self, match):
@@ -133,6 +139,9 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor):
"width": "1600"
},
}),
+ ("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"),
+ ("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"),
+ ("https://tlb.booru.org/index.php?page=post&s=view&id=127223"),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py
deleted file mode 100644
index aa79b67..0000000
--- a/gallery_dl/extractor/hentaicafe.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2018-2021 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://hentai.cafe/"""
-
-from . import foolslide
-from .. import text
-from .common import Extractor, Message
-from ..cache import memcache
-import re
-
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?hentai\.cafe"
-
-
-class HentaicafeBase():
- """Base class for hentaicafe extractors"""
- category = "hentaicafe"
- root = "https://hentai.cafe"
-
- def _pagination(self, urlfmt):
- data = {"_extractor": HentaicafeMangaExtractor}
- pnum = text.parse_int(self.page_start, 1)
-
- while True:
- page = self.request(urlfmt(pnum)).text
-
- for entry in text.extract_iter(
- page, 'class="entry-featured', 'title="'):
- url = text.extract(entry, 'href="', '"')[0]
- if url:
- yield Message.Queue, url, data
-
- if '>&#x2192;<' not in page:
- return
- pnum += 1
-
-
-class HentaicafeChapterExtractor(HentaicafeBase,
- foolslide.FoolslideChapterExtractor):
- """Extractor for manga-chapters from hentai.cafe"""
- directory_fmt = ("{category}", "{manga}")
- filename_fmt = "c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}"
- pattern = BASE_PATTERN + r"(/manga/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
- test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
- "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
- "keyword": "6913608267d883c82b887303b9ced13821188329",
- })
-
- def metadata(self, page):
- info = text.unescape(text.extract(page, '<title>', '</title>')[0])
- manga, _, chapter_string = info.partition(" :: ")
-
- data = self._data(self.gallery_url.split("/")[5])
- if "manga" not in data:
- data["manga"] = manga
- data["chapter_string"] = chapter_string.rstrip(" :")
- return self.parse_chapter_url(self.gallery_url, data)
-
- @memcache(keyarg=1)
- def _data(self, manga):
- return {"artist": (), "tags": ()}
-
-
-class HentaicafeMangaExtractor(HentaicafeBase,
- foolslide.FoolslideMangaExtractor):
- """Extractor for manga from hentai.cafe"""
- pattern = BASE_PATTERN + r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?#]+)/?$"
- test = (
- # single chapter
- ("https://hentai.cafe/hazuki-yuuto-summer-blues/", {
- "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b",
- "keyword": "ced644ff94ea22e1991a5e44bf37c38a7e2ac2b3",
- }),
- # multi-chapter
- ("https://hentai.cafe/saitom-saitom-box/", {
- "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
- "keyword": "4c2262d680286a54357c334c1faca8f1b0e692e9",
- }),
- # new-style URL
- ("https://hentai.cafe/hc.fyi/2782", {
- "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
- "keyword": "4c2262d680286a54357c334c1faca8f1b0e692e9",
- }),
- # foolslide URL
- ("https://hentai.cafe/manga/series/saitom-box/", {
- "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
- "keyword": "f0ece32d958f889d8229ed4052716d398a0a875c",
- }),
-
- )
-
- def items(self):
- page = Extractor.request(self, self.gallery_url).text
-
- chapters = self.chapters(page)
- if self.config("chapter-reverse", False):
- chapters.reverse()
-
- for chapter, data in chapters:
- data["_extractor"] = HentaicafeChapterExtractor
- yield Message.Queue, chapter, data
-
- def chapters(self, page):
- if "/manga/series/" in self.gallery_url:
- chapters = foolslide.FoolslideMangaExtractor.chapters(self, page)
- chapters.reverse()
- return chapters
-
- manga , pos = text.extract(page, '<title>', '<')
- url , pos = text.extract(page, 'rel="canonical" href="', '"', pos)
- tags , pos = text.extract(page, "<p>Tags: ", "</br>", pos)
- artist, pos = text.extract(page, "\nArtists: ", "</br>", pos)
- key , pos = text.extract(page, "/manga/read/", "/", pos)
- data = {
- "manga" : text.unescape(manga.rpartition(" | ")[0]),
- "manga_id": text.parse_int(url.rpartition("/")[2]),
- "tags" : text.split_html(tags)[::2],
- "artist" : text.split_html(artist),
- }
- HentaicafeChapterExtractor._data(key).update(data)
-
- return [
- (url, data)
- for url in re.findall(
- r'<a +class="x-btn[^"]*" +href="([^"]+)"', page)
- ]
-
-
-class HentaicafeSearchExtractor(HentaicafeBase, Extractor):
- """Extractor for hentaicafe search results"""
- subcategory = "search"
- pattern = BASE_PATTERN + r"/(?:page/(\d+)/?)?\?s=([^&#]+)"
- test = ("https://hentai.cafe/?s=benimura", {
- "pattern": HentaicafeMangaExtractor.pattern,
- "count": ">= 10",
- })
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.page_start, self.search = match.groups()
-
- def items(self):
- fmt = "{}/page/{}?s={}".format
- return self._pagination(lambda pnum: fmt(self.root, pnum, self.search))
-
-
-class HentaicafeTagExtractor(HentaicafeBase, Extractor):
- """Extractor for hentaicafe tag/artist searches"""
- subcategory = "tag"
- pattern = (BASE_PATTERN +
- r"/hc\.fyi/(tag|artist|category)/([^/?#]+)(?:/page/(\d+))?")
- test = (
- ("https://hentai.cafe/hc.fyi/tag/vanilla"),
- ("https://hentai.cafe/hc.fyi/category/book/page/5"),
- ("https://hentai.cafe/hc.fyi/artist/benimura-karu", {
- "pattern": HentaicafeMangaExtractor.pattern,
- "count": ">= 10",
- }),
- )
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.type, self.search, self.page_start = match.groups()
-
- def items(self):
- fmt = "{}/hc.fyi/{}/{}/page/{}".format
- return self._pagination(
- lambda pnum: fmt(self.root, self.type, self.search, pnum))
diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py
index 093f3fe..a5bebdd 100644
--- a/gallery_dl/extractor/hentaifox.py
+++ b/gallery_dl/extractor/hentaifox.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,6 +10,7 @@
from .common import GalleryExtractor, Extractor, Message
from .. import text
+import json
class HentaifoxBase():
@@ -21,61 +22,84 @@ class HentaifoxBase():
class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
"""Extractor for image galleries on hentaifox.com"""
pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
- test = ("https://hentaifox.com/gallery/56622/", {
- "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
- "keyword": "b7ff141331d0c7fc711ab28d45dfbb013a83d8e9",
- "count": 24,
- })
+ test = (
+ ("https://hentaifox.com/gallery/56622/", {
+ "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg",
+ "keyword": "bcd6b67284f378e5cc30b89b761140e3e60fcd92",
+ "count": 24,
+ }),
+ # 'split_tag' element (#1378)
+ ("https://hentaifox.com/gallery/630/", {
+ "keyword": {
+ "artist": ["beti", "betty", "magi", "mimikaki"],
+ "characters": [
+ "aerith gainsborough",
+ "tifa lockhart",
+ "yuffie kisaragi"
+ ],
+ "count": 32,
+ "gallery_id": 630,
+ "group": ["cu-little2"],
+ "parody": ["darkstalkers | vampire", "final fantasy vii"],
+ "tags": ["femdom", "fingering", "masturbation", "yuri"],
+ "title": "Cu-Little Bakanya~",
+ "type": "doujinshi",
+ },
+ }),
+ )
def __init__(self, match):
GalleryExtractor.__init__(self, match)
self.gallery_id = match.group(2)
- def metadata(self, page, split=text.split_html):
+ @staticmethod
+ def _split(txt):
+ return [
+ text.remove_html(tag.partition(">")[2], "", "")
+ for tag in text.extract_iter(
+ txt, "class='tag_btn", "<span class='t_badge")
+ ]
+
+ def metadata(self, page):
extr = text.extract_from(page)
+ split = self._split
return {
"gallery_id": text.parse_int(self.gallery_id),
"title" : text.unescape(extr("<h1>", "</h1>")),
- "parody" : split(extr(">Parodies:" , "</ul>"))[::2],
- "characters": split(extr(">Characters:", "</ul>"))[::2],
- "tags" : split(extr(">Tags:" , "</ul>"))[::2],
- "artist" : split(extr(">Artists:" , "</ul>"))[::2],
- "group" : split(extr(">Groups:" , "</ul>"))[::2],
+ "parody" : split(extr(">Parodies:" , "</ul>")),
+ "characters": split(extr(">Characters:", "</ul>")),
+ "tags" : split(extr(">Tags:" , "</ul>")),
+ "artist" : split(extr(">Artists:" , "</ul>")),
+ "group" : split(extr(">Groups:" , "</ul>")),
"type" : text.remove_html(extr(">Category:", "<span")),
"language" : "English",
"lang" : "en",
}
def images(self, page):
- pos = page.find('id="load_all"')
- if pos >= 0:
- extr = text.extract
- load_id = extr(page, 'id="load_id" value="', '"', pos)[0]
- load_dir = extr(page, 'id="load_dir" value="', '"', pos)[0]
- load_pages = extr(page, 'id="load_pages" value="', '"', pos)[0]
-
- url = self.root + "/includes/thumbs_loader.php"
- data = {
- "u_id" : self.gallery_id,
- "g_id" : load_id,
- "img_dir" : load_dir,
- "visible_pages": "0",
- "total_pages" : load_pages,
- "type" : "2",
- }
- headers = {
- "Origin": self.root,
- "Referer": self.gallery_url,
- "X-Requested-With": "XMLHttpRequest",
- }
- page = self.request(
- url, method="POST", headers=headers, data=data).text
-
- return [
- (url.replace("t.", "."), None)
- for url in text.extract_iter(page, 'data-src="', '"')
- ]
+ cover, pos = text.extract(page, '<img src="', '"')
+ data , pos = text.extract(page, "$.parseJSON('", "');", pos)
+ path = "/".join(cover.split("/")[3:-1])
+
+ result = []
+ append = result.append
+ extmap = {"j": "jpg", "p": "png", "g": "gif"}
+ urlfmt = ("/" + path + "/{}.{}").format
+
+ server1 = "https://i.hentaifox.com"
+ server2 = "https://i2.hentaifox.com"
+
+ for num, image in json.loads(data).items():
+ ext, width, height = image.split(",")
+ path = urlfmt(num, extmap[ext])
+ append((server1 + path, {
+ "width" : width,
+ "height" : height,
+ "_fallback": (server2 + path,),
+ }))
+
+ return result
class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
deleted file mode 100644
index 6c1879c..0000000
--- a/gallery_dl/extractor/hentainexus.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019-2021 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://hentainexus.com/"""
-
-from .common import GalleryExtractor, Extractor, Message
-from .. import text, util
-import binascii
-import json
-
-
-class HentainexusGalleryExtractor(GalleryExtractor):
- """Extractor for image galleries on hentainexus.com"""
- category = "hentainexus"
- root = "https://hentainexus.com"
- pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
- r"/(?:view|read)/(\d+)")
- test = (
- ("https://hentainexus.com/view/5688", {
- "url": "f1761895fb7aca2f6ff9e09f839c0ee2fa7a5e54",
- "keyword": "5e5bb4b1553b1c6e126b198f9ae017a1a5d0a5ad",
- }),
- ("https://hentainexus.com/read/5688"),
- )
-
- def __init__(self, match):
- self.gallery_id = match.group(1)
- url = "{}/view/{}".format(self.root, self.gallery_id)
- GalleryExtractor.__init__(self, match, url)
-
- def metadata(self, page):
- rmve = text.remove_html
- extr = text.extract_from(page)
- data = {
- "gallery_id": text.parse_int(self.gallery_id),
- "tags" : extr('"og:description" content="', '"').split(", "),
- "thumbnail" : extr('"og:image" content="', '"'),
- "title" : extr('<h1 class="title">', '</h1>'),
- }
- for key in ("Artist", "Book", "Circle", "Event", "Language",
- "Magazine", "Parody", "Publisher", "Description"):
- data[key.lower()] = rmve(extr(
- 'viewcolumn">' + key + '</td>', '</td>'))
- data["lang"] = util.language_to_code(data["language"])
-
- if 'doujin' in data['tags']:
- data['type'] = 'Doujinshi'
- elif 'illustration' in data['tags']:
- data['type'] = 'Illustration'
- else:
- data['type'] = 'Manga'
- data["title_conventional"] = self._join_title(data)
- return data
-
- def images(self, _):
- url = "{}/read/{}".format(self.root, self.gallery_id)
- page = self.request(url).text
- data = json.loads(self._decode(text.extract(
- page, 'initReader("', '"')[0]))
-
- headers = None
- if not self.config("original", True):
- headers = {"_http_headers": {"Accept": "image/webp,*/*"}}
-
- pages = data.get("pages")
- if pages:
- return [(page, headers) for page in pages]
-
- base = data["b"] + data["r"]
- gid = data["i"]
- return [
- ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), headers)
- for page in data["f"]
- ]
-
- @staticmethod
- def _decode(data):
- # https://hentainexus.com/static/js/reader.min.js?r=13
- primes = (2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53)
- blob = binascii.a2b_base64(data)
- key = blob[0:64]
-
- C = 0
- for k in key:
- C = C ^ k
- for _ in range(8):
- if C & 1:
- C = C >> 1 ^ 0xc
- else:
- C = C >> 1
- k = primes[C & 0x7]
-
- x = 0
- S = list(range(256))
- for i in range(256):
- x = (x + S[i] + key[i % len(key)]) % 256
- S[i], S[x] = S[x], S[i]
-
- result = ""
- a = c = m = x = 0
- for n in range(64, len(blob)):
- a = (a + k) % 256
- x = (c + S[(x + S[a]) % 256]) % 256
- c = (c + a + S[a]) % 256
-
- S[a], S[x] = S[x], S[a]
- m = S[(x + S[(a + S[(m + c) % 256]) % 256]) % 256]
- result += chr(blob[n] ^ m)
-
- return result
-
- @staticmethod
- def _join_title(data):
- event = data['event']
- artist = data['artist']
- circle = data['circle']
- title = data['title']
- parody = data['parody']
- book = data['book']
- magazine = data['magazine']
-
- # a few galleries have a large number of artists or parodies,
- # which get replaced with "Various" in the title string
- if artist.count(',') >= 3:
- artist = 'Various'
- if parody.count(',') >= 3:
- parody = 'Various'
-
- jt = ''
- if event:
- jt += '({}) '.format(event)
- if circle:
- jt += '[{} ({})] '.format(circle, artist)
- else:
- jt += '[{}] '.format(artist)
- jt += title
- if parody.lower() != 'original work':
- jt += ' ({})'.format(parody)
- if book:
- jt += ' ({})'.format(book)
- if magazine:
- jt += ' ({})'.format(magazine)
- return jt
-
-
-class HentainexusSearchExtractor(Extractor):
- """Extractor for search results on hentainexus.com"""
- category = "hentainexus"
- subcategory = "search"
- root = "https://hentainexus.com"
- pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
- r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$")
- test = (
- ("https://hentainexus.com/?q=tag:%22heart+pupils%22%20tag:group", {
- "pattern": HentainexusGalleryExtractor.pattern,
- "count": ">= 50",
- }),
- ("https://hentainexus.com/page/3?q=tag:%22heart+pupils%22"),
- )
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.params = text.parse_query(match.group(1))
-
- def items(self):
- params = self.params
- path = "/"
- data = {"_extractor": HentainexusGalleryExtractor}
-
- while path:
- page = self.request(self.root + path, params=params).text
- extr = text.extract_from(page)
-
- while True:
- gallery_id = extr('<a href="/view/', '"')
- if not gallery_id:
- break
- yield Message.Queue, self.root + "/view/" + gallery_id, data
-
- path = extr('class="pagination-next" href="', '"')
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index abb6d10..d757e17 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -257,10 +257,11 @@ class ImgclickImageExtractor(ImagehostImageExtractor):
category = "imgclick"
pattern = r"(?:https?://)?((?:www\.)?imgclick\.net/([^/?#]+))"
test = ("http://imgclick.net/4tbrre1oxew9/test-_-_.png.html", {
- "url": "b967f2d372ffb9f5d3a927c6dd560e120b10a808",
+ "url": "140dcb250a325f2d26b2d918c18b8ac6a2a0f6ab",
"keyword": "6895256143eab955622fc149aa367777a8815ba3",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
})
+ https = False
params = "complex"
def get_info(self, page):
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index f6e8f2d..7009c7a 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2020 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -377,16 +377,17 @@ class ImgurAPI():
return self._call(endpoint)
def _call(self, endpoint, params=None):
- try:
- return self.extractor.request(
- "https://api.imgur.com" + endpoint,
- params=params, headers=self.headers,
- ).json()
- except exception.HttpError as exc:
- if exc.status != 403 or b"capacity" not in exc.response.content:
- raise
- self.extractor.sleep(seconds=600)
- return self._call(endpoint)
+ while True:
+ try:
+ return self.extractor.request(
+ "https://api.imgur.com" + endpoint,
+ params=params, headers=self.headers,
+ ).json()
+ except exception.HttpError as exc:
+ if exc.status not in (403, 429) or \
+ b"capacity" not in exc.response.content:
+ raise
+ self.extractor.wait(seconds=600)
def _pagination(self, endpoint, params=None, key=None):
num = 0
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 81355ce..74c6197 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -384,6 +384,7 @@ class InstagramUserExtractor(InstagramExtractor):
(InstagramStoriesExtractor , stories),
(InstagramHighlightsExtractor, base + "highlights/"),
(InstagramPostsExtractor , base + "posts/"),
+ (InstagramReelsExtractor , base + "reels/"),
(InstagramChannelExtractor , base + "channel/"),
), ("posts",))
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index 8a4e413..6e5aec9 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -32,7 +32,7 @@ class KomikcastBase():
if manga:
data["manga"] = manga.partition(" Chapter ")[0]
- if title and title.lower() != "bahasa indonesia":
+ if title and not title.lower().startswith("bahasa indonesia"):
data["title"] = title.strip()
else:
data["title"] = ""
@@ -53,27 +53,23 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
"keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4",
}),
(("https://komikcast.com/chapter/"
- "tonari-no-kashiwagi-san-chapter-18b/"), {
- "url": "aff90dd21dbb945a726778b10bdef522af7c42fe",
- "keyword": "19b5783864c4299913de436513b124b028b557c1",
- }),
- (("https://komikcast.com/chapter/090-eko-to-issho-chapter-1/"), {
- "url": "cda104a32ea2b06b3d6b096726622f519ed1fa33",
+ "solo-spell-caster-chapter-37-bahasa-indonesia/"), {
+ "url": "c3d30de6c796ff6ff36eb86e2e6fa2f8add8e829",
+ "keyword": "ed8a0ff73098776988bf66fb700381a2c748f910",
}),
)
def metadata(self, page):
- info = text.extract(page, '<b>', "</b>")[0]
+ info = text.extract(page, "<title>", " &ndash; Komikcast<")[0]
return self.parse_chapter_string(info)
@staticmethod
def images(page):
readerarea = text.extract(
- page, '<div id="readerarea"', '<div class="navig')[0]
+ page, '<div class="main-reading-area', '</div')[0]
return [
(text.unescape(url), None)
for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
- if "/Banner-" not in url and "/WM-Sampingan." not in url
]
@@ -95,7 +91,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
data = self.metadata(page)
for item in text.extract_iter(
- page, '<span class="leftoff"><a href="', '</a>'):
+ page, '<a class="chapter-link-item" href="', '</a'):
url, _, chapter_string = item.rpartition('">Chapter ')
self.parse_chapter_string(chapter_string, data)
results.append((url, data.copy()))
@@ -104,14 +100,15 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
@staticmethod
def metadata(page):
"""Return a dict with general metadata"""
- manga , pos = text.extract(page, "<title>" , "</title>")
- genres, pos = text.extract(page, ">Genres:", "</span>", pos)
+ manga , pos = text.extract(page, "<title>" , " &ndash; Komikcast<")
+ genres, pos = text.extract(
+ page, 'class="komik_info-content-genre">', "</span>", pos)
author, pos = text.extract(page, ">Author:", "</span>", pos)
mtype , pos = text.extract(page, ">Type:" , "</span>", pos)
return {
- "manga": text.unescape(manga[:-12]),
+ "manga": text.unescape(manga),
+ "genres": text.split_html(genres),
"author": text.remove_html(author),
- "genres": text.split_html(genres)[::2],
"type": text.remove_html(mtype),
}
diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py
new file mode 100644
index 0000000..882031b
--- /dev/null
+++ b/gallery_dl/extractor/manganelo.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract manga-chapters and entire manga from https://manganelo.com/"""
+
+from .common import ChapterExtractor, MangaExtractor
+from .. import text
+import re
+
+
+class ManganeloBase():
+ """Base class for manganelo extractors"""
+ category = "manganelo"
+ root = "https://manganelo.com"
+
+ @staticmethod
+ def parse_page(page, data):
+ """Parse metadata on 'page' and add it to 'data'"""
+ text.extract_all(page, (
+ ("manga" , '<h1>', '</h1>'),
+ ('author' , '</i>Author(s) :</td>', '</tr>'),
+ ), values=data)
+ data["author"] = text.remove_html(data["author"])
+ return data
+
+
+class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor):
+ """Extractor for manga-chapters from manganelo.com"""
+ pattern = (r"(?:https?://)?(?:www\.)?manganelo\.com"
+ r"(/chapter/\w+/chapter_[^/?#]+)")
+ test = (
+ ("https://manganelo.com/chapter/gq921227/chapter_23", {
+ "pattern": r"https://s\d+\.\w+\.com/mangakakalot/g\d+/gq921227/"
+ r"vol3_chapter_23_24_yen/\d+\.jpg",
+ "keyword": "3748087cf41abc97f991530e6fd53b291490d6d0",
+ "count": 25,
+ }),
+ ("https://manganelo.com/chapter/gamers/chapter_15", {
+ "keyword": "8f59f88d516247011fe122e05746c27e203c8191",
+ "content": "fbec629c71f66b246bfa0604204407c0d1c8ae38",
+ "count": 39,
+ }),
+ )
+
+ def __init__(self, match):
+ self.path = match.group(1)
+ ChapterExtractor.__init__(self, match, self.root + self.path)
+ self.session.headers['Referer'] = self.root
+
+ def metadata(self, page):
+ _ , pos = text.extract(page, '<a class="a-h" ', '/a>')
+ manga , pos = text.extract(page, '<a class="a-h" ', '/a>', pos)
+ info , pos = text.extract(page, '<a class="a-h" ', '/a>', pos)
+ author, pos = text.extract(page, '- Author(s) : ', '</p>', pos)
+
+ manga, _ = text.extract(manga, '">', '<')
+ info , _ = text.extract(info , '">', '<')
+ match = re.match(
+ r"(?:[Vv]ol\. *(\d+) )?"
+ r"[Cc]hapter *([^:]*)"
+ r"(?:: *(.+))?", info)
+ volume, chapter, title = match.groups() if match else ("", "", info)
+ chapter, sep, minor = chapter.partition(".")
+
+ return {
+ "manga" : text.unescape(manga),
+ "title" : text.unescape(title) if title else "",
+ "author" : text.unescape(author) if author else "",
+ "volume" : text.parse_int(volume),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": sep + minor,
+ "lang" : "en",
+ "language" : "English",
+ }
+
+ def images(self, page):
+ page = text.extract(
+ page, 'class="container-chapter-reader', '\n<div')[0]
+ return [
+ (url, None)
+ for url in text.extract_iter(page, '<img src="', '"')
+ ]
+
+
+class ManganeloMangaExtractor(ManganeloBase, MangaExtractor):
+ """Extractor for manga from manganelo.com"""
+ chapterclass = ManganeloChapterExtractor
+ pattern = (r"(?:https?://)?(?:www\.)?manganelo\.com"
+ r"(/(?:manga/|read_)\w+)")
+ test = (
+ ("https://manganelo.com/manga/ol921234", {
+ "url": "8a1810edddbafcde993ecb3558a35c99d8d4f13e",
+ }),
+ ("https://manganelo.com/manga/read_otome_no_teikoku", {
+ "pattern": ManganeloChapterExtractor.pattern,
+ "count": ">= 40"
+ }),
+ )
+
+ def chapters(self, page):
+ results = []
+ data = self.parse_page(page, {"lang": "en", "language": "English"})
+
+ needle = 'class="chapter-name text-nowrap" href="'
+ pos = page.index('<ul class="row-content-chapter">')
+ while True:
+ url, pos = text.extract(page, needle, '"', pos)
+ if not url:
+ return results
+ data["title"], pos = text.extract(page, '>', '</a>', pos)
+ data["date"] , pos = text.extract(
+ page, 'class="chapter-time text-nowrap" title="', '">', pos)
+ chapter, sep, minor = url.rpartition("/chapter_")[2].partition(".")
+ data["chapter"] = text.parse_int(chapter)
+ data["chapter_minor"] = sep + minor
+ results.append((url, data.copy()))
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py
deleted file mode 100644
index 30b8ce3..0000000
--- a/gallery_dl/extractor/mangareader.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2015-2020 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://www.mangareader.net/"""
-
-from .common import ChapterExtractor, MangaExtractor
-from .. import text
-from ..cache import memcache
-import json
-
-
-class MangareaderBase():
- """Base class for mangareader extractors"""
- category = "mangareader"
- root = "https://www.mangareader.net"
-
- @memcache(keyarg=1)
- def _manga_info(self, path, page=None):
- if not page:
- page = self.request(self.root + path).text
- extr = text.extract_from(page)
- data = {
- "manga" : text.unescape(extr('class="name">', '<')),
- "release" : text.unescape(extr('Year of Release :</td><td>', '<')),
- "author" : text.unescape(text.unescape(extr(
- 'Author :</td><td>', '<'))),
- "artist" : text.unescape(text.unescape(extr(
- 'Artist :</td><td>', '<'))),
- "lang" : "en",
- "language": "English",
- }
-
- extr('<table', '>')
- chapters = []
- while True:
- url = extr('</i> <a href="', '"')
- if not url:
- return chapters
- chapter = {
- "chapter": text.parse_int(url.rpartition("/")[2]),
- "title" : text.unescape(extr("</a> : ", "<")),
- "date" : extr("<td>", "<"),
- }
- chapter.update(data)
- chapters.append((self.root + url, chapter))
-
-
-class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor):
- """Extractor for manga-chapters from mangareader.net"""
- archive_fmt = "{manga}_{chapter}_{page}"
- pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?#]+)/(\d+))"
- test = (("https://www.mangareader.net"
- "/karate-shoukoushi-kohinata-minoru/11"), {
- "url": "45ece5668d1e9f65cf2225237d78de58660b54e4",
- "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6",
- })
-
- def __init__(self, match):
- ChapterExtractor.__init__(self, match)
- _, self.path, self.chapter = match.groups()
-
- def metadata(self, page):
- chapter = text.parse_int(self.chapter)
- return self._manga_info(self.path)[chapter-1][1]
-
- def images(self, page):
- data = json.loads(text.extract(
- page, 'document["mj"]=', '</script>')[0])
- return [
- (text.ensure_http_scheme(img["u"]), {
- "width" : text.parse_int(img["w"]),
- "height": text.parse_int(img["h"]),
- })
- for img in data["im"]
- ]
-
-
-class MangareaderMangaExtractor(MangareaderBase, MangaExtractor):
- """Extractor for manga from mangareader.net"""
- chapterclass = MangareaderChapterExtractor
- reverse = False
- pattern = r"(?:https?://)?(?:www\.)?mangareader\.net(/[^/?#]+)/?$"
- test = ("https://www.mangareader.net/mushishi", {
- "url": "bc203b858b4ad76e5d77e39118a7be0350e357da",
- "keyword": "031b3ea085921c552de017ecbb9b906e462229c9",
- })
-
- def chapters(self, page):
- path = self.manga_url[len(self.root):]
- return self._manga_info(path, page)
diff --git a/gallery_dl/extractor/mangastream.py b/gallery_dl/extractor/mangastream.py
deleted file mode 100644
index 7ff0239..0000000
--- a/gallery_dl/extractor/mangastream.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2015-2019 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract manga-chapters from https://readms.net/"""
-
-from .common import ChapterExtractor
-from .. import text
-
-
-class MangastreamChapterExtractor(ChapterExtractor):
- """Extractor for manga-chapters from mangastream.com"""
- category = "mangastream"
- archive_fmt = "{chapter_id}_{page}"
- pattern = (r"(?:https?://)?(?:www\.)?(?:readms\.net|mangastream\.com)"
- r"/r(?:ead)?/([^/]*/([^/]+)/(\d+))")
- test = (
- ("https://readms.net/r/onepunch_man/087/4874/1"),
- ("https://mangastream.com/r/onepunch_man/087/4874/1"),
- )
- root = "https://readms.net"
-
- def __init__(self, match):
- self.part, self.chapter, self.chapter_id = match.groups()
- url = "{}/r/{}".format(self.root, self.part)
- ChapterExtractor.__init__(self, match, url)
-
- def metadata(self, page):
- manga, pos = text.extract(
- page, '<span class="hidden-xs hidden-sm">', "<")
- pos = page.find(self.part, pos)
- title, pos = text.extract(page, ' - ', '<', pos)
- count, pos = text.extract(page, 'Last Page (', ')', pos)
- return {
- "manga": manga,
- "chapter": text.unquote(self.chapter),
- "chapter_id": text.parse_int(self.chapter_id),
- "title": title,
- "count": text.parse_int(count, 1),
- "lang": "en",
- "language": "English",
- }
-
- def images(self, page):
- while True:
- pos = page.index(' class="page"')
- next_url = text.extract(page, ' href="', '"', pos)[0]
- image_url = text.extract(page, ' src="', '"', pos)[0]
- yield text.urljoin(self.root, image_url), None
- page = self.request(text.urljoin(self.root, next_url)).text
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 4eb3ee6..e1081da 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -69,12 +69,23 @@ class NozomiExtractor(Extractor):
post["dataid"] = post["filename"]
yield Message.Url, url, post
+ def posts(self):
+ url = "https://n.nozomi.la" + self.nozomi
+ offset = (text.parse_int(self.pnum, 1) - 1) * 256
+
+ while True:
+ headers = {"Range": "bytes={}-{}".format(offset, offset+255)}
+ response = self.request(url, headers=headers)
+ yield from decode_nozomi(response.content)
+
+ offset += 256
+ cr = response.headers.get("Content-Range", "").rpartition("/")[2]
+ if text.parse_int(cr, offset) <= offset:
+ return
+
def metadata(self):
return {}
- def posts(self):
- return ()
-
@staticmethod
def _list(src):
return [x["tagname_display"] for x in src] if src else ()
@@ -126,12 +137,29 @@ class NozomiPostExtractor(NozomiExtractor):
return (self.post_id,)
+class NozomiIndexExtractor(NozomiExtractor):
+ """Extractor for the nozomi.la index"""
+ subcategory = "index"
+ pattern = (r"(?:https?://)?nozomi\.la/"
+ r"(?:(index(?:-Popular)?)-(\d+)\.html)?(?:$|#|\?)")
+ test = (
+ ("https://nozomi.la/"),
+ ("https://nozomi.la/index-2.html"),
+ ("https://nozomi.la/index-Popular-33.html"),
+ )
+
+ def __init__(self, match):
+ NozomiExtractor.__init__(self, match)
+ index, self.pnum = match.groups()
+ self.nozomi = "/{}.nozomi".format(index or "index")
+
+
class NozomiTagExtractor(NozomiExtractor):
"""Extractor for posts from tag searches on nozomi.la"""
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{postid}"
- pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-\d+\."
+ pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-(\d+)\."
test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", {
"pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$",
"count": ">= 25",
@@ -140,25 +168,13 @@ class NozomiTagExtractor(NozomiExtractor):
def __init__(self, match):
NozomiExtractor.__init__(self, match)
- self.tags = text.unquote(match.group(1)).lower()
+ tags, self.pnum = match.groups()
+ self.tags = text.unquote(tags).lower()
+ self.nozomi = "/nozomi/{}.nozomi".format(self.tags)
def metadata(self):
return {"search_tags": self.tags}
- def posts(self):
- url = "https://n.nozomi.la/nozomi/{}.nozomi".format(self.tags)
- i = 0
-
- while True:
- headers = {"Range": "bytes={}-{}".format(i, i+255)}
- response = self.request(url, headers=headers)
- yield from decode_nozomi(response.content)
-
- i += 256
- cr = response.headers.get("Content-Range", "").rpartition("/")[2]
- if text.parse_int(cr, i) <= i:
- return
-
class NozomiSearchExtractor(NozomiExtractor):
"""Extractor for search results on nozomi.la"""
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
new file mode 100644
index 0000000..f3c5ac2
--- /dev/null
+++ b/gallery_dl/extractor/philomena.py
@@ -0,0 +1,216 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Philomena sites"""
+
+from .booru import BooruExtractor
+from .. import text, exception
+import operator
+
+
+class PhilomenaExtractor(BooruExtractor):
+ """Base class for philomena extractors"""
+ basecategory = "philomena"
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{id}"
+ request_interval = 1.0
+ per_page = 50
+
+ _file_url = operator.itemgetter("view_url")
+
+ @staticmethod
+ def _prepare(post):
+ post["date"] = text.parse_datetime(post["created_at"])
+
+ @staticmethod
+ def _extended_tags(post):
+ pass
+
+ def _pagination(self, url, params):
+ params["page"] = 1
+ params["per_page"] = self.per_page
+
+ api_key = self.config("api-key")
+ if api_key:
+ params["key"] = api_key
+
+ filter_id = self.config("filter")
+ if filter_id:
+ params["filter_id"] = filter_id
+ elif not api_key:
+ try:
+ params["filter_id"] = INSTANCES[self.category]["filter_id"]
+ except (KeyError, TypeError):
+ pass
+
+ while True:
+ data = self.request(url, params=params).json()
+ yield from data["images"]
+
+ if len(data["images"]) < self.per_page:
+ return
+ params["page"] += 1
+
+
+INSTANCES = {
+ "derpibooru": {"root": "https://derpibooru.org",
+ "filter_id": "56027"},
+ "ponybooru" : {"root": "https://ponybooru.org",
+ "filter_id": "2"},
+}
+
+BASE_PATTERN = PhilomenaExtractor.update(INSTANCES)
+
+
+class PhilomenaPostExtractor(PhilomenaExtractor):
+ """Extractor for single posts on a Philomena booru"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/(?:images/)?(\d+)"
+ test = (
+ ("https://derpibooru.org/images/1", {
+ "content": "88449eeb0c4fa5d3583d0b794f6bc1d70bf7f889",
+ "count": 1,
+ "keyword": {
+ "animated": False,
+ "aspect_ratio": 1.0,
+ "comment_count": int,
+ "created_at": "2012-01-02T03:12:33Z",
+ "date": "dt:2012-01-02 03:12:33",
+ "deletion_reason": None,
+ "description": "",
+ "downvotes": int,
+ "duplicate_of": None,
+ "duration": 0.04,
+ "extension": "png",
+ "faves": int,
+ "first_seen_at": "2012-01-02T03:12:33Z",
+ "format": "png",
+ "height": 900,
+ "hidden_from_users": False,
+ "id": 1,
+ "mime_type": "image/png",
+ "name": "1__safe_fluttershy_solo_cloud_happy_flying_upvotes+ga"
+ "lore_artist-colon-speccysy_get_sunshine",
+ "orig_sha512_hash": None,
+ "processed": True,
+ "representations": dict,
+ "score": int,
+ "sha512_hash": "f16c98e2848c2f1bfff3985e8f1a54375cc49f78125391"
+ "aeb80534ce011ead14e3e452a5c4bc98a66f56bdfcd07e"
+ "f7800663b994f3f343c572da5ecc22a9660f",
+ "size": 860914,
+ "source_url": "https://www.deviantart.com/speccysy/art"
+ "/Afternoon-Flight-215193985",
+ "spoilered": False,
+ "tag_count": 36,
+ "tag_ids": list,
+ "tags": list,
+ "thumbnails_generated": True,
+ "updated_at": "2020-05-28T13:14:07Z",
+ "uploader": "Clover the Clever",
+ "uploader_id": 211188,
+ "upvotes": int,
+ "view_url": str,
+ "width": 900,
+ "wilson_score": float,
+ },
+ }),
+ ("https://derpibooru.org/1"),
+ ("https://ponybooru.org/images/1", {
+ "content": "bca26f58fafd791fe07adcd2a28efd7751824605",
+ }),
+ )
+
+ def __init__(self, match):
+ PhilomenaExtractor.__init__(self, match)
+ self.image_id = match.group(match.lastindex)
+
+ def posts(self):
+ url = self.root + "/api/v1/json/images/" + self.image_id
+ return (self.request(url).json()["image"],)
+
+
+class PhilomenaSearchExtractor(PhilomenaExtractor):
+ """Extractor for Philomena search results"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "{search_tags}")
+ pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))"
+ test = (
+ ("https://derpibooru.org/search?q=cute", {
+ "range": "40-60",
+ "count": 21,
+ }),
+ ("https://derpibooru.org/tags/cute", {
+ "range": "40-60",
+ "count": 21,
+ }),
+ ("https://ponybooru.org/search?q=cute", {
+ "range": "40-60",
+ "count": 21,
+ }),
+ )
+
+ def __init__(self, match):
+ PhilomenaExtractor.__init__(self, match)
+ groups = match.groups()
+ if groups[-1]:
+ self.params = {"q": groups[-1]}
+ else:
+ self.params = text.parse_query(groups[-2])
+
+ def metadata(self):
+ return {"search_tags": self.params.get("q", "")}
+
+ def posts(self):
+ url = self.root + "/api/v1/json/search/images"
+ return self._pagination(url, self.params)
+
+
+class PhilomenaGalleryExtractor(PhilomenaExtractor):
+ """Extractor for Philomena galleries"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "galleries",
+ "{gallery[id]} {gallery[title]}")
+ pattern = BASE_PATTERN + r"/galleries/(\d+)"
+ test = (
+ ("https://derpibooru.org/galleries/1", {
+ "pattern": r"https://derpicdn\.net/img/view/\d+/\d+/\d+/\d+[^/]+$",
+ "keyword": {
+ "gallery": {
+ "description": "Indexes start at 1 :P",
+ "id": 1,
+ "spoiler_warning": "",
+ "thumbnail_id": 1,
+ "title": "The Very First Gallery",
+ "user": "DeliciousBlackInk",
+ "user_id": 365446,
+ },
+ },
+ }),
+ ("https://ponybooru.org/galleries/27", {
+ "count": ">= 24",
+ }),
+ )
+
+ def __init__(self, match):
+ PhilomenaExtractor.__init__(self, match)
+ self.gallery_id = match.group(match.lastindex)
+
+ def metadata(self):
+ url = self.root + "/api/v1/json/search/galleries"
+ params = {"q": "id:" + self.gallery_id}
+ galleries = self.request(url, params=params).json()["galleries"]
+ if not galleries:
+ raise exception.NotFoundError("gallery")
+ return {"gallery": galleries[0]}
+
+ def posts(self):
+ gallery_id = "gallery_id:" + self.gallery_id
+ url = self.root + "/api/v1/json/search/images"
+ params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id}
+ return self._pagination(url, params)
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index e5a0486..25344e8 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -220,6 +220,27 @@ class PinterestSectionExtractor(PinterestExtractor):
return self.api.board_section_pins(self.section["id"])
+class PinterestSearchExtractor(PinterestExtractor):
+ """Extractor for Pinterest search results"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "Search", "{search}")
+ pattern = BASE_PATTERN + r"/search/pins/?\?q=([^&#]+)"
+ test = ("https://www.pinterest.de/search/pins/?q=nature", {
+ "range": "1-50",
+ "count": ">= 50",
+ })
+
+ def __init__(self, match):
+ PinterestExtractor.__init__(self, match)
+ self.search = match.group(1)
+
+ def metadata(self):
+ return {"search": self.search}
+
+ def pins(self):
+ return self.api.search(self.search)
+
+
class PinterestRelatedPinExtractor(PinterestPinExtractor):
"""Extractor for related pins of another pin from pinterest.com"""
subcategory = "related-pin"
@@ -296,7 +317,7 @@ class PinterestAPI():
"Accept-Language" : "en-US,en;q=0.5",
"Referer" : BASE_URL + "/",
"X-Requested-With" : "XMLHttpRequest",
- "X-APP-VERSION" : "7a20185",
+ "X-APP-VERSION" : "31461e0",
"X-CSRFToken" : None,
"X-Pinterest-AppState": "active",
"Origin" : BASE_URL,
@@ -364,6 +385,11 @@ class PinterestAPI():
options = {"board_id": board_id, "add_vase": True}
return self._pagination("BoardRelatedPixieFeed", options)
+ def search(self, query):
+ """Yield pins from searches"""
+ options = {"query": query, "scope": "pins", "rs": "typed"}
+ return self._pagination("BaseSearch", options)
+
def login(self):
"""Login and obtain session cookies"""
username, password = self.extractor._get_auth_info()
@@ -421,7 +447,10 @@ class PinterestAPI():
def _pagination(self, resource, options):
while True:
data = self._call(resource, options)
- yield from data["resource_response"]["data"]
+ results = data["resource_response"]["data"]
+ if isinstance(results, dict):
+ results = results["results"]
+ yield from results
try:
bookmarks = data["resource"]["options"]["bookmarks"]
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index db49b90..ebbce67 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -254,8 +254,8 @@ class PixivFavoriteExtractor(PixivExtractor):
"{user_bookmark[id]} {user_bookmark[account]}")
archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}"
pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?"
- r"users/(\d+)/(bookmarks/artworks(?:/([^/?#]+))?|following)"
- r"|bookmark\.php(?:\?([^#]*))?)")
+ r"users/(\d+)/(bookmarks/artworks|following)(?:/([^/?#]+))?"
+ r"|bookmark\.php)(?:\?([^#]*))?")
test = (
("https://www.pixiv.net/en/users/173530/bookmarks/artworks", {
"url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index a5f0138..ea5bb6d 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -55,6 +55,7 @@ class SankakuExtractor(BooruExtractor):
post["created_at"] = post["created_at"]["s"]
post["date"] = text.parse_timestamp(post["created_at"])
post["tags"] = [tag["name"] for tag in post["tags"]]
+ post["tag_string"] = " ".join(post["tags"])
def _extended_tags(self, post):
tags = collections.defaultdict(list)
@@ -63,6 +64,7 @@ class SankakuExtractor(BooruExtractor):
tags[types[tag["type"]]].append(tag["name"])
for key, value in tags.items():
post["tags_" + key] = value
+ post["tag_string_" + key] = " ".join(value)
class SankakuTagExtractor(SankakuExtractor):
@@ -122,7 +124,13 @@ class SankakuPoolExtractor(SankakuExtractor):
def metadata(self):
pool = SankakuAPI(self).pools(self.pool_id)
+ pool["tags"] = [tag["name"] for tag in pool["tags"]]
+ pool["artist_tags"] = [tag["name"] for tag in pool["artist_tags"]]
+
self._posts = pool.pop("posts")
+ for num, post in enumerate(self._posts, 1):
+ post["num"] = num
+
return {"pool": pool}
def posts(self):
diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py
new file mode 100644
index 0000000..ec1e044
--- /dev/null
+++ b/gallery_dl/extractor/tapas.py
@@ -0,0 +1,205 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://tapas.io/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+BASE_PATTERN = r"(?:https?://)?tapas\.io"
+
+
+class TapasExtractor(Extractor):
+ """Base class for tapas.io extractors"""
+ category = "tapas"
+ root = "https://tapas.io"
+ directory_fmt = ("{category}", "{series[title]}", "{id} {title}")
+ filename_fmt = "{num:>02}.{extension}"
+ archive_fmt = "{id}_{num}"
+ cookiedomain = ".tapas.io"
+ cookienames = ("_cpc_",)
+ _cache = None
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ if self._cache is None:
+ TapasExtractor._cache = {}
+
+ def items(self):
+ self.login()
+ headers = {"Accept": "application/json, text/javascript, */*;"}
+
+ for episode_id in self.episode_ids():
+ url = "{}/episode/{}".format(self.root, episode_id)
+ data = self.request(url, headers=headers).json()["data"]
+
+ episode = data["episode"]
+ if not episode.get("free") and not episode.get("unlocked"):
+ raise exception.StopExtraction(
+ "Episode '%s' not unlocked (ID %s) ",
+ episode["title"], episode_id)
+
+ html = data["html"]
+ series_id = text.rextract(html, 'data-series-id="', '"')[0]
+ try:
+ episode["series"] = self._cache[series_id]
+ except KeyError:
+ url = "{}/series/{}".format(self.root, series_id)
+ episode["series"] = self._cache[series_id] = self.request(
+ url, headers=headers).json()["data"]
+
+ episode["date"] = text.parse_datetime(episode["publish_date"])
+ yield Message.Directory, episode
+
+ if episode["book"]:
+ content, _ = text.extract(
+ html, '<div class="viewer">', '<div class="viewer-bottom')
+ episode["num"] = 1
+ episode["extension"] = "html"
+ yield Message.Url, "text:" + content, episode
+
+ else: # comic
+ for episode["num"], url in enumerate(text.extract_iter(
+ html, 'data-src="', '"'), 1):
+ yield Message.Url, url, text.nameext_from_url(url, episode)
+
+ def login(self):
+ if not self._check_cookies(self.cookienames):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+ else:
+ sc = self.session.cookies.set
+ sc("birthDate" , "1981-02-03", domain=self.cookiedomain)
+ sc("adjustedBirthDate", "1981-02-03", domain=self.cookiedomain)
+
+ @cache(maxage=14*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ url = self.root + "/account/authenticate"
+ headers = {
+ "Referer" : url,
+ }
+ data = {
+ "from" : "https://tapas.io/",
+ "email" : username,
+ "password": password,
+ }
+ response = self.request(
+ url, method="POST", headers=headers, data=data)
+
+ if not response.history or \
+ "/account/signin_fail" in response.history[-1].url:
+ raise exception.AuthenticationError()
+
+ return {"_cpc_": response.history[0].cookies.get("_cpc_")}
+
+
+class TapasSeriesExtractor(TapasExtractor):
+ subcategory = "series"
+ pattern = BASE_PATTERN + r"/series/([^/?#]+)"
+ test = (
+ ("https://tapas.io/series/just-leave-me-be", {
+ "pattern": r"https://\w+\.cloudfront\.net/pc/\w\w/[0-9a-f-]+\.jpg",
+ "count": 127,
+ }),
+ ("https://tapas.io/series/yona", { # mature
+ "count": 26,
+ }),
+ )
+
+ def __init__(self, match):
+ TapasExtractor.__init__(self, match)
+ self.series_name = match.group(1)
+
+ def episode_ids(self):
+ url = "{}/series/{}".format(self.root, self.series_name)
+ series_id, _, episode_id = text.extract(
+ self.request(url).text, 'content="tapastic://series/', '"',
+ )[0].partition("/episodes/")
+
+ url = "{}/series/{}/episodes".format(self.root, series_id)
+ headers = {"Accept": "application/json, text/javascript, */*;"}
+ params = {
+ "eid" : episode_id,
+ "page" : 1,
+ "sort" : "OLDEST",
+ "last_access": "0",
+ "max_limit" : "20",
+ }
+
+ while True:
+ data = self.request(
+ url, params=params, headers=headers).json()["data"]
+ yield from text.extract_iter(
+ data["body"], 'data-href="/episode/', '"')
+
+ if not data["pagination"]["has_next"]:
+ return
+ params["page"] += 1
+
+
+class TapasEpisodeExtractor(TapasExtractor):
+ subcategory = "episode"
+ pattern = BASE_PATTERN + r"/episode/(\d+)"
+ test = ("https://tapas.io/episode/2068651", {
+ "url": "0e536117dfaa17972e83d2e0141e6f9e91a33611",
+ "pattern": "^text:",
+ "keyword": {
+ "book": True,
+ "comment_cnt": int,
+ "date": "dt:2021-02-23 16:02:07",
+ "early_access": False,
+ "escape_title": "You are a Tomb Raider (2)",
+ "free": True,
+ "id": 2068651,
+ "like_cnt": int,
+ "liked": bool,
+ "mature": False,
+ "next_ep_id": 2068652,
+ "nsfw": False,
+ "nu": False,
+ "num": 1,
+ "open_comments": True,
+ "pending_scene": 2,
+ "prev_ep_id": 2068650,
+ "publish_date": "2021-02-23T16:02:07Z",
+ "read": bool,
+ "related_ep_id": None,
+ "relative_publish_date": "Feb 23",
+ "scene": 2,
+ "scheduled": False,
+ "title": "You are a Tomb Raider (2)",
+ "unlock_cnt": 0,
+ "unlocked": False,
+ "view_cnt": int,
+
+ "series": {
+ "genre": dict,
+ "has_book_cover": True,
+ "has_top_banner": True,
+ "id": 199931,
+ "premium": True,
+ "sale_type": "PAID",
+ "subscribed": bool,
+ "thumbsup_cnt": int,
+ "title": "Tomb Raider King",
+ "type": "BOOKS",
+ "url": "tomb-raider-king-novel",
+ },
+ },
+ })
+
+ def __init__(self, match):
+ TapasExtractor.__init__(self, match)
+ self.episode_id = match.group(1)
+
+ def episode_ids(self):
+ return (self.episode_id,)
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index cf57a4d..243710d 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -257,9 +257,6 @@ class TumblrPostExtractor(TumblrExtractor):
("https://mikf123.tumblr.com/post/167623351559/link-post", {
"count": 2,
}),
- ("https://muyanna.tumblr.com/post/180692431632/answer-post", {
- "count": 1,
- }),
("https://mikf123.tumblr.com/post/167633596145/video-post", {
"count": 2,
}),
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index a7d2de5..c323fe0 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -113,19 +113,18 @@ class TwitterExtractor(Extractor):
"url" : base + "orig",
"width" : width,
"height" : height,
- "_fallback": self._image_fallback(base, url),
+ "_fallback": self._image_fallback(base, url + ":"),
}))
else:
files.append({"url": media["media_url"]})
@staticmethod
- def _image_fallback(base, url):
- url += ":"
- yield url + "orig"
+ def _image_fallback(new, old):
+ yield old + "orig"
for size in ("large", "medium", "small"):
- yield base + size
- yield url + size
+ yield new + size
+ yield old + size
def _extract_card(self, tweet, files):
card = tweet["card"]
@@ -139,7 +138,7 @@ class TwitterExtractor(Extractor):
if key in bvals:
files.append(bvals[key]["image_value"])
return
- else:
+ elif self.videos:
url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"])
files.append({"url": url})
@@ -224,6 +223,22 @@ class TwitterExtractor(Extractor):
}
return cache[uid]
+ def _users_result(self, users):
+ userfmt = self.config("users")
+ if not userfmt or userfmt == "timeline":
+ cls = TwitterTimelineExtractor
+ fmt = (self.root + "/i/user/{rest_id}").format_map
+ elif userfmt == "media":
+ cls = TwitterMediaExtractor
+ fmt = (self.root + "/id:{rest_id}/media").format_map
+ else:
+ cls = None
+ fmt = userfmt.format_map
+
+ for user in users:
+ user["_extractor"] = cls
+ yield Message.Queue, fmt(user), user
+
def metadata(self):
"""Return general metadata"""
return {}
@@ -261,6 +276,10 @@ class TwitterExtractor(Extractor):
response = self.request(
url, method="POST", cookies=cookies, data=data)
+ if "/account/login_verification" in response.url:
+ raise exception.AuthenticationError(
+ "Login with two-factor authentication is not supported")
+
cookies = {
cookie.name: cookie.value
for cookie in self.session.cookies
@@ -320,6 +339,9 @@ class TwitterLikesExtractor(TwitterExtractor):
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)"
test = ("https://twitter.com/supernaturepics/likes",)
+ def metadata(self):
+ return {"user_likes": self.user}
+
def tweets(self):
return TwitterAPI(self).timeline_favorites(self.user)
@@ -356,10 +378,7 @@ class TwitterListMembersExtractor(TwitterExtractor):
def items(self):
self.login()
- for user in TwitterAPI(self).list_members(self.user):
- user["_extractor"] = TwitterTimelineExtractor
- url = "{}/i/user/{}".format(self.root, user["rest_id"])
- yield Message.Queue, url, user
+ return self._users_result(TwitterAPI(self).list_members(self.user))
class TwitterFollowingExtractor(TwitterExtractor):
@@ -373,10 +392,7 @@ class TwitterFollowingExtractor(TwitterExtractor):
def items(self):
self.login()
- for user in TwitterAPI(self).user_following(self.user):
- user["_extractor"] = TwitterTimelineExtractor
- url = "{}/i/user/{}".format(self.root, user["rest_id"])
- yield Message.Queue, url, user
+ return self._users_result(TwitterAPI(self).user_following(self.user))
class TwitterSearchExtractor(TwitterExtractor):
@@ -485,6 +501,34 @@ class TwitterTweetExtractor(TwitterExtractor):
return TwitterAPI(self).tweet(self.tweet_id)
+class TwitterImageExtractor(Extractor):
+ category = "twitter"
+ subcategory = "image"
+ pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
+ test = (
+ ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg%name=orig"),
+ ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.id, self.fmt = match.groups()
+
+ def items(self):
+ base = "https://pbs.twimg.com/media/" + self.id
+ new = base + "?format=" + self.fmt + "&name="
+ old = base + "." + self.fmt + ":"
+
+ data = {
+ "filename": self.id,
+ "extension": self.fmt,
+ "_fallback": TwitterExtractor._image_fallback(new, old),
+ }
+
+ yield Message.Directory, data
+ yield Message.Url, new + "orig", data
+
+
class TwitterAPI():
def __init__(self, extractor):
diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py
index c653c01..886353f 100644
--- a/gallery_dl/extractor/unsplash.py
+++ b/gallery_dl/extractor/unsplash.py
@@ -69,7 +69,7 @@ class UnsplashImageExtractor(UnsplashExtractor):
subcategory = "image"
pattern = BASE_PATTERN + r"/photos/([^/?#]+)"
test = ("https://unsplash.com/photos/lsoogGC_5dg", {
- "url": "00accb0a64d5a0df0db911f8b425892718dce524",
+ "url": "ac9d194f58b3fc9aacdfc9784c1b69868f212b6e",
"keyword": {
"alt_description": "re:silhouette of trees near body of water ",
"blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz",
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py
new file mode 100644
index 0000000..1ce1140
--- /dev/null
+++ b/gallery_dl/extractor/vk.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://vk.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import re
+
+
+class VkPhotosExtractor(Extractor):
+ """Extractor for photos from a vk user"""
+ category = "vk"
+ subcategory = "photos"
+ directory_fmt = ("{category}", "{user[id]}")
+ filename_fmt = "{id}.{extension}"
+ archive_fmt = "{id}"
+ root = "https://vk.com"
+ request_interval = 1.0
+ pattern = r"(?:https://)?(?:www\.|m\.)?vk\.com/(?:albums|photos|id)(\d+)"
+ test = (
+ ("https://vk.com/id398982326", {
+ "pattern": r"https://sun\d+-\d+\.userapi\.com/c\d+/v\d+"
+ r"/[0-9a-f]+/[\w-]+\.jpg",
+ "count": ">= 35",
+ }),
+ ("https://m.vk.com/albums398982326"),
+ ("https://www.vk.com/id398982326?profile=1"),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def items(self):
+ user_id = self.user_id
+
+ if self.config("metadata"):
+ url = "{}/id{}".format(self.root, user_id)
+ extr = text.extract_from(self.request(url).text)
+ data = {"user": {
+ "id" : user_id,
+ "nick": text.unescape(extr(
+ "<title>", " | VK<")),
+ "name": text.unescape(extr(
+ '<h1 class="page_name">', "<")).replace(" ", " "),
+ "info": text.unescape(text.remove_html(extr(
+ '<span class="current_text">', '</span')))
+ }}
+ else:
+ data = {"user": {"id": user_id}}
+
+ photos_url = "{}/photos{}".format(self.root, user_id)
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "Origin" : self.root,
+ "Referer" : photos_url,
+ }
+ params = {
+ "al" : "1",
+ "al_ad" : "0",
+ "offset": 0,
+ "part" : "1",
+ }
+
+ yield Message.Directory, data
+ sub = re.compile(r"/imp[fg]/").sub
+ needle = 'data-id="{}_'.format(user_id)
+
+ while True:
+ offset, html = self.request(
+ photos_url, method="POST", headers=headers, data=params
+ ).json()["payload"][1]
+
+ for cnt, photo in enumerate(text.extract_iter(html, needle, ')')):
+ data["id"] = photo[:photo.find('"')]
+ url = photo[photo.rindex("(")+1:]
+ url = sub("/", url.partition("?")[0])
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ if cnt <= 40 or offset == params["offset"]:
+ return
+ params["offset"] = offset
diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py
index 1dd5b09..f8da191 100644
--- a/gallery_dl/extractor/weasyl.py
+++ b/gallery_dl/extractor/weasyl.py
@@ -203,7 +203,7 @@ class WeasylJournalsExtractor(WeasylExtractor):
class WeasylFavoriteExtractor(WeasylExtractor):
subcategory = "favorite"
directory_fmt = ("{category}", "{owner_login}", "Favorites")
- pattern = BASE_PATTERN + r"favorites\?userid=(\d+)&feature=submit"
+ pattern = BASE_PATTERN + r"favorites\?userid=(\d+)"
test = ("https://www.weasyl.com/favorites?userid=184616&feature=submit", {
"count": ">= 5",
})
diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py
index 428c6b5..7fd60b1 100644
--- a/gallery_dl/extractor/wikiart.py
+++ b/gallery_dl/extractor/wikiart.py
@@ -72,7 +72,7 @@ class WikiartArtistExtractor(WikiartExtractor):
pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)/?$"
test = ("https://www.wikiart.org/en/thomas-cole", {
"url": "5ba2fbe6783fcce34e65014d16e5fbc581490c98",
- "keyword": "6d92913c55675e05553f000cfee5daff0b4107cf",
+ "keyword": "eb5b141cf33e6d279afd1518aae24e61cc0adf81",
})
def __init__(self, match):
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index 0f40bb9..d3b4a90 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -42,7 +42,14 @@ class Job():
self.status = 0
self.pred_url = self._prepare_predicates("image", True)
self.pred_queue = self._prepare_predicates("chapter", False)
+ self.kwdict = {}
+ # user-supplied metadata
+ kwdict = self.extractor.config("keywords")
+ if kwdict:
+ self.kwdict.update(kwdict)
+
+ # data from parent job
if parent:
pextr = parent.extractor
@@ -57,9 +64,6 @@ class Job():
# reuse connection adapters
extr.session.adapters = pextr.session.adapters
- # user-supplied metadata
- self.userkwds = self.extractor.config("keywords")
-
def run(self):
"""Execute or run the job"""
sleep = self.extractor.config("sleep-extractor")
@@ -137,8 +141,8 @@ class Job():
extr = self.extractor
kwdict["category"] = extr.category
kwdict["subcategory"] = extr.subcategory
- if self.userkwds:
- kwdict.update(self.userkwds)
+ if self.kwdict:
+ kwdict.update(self.kwdict)
def _prepare_predicates(self, target, skip=True):
predicates = []
@@ -183,7 +187,7 @@ class Job():
class DownloadJob(Job):
"""Download images into appropriate directory/filename locations"""
- def __init__(self, url, parent=None):
+ def __init__(self, url, parent=None, kwdict=None):
Job.__init__(self, url, parent)
self.log = self.get_logger("download")
self.blacklist = None
@@ -198,6 +202,11 @@ class DownloadJob(Job):
pfmt = parent.pathfmt
if pfmt and parent.extractor.config("parent-directory"):
self.extractor._parentdir = pfmt.directory
+ if parent.extractor.config("parent-metadata"):
+ if parent.kwdict:
+ self.kwdict.update(parent.kwdict)
+ if kwdict:
+ self.kwdict.update(kwdict)
else:
self.visited = set()
@@ -280,8 +289,9 @@ class DownloadJob(Job):
return
self.visited.add(url)
- if "_extractor" in kwdict:
- extr = kwdict["_extractor"].from_url(url)
+ cls = kwdict.get("_extractor")
+ if cls:
+ extr = cls.from_url(url)
else:
extr = extractor.find(url)
if extr:
@@ -291,7 +301,7 @@ class DownloadJob(Job):
extr = None
if extr:
- self.status |= self.__class__(extr, self).run()
+ self.status |= self.__class__(extr, self, kwdict).run()
else:
self._write_unsupported(url)
@@ -474,7 +484,9 @@ class DownloadJob(Job):
class SimulationJob(DownloadJob):
"""Simulate the extraction process without downloading anything"""
- def handle_url(self, url, kwdict, fallback=None):
+ def handle_url(self, url, kwdict):
+ if not kwdict["extension"]:
+ kwdict["extension"] = "jpg"
self.pathfmt.set_filename(kwdict)
self.out.skip(self.pathfmt.path)
if self.sleep:
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 8b06384..a6a9105 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,33 +10,11 @@
import re
import html
-import os.path
import datetime
import urllib.parse
-
HTML_RE = re.compile("<[^>]+>")
-INVALID_XML_CHARS = (
- "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07",
- "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12",
- "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a",
- "\x1b", "\x1c", "\x1d", "\x1e", "\x1f",
-)
-
-
-def clean_xml(xmldata, repl=""):
- """Replace/Remove invalid control characters in 'xmldata'"""
- if not isinstance(xmldata, str):
- try:
- xmldata = "".join(xmldata)
- except TypeError:
- return ""
- for char in INVALID_XML_CHARS:
- if char in xmldata:
- xmldata = xmldata.replace(char, repl)
- return xmldata
-
def remove_html(txt, repl=" ", sep=" "):
"""Remove html-tags from a string"""
@@ -49,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "):
return txt.strip()
-def split_html(txt, sep=None):
- """Split input string by html-tags"""
+def split_html(txt):
+ """Split input string by HTML tags"""
try:
return [
- x.strip() for x in HTML_RE.split(txt)
+ unescape(x).strip()
+ for x in HTML_RE.split(txt)
if x and not x.isspace()
]
except TypeError:
@@ -77,18 +56,22 @@ def filename_from_url(url):
def ext_from_url(url):
"""Extract the filename extension of an URL"""
- filename = filename_from_url(url)
- ext = os.path.splitext(filename)[1]
- return ext[1:].lower()
+ name, _, ext = filename_from_url(url).rpartition(".")
+ return ext.lower() if name else ""
def nameext_from_url(url, data=None):
"""Extract the last part of an URL and fill 'data' accordingly"""
if data is None:
data = {}
- name = unquote(filename_from_url(url))
- data["filename"], ext = os.path.splitext(name)
- data["extension"] = ext[1:].lower()
+
+ filename = unquote(filename_from_url(url))
+ name, _, ext = filename.rpartition(".")
+ if name:
+ data["filename"], data["extension"] = name, ext.lower()
+ else:
+ data["filename"], data["extension"] = filename, ""
+
return data
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index f1c49e9..b75f444 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.17.0"
+__version__ = "1.17.2"
diff --git a/test/test_text.py b/test/test_text.py
index 34585d1..1daefde 100644
--- a/test/test_text.py
+++ b/test/test_text.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
-# Copyright 2015-2020 Mike Fährmann
+# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -23,29 +23,6 @@ INVALID_ALT = ((), [], {}, None, "")
class TestText(unittest.TestCase):
- def test_clean_xml(self, f=text.clean_xml):
- # standard usage
- self.assertEqual(f(""), "")
- self.assertEqual(f("foo"), "foo")
- self.assertEqual(f("\tfoo\nbar\r"), "\tfoo\nbar\r")
- self.assertEqual(f("<foo>\ab\ba\fr\v</foo>"), "<foo>bar</foo>")
-
- # 'repl' argument
- repl = "#"
- self.assertEqual(f("", repl), "")
- self.assertEqual(f("foo", repl), "foo")
- self.assertEqual(f("\tfoo\nbar\r", repl), "\tfoo\nbar\r")
- self.assertEqual(
- f("<foo>\ab\ba\fr\v</foo>", repl), "<foo>#b#a#r#</foo>")
-
- # removal of all illegal control characters
- value = "".join(chr(x) for x in range(32))
- self.assertEqual(f(value), "\t\n\r")
-
- # 'invalid' arguments
- for value in INVALID:
- self.assertEqual(f(value), "")
-
def test_remove_html(self, f=text.remove_html):
result = "Hello World."
@@ -82,6 +59,10 @@ class TestText(unittest.TestCase):
self.assertEqual(
f("<div><b class='a'>Hello</b><i>World.</i></div>"), result)
+ # escaped HTML entities
+ self.assertEqual(
+ f("<i>&lt;foo&gt;</i> <i>&lt;bar&gt; </i>"), ["<foo>", "<bar>"])
+
# empty HTML
self.assertEqual(f("<div></div>"), empty)
self.assertEqual(f(" <div> </div> "), empty)
@@ -142,8 +123,9 @@ class TestText(unittest.TestCase):
# standard usage
self.assertEqual(f(""), "")
+ self.assertEqual(f("filename"), "")
self.assertEqual(f("filename.ext"), result)
- self.assertEqual(f("/filename.ext"), result)
+ self.assertEqual(f("/filename.ExT"), result)
self.assertEqual(f("example.org/filename.ext"), result)
self.assertEqual(f("http://example.org/v2/filename.ext"), result)
self.assertEqual(
@@ -160,7 +142,7 @@ class TestText(unittest.TestCase):
# standard usage
self.assertEqual(f(""), empty)
self.assertEqual(f("filename.ext"), result)
- self.assertEqual(f("/filename.ext"), result)
+ self.assertEqual(f("/filename.ExT"), result)
self.assertEqual(f("example.org/filename.ext"), result)
self.assertEqual(f("http://example.org/v2/filename.ext"), result)
self.assertEqual(