diff options
author | Unit 193 <unit193@unit193.net> | 2021-04-13 19:33:55 -0400 |
---|---|---|
committer | Unit 193 <unit193@unit193.net> | 2021-04-13 19:33:55 -0400 |
commit | 027af8fd5fb02b9cb2fc3bc6e58ed41ac1056289 (patch) | |
tree | 45e9927640751d54f1c2331595e6a804807a388f | |
parent | a7f4d54b42ad98cd8e28bff2891097e0eebfac7c (diff) | |
parent | d27dcd4646242d6da8436f14c7b37ce864355858 (diff) | |
download | gallery-dl-027af8fd5fb02b9cb2fc3bc6e58ed41ac1056289.tar.bz2 gallery-dl-027af8fd5fb02b9cb2fc3bc6e58ed41ac1056289.tar.xz gallery-dl-027af8fd5fb02b9cb2fc3bc6e58ed41ac1056289.tar.zst |
Update upstream source from tag 'upstream/1.17.2'
Update to upstream version '1.17.2'
with Debian dir 223e9a6bbd333c762be6ae0b8588efbfc0885dd0
46 files changed, 1270 insertions, 1008 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index ef4148a..d57583e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,57 @@ # Changelog +## 1.17.2 - 2021-04-02 +### Additions +- [deviantart] add support for posts from watched users ([#794](https://github.com/mikf/gallery-dl/issues/794)) +- [manganelo] add `chapter` and `manga` extractors ([#1415](https://github.com/mikf/gallery-dl/issues/1415)) +- [pinterest] add `search` extractor ([#1411](https://github.com/mikf/gallery-dl/issues/1411)) +- [sankaku] add `tag_string` metadata field ([#1388](https://github.com/mikf/gallery-dl/issues/1388)) +- [sankaku] add enumeration index for books ([#1388](https://github.com/mikf/gallery-dl/issues/1388)) +- [tapas] add `series` and `episode` extractors ([#692](https://github.com/mikf/gallery-dl/issues/692)) +- [tapas] implement login with username & password ([#692](https://github.com/mikf/gallery-dl/issues/692)) +- [twitter] allow specifying a custom format for user results ([#1337](https://github.com/mikf/gallery-dl/issues/1337)) +- [twitter] add extractor for direct image links ([#1417](https://github.com/mikf/gallery-dl/issues/1417)) +- [vk] add support for albums ([#474](https://github.com/mikf/gallery-dl/issues/474)) +### Fixes +- [aryion] unescape paths ([#1414](https://github.com/mikf/gallery-dl/issues/1414)) +- [bcy] improve pagination +- [deviantart] update `watch` URL pattern ([#794](https://github.com/mikf/gallery-dl/issues/794)) +- [deviantart] fix arguments for search/popular results ([#1408](https://github.com/mikf/gallery-dl/issues/1408)) +- [deviantart] use fallback for `/intermediary/` URLs +- [exhentai] improve and simplify image limit checks +- [komikcast] fix extraction +- [pixiv] fix `favorite` URL pattern ([#1405](https://github.com/mikf/gallery-dl/issues/1405)) +- [sankaku] simplify `pool` tags ([#1388](https://github.com/mikf/gallery-dl/issues/1388)) +- [twitter] improve error message when trying to log in with 2FA ([#1409](https://github.com/mikf/gallery-dl/issues/1409)) +- [twitter] don't use youtube-dl for cards when videos are disabled ([#1416](https://github.com/mikf/gallery-dl/issues/1416)) + +## 1.17.1 - 2021-03-19 +### Additions +- [architizer] add `project` and `firm` extractors ([#1369](https://github.com/mikf/gallery-dl/issues/1369)) +- [deviantart] add `watch` extractor ([#794](https://github.com/mikf/gallery-dl/issues/794)) +- [exhentai] support `/tag/` URLs ([#1363](https://github.com/mikf/gallery-dl/issues/1363)) +- [gelbooru_v01] support `drawfriends.booru.org`, `vidyart.booru.org`, and `tlb.booru.org` by default +- [nozomi] support `/index-N.html` URLs ([#1365](https://github.com/mikf/gallery-dl/issues/1365)) +- [philomena] add generalized extractors for philomena sites ([#1379](https://github.com/mikf/gallery-dl/issues/1379)) +- [philomena] support post URLs without `/images/` +- [twitter] implement `users` option ([#1337](https://github.com/mikf/gallery-dl/issues/1337)) +- implement `parent-metadata` option ([#1364](https://github.com/mikf/gallery-dl/issues/1364)) +### Changes +- [deviantart] revert previous changes to `extra` option ([#1356](https://github.com/mikf/gallery-dl/issues/1356), [#1387](https://github.com/mikf/gallery-dl/issues/1387)) +### Fixes +- [exhentai] improve favorites count extraction ([#1360](https://github.com/mikf/gallery-dl/issues/1360)) +- [gelbooru] update domain for video downloads ([#1368](https://github.com/mikf/gallery-dl/issues/1368)) +- [hentaifox] improve image and metadata extraction ([#1366](https://github.com/mikf/gallery-dl/issues/1366), [#1378](https://github.com/mikf/gallery-dl/issues/1378)) +- [imgur] fix and improve rate limit handling ([#1386](https://github.com/mikf/gallery-dl/issues/1386)) +- [weasyl] improve favorites URL pattern ([#1374](https://github.com/mikf/gallery-dl/issues/1374)) +- use type check before applying `browser` option ([#1358](https://github.com/mikf/gallery-dl/issues/1358)) +- ensure `-s/--simulate` always prints filenames ([#1360](https://github.com/mikf/gallery-dl/issues/1360)) +### Removals +- [hentaicafe] remove module +- [hentainexus] remove module +- [mangareader] remove module +- [mangastream] remove module + ## 1.17.0 - 2021-03-05 ### Additions - [cyberdrop] add support for `https://cyberdrop.me/` ([#1328](https://github.com/mikf/gallery-dl/issues/1328)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.17.0 +Version: 1.17.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -75,8 +75,8 @@ Description: ========== Prebuilt executable files with a Python interpreter and required Python packages included are available for - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -223,6 +223,7 @@ Description: ========== ``pinterest``, ``sankaku``, ``subscribestar``, + ``tapas``, ``tsumino``, and ``twitter``. @@ -328,7 +329,7 @@ Description: ========== .. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst - .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst + .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ @@ -64,8 +64,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -212,6 +212,7 @@ and optional for ``pinterest``, ``sankaku``, ``subscribestar``, +``tapas``, ``tsumino``, and ``twitter``. @@ -317,7 +318,7 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst -.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst +.. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index c420d9b..1ab1ec6 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2021-03-05" "1.17.0" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2021-04-02" "1.17.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index c0629bb..608c2e5 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2021-03-05" "1.17.0" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2021-04-02" "1.17.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -155,6 +155,17 @@ Use an extractor's current target directory as for any spawned child extractors. +.SS extractor.*.parent-metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Overwrite any metadata provided by a child extractor with its parent's. + + .SS extractor.*.path-restrict .IP "Type:" 6 \f[I]string\f[] or \f[I]object\f[] @@ -352,6 +363,8 @@ and optional for .br * \f[I]subscribestar\f[] .br +* \f[I]tapas\f[] +.br * \f[I]tsumino\f[] .br * \f[I]twitter\f[] @@ -863,7 +876,7 @@ See \f[I]Filters\f[] for details. \f[I]false\f[] .IP "Description:" 4 -Download embedded Deviations and Sta.sh resources from +Download extra Sta.sh resources from description texts and journals. Note: Enabling this option also enables deviantart.metadata_. @@ -1046,21 +1059,6 @@ depending on the input URL * \f[I]"exhentai.org"\f[]: Use \f[I]exhentai.org\f[] for all URLs -.SS extractor.exhentai.limits -.IP "Type:" 6 -\f[I]bool\f[] or \f[I]integer\f[] - -.IP "Default:" 9 -\f[I]true\f[] - -.IP "Description:" 4 -Check image download limits -and stop extraction when they are exceeded. - -If this value is an \f[I]integer\f[], it gets used as the limit maximum -instead of the value listed on \f[I]https://e-hentai.org/home.php\f[] - - .SS extractor.exhentai.metadata .IP "Type:" 6 \f[I]bool\f[] @@ -1272,7 +1270,7 @@ A (comma-separated) list of subcategories to include when processing a user profile. Possible values are -\f[I]"posts"\f[], \f[I]"stories"\f[], \f[I]"highlights"\f[], \f[I]"channel"\f[]. +\f[I]"posts"\f[], \f[I]reels\f[], \f[I]"stories"\f[], \f[I]"highlights"\f[], \f[I]"channel"\f[]. You can use \f[I]"all"\f[] instead of listing all values separately. @@ -1599,6 +1597,7 @@ linked to in the initial set of submissions. This value sets the maximum recursion depth. Special values: + .br * \f[I]0\f[]: Recursion is disabled .br @@ -1844,6 +1843,35 @@ will be taken from the original Tweets, not the Retweets. Extract \f[I]TwitPic\f[] embeds. +.SS extractor.twitter.users +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"timeline"\f[] + +.IP "Example:" 4 +"https://twitter.com/search?q=from:{legacy[screen_name]}" + +.IP "Description:" 4 +Format string for user URLs generated from +.br +\f[I]following\f[] and \f[I]list-members\f[] queries, +whose replacement field values come from Twitter \f[I]user\f[] objects +.br +(\f[I]Example\f[]) + +Special values: + +.br +* \f[I]"timeline"\f[]: \f[I]https://twitter.com/i/user/{rest_id}\f[] +.br +* \f[I]"media"\f[]: \f[I]https://twitter.com/id:{rest_id}/media\f[] + +Note: To allow gallery-dl to follow custom URL formats, set the \f[I]blacklist\f[] +for \f[I]twitter\f[] to a non-default value, e.g. an empty string \f[I]""\f[]. + + .SS extractor.twitter.videos .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index acf60c7..8a3d9e2 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -79,7 +79,6 @@ "username": null, "password": null, "domain": "auto", - "limits": true, "metadata": false, "original": true, "sleep-request": 5.0 @@ -254,6 +253,7 @@ "replies": true, "retweets": true, "twitpic": false, + "users": "timeline", "videos": true }, "unsplash": diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index fbf67fe..f233a1a 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.17.0 +Version: 1.17.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -75,8 +75,8 @@ Description: ========== Prebuilt executable files with a Python interpreter and required Python packages included are available for - - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.exe>`__ - - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.0/gallery-dl.bin>`__ + - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.exe>`__ + - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.17.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -223,6 +223,7 @@ Description: ========== ``pinterest``, ``sankaku``, ``subscribestar``, + ``tapas``, ``tsumino``, and ``twitter``. @@ -328,7 +329,7 @@ Description: ========== .. _gallery-dl.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl.conf .. _gallery-dl-example.conf: https://github.com/mikf/gallery-dl/blob/master/docs/gallery-dl-example.conf .. _configuration.rst: https://github.com/mikf/gallery-dl/blob/master/docs/configuration.rst - .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst + .. _Supported Sites: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.md .. _Python: https://www.python.org/downloads/ .. _PyPI: https://pypi.org/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 89ae8ed..09e7097 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -42,6 +42,7 @@ gallery_dl/extractor/8kun.py gallery_dl/extractor/8muses.py gallery_dl/extractor/__init__.py gallery_dl/extractor/adultempire.py +gallery_dl/extractor/architizer.py gallery_dl/extractor/artstation.py gallery_dl/extractor/aryion.py gallery_dl/extractor/bcy.py @@ -51,7 +52,6 @@ gallery_dl/extractor/booru.py gallery_dl/extractor/common.py gallery_dl/extractor/cyberdrop.py gallery_dl/extractor/danbooru.py -gallery_dl/extractor/derpibooru.py gallery_dl/extractor/deviantart.py gallery_dl/extractor/directlink.py gallery_dl/extractor/dynastyscans.py @@ -70,12 +70,10 @@ gallery_dl/extractor/gelbooru_v02.py gallery_dl/extractor/gfycat.py gallery_dl/extractor/hbrowse.py gallery_dl/extractor/hentai2read.py -gallery_dl/extractor/hentaicafe.py gallery_dl/extractor/hentaifoundry.py gallery_dl/extractor/hentaifox.py gallery_dl/extractor/hentaihand.py gallery_dl/extractor/hentaihere.py -gallery_dl/extractor/hentainexus.py gallery_dl/extractor/hiperdex.py gallery_dl/extractor/hitomi.py gallery_dl/extractor/idolcomplex.py @@ -102,9 +100,8 @@ gallery_dl/extractor/mangadex.py gallery_dl/extractor/mangafox.py gallery_dl/extractor/mangahere.py gallery_dl/extractor/mangakakalot.py +gallery_dl/extractor/manganelo.py gallery_dl/extractor/mangapark.py -gallery_dl/extractor/mangareader.py -gallery_dl/extractor/mangastream.py gallery_dl/extractor/mangoxo.py gallery_dl/extractor/mastodon.py gallery_dl/extractor/message.py @@ -122,6 +119,7 @@ gallery_dl/extractor/nsfwalbum.py gallery_dl/extractor/oauth.py gallery_dl/extractor/paheal.py gallery_dl/extractor/patreon.py +gallery_dl/extractor/philomena.py gallery_dl/extractor/photobucket.py gallery_dl/extractor/photovogue.py gallery_dl/extractor/piczel.py @@ -149,6 +147,7 @@ gallery_dl/extractor/slideshare.py gallery_dl/extractor/smugmug.py gallery_dl/extractor/speakerdeck.py gallery_dl/extractor/subscribestar.py +gallery_dl/extractor/tapas.py gallery_dl/extractor/test.py gallery_dl/extractor/tsumino.py gallery_dl/extractor/tumblr.py @@ -156,6 +155,7 @@ gallery_dl/extractor/tumblrgallery.py gallery_dl/extractor/twitter.py gallery_dl/extractor/unsplash.py gallery_dl/extractor/vanillarock.py +gallery_dl/extractor/vk.py gallery_dl/extractor/vsco.py gallery_dl/extractor/wallhaven.py gallery_dl/extractor/warosu.py diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 57794d0..3d61515 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -17,6 +17,7 @@ modules = [ "8kun", "8muses", "adultempire", + "architizer", "artstation", "aryion", "bcy", @@ -24,7 +25,6 @@ modules = [ "blogger", "cyberdrop", "danbooru", - "derpibooru", "deviantart", "dynastyscans", "e621", @@ -40,12 +40,10 @@ modules = [ "gfycat", "hbrowse", "hentai2read", - "hentaicafe", "hentaifoundry", "hentaifox", "hentaihand", "hentaihere", - "hentainexus", "hiperdex", "hitomi", "idolcomplex", @@ -71,9 +69,8 @@ modules = [ "mangafox", "mangahere", "mangakakalot", + "manganelo", "mangapark", - "mangareader", - "mangastream", "mangoxo", "myhentaigallery", "myportfolio", @@ -87,6 +84,7 @@ modules = [ "nsfwalbum", "paheal", "patreon", + "philomena", "photobucket", "photovogue", "piczel", @@ -112,12 +110,14 @@ modules = [ "smugmug", "speakerdeck", "subscribestar", + "tapas", "tsumino", "tumblr", "tumblrgallery", "twitter", "unsplash", "vanillarock", + "vk", "vsco", "wallhaven", "warosu", diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py new file mode 100644 index 0000000..9629e25 --- /dev/null +++ b/gallery_dl/extractor/architizer.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://architizer.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text + + +class ArchitizerProjectExtractor(GalleryExtractor): + """Extractor for project pages on architizer.com""" + category = "architizer" + subcategory = "project" + root = "https://architizer.com" + directory_fmt = ("{category}", "{firm}", "{title}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{gid}_{num}" + pattern = r"(?:https?://)?architizer\.com/projects/([^/?#]+)" + test = ("https://architizer.com/projects/house-lo/", { + "pattern": r"https://architizer-prod\.imgix\.net/media/mediadata" + r"/uploads/.+\.jpg$", + "keyword": { + "count": 27, + "description": str, + "firm": "Atelier Lina Bellovicova", + "gid": "225496", + "location": "Czechia", + "num": int, + "size": "1000 sqft - 3000 sqft", + "slug": "house-lo", + "status": "Built", + "subcategory": "project", + "title": "House LO", + "type": "Residential › Private House", + "year": "2018", + }, + }) + + def __init__(self, match): + url = "{}/projects/{}/".format(self.root, match.group(1)) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + return { + "title" : extr("data-name='", "'"), + "slug" : extr("data-slug='", "'"), + "gid" : extr("data-gid='", "'").rpartition(".")[2], + "firm" : extr("data-firm-leaders-str='", "'"), + "location" : extr("<h2>", "<").strip(), + "type" : text.unescape(text.remove_html(extr( + '<div class="title">Type</div>', '<br'))), + "status" : text.remove_html(extr( + '<div class="title">STATUS</div>', '</')), + "year" : text.remove_html(extr( + '<div class="title">YEAR</div>', '</')), + "size" : text.remove_html(extr( + '<div class="title">SIZE</div>', '</')), + "description": text.unescape(extr( + '<span class="copy js-copy">', '</span></div>') + .replace("<br />", "\n")), + } + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter( + page, "property='og:image:secure_url' content='", "?") + ] + + +class ArchitizerFirmExtractor(Extractor): + """Extractor for all projects of a firm""" + category = "architizer" + subcategory = "firm" + root = "https://architizer.com" + pattern = r"(?:https?://)?architizer\.com/firms/([^/?#]+)" + test = ("https://architizer.com/firms/olson-kundig/", { + "pattern": ArchitizerProjectExtractor.pattern, + "count": ">= 90", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.firm = match.group(1) + + def items(self): + url = url = "{}/firms/{}/?requesting_merlin=pages".format( + self.root, self.firm) + page = self.request(url).text + data = {"_extractor": ArchitizerProjectExtractor} + + for project in text.extract_iter(page, '<a href="/projects/', '"'): + if not project.startswith("q/"): + url = "{}/projects/{}".format(self.root, project) + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 6a90b76..ded2ae3 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -126,7 +126,8 @@ class AryionExtractor(Extractor): "user" : self.user or artist, "title" : title, "artist": artist, - "path" : text.split_html(extr("cookiecrumb'>", '</span'))[4:-1:2], + "path" : text.split_html(extr( + "cookiecrumb'>", '</span'))[4:-1:2], "date" : extr("class='pretty-date' title='", "'"), "size" : text.parse_int(clen), "views" : text.parse_int(extr("Views</b>:", "<").replace(",", "")), diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py index ec7020a..6e0003d 100644 --- a/gallery_dl/extractor/bcy.py +++ b/gallery_dl/extractor/bcy.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Mike Fährmann +# Copyright 2020-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -125,12 +125,15 @@ class BcyUserExtractor(BcyExtractor): while True: data = self.request(url, params=params).json() - item = None - for item in data["data"]["items"]: - yield item["item_detail"] - - if not item: + try: + items = data["data"]["items"] + except KeyError: + return + if not items: return + + for item in items: + yield item["item_detail"] params["since"] = item["since"] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index e9b9718..048e0a3 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -216,7 +216,7 @@ class Extractor(): headers.clear() browser = self.config("browser") or self.browser - if browser: + if browser and isinstance(browser, str): browser, _, platform = browser.lower().partition(":") if not platform or platform == "auto": diff --git a/gallery_dl/extractor/derpibooru.py b/gallery_dl/extractor/derpibooru.py deleted file mode 100644 index 94f3729..0000000 --- a/gallery_dl/extractor/derpibooru.py +++ /dev/null @@ -1,188 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://derpibooru.org/""" - -from .booru import BooruExtractor -from .. import text, exception -import operator - -BASE_PATTERN = r"(?:https?://)?derpibooru\.org" - - -class DerpibooruExtractor(BooruExtractor): - """Base class for derpibooru extractors""" - category = "derpibooru" - filename_fmt = "{filename}.{extension}" - archive_fmt = "{id}" - root = "https://derpibooru.org" - request_interval = 1.0 - per_page = 50 - - _file_url = operator.itemgetter("view_url") - - @staticmethod - def _prepare(post): - post["date"] = text.parse_datetime(post["created_at"]) - - @staticmethod - def _extended_tags(post): - pass - - def _pagination(self, url, params): - params["page"] = 1 - params["per_page"] = self.per_page - - api_key = self.config("api-key") - if api_key: - params["key"] = api_key - - filter_id = self.config("filter") - if filter_id: - params["filter_id"] = filter_id - elif not api_key: - params["filter_id"] = "56027" # "Everything" filter - - while True: - data = self.request(url, params=params).json() - yield from data["images"] - - if len(data["images"]) < self.per_page: - return - params["page"] += 1 - - -class DerpibooruPostExtractor(DerpibooruExtractor): - """Extractor for single posts from derpibooru.org""" - subcategory = "post" - pattern = BASE_PATTERN + r"/images/(\d+)" - test = ("https://derpibooru.org/images/1", { - "content": "88449eeb0c4fa5d3583d0b794f6bc1d70bf7f889", - "count": 1, - "keyword": { - "animated": False, - "aspect_ratio": 1.0, - "comment_count": int, - "created_at": "2012-01-02T03:12:33Z", - "date": "dt:2012-01-02 03:12:33", - "deletion_reason": None, - "description": "", - "downvotes": int, - "duplicate_of": None, - "duration": 0.04, - "extension": "png", - "faves": int, - "first_seen_at": "2012-01-02T03:12:33Z", - "format": "png", - "height": 900, - "hidden_from_users": False, - "id": 1, - "mime_type": "image/png", - "name": "1__safe_fluttershy_solo_cloud_happy_flying_upvotes+galore" - "_artist-colon-speccysy_get_sunshine", - "orig_sha512_hash": None, - "processed": True, - "representations": dict, - "score": int, - "sha512_hash": "f16c98e2848c2f1bfff3985e8f1a54375cc49f78125391aeb8" - "0534ce011ead14e3e452a5c4bc98a66f56bdfcd07ef7800663" - "b994f3f343c572da5ecc22a9660f", - "size": 860914, - "source_url": "https://www.deviantart.com/speccysy/art" - "/Afternoon-Flight-215193985", - "spoilered": False, - "tag_count": 36, - "tag_ids": list, - "tags": list, - "thumbnails_generated": True, - "updated_at": "2020-05-28T13:14:07Z", - "uploader": "Clover the Clever", - "uploader_id": 211188, - "upvotes": int, - "view_url": str, - "width": 900, - "wilson_score": float, - }, - }) - - def __init__(self, match): - DerpibooruExtractor.__init__(self, match) - self.image_id = match.group(1) - - def posts(self): - url = self.root + "/api/v1/json/images/" + self.image_id - return (self.request(url).json()["image"],) - - -class DerpibooruSearchExtractor(DerpibooruExtractor): - """Extractor for search results on derpibooru.org""" - subcategory = "search" - directory_fmt = ("{category}", "{search_tags}") - pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))" - test = ( - ("https://derpibooru.org/search?q=cute", { - "range": "40-60", - "count": 21, - }), - ("https://derpibooru.org/tags/cute", { - "range": "40-60", - "count": 21, - }), - ) - - def __init__(self, match): - DerpibooruExtractor.__init__(self, match) - query, tags = match.groups() - self.params = text.parse_query(query) if query else {"q": tags} - - def metadata(self): - return {"search_tags": self.params.get("q", "")} - - def posts(self): - url = self.root + "/api/v1/json/search/images" - return self._pagination(url, self.params) - - -class DerpibooruGalleryExtractor(DerpibooruExtractor): - """Extractor for galleries on derpibooru.org""" - subcategory = "gallery" - directory_fmt = ("{category}", "galleries", - "{gallery[id]} {gallery[title]}") - pattern = BASE_PATTERN + r"/galleries/(\d+)" - test = ("https://derpibooru.org/galleries/1", { - "pattern": r"https://derpicdn\.net/img/view/\d+/\d+/\d+/\d+[^/]+$", - "keyword": { - "gallery": { - "description": "Indexes start at 1 :P", - "id": 1, - "spoiler_warning": "", - "thumbnail_id": 1, - "title": "The Very First Gallery", - "user": "DeliciousBlackInk", - "user_id": 365446, - }, - }, - }) - - def __init__(self, match): - DerpibooruExtractor.__init__(self, match) - self.gallery_id = match.group(1) - - def metadata(self): - url = self.root + "/api/v1/json/search/galleries" - params = {"q": "id:" + self.gallery_id} - galleries = self.request(url, params=params).json()["galleries"] - if not galleries: - raise exception.NotFoundError("gallery") - return {"gallery": galleries[0]} - - def posts(self): - gallery_id = "gallery_id:" + self.gallery_id - url = self.root + "/api/v1/json/search/images" - params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id} - return self._pagination(url, params) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 47286b7..9d1701f 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -20,7 +20,7 @@ import re BASE_PATTERN = ( r"(?:https?://)?(?:" - r"(?:www\.)?deviantart\.com/([\w-]+)|" + r"(?:www\.)?deviantart\.com/(?!watch/)([\w-]+)|" r"(?!www\.)([\w-]+)\.deviantart\.com)" ) @@ -78,10 +78,6 @@ class DeviantartExtractor(Extractor): else: self.user = profile["user"]["username"] - if self.extra: - finditer_stash = DeviantartStashExtractor.pattern.finditer - finditer_deviation = DeviantartDeviationExtractor.pattern.finditer - yield Message.Version, 1 for deviation in self.deviations(): if isinstance(deviation, tuple): @@ -109,7 +105,8 @@ class DeviantartExtractor(Extractor): intermediary, count = re.subn( r"(/f/[^/]+/[^/]+)/v\d+/.*", r"/intermediary\1", content["src"], 1) - if count and self._check_url(intermediary): + if count: + deviation["_fallback"] = (content["src"],) content["src"] = intermediary if self.quality: content["src"] = re.sub( @@ -138,14 +135,10 @@ class DeviantartExtractor(Extractor): if self.extra: txt = (deviation.get("description", "") + deviation.get("_journal", "")) - for match in finditer_stash(txt): + for match in DeviantartStashExtractor.pattern.finditer(txt): url = text.ensure_http_scheme(match.group(0)) deviation["_extractor"] = DeviantartStashExtractor yield Message.Queue, url, deviation - for match in finditer_deviation(txt): - url = text.ensure_http_scheme(match.group(0)) - deviation["_extractor"] = DeviantartDeviationExtractor - yield Message.Queue, url, deviation def deviations(self): """Return an iterable containing all relevant Deviation-objects""" @@ -290,9 +283,6 @@ class DeviantartExtractor(Extractor): if mtype and mtype.startswith("image/"): content.update(data) - def _check_url(self, url): - return self.request(url, method="HEAD", fatal=False).status_code < 400 - def _limited_request(self, url, **kwargs): """Limits HTTP requests to one every 2 seconds""" kwargs["fatal"] = None @@ -718,15 +708,16 @@ class DeviantartPopularExtractor(DeviantartExtractor): if path: self.category_path = path.strip("/") if trange: - trange = trange[8:] if trange.startswith("popular-") else "" + if trange.startswith("popular-"): + trange = trange[8:] self.time_range = trange.replace("-", "").replace("hours", "hr") if query: self.search_term = query.get("q") self.popular = { "search": self.search_term or "", - "range": trange or "24-hours", - "path": self.category_path, + "range" : trange or "", + "path" : self.category_path, } def deviations(self): @@ -738,6 +729,30 @@ class DeviantartPopularExtractor(DeviantartExtractor): deviation["popular"] = self.popular +class DeviantartWatchExtractor(DeviantartExtractor): + """Extractor for Deviations from watched users""" + subcategory = "watch" + pattern = (r"(?:https?://)?(?:www\.)?deviantart\.com" + r"/(?:watch/deviations|notifications/watch)()()") + test = ( + ("https://www.deviantart.com/watch/deviations"), + ("https://www.deviantart.com/notifications/watch"), + ) + + def deviations(self): + return self.api.browse_deviantsyouwatch() + + +class DeviantartWatchPostsExtractor(DeviantartExtractor): + """Extractor for Posts from watched users""" + subcategory = "watch-posts" + pattern = r"(?:https?://)?(?:www\.)?deviantart\.com/watch/posts()()" + test = ("https://www.deviantart.com/watch/posts",) + + def deviations(self): + return self.api.browse_posts_deviantsyouwatch() + + ############################################################################### # Eclipse ##################################################################### @@ -926,6 +941,20 @@ class DeviantartOAuthAPI(): self.client_id, ) + def browse_deviantsyouwatch(self, offset=0): + """Yield deviations from users you watch""" + endpoint = "browse/deviantsyouwatch" + params = {"limit": "50", "offset": offset, + "mature_content": self.mature} + return self._pagination(endpoint, params, public=False) + + def browse_posts_deviantsyouwatch(self, offset=0): + """Yield posts from users you watch""" + endpoint = "browse/posts/deviantsyouwatch" + params = {"limit": "50", "offset": offset, + "mature_content": self.mature} + return self._pagination(endpoint, params, public=False, unpack=True) + def browse_popular(self, query=None, timerange=None, offset=0): """Yield popular deviations""" endpoint = "browse/popular" @@ -1085,16 +1114,21 @@ class DeviantartOAuthAPI(): self.log.error(msg) return data - def _pagination(self, endpoint, params, extend=True, public=True): + def _pagination(self, endpoint, params, + extend=True, public=True, unpack=False): warn = True while True: data = self._call(endpoint, params, public=public) if "results" not in data: self.log.error("Unexpected API response: %s", data) return + results = data["results"] + if unpack: + results = [item["journal"] for item in results + if "journal" in item] if extend: - if public and len(data["results"]) < params["limit"]: + if public and len(results) < params["limit"]: if self.refresh_token_key: self.log.debug("Switching to private access token") public = False @@ -1106,10 +1140,10 @@ class DeviantartOAuthAPI(): "oauth:deviantart' and follow the instructions to " "be able to access them.") if self.metadata: - self._metadata(data["results"]) + self._metadata(results) if self.folders: - self._folders(data["results"]) - yield from data["results"] + self._folders(results) + yield from results if not data["has_more"]: return diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 7d26c47..67051c9 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -1,19 +1,18 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters from https://dynasty-scans.com/""" +"""Extractors for https://dynasty-scans.com/""" from .common import ChapterExtractor, Extractor, Message from .. import text import json import re - BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" @@ -36,7 +35,7 @@ class DynastyscansBase(): return { "url" : self.root + url, "image_id": text.parse_int(image_id), - "tags" : text.split_html(text.unescape(tags)), + "tags" : text.split_html(tags), "date" : text.remove_html(date), "source" : text.unescape(src), } diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 842de7e..2e2e952 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -85,14 +85,14 @@ class EromeAlbumExtractor(EromeExtractor): """Extractor for albums on erome.com""" subcategory = "album" pattern = BASE_PATTERN + r"/a/(\w+)" - test = ("https://www.erome.com/a/KandxY7y", { - "pattern": r"https://s\d+\.erome\.com/355/KandxY7y/\w+", - "count": 26, + test = ("https://www.erome.com/a/TyFMI7ik", { + "pattern": r"https://s\d+\.erome\.com/\d+/TyFMI7ik/\w+", + "count": 9, "keyword": { - "album_id": "KandxY7y", + "album_id": "TyFMI7ik", "num": int, - "title": "Therealbrittfitt", - "user": "pokow", + "title": "Ryan Ryans", + "user": "xanub", }, }) @@ -103,7 +103,7 @@ class EromeAlbumExtractor(EromeExtractor): class EromeUserExtractor(EromeExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)" - test = ("https://www.erome.com/gutiquq", { + test = ("https://www.erome.com/xanub", { "range": "1-25", "count": 25, }) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 5a7de23..872a338 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -43,16 +43,8 @@ class ExhentaiExtractor(Extractor): self.cookiedomain = "." + domain Extractor.__init__(self, match) - self.limits = self.config("limits", True) self.original = self.config("original", True) - if type(self.limits) is int: - self._limit_max = self.limits - self.limits = True - else: - self._limit_max = 0 - - self._remaining = 0 self.session.headers["Referer"] = self.root + "/" if version != "ex": self.session.cookies.set("nw", "1", domain=self.cookiedomain) @@ -77,7 +69,6 @@ class ExhentaiExtractor(Extractor): self.log.info("no username given; using e-hentai.org") self.root = "https://e-hentai.org" self.original = False - self.limits = False self.session.cookies["nw"] = "1" @cache(maxage=90*24*3600, keyarg=1) @@ -206,8 +197,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): (self.image_from_page(ipage),), self.images_from_api()) for url, image in images: data.update(image) - if self.limits: - self._check_limits(data) if "/fullimg.php" in url: data["extension"] = "" yield Message.Url, url, data @@ -246,6 +235,12 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "torrentcount" : extr('>Torrent Download (', ')'), } + f = data["favorites"][0] + if f == "N": + data["favorites"] = "0" + elif f == "O": + data["favorites"] = "1" + data["lang"] = util.language_to_code(data["language"]) data["tags"] = [ text.unquote(tag.replace("+", " ")) @@ -293,6 +288,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["image_token"] = self.key["start"] = extr('var startkey="', '";') self.key["show"] = extr('var showkey="', '";') + if iurl.endswith("g/509.gif"): + self._report_limits(data) return url, text.nameext_from_url(iurl, data) def images_from_api(self): @@ -327,10 +324,20 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = request["page"] data["image_token"] = imgkey + + if imgurl.endswith("g/509.gif"): + self._report_limits(data) yield url, text.nameext_from_url(imgurl, data) request["imgkey"] = nextkey + def _report_limits(self, data): + ExhentaiExtractor.LIMIT = True + raise exception.StopExtraction( + "Image limit reached! " + "Continue with '%s/s/%s/%s-%s' as URL after resetting it.", + self.root, data["image_token"], self.gallery_id, data["num"]) + def _gallery_page(self): url = "{}/g/{}/{}/".format( self.root, self.gallery_id, self.gallery_token) @@ -354,35 +361,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.NotFoundError("image page") return page - def _check_limits(self, data): - if not self._remaining or data["num"] % 25 == 0: - self._update_limits() - self._remaining -= data["cost"] - - if self._remaining <= 0: - ExhentaiExtractor.LIMIT = True - url = "{}/s/{}/{}-{}".format( - self.root, data["image_token"], self.gallery_id, data["num"]) - raise exception.StopExtraction( - "Image limit reached! Continue with '%s' " - "as URL after resetting it.", url) - - def _update_limits(self): - url = "https://e-hentai.org/home.php" - cookies = { - cookie.name: cookie.value - for cookie in self.session.cookies - if cookie.domain == self.cookiedomain and cookie.name != "igneous" - } - - page = self.request(url, cookies=cookies).text - current, pos = text.extract(page, "<strong>", "</strong>") - maximum, pos = text.extract(page, "<strong>", "</strong>", pos) - if self._limit_max: - maximum = self._limit_max - self.log.debug("Image Limits: %s/%s", current, maximum) - self._remaining = text.parse_int(maximum) - text.parse_int(current) - @staticmethod def _parse_image_info(url): for part in url.split("/")[4:]: @@ -418,9 +396,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): class ExhentaiSearchExtractor(ExhentaiExtractor): """Extractor for exhentai search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/?\?(.*)$" + pattern = BASE_PATTERN + r"/(?:\?([^#]*)|tag/([^/?#]+))" test = ( ("https://e-hentai.org/?f_search=touhou"), + ("https://exhentai.org/?f_cats=767&f_search=touhou"), + ("https://exhentai.org/tag/parody:touhou+project"), (("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0" "&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0" "&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), { @@ -432,10 +412,20 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): def __init__(self, match): ExhentaiExtractor.__init__(self, match) - self.params = text.parse_query(match.group(2)) - self.params["page"] = text.parse_int(self.params.get("page")) self.search_url = self.root + _, query, tag = match.groups() + if tag: + if "+" in tag: + ns, _, tag = tag.rpartition(":") + tag = '{}:"{}$"'.format(ns, tag.replace("+", " ")) + else: + tag += "$" + self.params = {"f_search": tag, "page": 0} + else: + self.params = text.parse_query(query) + self.params["page"] = text.parse_int(self.params.get("page")) + def items(self): self.login() data = {"_extractor": ExhentaiGalleryExtractor} @@ -459,7 +449,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): """Extractor for favorited exhentai galleries""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?" + pattern = BASE_PATTERN + r"/favorites\.php(?:\?([^#]*)())?" test = ( ("https://e-hentai.org/favorites.php", { "count": 1, diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 92d27a9..0042676 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -23,10 +23,16 @@ class GelbooruBase(): url = post["file_url"] if url.startswith(("https://mp4.gelbooru.com/", "https://video-cdn")): md5 = post["md5"] - url = "https://img2.gelbooru.com/images/{}/{}/{}.webm".format( - md5[0:2], md5[2:4], md5) + path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5) + post["_fallback"] = GelbooruBase._video_fallback(path) + url = "https://img3.gelbooru.com" + path return url + @staticmethod + def _video_fallback(path): + yield "https://img2.gelbooru.com" + path + yield "https://img1.gelbooru.com" + path + class GelbooruTagExtractor(GelbooruBase, gelbooru_v02.GelbooruV02TagExtractor): @@ -80,7 +86,15 @@ class GelbooruPostExtractor(GelbooruBase, """Extractor for single images from gelbooru.com""" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=post&s=view&id=(?P<post>\d+)") - test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", { - "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", - "count": 1, - }) + test = ( + ("https://gelbooru.com/index.php?page=post&s=view&id=313638", { + "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + "count": 1, + }), + # video + ("https://gelbooru.com/index.php?page=post&s=view&id=5938076", { + "content": "6360452fa8c2f0c1137749e81471238564df832a", + "pattern": r"https://img\d\.gelbooru\.com/images" + r"/22/61/226111273615049235b001b381707bd0\.webm", + }), + ) diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 0935998..541f454 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -47,6 +47,9 @@ BASE_PATTERN = GelbooruV01Extractor.update({ "thecollection" : {"root": "https://the-collection.booru.org"}, "illusioncardsbooru": {"root": "https://illusioncards.booru.org"}, "allgirlbooru" : {"root": "https://allgirl.booru.org"}, + "drawfriends" : {"root": "https://drawfriends.booru.org"}, + "vidyart" : {"root": "https://vidyart.booru.org"}, + "theloudbooru" : {"root": "https://tlb.booru.org"}, }) @@ -70,6 +73,9 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): "range": "1-25", "count": 25, }), + ("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"), + ("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"), + ("https://tlb.booru.org/index.php?page=post&s=list&tags=all"), ) def __init__(self, match): @@ -133,6 +139,9 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor): "width": "1600" }, }), + ("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"), + ("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"), + ("https://tlb.booru.org/index.php?page=post&s=view&id=127223"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py deleted file mode 100644 index aa79b67..0000000 --- a/gallery_dl/extractor/hentaicafe.py +++ /dev/null @@ -1,173 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2018-2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://hentai.cafe/""" - -from . import foolslide -from .. import text -from .common import Extractor, Message -from ..cache import memcache -import re - -BASE_PATTERN = r"(?:https?://)?(?:www\.)?hentai\.cafe" - - -class HentaicafeBase(): - """Base class for hentaicafe extractors""" - category = "hentaicafe" - root = "https://hentai.cafe" - - def _pagination(self, urlfmt): - data = {"_extractor": HentaicafeMangaExtractor} - pnum = text.parse_int(self.page_start, 1) - - while True: - page = self.request(urlfmt(pnum)).text - - for entry in text.extract_iter( - page, 'class="entry-featured', 'title="'): - url = text.extract(entry, 'href="', '"')[0] - if url: - yield Message.Queue, url, data - - if '>→<' not in page: - return - pnum += 1 - - -class HentaicafeChapterExtractor(HentaicafeBase, - foolslide.FoolslideChapterExtractor): - """Extractor for manga-chapters from hentai.cafe""" - directory_fmt = ("{category}", "{manga}") - filename_fmt = "c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}" - pattern = BASE_PATTERN + r"(/manga/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" - test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", { - "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2", - "keyword": "6913608267d883c82b887303b9ced13821188329", - }) - - def metadata(self, page): - info = text.unescape(text.extract(page, '<title>', '</title>')[0]) - manga, _, chapter_string = info.partition(" :: ") - - data = self._data(self.gallery_url.split("/")[5]) - if "manga" not in data: - data["manga"] = manga - data["chapter_string"] = chapter_string.rstrip(" :") - return self.parse_chapter_url(self.gallery_url, data) - - @memcache(keyarg=1) - def _data(self, manga): - return {"artist": (), "tags": ()} - - -class HentaicafeMangaExtractor(HentaicafeBase, - foolslide.FoolslideMangaExtractor): - """Extractor for manga from hentai.cafe""" - pattern = BASE_PATTERN + r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?#]+)/?$" - test = ( - # single chapter - ("https://hentai.cafe/hazuki-yuuto-summer-blues/", { - "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b", - "keyword": "ced644ff94ea22e1991a5e44bf37c38a7e2ac2b3", - }), - # multi-chapter - ("https://hentai.cafe/saitom-saitom-box/", { - "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", - "keyword": "4c2262d680286a54357c334c1faca8f1b0e692e9", - }), - # new-style URL - ("https://hentai.cafe/hc.fyi/2782", { - "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", - "keyword": "4c2262d680286a54357c334c1faca8f1b0e692e9", - }), - # foolslide URL - ("https://hentai.cafe/manga/series/saitom-box/", { - "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", - "keyword": "f0ece32d958f889d8229ed4052716d398a0a875c", - }), - - ) - - def items(self): - page = Extractor.request(self, self.gallery_url).text - - chapters = self.chapters(page) - if self.config("chapter-reverse", False): - chapters.reverse() - - for chapter, data in chapters: - data["_extractor"] = HentaicafeChapterExtractor - yield Message.Queue, chapter, data - - def chapters(self, page): - if "/manga/series/" in self.gallery_url: - chapters = foolslide.FoolslideMangaExtractor.chapters(self, page) - chapters.reverse() - return chapters - - manga , pos = text.extract(page, '<title>', '<') - url , pos = text.extract(page, 'rel="canonical" href="', '"', pos) - tags , pos = text.extract(page, "<p>Tags: ", "</br>", pos) - artist, pos = text.extract(page, "\nArtists: ", "</br>", pos) - key , pos = text.extract(page, "/manga/read/", "/", pos) - data = { - "manga" : text.unescape(manga.rpartition(" | ")[0]), - "manga_id": text.parse_int(url.rpartition("/")[2]), - "tags" : text.split_html(tags)[::2], - "artist" : text.split_html(artist), - } - HentaicafeChapterExtractor._data(key).update(data) - - return [ - (url, data) - for url in re.findall( - r'<a +class="x-btn[^"]*" +href="([^"]+)"', page) - ] - - -class HentaicafeSearchExtractor(HentaicafeBase, Extractor): - """Extractor for hentaicafe search results""" - subcategory = "search" - pattern = BASE_PATTERN + r"/(?:page/(\d+)/?)?\?s=([^&#]+)" - test = ("https://hentai.cafe/?s=benimura", { - "pattern": HentaicafeMangaExtractor.pattern, - "count": ">= 10", - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page_start, self.search = match.groups() - - def items(self): - fmt = "{}/page/{}?s={}".format - return self._pagination(lambda pnum: fmt(self.root, pnum, self.search)) - - -class HentaicafeTagExtractor(HentaicafeBase, Extractor): - """Extractor for hentaicafe tag/artist searches""" - subcategory = "tag" - pattern = (BASE_PATTERN + - r"/hc\.fyi/(tag|artist|category)/([^/?#]+)(?:/page/(\d+))?") - test = ( - ("https://hentai.cafe/hc.fyi/tag/vanilla"), - ("https://hentai.cafe/hc.fyi/category/book/page/5"), - ("https://hentai.cafe/hc.fyi/artist/benimura-karu", { - "pattern": HentaicafeMangaExtractor.pattern, - "count": ">= 10", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.type, self.search, self.page_start = match.groups() - - def items(self): - fmt = "{}/hc.fyi/{}/{}/page/{}".format - return self._pagination( - lambda pnum: fmt(self.root, self.type, self.search, pnum)) diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py index 093f3fe..a5bebdd 100644 --- a/gallery_dl/extractor/hentaifox.py +++ b/gallery_dl/extractor/hentaifox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,6 +10,7 @@ from .common import GalleryExtractor, Extractor, Message from .. import text +import json class HentaifoxBase(): @@ -21,61 +22,84 @@ class HentaifoxBase(): class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): """Extractor for image galleries on hentaifox.com""" pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" - test = ("https://hentaifox.com/gallery/56622/", { - "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", - "keyword": "b7ff141331d0c7fc711ab28d45dfbb013a83d8e9", - "count": 24, - }) + test = ( + ("https://hentaifox.com/gallery/56622/", { + "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", + "keyword": "bcd6b67284f378e5cc30b89b761140e3e60fcd92", + "count": 24, + }), + # 'split_tag' element (#1378) + ("https://hentaifox.com/gallery/630/", { + "keyword": { + "artist": ["beti", "betty", "magi", "mimikaki"], + "characters": [ + "aerith gainsborough", + "tifa lockhart", + "yuffie kisaragi" + ], + "count": 32, + "gallery_id": 630, + "group": ["cu-little2"], + "parody": ["darkstalkers | vampire", "final fantasy vii"], + "tags": ["femdom", "fingering", "masturbation", "yuri"], + "title": "Cu-Little Bakanya~", + "type": "doujinshi", + }, + }), + ) def __init__(self, match): GalleryExtractor.__init__(self, match) self.gallery_id = match.group(2) - def metadata(self, page, split=text.split_html): + @staticmethod + def _split(txt): + return [ + text.remove_html(tag.partition(">")[2], "", "") + for tag in text.extract_iter( + txt, "class='tag_btn", "<span class='t_badge") + ] + + def metadata(self, page): extr = text.extract_from(page) + split = self._split return { "gallery_id": text.parse_int(self.gallery_id), "title" : text.unescape(extr("<h1>", "</h1>")), - "parody" : split(extr(">Parodies:" , "</ul>"))[::2], - "characters": split(extr(">Characters:", "</ul>"))[::2], - "tags" : split(extr(">Tags:" , "</ul>"))[::2], - "artist" : split(extr(">Artists:" , "</ul>"))[::2], - "group" : split(extr(">Groups:" , "</ul>"))[::2], + "parody" : split(extr(">Parodies:" , "</ul>")), + "characters": split(extr(">Characters:", "</ul>")), + "tags" : split(extr(">Tags:" , "</ul>")), + "artist" : split(extr(">Artists:" , "</ul>")), + "group" : split(extr(">Groups:" , "</ul>")), "type" : text.remove_html(extr(">Category:", "<span")), "language" : "English", "lang" : "en", } def images(self, page): - pos = page.find('id="load_all"') - if pos >= 0: - extr = text.extract - load_id = extr(page, 'id="load_id" value="', '"', pos)[0] - load_dir = extr(page, 'id="load_dir" value="', '"', pos)[0] - load_pages = extr(page, 'id="load_pages" value="', '"', pos)[0] - - url = self.root + "/includes/thumbs_loader.php" - data = { - "u_id" : self.gallery_id, - "g_id" : load_id, - "img_dir" : load_dir, - "visible_pages": "0", - "total_pages" : load_pages, - "type" : "2", - } - headers = { - "Origin": self.root, - "Referer": self.gallery_url, - "X-Requested-With": "XMLHttpRequest", - } - page = self.request( - url, method="POST", headers=headers, data=data).text - - return [ - (url.replace("t.", "."), None) - for url in text.extract_iter(page, 'data-src="', '"') - ] + cover, pos = text.extract(page, '<img src="', '"') + data , pos = text.extract(page, "$.parseJSON('", "');", pos) + path = "/".join(cover.split("/")[3:-1]) + + result = [] + append = result.append + extmap = {"j": "jpg", "p": "png", "g": "gif"} + urlfmt = ("/" + path + "/{}.{}").format + + server1 = "https://i.hentaifox.com" + server2 = "https://i2.hentaifox.com" + + for num, image in json.loads(data).items(): + ext, width, height = image.split(",") + path = urlfmt(num, extmap[ext]) + append((server1 + path, { + "width" : width, + "height" : height, + "_fallback": (server2 + path,), + })) + + return result class HentaifoxSearchExtractor(HentaifoxBase, Extractor): diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py deleted file mode 100644 index 6c1879c..0000000 --- a/gallery_dl/extractor/hentainexus.py +++ /dev/null @@ -1,185 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019-2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://hentainexus.com/""" - -from .common import GalleryExtractor, Extractor, Message -from .. import text, util -import binascii -import json - - -class HentainexusGalleryExtractor(GalleryExtractor): - """Extractor for image galleries on hentainexus.com""" - category = "hentainexus" - root = "https://hentainexus.com" - pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" - r"/(?:view|read)/(\d+)") - test = ( - ("https://hentainexus.com/view/5688", { - "url": "f1761895fb7aca2f6ff9e09f839c0ee2fa7a5e54", - "keyword": "5e5bb4b1553b1c6e126b198f9ae017a1a5d0a5ad", - }), - ("https://hentainexus.com/read/5688"), - ) - - def __init__(self, match): - self.gallery_id = match.group(1) - url = "{}/view/{}".format(self.root, self.gallery_id) - GalleryExtractor.__init__(self, match, url) - - def metadata(self, page): - rmve = text.remove_html - extr = text.extract_from(page) - data = { - "gallery_id": text.parse_int(self.gallery_id), - "tags" : extr('"og:description" content="', '"').split(", "), - "thumbnail" : extr('"og:image" content="', '"'), - "title" : extr('<h1 class="title">', '</h1>'), - } - for key in ("Artist", "Book", "Circle", "Event", "Language", - "Magazine", "Parody", "Publisher", "Description"): - data[key.lower()] = rmve(extr( - 'viewcolumn">' + key + '</td>', '</td>')) - data["lang"] = util.language_to_code(data["language"]) - - if 'doujin' in data['tags']: - data['type'] = 'Doujinshi' - elif 'illustration' in data['tags']: - data['type'] = 'Illustration' - else: - data['type'] = 'Manga' - data["title_conventional"] = self._join_title(data) - return data - - def images(self, _): - url = "{}/read/{}".format(self.root, self.gallery_id) - page = self.request(url).text - data = json.loads(self._decode(text.extract( - page, 'initReader("', '"')[0])) - - headers = None - if not self.config("original", True): - headers = {"_http_headers": {"Accept": "image/webp,*/*"}} - - pages = data.get("pages") - if pages: - return [(page, headers) for page in pages] - - base = data["b"] + data["r"] - gid = data["i"] - return [ - ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), headers) - for page in data["f"] - ] - - @staticmethod - def _decode(data): - # https://hentainexus.com/static/js/reader.min.js?r=13 - primes = (2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53) - blob = binascii.a2b_base64(data) - key = blob[0:64] - - C = 0 - for k in key: - C = C ^ k - for _ in range(8): - if C & 1: - C = C >> 1 ^ 0xc - else: - C = C >> 1 - k = primes[C & 0x7] - - x = 0 - S = list(range(256)) - for i in range(256): - x = (x + S[i] + key[i % len(key)]) % 256 - S[i], S[x] = S[x], S[i] - - result = "" - a = c = m = x = 0 - for n in range(64, len(blob)): - a = (a + k) % 256 - x = (c + S[(x + S[a]) % 256]) % 256 - c = (c + a + S[a]) % 256 - - S[a], S[x] = S[x], S[a] - m = S[(x + S[(a + S[(m + c) % 256]) % 256]) % 256] - result += chr(blob[n] ^ m) - - return result - - @staticmethod - def _join_title(data): - event = data['event'] - artist = data['artist'] - circle = data['circle'] - title = data['title'] - parody = data['parody'] - book = data['book'] - magazine = data['magazine'] - - # a few galleries have a large number of artists or parodies, - # which get replaced with "Various" in the title string - if artist.count(',') >= 3: - artist = 'Various' - if parody.count(',') >= 3: - parody = 'Various' - - jt = '' - if event: - jt += '({}) '.format(event) - if circle: - jt += '[{} ({})] '.format(circle, artist) - else: - jt += '[{}] '.format(artist) - jt += title - if parody.lower() != 'original work': - jt += ' ({})'.format(parody) - if book: - jt += ' ({})'.format(book) - if magazine: - jt += ' ({})'.format(magazine) - return jt - - -class HentainexusSearchExtractor(Extractor): - """Extractor for search results on hentainexus.com""" - category = "hentainexus" - subcategory = "search" - root = "https://hentainexus.com" - pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" - r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$") - test = ( - ("https://hentainexus.com/?q=tag:%22heart+pupils%22%20tag:group", { - "pattern": HentainexusGalleryExtractor.pattern, - "count": ">= 50", - }), - ("https://hentainexus.com/page/3?q=tag:%22heart+pupils%22"), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.params = text.parse_query(match.group(1)) - - def items(self): - params = self.params - path = "/" - data = {"_extractor": HentainexusGalleryExtractor} - - while path: - page = self.request(self.root + path, params=params).text - extr = text.extract_from(page) - - while True: - gallery_id = extr('<a href="/view/', '"') - if not gallery_id: - break - yield Message.Queue, self.root + "/view/" + gallery_id, data - - path = extr('class="pagination-next" href="', '"') diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index abb6d10..d757e17 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -257,10 +257,11 @@ class ImgclickImageExtractor(ImagehostImageExtractor): category = "imgclick" pattern = r"(?:https?://)?((?:www\.)?imgclick\.net/([^/?#]+))" test = ("http://imgclick.net/4tbrre1oxew9/test-_-_.png.html", { - "url": "b967f2d372ffb9f5d3a927c6dd560e120b10a808", + "url": "140dcb250a325f2d26b2d918c18b8ac6a2a0f6ab", "keyword": "6895256143eab955622fc149aa367777a8815ba3", "content": "0c8768055e4e20e7c7259608b67799171b691140", }) + https = False params = "complex" def get_info(self, page): diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index f6e8f2d..7009c7a 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -377,16 +377,17 @@ class ImgurAPI(): return self._call(endpoint) def _call(self, endpoint, params=None): - try: - return self.extractor.request( - "https://api.imgur.com" + endpoint, - params=params, headers=self.headers, - ).json() - except exception.HttpError as exc: - if exc.status != 403 or b"capacity" not in exc.response.content: - raise - self.extractor.sleep(seconds=600) - return self._call(endpoint) + while True: + try: + return self.extractor.request( + "https://api.imgur.com" + endpoint, + params=params, headers=self.headers, + ).json() + except exception.HttpError as exc: + if exc.status not in (403, 429) or \ + b"capacity" not in exc.response.content: + raise + self.extractor.wait(seconds=600) def _pagination(self, endpoint, params=None, key=None): num = 0 diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 81355ce..74c6197 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -384,6 +384,7 @@ class InstagramUserExtractor(InstagramExtractor): (InstagramStoriesExtractor , stories), (InstagramHighlightsExtractor, base + "highlights/"), (InstagramPostsExtractor , base + "posts/"), + (InstagramReelsExtractor , base + "reels/"), (InstagramChannelExtractor , base + "channel/"), ), ("posts",)) diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 8a4e413..6e5aec9 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -32,7 +32,7 @@ class KomikcastBase(): if manga: data["manga"] = manga.partition(" Chapter ")[0] - if title and title.lower() != "bahasa indonesia": + if title and not title.lower().startswith("bahasa indonesia"): data["title"] = title.strip() else: data["title"] = "" @@ -53,27 +53,23 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4", }), (("https://komikcast.com/chapter/" - "tonari-no-kashiwagi-san-chapter-18b/"), { - "url": "aff90dd21dbb945a726778b10bdef522af7c42fe", - "keyword": "19b5783864c4299913de436513b124b028b557c1", - }), - (("https://komikcast.com/chapter/090-eko-to-issho-chapter-1/"), { - "url": "cda104a32ea2b06b3d6b096726622f519ed1fa33", + "solo-spell-caster-chapter-37-bahasa-indonesia/"), { + "url": "c3d30de6c796ff6ff36eb86e2e6fa2f8add8e829", + "keyword": "ed8a0ff73098776988bf66fb700381a2c748f910", }), ) def metadata(self, page): - info = text.extract(page, '<b>', "</b>")[0] + info = text.extract(page, "<title>", " – Komikcast<")[0] return self.parse_chapter_string(info) @staticmethod def images(page): readerarea = text.extract( - page, '<div id="readerarea"', '<div class="navig')[0] + page, '<div class="main-reading-area', '</div')[0] return [ (text.unescape(url), None) for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea) - if "/Banner-" not in url and "/WM-Sampingan." not in url ] @@ -95,7 +91,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): data = self.metadata(page) for item in text.extract_iter( - page, '<span class="leftoff"><a href="', '</a>'): + page, '<a class="chapter-link-item" href="', '</a'): url, _, chapter_string = item.rpartition('">Chapter ') self.parse_chapter_string(chapter_string, data) results.append((url, data.copy())) @@ -104,14 +100,15 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): @staticmethod def metadata(page): """Return a dict with general metadata""" - manga , pos = text.extract(page, "<title>" , "</title>") - genres, pos = text.extract(page, ">Genres:", "</span>", pos) + manga , pos = text.extract(page, "<title>" , " – Komikcast<") + genres, pos = text.extract( + page, 'class="komik_info-content-genre">', "</span>", pos) author, pos = text.extract(page, ">Author:", "</span>", pos) mtype , pos = text.extract(page, ">Type:" , "</span>", pos) return { - "manga": text.unescape(manga[:-12]), + "manga": text.unescape(manga), + "genres": text.split_html(genres), "author": text.remove_html(author), - "genres": text.split_html(genres)[::2], "type": text.remove_html(mtype), } diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py new file mode 100644 index 0000000..882031b --- /dev/null +++ b/gallery_dl/extractor/manganelo.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://manganelo.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +import re + + +class ManganeloBase(): + """Base class for manganelo extractors""" + category = "manganelo" + root = "https://manganelo.com" + + @staticmethod + def parse_page(page, data): + """Parse metadata on 'page' and add it to 'data'""" + text.extract_all(page, ( + ("manga" , '<h1>', '</h1>'), + ('author' , '</i>Author(s) :</td>', '</tr>'), + ), values=data) + data["author"] = text.remove_html(data["author"]) + return data + + +class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): + """Extractor for manga-chapters from manganelo.com""" + pattern = (r"(?:https?://)?(?:www\.)?manganelo\.com" + r"(/chapter/\w+/chapter_[^/?#]+)") + test = ( + ("https://manganelo.com/chapter/gq921227/chapter_23", { + "pattern": r"https://s\d+\.\w+\.com/mangakakalot/g\d+/gq921227/" + r"vol3_chapter_23_24_yen/\d+\.jpg", + "keyword": "3748087cf41abc97f991530e6fd53b291490d6d0", + "count": 25, + }), + ("https://manganelo.com/chapter/gamers/chapter_15", { + "keyword": "8f59f88d516247011fe122e05746c27e203c8191", + "content": "fbec629c71f66b246bfa0604204407c0d1c8ae38", + "count": 39, + }), + ) + + def __init__(self, match): + self.path = match.group(1) + ChapterExtractor.__init__(self, match, self.root + self.path) + self.session.headers['Referer'] = self.root + + def metadata(self, page): + _ , pos = text.extract(page, '<a class="a-h" ', '/a>') + manga , pos = text.extract(page, '<a class="a-h" ', '/a>', pos) + info , pos = text.extract(page, '<a class="a-h" ', '/a>', pos) + author, pos = text.extract(page, '- Author(s) : ', '</p>', pos) + + manga, _ = text.extract(manga, '">', '<') + info , _ = text.extract(info , '">', '<') + match = re.match( + r"(?:[Vv]ol\. *(\d+) )?" + r"[Cc]hapter *([^:]*)" + r"(?:: *(.+))?", info) + volume, chapter, title = match.groups() if match else ("", "", info) + chapter, sep, minor = chapter.partition(".") + + return { + "manga" : text.unescape(manga), + "title" : text.unescape(title) if title else "", + "author" : text.unescape(author) if author else "", + "volume" : text.parse_int(volume), + "chapter" : text.parse_int(chapter), + "chapter_minor": sep + minor, + "lang" : "en", + "language" : "English", + } + + def images(self, page): + page = text.extract( + page, 'class="container-chapter-reader', '\n<div')[0] + return [ + (url, None) + for url in text.extract_iter(page, '<img src="', '"') + ] + + +class ManganeloMangaExtractor(ManganeloBase, MangaExtractor): + """Extractor for manga from manganelo.com""" + chapterclass = ManganeloChapterExtractor + pattern = (r"(?:https?://)?(?:www\.)?manganelo\.com" + r"(/(?:manga/|read_)\w+)") + test = ( + ("https://manganelo.com/manga/ol921234", { + "url": "8a1810edddbafcde993ecb3558a35c99d8d4f13e", + }), + ("https://manganelo.com/manga/read_otome_no_teikoku", { + "pattern": ManganeloChapterExtractor.pattern, + "count": ">= 40" + }), + ) + + def chapters(self, page): + results = [] + data = self.parse_page(page, {"lang": "en", "language": "English"}) + + needle = 'class="chapter-name text-nowrap" href="' + pos = page.index('<ul class="row-content-chapter">') + while True: + url, pos = text.extract(page, needle, '"', pos) + if not url: + return results + data["title"], pos = text.extract(page, '>', '</a>', pos) + data["date"] , pos = text.extract( + page, 'class="chapter-time text-nowrap" title="', '">', pos) + chapter, sep, minor = url.rpartition("/chapter_")[2].partition(".") + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + results.append((url, data.copy())) diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py deleted file mode 100644 index 30b8ce3..0000000 --- a/gallery_dl/extractor/mangareader.py +++ /dev/null @@ -1,95 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2020 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://www.mangareader.net/""" - -from .common import ChapterExtractor, MangaExtractor -from .. import text -from ..cache import memcache -import json - - -class MangareaderBase(): - """Base class for mangareader extractors""" - category = "mangareader" - root = "https://www.mangareader.net" - - @memcache(keyarg=1) - def _manga_info(self, path, page=None): - if not page: - page = self.request(self.root + path).text - extr = text.extract_from(page) - data = { - "manga" : text.unescape(extr('class="name">', '<')), - "release" : text.unescape(extr('Year of Release :</td><td>', '<')), - "author" : text.unescape(text.unescape(extr( - 'Author :</td><td>', '<'))), - "artist" : text.unescape(text.unescape(extr( - 'Artist :</td><td>', '<'))), - "lang" : "en", - "language": "English", - } - - extr('<table', '>') - chapters = [] - while True: - url = extr('</i> <a href="', '"') - if not url: - return chapters - chapter = { - "chapter": text.parse_int(url.rpartition("/")[2]), - "title" : text.unescape(extr("</a> : ", "<")), - "date" : extr("<td>", "<"), - } - chapter.update(data) - chapters.append((self.root + url, chapter)) - - -class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): - """Extractor for manga-chapters from mangareader.net""" - archive_fmt = "{manga}_{chapter}_{page}" - pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?#]+)/(\d+))" - test = (("https://www.mangareader.net" - "/karate-shoukoushi-kohinata-minoru/11"), { - "url": "45ece5668d1e9f65cf2225237d78de58660b54e4", - "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6", - }) - - def __init__(self, match): - ChapterExtractor.__init__(self, match) - _, self.path, self.chapter = match.groups() - - def metadata(self, page): - chapter = text.parse_int(self.chapter) - return self._manga_info(self.path)[chapter-1][1] - - def images(self, page): - data = json.loads(text.extract( - page, 'document["mj"]=', '</script>')[0]) - return [ - (text.ensure_http_scheme(img["u"]), { - "width" : text.parse_int(img["w"]), - "height": text.parse_int(img["h"]), - }) - for img in data["im"] - ] - - -class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): - """Extractor for manga from mangareader.net""" - chapterclass = MangareaderChapterExtractor - reverse = False - pattern = r"(?:https?://)?(?:www\.)?mangareader\.net(/[^/?#]+)/?$" - test = ("https://www.mangareader.net/mushishi", { - "url": "bc203b858b4ad76e5d77e39118a7be0350e357da", - "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", - }) - - def chapters(self, page): - path = self.manga_url[len(self.root):] - return self._manga_info(path, page) diff --git a/gallery_dl/extractor/mangastream.py b/gallery_dl/extractor/mangastream.py deleted file mode 100644 index 7ff0239..0000000 --- a/gallery_dl/extractor/mangastream.py +++ /dev/null @@ -1,54 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2019 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract manga-chapters from https://readms.net/""" - -from .common import ChapterExtractor -from .. import text - - -class MangastreamChapterExtractor(ChapterExtractor): - """Extractor for manga-chapters from mangastream.com""" - category = "mangastream" - archive_fmt = "{chapter_id}_{page}" - pattern = (r"(?:https?://)?(?:www\.)?(?:readms\.net|mangastream\.com)" - r"/r(?:ead)?/([^/]*/([^/]+)/(\d+))") - test = ( - ("https://readms.net/r/onepunch_man/087/4874/1"), - ("https://mangastream.com/r/onepunch_man/087/4874/1"), - ) - root = "https://readms.net" - - def __init__(self, match): - self.part, self.chapter, self.chapter_id = match.groups() - url = "{}/r/{}".format(self.root, self.part) - ChapterExtractor.__init__(self, match, url) - - def metadata(self, page): - manga, pos = text.extract( - page, '<span class="hidden-xs hidden-sm">', "<") - pos = page.find(self.part, pos) - title, pos = text.extract(page, ' - ', '<', pos) - count, pos = text.extract(page, 'Last Page (', ')', pos) - return { - "manga": manga, - "chapter": text.unquote(self.chapter), - "chapter_id": text.parse_int(self.chapter_id), - "title": title, - "count": text.parse_int(count, 1), - "lang": "en", - "language": "English", - } - - def images(self, page): - while True: - pos = page.index(' class="page"') - next_url = text.extract(page, ' href="', '"', pos)[0] - image_url = text.extract(page, ' src="', '"', pos)[0] - yield text.urljoin(self.root, image_url), None - page = self.request(text.urljoin(self.root, next_url)).text diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 4eb3ee6..e1081da 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -69,12 +69,23 @@ class NozomiExtractor(Extractor): post["dataid"] = post["filename"] yield Message.Url, url, post + def posts(self): + url = "https://n.nozomi.la" + self.nozomi + offset = (text.parse_int(self.pnum, 1) - 1) * 256 + + while True: + headers = {"Range": "bytes={}-{}".format(offset, offset+255)} + response = self.request(url, headers=headers) + yield from decode_nozomi(response.content) + + offset += 256 + cr = response.headers.get("Content-Range", "").rpartition("/")[2] + if text.parse_int(cr, offset) <= offset: + return + def metadata(self): return {} - def posts(self): - return () - @staticmethod def _list(src): return [x["tagname_display"] for x in src] if src else () @@ -126,12 +137,29 @@ class NozomiPostExtractor(NozomiExtractor): return (self.post_id,) +class NozomiIndexExtractor(NozomiExtractor): + """Extractor for the nozomi.la index""" + subcategory = "index" + pattern = (r"(?:https?://)?nozomi\.la/" + r"(?:(index(?:-Popular)?)-(\d+)\.html)?(?:$|#|\?)") + test = ( + ("https://nozomi.la/"), + ("https://nozomi.la/index-2.html"), + ("https://nozomi.la/index-Popular-33.html"), + ) + + def __init__(self, match): + NozomiExtractor.__init__(self, match) + index, self.pnum = match.groups() + self.nozomi = "/{}.nozomi".format(index or "index") + + class NozomiTagExtractor(NozomiExtractor): """Extractor for posts from tag searches on nozomi.la""" subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{postid}" - pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-\d+\." + pattern = r"(?:https?://)?nozomi\.la/tag/([^/?#]+)-(\d+)\." test = ("https://nozomi.la/tag/3:1_aspect_ratio-1.html", { "pattern": r"^https://i.nozomi.la/\w/\w\w/\w+\.\w+$", "count": ">= 25", @@ -140,25 +168,13 @@ class NozomiTagExtractor(NozomiExtractor): def __init__(self, match): NozomiExtractor.__init__(self, match) - self.tags = text.unquote(match.group(1)).lower() + tags, self.pnum = match.groups() + self.tags = text.unquote(tags).lower() + self.nozomi = "/nozomi/{}.nozomi".format(self.tags) def metadata(self): return {"search_tags": self.tags} - def posts(self): - url = "https://n.nozomi.la/nozomi/{}.nozomi".format(self.tags) - i = 0 - - while True: - headers = {"Range": "bytes={}-{}".format(i, i+255)} - response = self.request(url, headers=headers) - yield from decode_nozomi(response.content) - - i += 256 - cr = response.headers.get("Content-Range", "").rpartition("/")[2] - if text.parse_int(cr, i) <= i: - return - class NozomiSearchExtractor(NozomiExtractor): """Extractor for search results on nozomi.la""" diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py new file mode 100644 index 0000000..f3c5ac2 --- /dev/null +++ b/gallery_dl/extractor/philomena.py @@ -0,0 +1,216 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Philomena sites""" + +from .booru import BooruExtractor +from .. import text, exception +import operator + + +class PhilomenaExtractor(BooruExtractor): + """Base class for philomena extractors""" + basecategory = "philomena" + filename_fmt = "{filename}.{extension}" + archive_fmt = "{id}" + request_interval = 1.0 + per_page = 50 + + _file_url = operator.itemgetter("view_url") + + @staticmethod + def _prepare(post): + post["date"] = text.parse_datetime(post["created_at"]) + + @staticmethod + def _extended_tags(post): + pass + + def _pagination(self, url, params): + params["page"] = 1 + params["per_page"] = self.per_page + + api_key = self.config("api-key") + if api_key: + params["key"] = api_key + + filter_id = self.config("filter") + if filter_id: + params["filter_id"] = filter_id + elif not api_key: + try: + params["filter_id"] = INSTANCES[self.category]["filter_id"] + except (KeyError, TypeError): + pass + + while True: + data = self.request(url, params=params).json() + yield from data["images"] + + if len(data["images"]) < self.per_page: + return + params["page"] += 1 + + +INSTANCES = { + "derpibooru": {"root": "https://derpibooru.org", + "filter_id": "56027"}, + "ponybooru" : {"root": "https://ponybooru.org", + "filter_id": "2"}, +} + +BASE_PATTERN = PhilomenaExtractor.update(INSTANCES) + + +class PhilomenaPostExtractor(PhilomenaExtractor): + """Extractor for single posts on a Philomena booru""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(?:images/)?(\d+)" + test = ( + ("https://derpibooru.org/images/1", { + "content": "88449eeb0c4fa5d3583d0b794f6bc1d70bf7f889", + "count": 1, + "keyword": { + "animated": False, + "aspect_ratio": 1.0, + "comment_count": int, + "created_at": "2012-01-02T03:12:33Z", + "date": "dt:2012-01-02 03:12:33", + "deletion_reason": None, + "description": "", + "downvotes": int, + "duplicate_of": None, + "duration": 0.04, + "extension": "png", + "faves": int, + "first_seen_at": "2012-01-02T03:12:33Z", + "format": "png", + "height": 900, + "hidden_from_users": False, + "id": 1, + "mime_type": "image/png", + "name": "1__safe_fluttershy_solo_cloud_happy_flying_upvotes+ga" + "lore_artist-colon-speccysy_get_sunshine", + "orig_sha512_hash": None, + "processed": True, + "representations": dict, + "score": int, + "sha512_hash": "f16c98e2848c2f1bfff3985e8f1a54375cc49f78125391" + "aeb80534ce011ead14e3e452a5c4bc98a66f56bdfcd07e" + "f7800663b994f3f343c572da5ecc22a9660f", + "size": 860914, + "source_url": "https://www.deviantart.com/speccysy/art" + "/Afternoon-Flight-215193985", + "spoilered": False, + "tag_count": 36, + "tag_ids": list, + "tags": list, + "thumbnails_generated": True, + "updated_at": "2020-05-28T13:14:07Z", + "uploader": "Clover the Clever", + "uploader_id": 211188, + "upvotes": int, + "view_url": str, + "width": 900, + "wilson_score": float, + }, + }), + ("https://derpibooru.org/1"), + ("https://ponybooru.org/images/1", { + "content": "bca26f58fafd791fe07adcd2a28efd7751824605", + }), + ) + + def __init__(self, match): + PhilomenaExtractor.__init__(self, match) + self.image_id = match.group(match.lastindex) + + def posts(self): + url = self.root + "/api/v1/json/images/" + self.image_id + return (self.request(url).json()["image"],) + + +class PhilomenaSearchExtractor(PhilomenaExtractor): + """Extractor for Philomena search results""" + subcategory = "search" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))" + test = ( + ("https://derpibooru.org/search?q=cute", { + "range": "40-60", + "count": 21, + }), + ("https://derpibooru.org/tags/cute", { + "range": "40-60", + "count": 21, + }), + ("https://ponybooru.org/search?q=cute", { + "range": "40-60", + "count": 21, + }), + ) + + def __init__(self, match): + PhilomenaExtractor.__init__(self, match) + groups = match.groups() + if groups[-1]: + self.params = {"q": groups[-1]} + else: + self.params = text.parse_query(groups[-2]) + + def metadata(self): + return {"search_tags": self.params.get("q", "")} + + def posts(self): + url = self.root + "/api/v1/json/search/images" + return self._pagination(url, self.params) + + +class PhilomenaGalleryExtractor(PhilomenaExtractor): + """Extractor for Philomena galleries""" + subcategory = "gallery" + directory_fmt = ("{category}", "galleries", + "{gallery[id]} {gallery[title]}") + pattern = BASE_PATTERN + r"/galleries/(\d+)" + test = ( + ("https://derpibooru.org/galleries/1", { + "pattern": r"https://derpicdn\.net/img/view/\d+/\d+/\d+/\d+[^/]+$", + "keyword": { + "gallery": { + "description": "Indexes start at 1 :P", + "id": 1, + "spoiler_warning": "", + "thumbnail_id": 1, + "title": "The Very First Gallery", + "user": "DeliciousBlackInk", + "user_id": 365446, + }, + }, + }), + ("https://ponybooru.org/galleries/27", { + "count": ">= 24", + }), + ) + + def __init__(self, match): + PhilomenaExtractor.__init__(self, match) + self.gallery_id = match.group(match.lastindex) + + def metadata(self): + url = self.root + "/api/v1/json/search/galleries" + params = {"q": "id:" + self.gallery_id} + galleries = self.request(url, params=params).json()["galleries"] + if not galleries: + raise exception.NotFoundError("gallery") + return {"gallery": galleries[0]} + + def posts(self): + gallery_id = "gallery_id:" + self.gallery_id + url = self.root + "/api/v1/json/search/images" + params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id} + return self._pagination(url, params) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index e5a0486..25344e8 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -220,6 +220,27 @@ class PinterestSectionExtractor(PinterestExtractor): return self.api.board_section_pins(self.section["id"]) +class PinterestSearchExtractor(PinterestExtractor): + """Extractor for Pinterest search results""" + subcategory = "search" + directory_fmt = ("{category}", "Search", "{search}") + pattern = BASE_PATTERN + r"/search/pins/?\?q=([^&#]+)" + test = ("https://www.pinterest.de/search/pins/?q=nature", { + "range": "1-50", + "count": ">= 50", + }) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.search = match.group(1) + + def metadata(self): + return {"search": self.search} + + def pins(self): + return self.api.search(self.search) + + class PinterestRelatedPinExtractor(PinterestPinExtractor): """Extractor for related pins of another pin from pinterest.com""" subcategory = "related-pin" @@ -296,7 +317,7 @@ class PinterestAPI(): "Accept-Language" : "en-US,en;q=0.5", "Referer" : BASE_URL + "/", "X-Requested-With" : "XMLHttpRequest", - "X-APP-VERSION" : "7a20185", + "X-APP-VERSION" : "31461e0", "X-CSRFToken" : None, "X-Pinterest-AppState": "active", "Origin" : BASE_URL, @@ -364,6 +385,11 @@ class PinterestAPI(): options = {"board_id": board_id, "add_vase": True} return self._pagination("BoardRelatedPixieFeed", options) + def search(self, query): + """Yield pins from searches""" + options = {"query": query, "scope": "pins", "rs": "typed"} + return self._pagination("BaseSearch", options) + def login(self): """Login and obtain session cookies""" username, password = self.extractor._get_auth_info() @@ -421,7 +447,10 @@ class PinterestAPI(): def _pagination(self, resource, options): while True: data = self._call(resource, options) - yield from data["resource_response"]["data"] + results = data["resource_response"]["data"] + if isinstance(results, dict): + results = results["results"] + yield from results try: bookmarks = data["resource"]["options"]["bookmarks"] diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index db49b90..ebbce67 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -254,8 +254,8 @@ class PixivFavoriteExtractor(PixivExtractor): "{user_bookmark[id]} {user_bookmark[account]}") archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?" - r"users/(\d+)/(bookmarks/artworks(?:/([^/?#]+))?|following)" - r"|bookmark\.php(?:\?([^#]*))?)") + r"users/(\d+)/(bookmarks/artworks|following)(?:/([^/?#]+))?" + r"|bookmark\.php)(?:\?([^#]*))?") test = ( ("https://www.pixiv.net/en/users/173530/bookmarks/artworks", { "url": "e717eb511500f2fa3497aaee796a468ecf685cc4", diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index a5f0138..ea5bb6d 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -55,6 +55,7 @@ class SankakuExtractor(BooruExtractor): post["created_at"] = post["created_at"]["s"] post["date"] = text.parse_timestamp(post["created_at"]) post["tags"] = [tag["name"] for tag in post["tags"]] + post["tag_string"] = " ".join(post["tags"]) def _extended_tags(self, post): tags = collections.defaultdict(list) @@ -63,6 +64,7 @@ class SankakuExtractor(BooruExtractor): tags[types[tag["type"]]].append(tag["name"]) for key, value in tags.items(): post["tags_" + key] = value + post["tag_string_" + key] = " ".join(value) class SankakuTagExtractor(SankakuExtractor): @@ -122,7 +124,13 @@ class SankakuPoolExtractor(SankakuExtractor): def metadata(self): pool = SankakuAPI(self).pools(self.pool_id) + pool["tags"] = [tag["name"] for tag in pool["tags"]] + pool["artist_tags"] = [tag["name"] for tag in pool["artist_tags"]] + self._posts = pool.pop("posts") + for num, post in enumerate(self._posts, 1): + post["num"] = num + return {"pool": pool} def posts(self): diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py new file mode 100644 index 0000000..ec1e044 --- /dev/null +++ b/gallery_dl/extractor/tapas.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://tapas.io/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache + +BASE_PATTERN = r"(?:https?://)?tapas\.io" + + +class TapasExtractor(Extractor): + """Base class for tapas.io extractors""" + category = "tapas" + root = "https://tapas.io" + directory_fmt = ("{category}", "{series[title]}", "{id} {title}") + filename_fmt = "{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + cookiedomain = ".tapas.io" + cookienames = ("_cpc_",) + _cache = None + + def __init__(self, match): + Extractor.__init__(self, match) + if self._cache is None: + TapasExtractor._cache = {} + + def items(self): + self.login() + headers = {"Accept": "application/json, text/javascript, */*;"} + + for episode_id in self.episode_ids(): + url = "{}/episode/{}".format(self.root, episode_id) + data = self.request(url, headers=headers).json()["data"] + + episode = data["episode"] + if not episode.get("free") and not episode.get("unlocked"): + raise exception.StopExtraction( + "Episode '%s' not unlocked (ID %s) ", + episode["title"], episode_id) + + html = data["html"] + series_id = text.rextract(html, 'data-series-id="', '"')[0] + try: + episode["series"] = self._cache[series_id] + except KeyError: + url = "{}/series/{}".format(self.root, series_id) + episode["series"] = self._cache[series_id] = self.request( + url, headers=headers).json()["data"] + + episode["date"] = text.parse_datetime(episode["publish_date"]) + yield Message.Directory, episode + + if episode["book"]: + content, _ = text.extract( + html, '<div class="viewer">', '<div class="viewer-bottom') + episode["num"] = 1 + episode["extension"] = "html" + yield Message.Url, "text:" + content, episode + + else: # comic + for episode["num"], url in enumerate(text.extract_iter( + html, 'data-src="', '"'), 1): + yield Message.Url, url, text.nameext_from_url(url, episode) + + def login(self): + if not self._check_cookies(self.cookienames): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + else: + sc = self.session.cookies.set + sc("birthDate" , "1981-02-03", domain=self.cookiedomain) + sc("adjustedBirthDate", "1981-02-03", domain=self.cookiedomain) + + @cache(maxage=14*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/account/authenticate" + headers = { + "Referer" : url, + } + data = { + "from" : "https://tapas.io/", + "email" : username, + "password": password, + } + response = self.request( + url, method="POST", headers=headers, data=data) + + if not response.history or \ + "/account/signin_fail" in response.history[-1].url: + raise exception.AuthenticationError() + + return {"_cpc_": response.history[0].cookies.get("_cpc_")} + + +class TapasSeriesExtractor(TapasExtractor): + subcategory = "series" + pattern = BASE_PATTERN + r"/series/([^/?#]+)" + test = ( + ("https://tapas.io/series/just-leave-me-be", { + "pattern": r"https://\w+\.cloudfront\.net/pc/\w\w/[0-9a-f-]+\.jpg", + "count": 127, + }), + ("https://tapas.io/series/yona", { # mature + "count": 26, + }), + ) + + def __init__(self, match): + TapasExtractor.__init__(self, match) + self.series_name = match.group(1) + + def episode_ids(self): + url = "{}/series/{}".format(self.root, self.series_name) + series_id, _, episode_id = text.extract( + self.request(url).text, 'content="tapastic://series/', '"', + )[0].partition("/episodes/") + + url = "{}/series/{}/episodes".format(self.root, series_id) + headers = {"Accept": "application/json, text/javascript, */*;"} + params = { + "eid" : episode_id, + "page" : 1, + "sort" : "OLDEST", + "last_access": "0", + "max_limit" : "20", + } + + while True: + data = self.request( + url, params=params, headers=headers).json()["data"] + yield from text.extract_iter( + data["body"], 'data-href="/episode/', '"') + + if not data["pagination"]["has_next"]: + return + params["page"] += 1 + + +class TapasEpisodeExtractor(TapasExtractor): + subcategory = "episode" + pattern = BASE_PATTERN + r"/episode/(\d+)" + test = ("https://tapas.io/episode/2068651", { + "url": "0e536117dfaa17972e83d2e0141e6f9e91a33611", + "pattern": "^text:", + "keyword": { + "book": True, + "comment_cnt": int, + "date": "dt:2021-02-23 16:02:07", + "early_access": False, + "escape_title": "You are a Tomb Raider (2)", + "free": True, + "id": 2068651, + "like_cnt": int, + "liked": bool, + "mature": False, + "next_ep_id": 2068652, + "nsfw": False, + "nu": False, + "num": 1, + "open_comments": True, + "pending_scene": 2, + "prev_ep_id": 2068650, + "publish_date": "2021-02-23T16:02:07Z", + "read": bool, + "related_ep_id": None, + "relative_publish_date": "Feb 23", + "scene": 2, + "scheduled": False, + "title": "You are a Tomb Raider (2)", + "unlock_cnt": 0, + "unlocked": False, + "view_cnt": int, + + "series": { + "genre": dict, + "has_book_cover": True, + "has_top_banner": True, + "id": 199931, + "premium": True, + "sale_type": "PAID", + "subscribed": bool, + "thumbsup_cnt": int, + "title": "Tomb Raider King", + "type": "BOOKS", + "url": "tomb-raider-king-novel", + }, + }, + }) + + def __init__(self, match): + TapasExtractor.__init__(self, match) + self.episode_id = match.group(1) + + def episode_ids(self): + return (self.episode_id,) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index cf57a4d..243710d 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -257,9 +257,6 @@ class TumblrPostExtractor(TumblrExtractor): ("https://mikf123.tumblr.com/post/167623351559/link-post", { "count": 2, }), - ("https://muyanna.tumblr.com/post/180692431632/answer-post", { - "count": 1, - }), ("https://mikf123.tumblr.com/post/167633596145/video-post", { "count": 2, }), diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index a7d2de5..c323fe0 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -113,19 +113,18 @@ class TwitterExtractor(Extractor): "url" : base + "orig", "width" : width, "height" : height, - "_fallback": self._image_fallback(base, url), + "_fallback": self._image_fallback(base, url + ":"), })) else: files.append({"url": media["media_url"]}) @staticmethod - def _image_fallback(base, url): - url += ":" - yield url + "orig" + def _image_fallback(new, old): + yield old + "orig" for size in ("large", "medium", "small"): - yield base + size - yield url + size + yield new + size + yield old + size def _extract_card(self, tweet, files): card = tweet["card"] @@ -139,7 +138,7 @@ class TwitterExtractor(Extractor): if key in bvals: files.append(bvals[key]["image_value"]) return - else: + elif self.videos: url = "ytdl:{}/i/web/status/{}".format(self.root, tweet["id_str"]) files.append({"url": url}) @@ -224,6 +223,22 @@ class TwitterExtractor(Extractor): } return cache[uid] + def _users_result(self, users): + userfmt = self.config("users") + if not userfmt or userfmt == "timeline": + cls = TwitterTimelineExtractor + fmt = (self.root + "/i/user/{rest_id}").format_map + elif userfmt == "media": + cls = TwitterMediaExtractor + fmt = (self.root + "/id:{rest_id}/media").format_map + else: + cls = None + fmt = userfmt.format_map + + for user in users: + user["_extractor"] = cls + yield Message.Queue, fmt(user), user + def metadata(self): """Return general metadata""" return {} @@ -261,6 +276,10 @@ class TwitterExtractor(Extractor): response = self.request( url, method="POST", cookies=cookies, data=data) + if "/account/login_verification" in response.url: + raise exception.AuthenticationError( + "Login with two-factor authentication is not supported") + cookies = { cookie.name: cookie.value for cookie in self.session.cookies @@ -320,6 +339,9 @@ class TwitterLikesExtractor(TwitterExtractor): pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/likes(?!\w)" test = ("https://twitter.com/supernaturepics/likes",) + def metadata(self): + return {"user_likes": self.user} + def tweets(self): return TwitterAPI(self).timeline_favorites(self.user) @@ -356,10 +378,7 @@ class TwitterListMembersExtractor(TwitterExtractor): def items(self): self.login() - for user in TwitterAPI(self).list_members(self.user): - user["_extractor"] = TwitterTimelineExtractor - url = "{}/i/user/{}".format(self.root, user["rest_id"]) - yield Message.Queue, url, user + return self._users_result(TwitterAPI(self).list_members(self.user)) class TwitterFollowingExtractor(TwitterExtractor): @@ -373,10 +392,7 @@ class TwitterFollowingExtractor(TwitterExtractor): def items(self): self.login() - for user in TwitterAPI(self).user_following(self.user): - user["_extractor"] = TwitterTimelineExtractor - url = "{}/i/user/{}".format(self.root, user["rest_id"]) - yield Message.Queue, url, user + return self._users_result(TwitterAPI(self).user_following(self.user)) class TwitterSearchExtractor(TwitterExtractor): @@ -485,6 +501,34 @@ class TwitterTweetExtractor(TwitterExtractor): return TwitterAPI(self).tweet(self.tweet_id) +class TwitterImageExtractor(Extractor): + category = "twitter" + subcategory = "image" + pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)" + test = ( + ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg%name=orig"), + ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.id, self.fmt = match.groups() + + def items(self): + base = "https://pbs.twimg.com/media/" + self.id + new = base + "?format=" + self.fmt + "&name=" + old = base + "." + self.fmt + ":" + + data = { + "filename": self.id, + "extension": self.fmt, + "_fallback": TwitterExtractor._image_fallback(new, old), + } + + yield Message.Directory, data + yield Message.Url, new + "orig", data + + class TwitterAPI(): def __init__(self, extractor): diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index c653c01..886353f 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -69,7 +69,7 @@ class UnsplashImageExtractor(UnsplashExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/photos/([^/?#]+)" test = ("https://unsplash.com/photos/lsoogGC_5dg", { - "url": "00accb0a64d5a0df0db911f8b425892718dce524", + "url": "ac9d194f58b3fc9aacdfc9784c1b69868f212b6e", "keyword": { "alt_description": "re:silhouette of trees near body of water ", "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz", diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py new file mode 100644 index 0000000..1ce1140 --- /dev/null +++ b/gallery_dl/extractor/vk.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://vk.com/""" + +from .common import Extractor, Message +from .. import text +import re + + +class VkPhotosExtractor(Extractor): + """Extractor for photos from a vk user""" + category = "vk" + subcategory = "photos" + directory_fmt = ("{category}", "{user[id]}") + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}" + root = "https://vk.com" + request_interval = 1.0 + pattern = r"(?:https://)?(?:www\.|m\.)?vk\.com/(?:albums|photos|id)(\d+)" + test = ( + ("https://vk.com/id398982326", { + "pattern": r"https://sun\d+-\d+\.userapi\.com/c\d+/v\d+" + r"/[0-9a-f]+/[\w-]+\.jpg", + "count": ">= 35", + }), + ("https://m.vk.com/albums398982326"), + ("https://www.vk.com/id398982326?profile=1"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user_id = match.group(1) + + def items(self): + user_id = self.user_id + + if self.config("metadata"): + url = "{}/id{}".format(self.root, user_id) + extr = text.extract_from(self.request(url).text) + data = {"user": { + "id" : user_id, + "nick": text.unescape(extr( + "<title>", " | VK<")), + "name": text.unescape(extr( + '<h1 class="page_name">', "<")).replace(" ", " "), + "info": text.unescape(text.remove_html(extr( + '<span class="current_text">', '</span'))) + }} + else: + data = {"user": {"id": user_id}} + + photos_url = "{}/photos{}".format(self.root, user_id) + headers = { + "X-Requested-With": "XMLHttpRequest", + "Origin" : self.root, + "Referer" : photos_url, + } + params = { + "al" : "1", + "al_ad" : "0", + "offset": 0, + "part" : "1", + } + + yield Message.Directory, data + sub = re.compile(r"/imp[fg]/").sub + needle = 'data-id="{}_'.format(user_id) + + while True: + offset, html = self.request( + photos_url, method="POST", headers=headers, data=params + ).json()["payload"][1] + + for cnt, photo in enumerate(text.extract_iter(html, needle, ')')): + data["id"] = photo[:photo.find('"')] + url = photo[photo.rindex("(")+1:] + url = sub("/", url.partition("?")[0]) + yield Message.Url, url, text.nameext_from_url(url, data) + + if cnt <= 40 or offset == params["offset"]: + return + params["offset"] = offset diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index 1dd5b09..f8da191 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -203,7 +203,7 @@ class WeasylJournalsExtractor(WeasylExtractor): class WeasylFavoriteExtractor(WeasylExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{owner_login}", "Favorites") - pattern = BASE_PATTERN + r"favorites\?userid=(\d+)&feature=submit" + pattern = BASE_PATTERN + r"favorites\?userid=(\d+)" test = ("https://www.weasyl.com/favorites?userid=184616&feature=submit", { "count": ">= 5", }) diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 428c6b5..7fd60b1 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -72,7 +72,7 @@ class WikiartArtistExtractor(WikiartExtractor): pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)/?$" test = ("https://www.wikiart.org/en/thomas-cole", { "url": "5ba2fbe6783fcce34e65014d16e5fbc581490c98", - "keyword": "6d92913c55675e05553f000cfee5daff0b4107cf", + "keyword": "eb5b141cf33e6d279afd1518aae24e61cc0adf81", }) def __init__(self, match): diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 0f40bb9..d3b4a90 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -42,7 +42,14 @@ class Job(): self.status = 0 self.pred_url = self._prepare_predicates("image", True) self.pred_queue = self._prepare_predicates("chapter", False) + self.kwdict = {} + # user-supplied metadata + kwdict = self.extractor.config("keywords") + if kwdict: + self.kwdict.update(kwdict) + + # data from parent job if parent: pextr = parent.extractor @@ -57,9 +64,6 @@ class Job(): # reuse connection adapters extr.session.adapters = pextr.session.adapters - # user-supplied metadata - self.userkwds = self.extractor.config("keywords") - def run(self): """Execute or run the job""" sleep = self.extractor.config("sleep-extractor") @@ -137,8 +141,8 @@ class Job(): extr = self.extractor kwdict["category"] = extr.category kwdict["subcategory"] = extr.subcategory - if self.userkwds: - kwdict.update(self.userkwds) + if self.kwdict: + kwdict.update(self.kwdict) def _prepare_predicates(self, target, skip=True): predicates = [] @@ -183,7 +187,7 @@ class Job(): class DownloadJob(Job): """Download images into appropriate directory/filename locations""" - def __init__(self, url, parent=None): + def __init__(self, url, parent=None, kwdict=None): Job.__init__(self, url, parent) self.log = self.get_logger("download") self.blacklist = None @@ -198,6 +202,11 @@ class DownloadJob(Job): pfmt = parent.pathfmt if pfmt and parent.extractor.config("parent-directory"): self.extractor._parentdir = pfmt.directory + if parent.extractor.config("parent-metadata"): + if parent.kwdict: + self.kwdict.update(parent.kwdict) + if kwdict: + self.kwdict.update(kwdict) else: self.visited = set() @@ -280,8 +289,9 @@ class DownloadJob(Job): return self.visited.add(url) - if "_extractor" in kwdict: - extr = kwdict["_extractor"].from_url(url) + cls = kwdict.get("_extractor") + if cls: + extr = cls.from_url(url) else: extr = extractor.find(url) if extr: @@ -291,7 +301,7 @@ class DownloadJob(Job): extr = None if extr: - self.status |= self.__class__(extr, self).run() + self.status |= self.__class__(extr, self, kwdict).run() else: self._write_unsupported(url) @@ -474,7 +484,9 @@ class DownloadJob(Job): class SimulationJob(DownloadJob): """Simulate the extraction process without downloading anything""" - def handle_url(self, url, kwdict, fallback=None): + def handle_url(self, url, kwdict): + if not kwdict["extension"]: + kwdict["extension"] = "jpg" self.pathfmt.set_filename(kwdict) self.out.skip(self.pathfmt.path) if self.sleep: diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 8b06384..a6a9105 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,33 +10,11 @@ import re import html -import os.path import datetime import urllib.parse - HTML_RE = re.compile("<[^>]+>") -INVALID_XML_CHARS = ( - "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", - "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12", - "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a", - "\x1b", "\x1c", "\x1d", "\x1e", "\x1f", -) - - -def clean_xml(xmldata, repl=""): - """Replace/Remove invalid control characters in 'xmldata'""" - if not isinstance(xmldata, str): - try: - xmldata = "".join(xmldata) - except TypeError: - return "" - for char in INVALID_XML_CHARS: - if char in xmldata: - xmldata = xmldata.replace(char, repl) - return xmldata - def remove_html(txt, repl=" ", sep=" "): """Remove html-tags from a string""" @@ -49,11 +27,12 @@ def remove_html(txt, repl=" ", sep=" "): return txt.strip() -def split_html(txt, sep=None): - """Split input string by html-tags""" +def split_html(txt): + """Split input string by HTML tags""" try: return [ - x.strip() for x in HTML_RE.split(txt) + unescape(x).strip() + for x in HTML_RE.split(txt) if x and not x.isspace() ] except TypeError: @@ -77,18 +56,22 @@ def filename_from_url(url): def ext_from_url(url): """Extract the filename extension of an URL""" - filename = filename_from_url(url) - ext = os.path.splitext(filename)[1] - return ext[1:].lower() + name, _, ext = filename_from_url(url).rpartition(".") + return ext.lower() if name else "" def nameext_from_url(url, data=None): """Extract the last part of an URL and fill 'data' accordingly""" if data is None: data = {} - name = unquote(filename_from_url(url)) - data["filename"], ext = os.path.splitext(name) - data["extension"] = ext[1:].lower() + + filename = unquote(filename_from_url(url)) + name, _, ext = filename.rpartition(".") + if name: + data["filename"], data["extension"] = name, ext.lower() + else: + data["filename"], data["extension"] = filename, "" + return data diff --git a/gallery_dl/version.py b/gallery_dl/version.py index f1c49e9..b75f444 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.17.0" +__version__ = "1.17.2" diff --git a/test/test_text.py b/test/test_text.py index 34585d1..1daefde 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,29 +23,6 @@ INVALID_ALT = ((), [], {}, None, "") class TestText(unittest.TestCase): - def test_clean_xml(self, f=text.clean_xml): - # standard usage - self.assertEqual(f(""), "") - self.assertEqual(f("foo"), "foo") - self.assertEqual(f("\tfoo\nbar\r"), "\tfoo\nbar\r") - self.assertEqual(f("<foo>\ab\ba\fr\v</foo>"), "<foo>bar</foo>") - - # 'repl' argument - repl = "#" - self.assertEqual(f("", repl), "") - self.assertEqual(f("foo", repl), "foo") - self.assertEqual(f("\tfoo\nbar\r", repl), "\tfoo\nbar\r") - self.assertEqual( - f("<foo>\ab\ba\fr\v</foo>", repl), "<foo>#b#a#r#</foo>") - - # removal of all illegal control characters - value = "".join(chr(x) for x in range(32)) - self.assertEqual(f(value), "\t\n\r") - - # 'invalid' arguments - for value in INVALID: - self.assertEqual(f(value), "") - def test_remove_html(self, f=text.remove_html): result = "Hello World." @@ -82,6 +59,10 @@ class TestText(unittest.TestCase): self.assertEqual( f("<div><b class='a'>Hello</b><i>World.</i></div>"), result) + # escaped HTML entities + self.assertEqual( + f("<i><foo></i> <i><bar> </i>"), ["<foo>", "<bar>"]) + # empty HTML self.assertEqual(f("<div></div>"), empty) self.assertEqual(f(" <div> </div> "), empty) @@ -142,8 +123,9 @@ class TestText(unittest.TestCase): # standard usage self.assertEqual(f(""), "") + self.assertEqual(f("filename"), "") self.assertEqual(f("filename.ext"), result) - self.assertEqual(f("/filename.ext"), result) + self.assertEqual(f("/filename.ExT"), result) self.assertEqual(f("example.org/filename.ext"), result) self.assertEqual(f("http://example.org/v2/filename.ext"), result) self.assertEqual( @@ -160,7 +142,7 @@ class TestText(unittest.TestCase): # standard usage self.assertEqual(f(""), empty) self.assertEqual(f("filename.ext"), result) - self.assertEqual(f("/filename.ext"), result) + self.assertEqual(f("/filename.ExT"), result) self.assertEqual(f("example.org/filename.ext"), result) self.assertEqual(f("http://example.org/v2/filename.ext"), result) self.assertEqual( |