From e6b82556343116256be047ab7099bedd9063f66a Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Thu, 22 Sep 2022 19:43:53 -0400 Subject: New upstream version 1.23.1. --- CHANGELOG.md | 25 +++++ PKG-INFO | 10 +- README.rst | 8 +- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 96 +++++++++++++++- docs/gallery-dl.conf | 3 +- gallery_dl.egg-info/PKG-INFO | 10 +- gallery_dl.egg-info/SOURCES.txt | 1 + gallery_dl/__init__.py | 4 +- gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/bunkr.py | 12 +- gallery_dl/extractor/common.py | 3 +- gallery_dl/extractor/exhentai.py | 16 ++- gallery_dl/extractor/flickr.py | 32 ++++-- gallery_dl/extractor/hotleak.py | 228 ++++++++++++++++++++++++++++++++++++++ gallery_dl/extractor/instagram.py | 6 + gallery_dl/extractor/paheal.py | 30 ++++- gallery_dl/extractor/poipiku.py | 13 ++- gallery_dl/extractor/reddit.py | 4 +- gallery_dl/extractor/redgifs.py | 1 + gallery_dl/extractor/smugmug.py | 6 +- gallery_dl/extractor/tumblr.py | 81 +++++++++----- gallery_dl/extractor/twitter.py | 77 +++++++++---- gallery_dl/extractor/zerochan.py | 55 ++++++--- gallery_dl/postprocessor/zip.py | 19 ++++ gallery_dl/version.py | 2 +- test/test_postprocessor.py | 16 ++- 27 files changed, 632 insertions(+), 129 deletions(-) create mode 100644 gallery_dl/extractor/hotleak.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 61987d9..4f4fdf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ # Changelog +## 1.23.1 - 2022-09-18 +### Additions +- [flickr] add support for `secure.flickr.com` URLs ([#2910](https://github.com/mikf/gallery-dl/issues/2910)) +- [hotleak] add hotleak extractors ([#2890](https://github.com/mikf/gallery-dl/issues/2890), [#2909](https://github.com/mikf/gallery-dl/issues/2909)) +- [instagram] add `highlight_title` and `date` metadata for highlight downloads ([#2879](https://github.com/mikf/gallery-dl/issues/2879)) +- [paheal] add support for videos ([#2892](https://github.com/mikf/gallery-dl/issues/2892)) +- [twitter] add general support for unified cards ([#2875](https://github.com/mikf/gallery-dl/issues/2875)) +- [twitter] implement `cards-blacklist` option ([#2875](https://github.com/mikf/gallery-dl/issues/2875)) +- [tumblr] fetch high-quality inline images ([#2877](https://github.com/mikf/gallery-dl/issues/2877)) +- [tumblr] implement `ratelimit` option ([#2919](https://github.com/mikf/gallery-dl/issues/2919)) +- [zerochan] add `metadata` option ([#2861](https://github.com/mikf/gallery-dl/issues/2861)) +- [postprocessor:zip] implement `files` option ([#2872](https://github.com/mikf/gallery-dl/issues/2872)) +### Fixes +- [bunkr] fix extraction ([#2903](https://github.com/mikf/gallery-dl/issues/2903)) +- [bunkr] use `media-files` servers for `m4v` and `mov` downloads ([#2925](https://github.com/mikf/gallery-dl/issues/2925)) +- [exhentai] improve 509.gif detection ([#2901](https://github.com/mikf/gallery-dl/issues/2901)) +- [exhentai] guess extension for original files ([#2842](https://github.com/mikf/gallery-dl/issues/2842)) +- [poipiku] use `img-org.poipiku.com` as image domain ([#2796](https://github.com/mikf/gallery-dl/issues/2796)) +- [reddit] prevent exception with empty submission URLs ([#2913](https://github.com/mikf/gallery-dl/issues/2913)) +- [redgifs] fix download URLs ([#2884](https://github.com/mikf/gallery-dl/issues/2884)) +- [smugmug] update default API credentials ([#2881](https://github.com/mikf/gallery-dl/issues/2881)) +- [twitter] provide proper `date` for syndication results ([#2920](https://github.com/mikf/gallery-dl/issues/2920)) +- [twitter] fix new-style `/card_img/` URLs +- remove all whitespace before comments after input file URLs ([#2808](https://github.com/mikf/gallery-dl/issues/2808)) + ## 1.23.0 - 2022-08-28 ### Changes - [twitter] update `user` and `author` metdata fields diff --git a/PKG-INFO b/PKG-INFO index 60a798f..b15426c 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.23.0 +Version: 1.23.1 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -166,11 +166,11 @@ Get the direct URL of an image from a site supporting authentication with userna gallery-dl -g -u "" -p "" "https://twitter.com/i/web/status/604341487988576256" -Filter manga chapters by language and chapter number: +Filter manga chapters by chapter number and language: .. code:: bash - gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539" | Search a remote resource for URLs and download images from them: diff --git a/README.rst b/README.rst index 2b45b27..813d6d8 100644 --- a/README.rst +++ b/README.rst @@ -66,8 +66,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -133,11 +133,11 @@ Get the direct URL of an image from a site supporting authentication with userna gallery-dl -g -u "" -p "" "https://twitter.com/i/web/status/604341487988576256" -Filter manga chapters by language and chapter number: +Filter manga chapters by chapter number and language: .. code:: bash - gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539" | Search a remote resource for URLs and download images from them: diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index d4efeed..e76a380 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-08-28" "1.23.0" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-09-18" "1.23.1" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 642cb78..f465d84 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-08-28" "1.23.0" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-09-18" "1.23.1" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -2575,13 +2575,30 @@ Search posts for inline images and videos. \f[I]true\f[] .IP "Description:" 4 -Download full-resolution \f[I]photo\f[] images. +Download full-resolution \f[I]photo\f[] and \f[I]inline\f[] images. For each photo with "maximum" resolution -(width equal to 2048 or height equal to 3072), +(width equal to 2048 or height equal to 3072) +or each inline image, use an extra HTTP request to find the URL to its full-resolution version. +.SS extractor.tumblr.ratelimit +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"abort"\f[] + +.IP "Description:" 4 +Selects how to handle exceeding the daily API rate limit. + +.br +* \f[I]"abort"\f[]: Raise an error and stop extraction +.br +* \f[I]"wait"\f[]: Wait until rate limit reset + + .SS extractor.tumblr.reblogs .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -2664,6 +2681,26 @@ Controls how to handle \f[I]Twitter Cards\f[]. * \f[I]"ytdl"\f[]: Additionally download video content from unsupported cards using \f[I]youtube-dl\f[] +.SS extractor.twitter.cards-blacklist +.IP "Type:" 6 +\f[I]list\f[] of \f[I]strings\f[] + +.IP "Example:" 4 +["summary", "youtube.com", "player:twitch.tv"] + +.IP "Description:" 4 +List of card types to ignore. + +Possible values are + +.br +* card names +.br +* card domains +.br +* \f[I]:\f[] + + .SS extractor.twitter.conversations .IP "Type:" 6 \f[I]bool\f[] @@ -2672,8 +2709,11 @@ Controls how to handle \f[I]Twitter Cards\f[]. \f[I]false\f[] .IP "Description:" 4 -Fetch media from all Tweets and replies in a \f[I]conversation -\f[]. +For input URLs pointing to a single Tweet, +e.g. https://twitter.com/i/web/status/, +fetch media from all Tweets and replies in this \f[I]conversation +\f[] +or thread. .SS extractor.twitter.csrf @@ -2692,6 +2732,25 @@ Controls how to handle Cross Site Request Forgery (CSRF) tokens. * \f[I]"cookies"\f[]: Use token given by the \f[I]ct0\f[] cookie if present. +.SS extractor.twitter.expand +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +For each Tweet, return *all* Tweets from that initial Tweet's +conversation or thread, i.e. *expand* all Twitter threads. + +Going through a timeline with this option enabled is essentially the same +as running \f[I]gallery-dl https://twitter.com/i/web/status/\f[] +with enabled \f[I]conversations\f[] option +for each Tweet in said timeline. + +Note: This requires at least 1 additional API call per initial Tweet. + + .SS extractor.twitter.size .IP "Type:" 6 \f[I]list\f[] of \f[I]strings\f[] @@ -3140,6 +3199,19 @@ Additional options specified as youtube-dl command-line arguments. Location of a youtube-dl configuration file to load options from. +.SS extractor.zerochan.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract additional metadata (date, md5, tags, ...) + +Note: This requires 1-2 additional HTTP request for each post. + + .SS extractor.[booru].tags .IP "Type:" 6 \f[I]bool\f[] @@ -4225,6 +4297,20 @@ to prevent it from only being displayed for a very short amount of time. Filename extension for the created ZIP archive. +.SS zip.files +.IP "Type:" 6 +\f[I]list\f[] of \f[I]Path\f[] + +.IP "Example:" 4 +["info.json"] + +.IP "Description:" 4 +List of extra files to be added to a ZIP archive. + +Note: Relative paths are relative to the current +\f[I]download directory\f[]. + + .SS zip.keep-files .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1e485ee..6ba50f2 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -341,7 +341,8 @@ "zerochan": { "username": null, - "password": null + "password": null, + "metadata": false }, "booru": { diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 6b9d68b..ea2164a 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.23.0 +Version: 1.23.1 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -166,11 +166,11 @@ Get the direct URL of an image from a site supporting authentication with userna gallery-dl -g -u "" -p "" "https://twitter.com/i/web/status/604341487988576256" -Filter manga chapters by language and chapter number: +Filter manga chapters by chapter number and language: .. code:: bash - gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539" | Search a remote resource for URLs and download images from them: diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 5f5084b..73cc80b 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -92,6 +92,7 @@ gallery_dl/extractor/hentaihand.py gallery_dl/extractor/hentaihere.py gallery_dl/extractor/hiperdex.py gallery_dl/extractor/hitomi.py +gallery_dl/extractor/hotleak.py gallery_dl/extractor/idolcomplex.py gallery_dl/extractor/imagebam.py gallery_dl/extractor/imagechest.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 329e7ab..7504fa4 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -96,9 +96,9 @@ def parse_inputfile(file, log): else: # url if " #" in line: - line = line.partition(" #")[0] + line = line.partition(" #")[0].rstrip() elif "\t#" in line: - line = line.partition("\t#")[0] + line = line.partition("\t#")[0].rstrip() if gconf or lconf: yield util.ExtendedUrl(line, gconf, lconf) gconf = [] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e4507a..fed6998 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -55,6 +55,7 @@ modules = [ "hentaihere", "hiperdex", "hitomi", + "hotleak", "idolcomplex", "imagebam", "imagechest", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 3091f57..2502411 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -37,6 +37,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4", "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }), + # cdn4 + ("https://bunkr.is/a/iXTTc1o2", { + "pattern": r"https://(cdn|media-files)4\.bunkr\.is/", + "content": "da29aae371b7adc8c5ef8e6991b66b69823791e8", + }), ("https://bunkr.to/a/Lktg9Keq"), ) @@ -66,9 +71,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): data = json.loads(text.extract( self.request(url).text, 'id="__NEXT_DATA__" type="application/json">', '<')[0]) - props = data["props"]["pageProps"] - album = props["album"] - files = props["files"] + album = data["props"]["pageProps"]["album"] + files = album["files"] except Exception as exc: self.log.debug(exc.__class__.__name__, exc) self.root = self.root.replace("bunkr", "app.bunkr", 1) @@ -77,7 +81,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): for file in files: name = file["name"] cdn = file["cdn"] - if name.endswith(".mp4"): + if name.endswith((".mp4", ".m4v", ".mov")): cdn = cdn.replace("//cdn", "//media-files") file["file"] = cdn + "/" + name diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 1b41101..f7ee51f 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -152,7 +152,8 @@ class Extractor(): server = response.headers.get("Server") if server and server.startswith("cloudflare"): if code == 503 and \ - b"jschl-answer" in response.content: + (b"_cf_chl_opt" in response.content or + b"jschl-answer" in response.content): self.log.warning("Cloudflare IUAM challenge") break if code == 403 and \ diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 2720691..01ba03a 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -219,7 +219,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self.limits: self._check_limits(data) if "/fullimg.php" in url: - data["extension"] = "" data["_http_validate"] = _validate_response else: data["_http_validate"] = None @@ -328,8 +327,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["image_token"] = self.key["start"] = extr('var startkey="', '";') self.key["show"] = extr('var showkey="', '";') - if iurl.endswith("g/509.gif"): - self._report_limits(data) + self._check_509(iurl, data) return url, text.nameext_from_url(iurl, data) def images_from_api(self): @@ -365,8 +363,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = request["page"] data["image_token"] = imgkey - if imgurl.endswith("g/509.gif"): - self._report_limits(data) + self._check_509(imgurl, data) yield url, text.nameext_from_url(imgurl, data) request["imgkey"] = nextkey @@ -385,6 +382,15 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self._remaining <= 0: self._report_limits(data) + def _check_509(self, url, data): + # full 509.gif URLs + # - https://exhentai.org/img/509.gif + # - https://ehgt.org/g/509.gif + if url.endswith(("hentai.org/img/509.gif", + "ehgt.org/g/509.gif")): + self.log.debug(url) + self._report_limits(data) + def _update_limits(self): url = "https://e-hentai.org/home.php" cookies = { diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 2bd8c6b..e85d68a 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2020 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,6 +11,8 @@ from .common import Extractor, Message from .. import text, oauth, util, exception +BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com" + class FlickrExtractor(Extractor): """Base class for flickr extractors""" @@ -55,7 +57,7 @@ class FlickrImageExtractor(FlickrExtractor): """Extractor for individual images from flickr.com""" subcategory = "image" pattern = (r"(?:https?://)?(?:" - r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/" + r"(?:(?:www\.|secure\.|m\.)?flickr\.com/photos/[^/?#]+/" r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" r"|flic\.kr/p/([A-Za-z1-9]+))") test = ( @@ -77,6 +79,10 @@ class FlickrImageExtractor(FlickrExtractor): "width": 1024, }, }), + ("https://secure.flickr.com/photos/departingyyz/16089302239"), + ("https://m.flickr.com/photos/departingyyz/16089302239"), + ("https://flickr.com/photos/departingyyz/16089302239"), + ("https://www.flickr.com/photos/145617051@N08/46733161535", { "count": 1, "keyword": {"media": "video"}, @@ -132,8 +138,7 @@ class FlickrAlbumExtractor(FlickrExtractor): directory_fmt = ("{category}", "{user[username]}", "Albums", "{album[id]} {album[title]}") archive_fmt = "a_{album[id]}_{id}" - pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" - r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?") + pattern = BASE_PATTERN + r"/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?" test = ( (("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), { "pattern": FlickrImageExtractor.pattern, @@ -143,6 +148,8 @@ class FlickrAlbumExtractor(FlickrExtractor): "pattern": pattern, "count": 2, }), + ("https://secure.flickr.com/photos/shona_s/albums"), + ("https://m.flickr.com/photos/shona_s/albums"), ) def __init__(self, match): @@ -180,8 +187,7 @@ class FlickrGalleryExtractor(FlickrExtractor): directory_fmt = ("{category}", "{user[username]}", "Galleries", "{gallery[gallery_id]} {gallery[title]}") archive_fmt = "g_{gallery[id]}_{id}" - pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" - r"photos/([^/]+)/galleries/(\d+)") + pattern = BASE_PATTERN + r"/photos/([^/?#]+)/galleries/(\d+)" test = (("https://www.flickr.com/photos/flickr/" "galleries/72157681572514792/"), { "pattern": FlickrImageExtractor.pattern, @@ -206,7 +212,7 @@ class FlickrGroupExtractor(FlickrExtractor): subcategory = "group" directory_fmt = ("{category}", "Groups", "{group[groupname]}") archive_fmt = "G_{group[nsid]}_{id}" - pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)" + pattern = BASE_PATTERN + r"/groups/([^/?#]+)" test = ("https://www.flickr.com/groups/bird_headshots/", { "pattern": FlickrImageExtractor.pattern, "count": "> 150", @@ -224,7 +230,7 @@ class FlickrUserExtractor(FlickrExtractor): """Extractor for the photostream of a flickr user""" subcategory = "user" archive_fmt = "u_{user[nsid]}_{id}" - pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$" + pattern = BASE_PATTERN + r"/photos/([^/?#]+)/?$" test = ("https://www.flickr.com/photos/shona_s/", { "pattern": FlickrImageExtractor.pattern, "count": 28, @@ -239,7 +245,7 @@ class FlickrFavoriteExtractor(FlickrExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{user[username]}", "Favorites") archive_fmt = "f_{user[nsid]}_{id}" - pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites" + pattern = BASE_PATTERN + r"/photos/([^/?#]+)/favorites" test = ("https://www.flickr.com/photos/shona_s/favorites", { "pattern": FlickrImageExtractor.pattern, "count": 4, @@ -254,7 +260,7 @@ class FlickrSearchExtractor(FlickrExtractor): subcategory = "search" directory_fmt = ("{category}", "Search", "{search[text]}") archive_fmt = "s_{search}_{id}" - pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)" + pattern = BASE_PATTERN + r"/search/?\?([^#]+)" test = ( ("https://flickr.com/search/?text=mountain"), ("https://flickr.com/search/?text=tree%20cloud%20house" @@ -275,7 +281,11 @@ class FlickrSearchExtractor(FlickrExtractor): class FlickrAPI(oauth.OAuth1API): - """Minimal interface for the flickr API""" + """Minimal interface for the flickr API + + https://www.flickr.com/services/api/ + """ + API_URL = "https://api.flickr.com/services/rest/" API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" API_SECRET = "3adb0f568dc68393" diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py new file mode 100644 index 0000000..d6575cf --- /dev/null +++ b/gallery_dl/extractor/hotleak.py @@ -0,0 +1,228 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hotleak.vip/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip" + + +class HotleakExtractor(Extractor): + """Base class for hotleak extractors""" + category = "hotleak" + directory_fmt = ("{category}", "{creator}",) + filename_fmt = "{creator}_{id}.{extension}" + archive_fmt = "{type}_{creator}_{id}" + root = "https://hotleak.vip" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + + def items(self): + for post in self.posts(): + yield Message.Directory, post + yield Message.Url, post["url"], post + + def posts(self): + """Return an iterable containing relevant posts""" + return () + + def _pagination(self, url, params): + params = text.parse_query(params) + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + if "" not in page: + return + + for item in text.extract_iter( + page, '
', '
')[0] + data = { + "id" : text.parse_int(self.id), + "creator": self.creator, + "type" : self.type, + } + + if self.type == "photo": + data["url"] = text.extract(page, 'data-src="', '"')[0] + text.nameext_from_url(data["url"], data) + + elif self.type == "video": + data["url"] = "ytdl:" + text.extract( + text.unescape(page), '"src":"', '"')[0] + text.nameext_from_url(data["url"], data) + data["extension"] = "mp4" + + return (data,) + + +class HotleakCreatorExtractor(HotleakExtractor): + """Extractor for all posts from a hotleak creator""" + subcategory = "creator" + pattern = BASE_PATTERN + r"/(?!hot|creators|videos|photos)([^/?#]+)/?$" + test = ( + ("https://hotleak.vip/kaiyakawaii", { + "range": "1-200", + "count": 200, + }), + ("https://hotleak.vip/stellaviolet", { + "count": "> 600" + }), + ("https://hotleak.vip/doesnotexist", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + HotleakExtractor.__init__(self, match) + self.creator = match.group(1) + + def posts(self): + url = "{}/{}".format(self.root, self.creator) + return self._pagination(url) + + def _pagination(self, url): + headers = {"X-Requested-With": "XMLHttpRequest"} + params = {"page": 1} + + while True: + try: + response = self.request( + url, headers=headers, params=params, notfound="creator") + except exception.HttpError as exc: + if exc.response.status_code == 429: + self.wait( + until=exc.response.headers.get("X-RateLimit-Reset")) + continue + + posts = response.json() + if not posts: + return + + data = {"creator": self.creator} + for post in posts: + data["id"] = text.parse_int(post["id"]) + + if post["type"] == 0: + data["type"] = "photo" + data["url"] = self.root + "/storage/" + post["image"] + text.nameext_from_url(data["url"], data) + + elif post["type"] == 1: + data["type"] = "video" + data["url"] = "ytdl:" + post["stream_url_play"] + text.nameext_from_url(data["url"], data) + data["extension"] = "mp4" + + yield data + params["page"] += 1 + + +class HotleakCategoryExtractor(HotleakExtractor): + """Extractor for hotleak categories""" + subcategory = "category" + pattern = BASE_PATTERN + r"/(hot|creators|videos|photos)(?:/?\?([^#]+))?" + test = ( + ("https://hotleak.vip/photos", { + "pattern": HotleakPostExtractor.pattern, + "range": "1-50", + "count": 50, + }), + ("https://hotleak.vip/videos"), + ("https://hotleak.vip/creators", { + "pattern": HotleakCreatorExtractor.pattern, + "range": "1-50", + "count": 50, + }), + ("https://hotleak.vip/hot"), + ) + + def __init__(self, match): + HotleakExtractor.__init__(self, match) + self._category, self.params = match.groups() + + def items(self): + url = "{}/{}".format(self.root, self._category) + + if self._category in ("hot", "creators"): + data = {"_extractor": HotleakCreatorExtractor} + elif self._category in ("videos", "photos"): + data = {"_extractor": HotleakPostExtractor} + + for item in self._pagination(url, self.params): + yield Message.Queue, item, data + + +class HotleakSearchExtractor(HotleakExtractor): + """Extractor for hotleak search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))" + test = ( + ("https://hotleak.vip/search?search=gallery-dl", { + "count": 0, + }), + ("https://hotleak.vip/search?search=hannah", { + "count": "> 30", + }), + ) + + def __init__(self, match): + HotleakExtractor.__init__(self, match) + self.params = match.group(1) + + def items(self): + data = {"_extractor": HotleakCreatorExtractor} + for creator in self._pagination(self.root + "/search", self.params): + yield Message.Queue, creator, data diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index d56af8b..8c98d2e 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -264,6 +264,12 @@ class InstagramExtractor(Extractor): "post_id": reel_id, "post_shortcode": shortcode_from_id(reel_id), } + + if "title" in post: + data["highlight_title"] = post["title"] + if "created_at" in post: + data["date"] = text.parse_timestamp(post.get("created_at")) + else: data = { "post_id" : post["pk"], diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 0a6a6d3..56e3b39 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -49,7 +49,8 @@ class PahealExtractor(Extractor): "id" : post_id, "tags" : extr(": ", "<"), "md5" : extr("/_thumbs/", "/"), - "file_url": extr("id='main_image' src='", "'"), + "file_url": (extr("id='main_image' src='", "'") or + extr("", ">").split(" // ") - post["width"], _, post["height"] = dimensions.partition("x") + post["width"], _, height = dimensions.partition("x") post["size"] = text.parse_bytes(size[:-1]) + post["height"], _, duration = height.partition(", ") + post["duration"] = text.parse_float(duration[:-1]) return post @@ -111,10 +114,12 @@ class PahealTagExtractor(PahealExtractor): tags, data, date = data.split("\n") dimensions, size, ext = data.split(" // ") width, _, height = dimensions.partition("x") + height, _, duration = height.partition(", ") return { "id": pid, "md5": md5, "file_url": url, "width": width, "height": height, + "duration": text.parse_float(duration[:-1]), "tags": text.unescape(tags), "size": text.parse_bytes(size[:-1]), "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"), @@ -163,6 +168,27 @@ class PahealPostExtractor(PahealExtractor): "width": 1200, }, }), + # video + ("https://rule34.paheal.net/post/view/3864982", { + "pattern": r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637d" + r"de5bf4f992b2cb/3864982%20-%20Metal_Gear%20Metal_Gear_" + r"Solid_V%20Quiet%20Vg_erotica%20animated%20webm\.webm", + "keyword": { + "date": "dt:2020-09-06 01:59:03", + "duration": 30.0, + "extension": "webm", + "height": 2500, + "id": 3864982, + "md5": "7629fc0ff77e32637dde5bf4f992b2cb", + "size": 18454938, + "source": "https://twitter.com/VG_Worklog" + "/status/1302407696294055936", + "tags": "Metal_Gear Metal_Gear_Solid_V Quiet " + "Vg_erotica animated webm", + "uploader": "justausername", + "width": 1768, + }, + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index 8203885..4283081 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -42,6 +42,7 @@ class PoipikuExtractor(Extractor): '

', '")[2]), "description": text.unescape(extr( 'class="IllustItemDesc" >', '<')), + "_http_headers": {"Referer": post_url}, } yield Message.Directory, post @@ -54,7 +55,8 @@ class PoipikuExtractor(Extractor): elif thumb.startswith(("//img.poipiku.com/img/", "/img/")): continue post["num"] += 1 - url = text.ensure_http_scheme(thumb[:-8]) + url = text.ensure_http_scheme(thumb[:-8]).replace( + "//img.", "//img-org.", 1) yield Message.Url, url, text.nameext_from_url(url, post) if not extr('> show all', '<'): @@ -80,7 +82,8 @@ class PoipikuExtractor(Extractor): for thumb in text.extract_iter( page, 'class="IllustItemThumbImg" src="', '"'): post["num"] += 1 - url = text.ensure_http_scheme(thumb[:-8]) + url = text.ensure_http_scheme(thumb[:-8]).replace( + "//img.", "//img-org.", 1) yield Message.Url, url, text.nameext_from_url(url, post) @@ -91,7 +94,7 @@ class PoipikuUserExtractor(PoipikuExtractor): r"(\d+)/?(?:$|[?&#])") test = ( ("https://poipiku.com/25049/", { - "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049" + "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049" r"/\d+_\w+\.(jpe?g|png)$", "range": "1-10", "count": 10, @@ -131,7 +134,7 @@ class PoipikuPostExtractor(PoipikuExtractor): pattern = BASE_PATTERN + r"/(\d+)/(\d+)" test = ( ("https://poipiku.com/25049/5864576.html", { - "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049" + "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049" r"/005864576_EWN1Y65gQ\.png$", "keyword": { "count": "1", @@ -146,7 +149,7 @@ class PoipikuPostExtractor(PoipikuExtractor): }, }), ("https://poipiku.com/2166245/6411749.html", { - "pattern": r"https://img\.poipiku\.com/user_img\d+/002166245" + "pattern": r"https://img-org\.poipiku\.com/user_img\d+/002166245" r"/006411749_\w+\.jpeg$", "count": 4, "keyword": { diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index d35e24e..954a84f 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -46,10 +46,10 @@ class RedditExtractor(Extractor): submission["created_utc"]) yield Message.Directory, submission visited.add(submission["id"]) - url = submission["url"] submission["num"] = 0 - if url.startswith("https://i.redd.it/"): + url = submission["url"] + if url and url.startswith("https://i.redd.it/"): text.nameext_from_url(url, submission) yield Message.Url, url, submission diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 2c3ed44..3a4fb0e 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -53,6 +53,7 @@ class RedgifsExtractor(Extractor): for fmt in self.formats: url = urls.get(fmt) if url: + url = url.replace("//thumbs2.", "//thumbs3.", 1) text.nameext_from_url(url, gif) yield url diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 4010da3..2264fe4 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -209,9 +209,9 @@ class SmugmugPathExtractor(SmugmugExtractor): class SmugmugAPI(oauth.OAuth1API): """Minimal interface for the smugmug API v2""" API_DOMAIN = "api.smugmug.com" - API_KEY = "DFqxg4jf7GrtsQ5PnbNB8899zKfnDrdK" - API_SECRET = ("fknV35p9r9BwZC4XbTzvCXpcSJRdD83S" - "9nMFQm25ndGBzNPnwRDbRnnVBvqt4xTq") + API_KEY = "RCVHDGjcbc4Fhzq4qzqLdZmvwmwB6LM2" + API_SECRET = ("jGrdndvJqhTx8XSNs7TFTSSthhZHq92d" + "dMpbpDpkDVNM7TDgnvLFMtfB5Mg5kH73") HEADERS = {"Accept": "application/json"} def album(self, album_id, expands=None): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index b694fa0..6f53881 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -14,25 +14,6 @@ from datetime import datetime, timedelta import re -def _original_inline_image(url): - return re.sub( - (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" - r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"), - r"https://\1_1280.\2", url - ) - - -def _original_video(url): - return re.sub( - (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" - r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"), - r"https://\1.\2", url - ) - - -POST_TYPES = frozenset(( - "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) - BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?" @@ -40,6 +21,9 @@ BASE_PATTERN = ( r"([\w-]+\.tumblr\.com)))" ) +POST_TYPES = frozenset(( + "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) + class TumblrExtractor(Extractor): """Base class for tumblr extractors""" @@ -79,6 +63,18 @@ class TumblrExtractor(Extractor): def items(self): blog = None + # pre-compile regular expressions + self._sub_video = re.compile( + r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" + r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub + if self.inline: + self._sub_image = re.compile( + r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" + r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub + self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn + _findall_image = re.compile(' post["timestamp"]: return @@ -120,7 +116,7 @@ class TumblrExtractor(Extractor): if self.original and "/s2048x3072/" in photo["url"] and ( photo["width"] == 2048 or photo["height"] == 3072): - photo["url"] = self._original_image(photo["url"]) + photo["url"] = self._original_photo(photo["url"]) del photo["original_size"] del photo["alt_sizes"] @@ -134,17 +130,18 @@ class TumblrExtractor(Extractor): url = post.get("video_url") # type "video" if url: - posts.append(self._prepare(_original_video(url), post.copy())) + posts.append(self._prepare( + self._original_video(url), post.copy())) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their # API response, but they can't contain images/videos anyway body = post["reblog"]["comment"] + post["reblog"]["tree_html"] - for url in re.findall('