diff options
27 files changed, 632 insertions, 129 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 61987d9..4f4fdf9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,30 @@ # Changelog +## 1.23.1 - 2022-09-18 +### Additions +- [flickr] add support for `secure.flickr.com` URLs ([#2910](https://github.com/mikf/gallery-dl/issues/2910)) +- [hotleak] add hotleak extractors ([#2890](https://github.com/mikf/gallery-dl/issues/2890), [#2909](https://github.com/mikf/gallery-dl/issues/2909)) +- [instagram] add `highlight_title` and `date` metadata for highlight downloads ([#2879](https://github.com/mikf/gallery-dl/issues/2879)) +- [paheal] add support for videos ([#2892](https://github.com/mikf/gallery-dl/issues/2892)) +- [twitter] add general support for unified cards ([#2875](https://github.com/mikf/gallery-dl/issues/2875)) +- [twitter] implement `cards-blacklist` option ([#2875](https://github.com/mikf/gallery-dl/issues/2875)) +- [tumblr] fetch high-quality inline images ([#2877](https://github.com/mikf/gallery-dl/issues/2877)) +- [tumblr] implement `ratelimit` option ([#2919](https://github.com/mikf/gallery-dl/issues/2919)) +- [zerochan] add `metadata` option ([#2861](https://github.com/mikf/gallery-dl/issues/2861)) +- [postprocessor:zip] implement `files` option ([#2872](https://github.com/mikf/gallery-dl/issues/2872)) +### Fixes +- [bunkr] fix extraction ([#2903](https://github.com/mikf/gallery-dl/issues/2903)) +- [bunkr] use `media-files` servers for `m4v` and `mov` downloads ([#2925](https://github.com/mikf/gallery-dl/issues/2925)) +- [exhentai] improve 509.gif detection ([#2901](https://github.com/mikf/gallery-dl/issues/2901)) +- [exhentai] guess extension for original files ([#2842](https://github.com/mikf/gallery-dl/issues/2842)) +- [poipiku] use `img-org.poipiku.com` as image domain ([#2796](https://github.com/mikf/gallery-dl/issues/2796)) +- [reddit] prevent exception with empty submission URLs ([#2913](https://github.com/mikf/gallery-dl/issues/2913)) +- [redgifs] fix download URLs ([#2884](https://github.com/mikf/gallery-dl/issues/2884)) +- [smugmug] update default API credentials ([#2881](https://github.com/mikf/gallery-dl/issues/2881)) +- [twitter] provide proper `date` for syndication results ([#2920](https://github.com/mikf/gallery-dl/issues/2920)) +- [twitter] fix new-style `/card_img/` URLs +- remove all whitespace before comments after input file URLs ([#2808](https://github.com/mikf/gallery-dl/issues/2808)) + ## 1.23.0 - 2022-08-28 ### Changes - [twitter] update `user` and `author` metdata fields @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.23.0 +Version: 1.23.1 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -166,11 +166,11 @@ Get the direct URL of an image from a site supporting authentication with userna gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256" -Filter manga chapters by language and chapter number: +Filter manga chapters by chapter number and language: .. code:: bash - gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539" | Search a remote resource for URLs and download images from them: @@ -66,8 +66,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -133,11 +133,11 @@ Get the direct URL of an image from a site supporting authentication with userna gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256" -Filter manga chapters by language and chapter number: +Filter manga chapters by chapter number and language: .. code:: bash - gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539" | Search a remote resource for URLs and download images from them: diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index d4efeed..e76a380 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-08-28" "1.23.0" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-09-18" "1.23.1" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 642cb78..f465d84 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-08-28" "1.23.0" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-09-18" "1.23.1" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -2575,13 +2575,30 @@ Search posts for inline images and videos. \f[I]true\f[] .IP "Description:" 4 -Download full-resolution \f[I]photo\f[] images. +Download full-resolution \f[I]photo\f[] and \f[I]inline\f[] images. For each photo with "maximum" resolution -(width equal to 2048 or height equal to 3072), +(width equal to 2048 or height equal to 3072) +or each inline image, use an extra HTTP request to find the URL to its full-resolution version. +.SS extractor.tumblr.ratelimit +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"abort"\f[] + +.IP "Description:" 4 +Selects how to handle exceeding the daily API rate limit. + +.br +* \f[I]"abort"\f[]: Raise an error and stop extraction +.br +* \f[I]"wait"\f[]: Wait until rate limit reset + + .SS extractor.tumblr.reblogs .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -2664,6 +2681,26 @@ Controls how to handle \f[I]Twitter Cards\f[]. * \f[I]"ytdl"\f[]: Additionally download video content from unsupported cards using \f[I]youtube-dl\f[] +.SS extractor.twitter.cards-blacklist +.IP "Type:" 6 +\f[I]list\f[] of \f[I]strings\f[] + +.IP "Example:" 4 +["summary", "youtube.com", "player:twitch.tv"] + +.IP "Description:" 4 +List of card types to ignore. + +Possible values are + +.br +* card names +.br +* card domains +.br +* \f[I]<card name>:<card domain>\f[] + + .SS extractor.twitter.conversations .IP "Type:" 6 \f[I]bool\f[] @@ -2672,8 +2709,11 @@ Controls how to handle \f[I]Twitter Cards\f[]. \f[I]false\f[] .IP "Description:" 4 -Fetch media from all Tweets and replies in a \f[I]conversation -<https://help.twitter.com/en/using-twitter/twitter-conversations>\f[]. +For input URLs pointing to a single Tweet, +e.g. https://twitter.com/i/web/status/<TweetID>, +fetch media from all Tweets and replies in this \f[I]conversation +<https://help.twitter.com/en/using-twitter/twitter-conversations>\f[] +or thread. .SS extractor.twitter.csrf @@ -2692,6 +2732,25 @@ Controls how to handle Cross Site Request Forgery (CSRF) tokens. * \f[I]"cookies"\f[]: Use token given by the \f[I]ct0\f[] cookie if present. +.SS extractor.twitter.expand +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +For each Tweet, return *all* Tweets from that initial Tweet's +conversation or thread, i.e. *expand* all Twitter threads. + +Going through a timeline with this option enabled is essentially the same +as running \f[I]gallery-dl https://twitter.com/i/web/status/<TweetID>\f[] +with enabled \f[I]conversations\f[] option +for each Tweet in said timeline. + +Note: This requires at least 1 additional API call per initial Tweet. + + .SS extractor.twitter.size .IP "Type:" 6 \f[I]list\f[] of \f[I]strings\f[] @@ -3140,6 +3199,19 @@ Additional options specified as youtube-dl command-line arguments. Location of a youtube-dl configuration file to load options from. +.SS extractor.zerochan.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract additional metadata (date, md5, tags, ...) + +Note: This requires 1-2 additional HTTP request for each post. + + .SS extractor.[booru].tags .IP "Type:" 6 \f[I]bool\f[] @@ -4225,6 +4297,20 @@ to prevent it from only being displayed for a very short amount of time. Filename extension for the created ZIP archive. +.SS zip.files +.IP "Type:" 6 +\f[I]list\f[] of \f[I]Path\f[] + +.IP "Example:" 4 +["info.json"] + +.IP "Description:" 4 +List of extra files to be added to a ZIP archive. + +Note: Relative paths are relative to the current +\f[I]download directory\f[]. + + .SS zip.keep-files .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1e485ee..6ba50f2 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -341,7 +341,8 @@ "zerochan": { "username": null, - "password": null + "password": null, + "metadata": false }, "booru": { diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 6b9d68b..ea2164a 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.23.0 +Version: 1.23.1 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.1/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -166,11 +166,11 @@ Get the direct URL of an image from a site supporting authentication with userna gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256" -Filter manga chapters by language and chapter number: +Filter manga chapters by chapter number and language: .. code:: bash - gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "10 <= chapter < 20" -o "lang=fr" "https://mangadex.org/title/59793dd0-a2d8-41a2-9758-8197287a8539" | Search a remote resource for URLs and download images from them: diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 5f5084b..73cc80b 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -92,6 +92,7 @@ gallery_dl/extractor/hentaihand.py gallery_dl/extractor/hentaihere.py gallery_dl/extractor/hiperdex.py gallery_dl/extractor/hitomi.py +gallery_dl/extractor/hotleak.py gallery_dl/extractor/idolcomplex.py gallery_dl/extractor/imagebam.py gallery_dl/extractor/imagechest.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 329e7ab..7504fa4 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -96,9 +96,9 @@ def parse_inputfile(file, log): else: # url if " #" in line: - line = line.partition(" #")[0] + line = line.partition(" #")[0].rstrip() elif "\t#" in line: - line = line.partition("\t#")[0] + line = line.partition("\t#")[0].rstrip() if gconf or lconf: yield util.ExtendedUrl(line, gconf, lconf) gconf = [] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9e4507a..fed6998 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -55,6 +55,7 @@ modules = [ "hentaihere", "hiperdex", "hitomi", + "hotleak", "idolcomplex", "imagebam", "imagechest", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 3091f57..2502411 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -37,6 +37,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4", "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }), + # cdn4 + ("https://bunkr.is/a/iXTTc1o2", { + "pattern": r"https://(cdn|media-files)4\.bunkr\.is/", + "content": "da29aae371b7adc8c5ef8e6991b66b69823791e8", + }), ("https://bunkr.to/a/Lktg9Keq"), ) @@ -66,9 +71,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): data = json.loads(text.extract( self.request(url).text, 'id="__NEXT_DATA__" type="application/json">', '<')[0]) - props = data["props"]["pageProps"] - album = props["album"] - files = props["files"] + album = data["props"]["pageProps"]["album"] + files = album["files"] except Exception as exc: self.log.debug(exc.__class__.__name__, exc) self.root = self.root.replace("bunkr", "app.bunkr", 1) @@ -77,7 +81,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): for file in files: name = file["name"] cdn = file["cdn"] - if name.endswith(".mp4"): + if name.endswith((".mp4", ".m4v", ".mov")): cdn = cdn.replace("//cdn", "//media-files") file["file"] = cdn + "/" + name diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 1b41101..f7ee51f 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -152,7 +152,8 @@ class Extractor(): server = response.headers.get("Server") if server and server.startswith("cloudflare"): if code == 503 and \ - b"jschl-answer" in response.content: + (b"_cf_chl_opt" in response.content or + b"jschl-answer" in response.content): self.log.warning("Cloudflare IUAM challenge") break if code == 403 and \ diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 2720691..01ba03a 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -219,7 +219,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self.limits: self._check_limits(data) if "/fullimg.php" in url: - data["extension"] = "" data["_http_validate"] = _validate_response else: data["_http_validate"] = None @@ -328,8 +327,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["image_token"] = self.key["start"] = extr('var startkey="', '";') self.key["show"] = extr('var showkey="', '";') - if iurl.endswith("g/509.gif"): - self._report_limits(data) + self._check_509(iurl, data) return url, text.nameext_from_url(iurl, data) def images_from_api(self): @@ -365,8 +363,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = request["page"] data["image_token"] = imgkey - if imgurl.endswith("g/509.gif"): - self._report_limits(data) + self._check_509(imgurl, data) yield url, text.nameext_from_url(imgurl, data) request["imgkey"] = nextkey @@ -385,6 +382,15 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self._remaining <= 0: self._report_limits(data) + def _check_509(self, url, data): + # full 509.gif URLs + # - https://exhentai.org/img/509.gif + # - https://ehgt.org/g/509.gif + if url.endswith(("hentai.org/img/509.gif", + "ehgt.org/g/509.gif")): + self.log.debug(url) + self._report_limits(data) + def _update_limits(self): url = "https://e-hentai.org/home.php" cookies = { diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 2bd8c6b..e85d68a 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2020 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,6 +11,8 @@ from .common import Extractor, Message from .. import text, oauth, util, exception +BASE_PATTERN = r"(?:https?://)?(?:www\.|secure\.|m\.)?flickr\.com" + class FlickrExtractor(Extractor): """Base class for flickr extractors""" @@ -55,7 +57,7 @@ class FlickrImageExtractor(FlickrExtractor): """Extractor for individual images from flickr.com""" subcategory = "image" pattern = (r"(?:https?://)?(?:" - r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/" + r"(?:(?:www\.|secure\.|m\.)?flickr\.com/photos/[^/?#]+/" r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" r"|flic\.kr/p/([A-Za-z1-9]+))") test = ( @@ -77,6 +79,10 @@ class FlickrImageExtractor(FlickrExtractor): "width": 1024, }, }), + ("https://secure.flickr.com/photos/departingyyz/16089302239"), + ("https://m.flickr.com/photos/departingyyz/16089302239"), + ("https://flickr.com/photos/departingyyz/16089302239"), + ("https://www.flickr.com/photos/145617051@N08/46733161535", { "count": 1, "keyword": {"media": "video"}, @@ -132,8 +138,7 @@ class FlickrAlbumExtractor(FlickrExtractor): directory_fmt = ("{category}", "{user[username]}", "Albums", "{album[id]} {album[title]}") archive_fmt = "a_{album[id]}_{id}" - pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" - r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?") + pattern = BASE_PATTERN + r"/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?" test = ( (("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), { "pattern": FlickrImageExtractor.pattern, @@ -143,6 +148,8 @@ class FlickrAlbumExtractor(FlickrExtractor): "pattern": pattern, "count": 2, }), + ("https://secure.flickr.com/photos/shona_s/albums"), + ("https://m.flickr.com/photos/shona_s/albums"), ) def __init__(self, match): @@ -180,8 +187,7 @@ class FlickrGalleryExtractor(FlickrExtractor): directory_fmt = ("{category}", "{user[username]}", "Galleries", "{gallery[gallery_id]} {gallery[title]}") archive_fmt = "g_{gallery[id]}_{id}" - pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" - r"photos/([^/]+)/galleries/(\d+)") + pattern = BASE_PATTERN + r"/photos/([^/?#]+)/galleries/(\d+)" test = (("https://www.flickr.com/photos/flickr/" "galleries/72157681572514792/"), { "pattern": FlickrImageExtractor.pattern, @@ -206,7 +212,7 @@ class FlickrGroupExtractor(FlickrExtractor): subcategory = "group" directory_fmt = ("{category}", "Groups", "{group[groupname]}") archive_fmt = "G_{group[nsid]}_{id}" - pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)" + pattern = BASE_PATTERN + r"/groups/([^/?#]+)" test = ("https://www.flickr.com/groups/bird_headshots/", { "pattern": FlickrImageExtractor.pattern, "count": "> 150", @@ -224,7 +230,7 @@ class FlickrUserExtractor(FlickrExtractor): """Extractor for the photostream of a flickr user""" subcategory = "user" archive_fmt = "u_{user[nsid]}_{id}" - pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$" + pattern = BASE_PATTERN + r"/photos/([^/?#]+)/?$" test = ("https://www.flickr.com/photos/shona_s/", { "pattern": FlickrImageExtractor.pattern, "count": 28, @@ -239,7 +245,7 @@ class FlickrFavoriteExtractor(FlickrExtractor): subcategory = "favorite" directory_fmt = ("{category}", "{user[username]}", "Favorites") archive_fmt = "f_{user[nsid]}_{id}" - pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites" + pattern = BASE_PATTERN + r"/photos/([^/?#]+)/favorites" test = ("https://www.flickr.com/photos/shona_s/favorites", { "pattern": FlickrImageExtractor.pattern, "count": 4, @@ -254,7 +260,7 @@ class FlickrSearchExtractor(FlickrExtractor): subcategory = "search" directory_fmt = ("{category}", "Search", "{search[text]}") archive_fmt = "s_{search}_{id}" - pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)" + pattern = BASE_PATTERN + r"/search/?\?([^#]+)" test = ( ("https://flickr.com/search/?text=mountain"), ("https://flickr.com/search/?text=tree%20cloud%20house" @@ -275,7 +281,11 @@ class FlickrSearchExtractor(FlickrExtractor): class FlickrAPI(oauth.OAuth1API): - """Minimal interface for the flickr API""" + """Minimal interface for the flickr API + + https://www.flickr.com/services/api/ + """ + API_URL = "https://api.flickr.com/services/rest/" API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" API_SECRET = "3adb0f568dc68393" diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py new file mode 100644 index 0000000..d6575cf --- /dev/null +++ b/gallery_dl/extractor/hotleak.py @@ -0,0 +1,228 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hotleak.vip/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip" + + +class HotleakExtractor(Extractor): + """Base class for hotleak extractors""" + category = "hotleak" + directory_fmt = ("{category}", "{creator}",) + filename_fmt = "{creator}_{id}.{extension}" + archive_fmt = "{type}_{creator}_{id}" + root = "https://hotleak.vip" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + + def items(self): + for post in self.posts(): + yield Message.Directory, post + yield Message.Url, post["url"], post + + def posts(self): + """Return an iterable containing relevant posts""" + return () + + def _pagination(self, url, params): + params = text.parse_query(params) + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + if "</article>" not in page: + return + + for item in text.extract_iter( + page, '<article class="movie-item', '</article>'): + yield text.extract(item, '<a href="', '"')[0] + + params["page"] += 1 + + +class HotleakPostExtractor(HotleakExtractor): + """Extractor for individual posts on hotleak""" + subcategory = "post" + pattern = (BASE_PATTERN + r"/(?!hot|creators|videos|photos)" + r"([^/]+)/(photo|video)/(\d+)") + test = ( + ("https://hotleak.vip/kaiyakawaii/photo/1617145", { + "pattern": r"https://hotleak\.vip/storage/images/3625" + r"/1617145/fefdd5988dfcf6b98cc9e11616018868\.jpg", + "keyword": { + "id": 1617145, + "creator": "kaiyakawaii", + "type": "photo", + "filename": "fefdd5988dfcf6b98cc9e11616018868", + "extension": "jpg", + }, + }), + ("https://hotleak.vip/lilmochidoll/video/1625538", { + "pattern": r"ytdl:https://cdn8-leak\.camhdxx\.com" + r"/1661/1625538/index\.m3u8", + "keyword": { + "id": 1625538, + "creator": "lilmochidoll", + "type": "video", + "filename": "index", + "extension": "mp4", + }, + }), + ) + + def __init__(self, match): + HotleakExtractor.__init__(self, match) + self.creator, self.type, self.id = match.groups() + + def posts(self): + url = "{}/{}/{}/{}".format( + self.root, self.creator, self.type, self.id) + page = self.request(url).text + page = text.extract( + page, '<div class="movie-image thumb">', '</article>')[0] + data = { + "id" : text.parse_int(self.id), + "creator": self.creator, + "type" : self.type, + } + + if self.type == "photo": + data["url"] = text.extract(page, 'data-src="', '"')[0] + text.nameext_from_url(data["url"], data) + + elif self.type == "video": + data["url"] = "ytdl:" + text.extract( + text.unescape(page), '"src":"', '"')[0] + text.nameext_from_url(data["url"], data) + data["extension"] = "mp4" + + return (data,) + + +class HotleakCreatorExtractor(HotleakExtractor): + """Extractor for all posts from a hotleak creator""" + subcategory = "creator" + pattern = BASE_PATTERN + r"/(?!hot|creators|videos|photos)([^/?#]+)/?$" + test = ( + ("https://hotleak.vip/kaiyakawaii", { + "range": "1-200", + "count": 200, + }), + ("https://hotleak.vip/stellaviolet", { + "count": "> 600" + }), + ("https://hotleak.vip/doesnotexist", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + HotleakExtractor.__init__(self, match) + self.creator = match.group(1) + + def posts(self): + url = "{}/{}".format(self.root, self.creator) + return self._pagination(url) + + def _pagination(self, url): + headers = {"X-Requested-With": "XMLHttpRequest"} + params = {"page": 1} + + while True: + try: + response = self.request( + url, headers=headers, params=params, notfound="creator") + except exception.HttpError as exc: + if exc.response.status_code == 429: + self.wait( + until=exc.response.headers.get("X-RateLimit-Reset")) + continue + + posts = response.json() + if not posts: + return + + data = {"creator": self.creator} + for post in posts: + data["id"] = text.parse_int(post["id"]) + + if post["type"] == 0: + data["type"] = "photo" + data["url"] = self.root + "/storage/" + post["image"] + text.nameext_from_url(data["url"], data) + + elif post["type"] == 1: + data["type"] = "video" + data["url"] = "ytdl:" + post["stream_url_play"] + text.nameext_from_url(data["url"], data) + data["extension"] = "mp4" + + yield data + params["page"] += 1 + + +class HotleakCategoryExtractor(HotleakExtractor): + """Extractor for hotleak categories""" + subcategory = "category" + pattern = BASE_PATTERN + r"/(hot|creators|videos|photos)(?:/?\?([^#]+))?" + test = ( + ("https://hotleak.vip/photos", { + "pattern": HotleakPostExtractor.pattern, + "range": "1-50", + "count": 50, + }), + ("https://hotleak.vip/videos"), + ("https://hotleak.vip/creators", { + "pattern": HotleakCreatorExtractor.pattern, + "range": "1-50", + "count": 50, + }), + ("https://hotleak.vip/hot"), + ) + + def __init__(self, match): + HotleakExtractor.__init__(self, match) + self._category, self.params = match.groups() + + def items(self): + url = "{}/{}".format(self.root, self._category) + + if self._category in ("hot", "creators"): + data = {"_extractor": HotleakCreatorExtractor} + elif self._category in ("videos", "photos"): + data = {"_extractor": HotleakPostExtractor} + + for item in self._pagination(url, self.params): + yield Message.Queue, item, data + + +class HotleakSearchExtractor(HotleakExtractor): + """Extractor for hotleak search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/search(?:/?\?([^#]+))" + test = ( + ("https://hotleak.vip/search?search=gallery-dl", { + "count": 0, + }), + ("https://hotleak.vip/search?search=hannah", { + "count": "> 30", + }), + ) + + def __init__(self, match): + HotleakExtractor.__init__(self, match) + self.params = match.group(1) + + def items(self): + data = {"_extractor": HotleakCreatorExtractor} + for creator in self._pagination(self.root + "/search", self.params): + yield Message.Queue, creator, data diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index d56af8b..8c98d2e 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -264,6 +264,12 @@ class InstagramExtractor(Extractor): "post_id": reel_id, "post_shortcode": shortcode_from_id(reel_id), } + + if "title" in post: + data["highlight_title"] = post["title"] + if "created_at" in post: + data["date"] = text.parse_timestamp(post.get("created_at")) + else: data = { "post_id" : post["pk"], diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 0a6a6d3..56e3b39 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -49,7 +49,8 @@ class PahealExtractor(Extractor): "id" : post_id, "tags" : extr(": ", "<"), "md5" : extr("/_thumbs/", "/"), - "file_url": extr("id='main_image' src='", "'"), + "file_url": (extr("id='main_image' src='", "'") or + extr("<source src='", "'")), "uploader": text.unquote(extr( "class='username' href='/user/", "'")), "date" : text.parse_datetime( @@ -59,8 +60,10 @@ class PahealExtractor(Extractor): } dimensions, size, ext = extr("Info</th><td>", ">").split(" // ") - post["width"], _, post["height"] = dimensions.partition("x") + post["width"], _, height = dimensions.partition("x") post["size"] = text.parse_bytes(size[:-1]) + post["height"], _, duration = height.partition(", ") + post["duration"] = text.parse_float(duration[:-1]) return post @@ -111,10 +114,12 @@ class PahealTagExtractor(PahealExtractor): tags, data, date = data.split("\n") dimensions, size, ext = data.split(" // ") width, _, height = dimensions.partition("x") + height, _, duration = height.partition(", ") return { "id": pid, "md5": md5, "file_url": url, "width": width, "height": height, + "duration": text.parse_float(duration[:-1]), "tags": text.unescape(tags), "size": text.parse_bytes(size[:-1]), "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"), @@ -163,6 +168,27 @@ class PahealPostExtractor(PahealExtractor): "width": 1200, }, }), + # video + ("https://rule34.paheal.net/post/view/3864982", { + "pattern": r"https://[\w]+\.paheal\.net/_images/7629fc0ff77e32637d" + r"de5bf4f992b2cb/3864982%20-%20Metal_Gear%20Metal_Gear_" + r"Solid_V%20Quiet%20Vg_erotica%20animated%20webm\.webm", + "keyword": { + "date": "dt:2020-09-06 01:59:03", + "duration": 30.0, + "extension": "webm", + "height": 2500, + "id": 3864982, + "md5": "7629fc0ff77e32637dde5bf4f992b2cb", + "size": 18454938, + "source": "https://twitter.com/VG_Worklog" + "/status/1302407696294055936", + "tags": "Metal_Gear Metal_Gear_Solid_V Quiet " + "Vg_erotica animated webm", + "uploader": "justausername", + "width": 1768, + }, + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index 8203885..4283081 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -42,6 +42,7 @@ class PoipikuExtractor(Extractor): '<h2 class="UserInfoUserName">', '</').rpartition(">")[2]), "description": text.unescape(extr( 'class="IllustItemDesc" >', '<')), + "_http_headers": {"Referer": post_url}, } yield Message.Directory, post @@ -54,7 +55,8 @@ class PoipikuExtractor(Extractor): elif thumb.startswith(("//img.poipiku.com/img/", "/img/")): continue post["num"] += 1 - url = text.ensure_http_scheme(thumb[:-8]) + url = text.ensure_http_scheme(thumb[:-8]).replace( + "//img.", "//img-org.", 1) yield Message.Url, url, text.nameext_from_url(url, post) if not extr('> show all', '<'): @@ -80,7 +82,8 @@ class PoipikuExtractor(Extractor): for thumb in text.extract_iter( page, 'class="IllustItemThumbImg" src="', '"'): post["num"] += 1 - url = text.ensure_http_scheme(thumb[:-8]) + url = text.ensure_http_scheme(thumb[:-8]).replace( + "//img.", "//img-org.", 1) yield Message.Url, url, text.nameext_from_url(url, post) @@ -91,7 +94,7 @@ class PoipikuUserExtractor(PoipikuExtractor): r"(\d+)/?(?:$|[?&#])") test = ( ("https://poipiku.com/25049/", { - "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049" + "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049" r"/\d+_\w+\.(jpe?g|png)$", "range": "1-10", "count": 10, @@ -131,7 +134,7 @@ class PoipikuPostExtractor(PoipikuExtractor): pattern = BASE_PATTERN + r"/(\d+)/(\d+)" test = ( ("https://poipiku.com/25049/5864576.html", { - "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049" + "pattern": r"https://img-org\.poipiku\.com/user_img\d+/000025049" r"/005864576_EWN1Y65gQ\.png$", "keyword": { "count": "1", @@ -146,7 +149,7 @@ class PoipikuPostExtractor(PoipikuExtractor): }, }), ("https://poipiku.com/2166245/6411749.html", { - "pattern": r"https://img\.poipiku\.com/user_img\d+/002166245" + "pattern": r"https://img-org\.poipiku\.com/user_img\d+/002166245" r"/006411749_\w+\.jpeg$", "count": 4, "keyword": { diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index d35e24e..954a84f 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -46,10 +46,10 @@ class RedditExtractor(Extractor): submission["created_utc"]) yield Message.Directory, submission visited.add(submission["id"]) - url = submission["url"] submission["num"] = 0 - if url.startswith("https://i.redd.it/"): + url = submission["url"] + if url and url.startswith("https://i.redd.it/"): text.nameext_from_url(url, submission) yield Message.Url, url, submission diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 2c3ed44..3a4fb0e 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -53,6 +53,7 @@ class RedgifsExtractor(Extractor): for fmt in self.formats: url = urls.get(fmt) if url: + url = url.replace("//thumbs2.", "//thumbs3.", 1) text.nameext_from_url(url, gif) yield url diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 4010da3..2264fe4 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -209,9 +209,9 @@ class SmugmugPathExtractor(SmugmugExtractor): class SmugmugAPI(oauth.OAuth1API): """Minimal interface for the smugmug API v2""" API_DOMAIN = "api.smugmug.com" - API_KEY = "DFqxg4jf7GrtsQ5PnbNB8899zKfnDrdK" - API_SECRET = ("fknV35p9r9BwZC4XbTzvCXpcSJRdD83S" - "9nMFQm25ndGBzNPnwRDbRnnVBvqt4xTq") + API_KEY = "RCVHDGjcbc4Fhzq4qzqLdZmvwmwB6LM2" + API_SECRET = ("jGrdndvJqhTx8XSNs7TFTSSthhZHq92d" + "dMpbpDpkDVNM7TDgnvLFMtfB5Mg5kH73") HEADERS = {"Accept": "application/json"} def album(self, album_id, expands=None): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index b694fa0..6f53881 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -14,25 +14,6 @@ from datetime import datetime, timedelta import re -def _original_inline_image(url): - return re.sub( - (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" - r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"), - r"https://\1_1280.\2", url - ) - - -def _original_video(url): - return re.sub( - (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" - r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"), - r"https://\1.\2", url - ) - - -POST_TYPES = frozenset(( - "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) - BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?" @@ -40,6 +21,9 @@ BASE_PATTERN = ( r"([\w-]+\.tumblr\.com)))" ) +POST_TYPES = frozenset(( + "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) + class TumblrExtractor(Extractor): """Base class for tumblr extractors""" @@ -79,6 +63,18 @@ class TumblrExtractor(Extractor): def items(self): blog = None + # pre-compile regular expressions + self._sub_video = re.compile( + r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" + r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)").sub + if self.inline: + self._sub_image = re.compile( + r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" + r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)").sub + self._subn_orig_image = re.compile(r"/s\d+x\d+/").subn + _findall_image = re.compile('<img src="([^"]+)"').findall + _findall_video = re.compile('<source src="([^"]+)"').findall + for post in self.posts(): if self.date_min > post["timestamp"]: return @@ -120,7 +116,7 @@ class TumblrExtractor(Extractor): if self.original and "/s2048x3072/" in photo["url"] and ( photo["width"] == 2048 or photo["height"] == 3072): - photo["url"] = self._original_image(photo["url"]) + photo["url"] = self._original_photo(photo["url"]) del photo["original_size"] del photo["alt_sizes"] @@ -134,17 +130,18 @@ class TumblrExtractor(Extractor): url = post.get("video_url") # type "video" if url: - posts.append(self._prepare(_original_video(url), post.copy())) + posts.append(self._prepare( + self._original_video(url), post.copy())) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their # API response, but they can't contain images/videos anyway body = post["reblog"]["comment"] + post["reblog"]["tree_html"] - for url in re.findall('<img src="([^"]+)"', body): - url = _original_inline_image(url) + for url in _findall_image(body): + url = self._original_inline_image(url) posts.append(self._prepare_image(url, post.copy())) - for url in re.findall('<source src="([^"]+)"', body): - url = _original_video(url) + for url in _findall_video(body): + url = self._original_video(url) posts.append(self._prepare(url, post.copy())) if self.external: # external links @@ -220,8 +217,21 @@ class TumblrExtractor(Extractor): def _skip_reblog_same_blog(self, post): return self.blog != post.get("reblogged_root_uuid") - def _original_image(self, url): - url = url.replace("/s2048x3072/", "/s99999x99999/", 1) + def _original_photo(self, url): + return self._update_image_token( + url.replace("/s2048x3072/", "/s99999x99999/", 1)) + + def _original_inline_image(self, url): + if self.original: + url, n = self._subn_orig_image("/s99999x99999/", url, 1) + if n: + return self._update_image_token(url) + return self._sub_image(r"https://\1_1280.\2", url) + + def _original_video(self, url): + return self._sub_video(r"https://\1.\2", url) + + def _update_image_token(self, url): headers = {"Accept": "text/html,*/*;q=0.8"} response = self.request(url, headers=headers) return text.extract(response.text, '" src="', '"')[0] @@ -305,6 +315,14 @@ class TumblrPostExtractor(TumblrExtractor): ("https://mikf123.tumblr.com/post/181022380064/chat-post", { "count": 0, }), + ("https://kichatundk.tumblr.com/post/654953419288821760", { + "count": 2, # high-quality images (#1846) + "content": "d6fcc7b6f750d835d55c7f31fa3b63be26c9f89b", + }), + ("https://hameru-is-cool.tumblr.com/post/639261855227002880", { + "count": 2, # high-quality images (#1344) + "content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34", + }), ("https://mikf123.tumblr.com/image/689860196535762944", { "pattern": r"^https://\d+\.media\.tumblr\.com" r"/134791621559a79793563b636b5fe2c6" @@ -446,10 +464,8 @@ class TumblrAPI(oauth.OAuth1API): # daily rate limit if response.headers.get("x-ratelimit-perday-remaining") == "0": + self.log.info("Daily API rate limit exceeded") reset = response.headers.get("x-ratelimit-perday-reset") - t = (datetime.now() + timedelta(seconds=float(reset))).time() - - self.log.error("Daily API rate limit exceeded") api_key = self.api_key or self.session.auth.consumer_key if api_key == self.API_KEY: @@ -459,6 +475,11 @@ class TumblrAPI(oauth.OAuth1API): "ter/docs/configuration.rst#extractortumblra" "pi-key--api-secret") + if self.extractor.config("ratelimit") == "wait": + self.extractor.wait(seconds=reset) + return self._call(blog, endpoint, params) + + t = (datetime.now() + timedelta(seconds=float(reset))).time() raise exception.StopExtraction( "Aborting - Rate limit will reset at %s", "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 0df4ea2..ba0597e 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -41,6 +41,7 @@ class TwitterExtractor(Extractor): self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) self.cards = self.config("cards", False) + self.cards_blacklist = self.config("cards-blacklist") self._user = self._user_obj = None self._user_cache = {} self._init_sizes() @@ -154,8 +155,11 @@ class TwitterExtractor(Extractor): }) elif "media_url_https" in media: url = media["media_url_https"] - base, _, fmt = url.rpartition(".") - base += "?format=" + fmt + "&name=" + if url[-4] == ".": + base, _, fmt = url.rpartition(".") + base += "?format=" + fmt + "&name=" + else: + base = url.rpartition("=")[0] + "=" files.append(text.nameext_from_url(url, { "url" : base + self._size_image, "width" : width, @@ -174,15 +178,23 @@ class TwitterExtractor(Extractor): card = tweet["card"] if "legacy" in card: card = card["legacy"] - name = card["name"] + + name = card["name"].rpartition(":")[2] + bvals = card["binding_values"] + if isinstance(bvals, list): + bvals = {bval["key"]: bval["value"] + for bval in card["binding_values"]} + + cbl = self.cards_blacklist + if cbl: + if name in cbl: + return + if "vanity_url" in bvals: + domain = bvals["vanity_url"]["string_value"] + if domain in cbl or name + ":" + domain in cbl: + return if name in ("summary", "summary_large_image"): - bvals = card["binding_values"] - if isinstance(bvals, list): - bvals = { - bval["key"]: bval["value"] - for bval in card["binding_values"] - } for prefix in ("photo_image_full_size_", "summary_photo_image_", "thumbnail_image_"): @@ -199,19 +211,9 @@ class TwitterExtractor(Extractor): files.append(value) return elif name == "unified_card": - bvals = card["binding_values"] - if isinstance(bvals, list): - for bval in card["binding_values"]: - if bval["key"] == "unified_card": - bval = bval["value"]["string_value"] - break - else: - bval = bvals["unified_card"]["string_value"] - data = json.loads(bval) - if data.get("type") == "image_carousel_website": - self._extract_media( - tweet, data["media_entities"].values(), files) - return + data = json.loads(bvals["unified_card"]["string_value"]) + self._extract_media(tweet, data["media_entities"].values(), files) + return if self.cards == "ytdl": tweet_id = tweet.get("rest_id") or tweet["id_str"] @@ -735,16 +737,33 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("cards", True),), "pattern": r"https://pbs.twimg.com/card_img/\d+/", }), - # unified_card with image_carousel_website + # unified_card image_website (#2875) + ("https://twitter.com/i/web/status/1561674543323910144", { + "options": (("cards", True),), + "pattern": r"https://pbs\.twimg\.com/media/F.+=jpg", + }), + # unified_card image_carousel_website ("https://twitter.com/doax_vv_staff/status/1479438945662685184", { "options": (("cards", True),), "pattern": r"https://pbs\.twimg\.com/media/F.+=png", "count": 6, }), + # unified_card video_website (#2875) + ("https://twitter.com/bang_dream_1242/status/1561548715348746241", { + "options": (("cards", True),), + "pattern": r"https://video\.twimg\.com/amplify_video" + r"/1560607284333449216/vid/720x720/\w+\.mp4", + }), # unified_card without type ("https://twitter.com/i/web/status/1466183847628865544", { "count": 0, }), + # 'cards-blacklist' option + ("https://twitter.com/i/web/status/1571141912295243776", { + "options": (("cards", "ytdl"), + ("cards-blacklist", ("twitch.tv",))), + "count": 0, + }), # original retweets (#1026) ("https://twitter.com/jessica_3978/status/1296304589591810048", { "options": (("retweets", "original"),), @@ -776,12 +795,20 @@ class TwitterTweetExtractor(TwitterExtractor): # age-restricted (#2354) ("https://twitter.com/mightbecursed/status/1492954264909479936", { "options": (("syndication", True),), + "keywords": {"date": "dt:2022-02-13 20:10:09"}, "count": 1, }), # media alt texts / descriptions (#2617) ("https://twitter.com/my0nruri/status/1528379296041299968", { "keyword": {"description": "oc"} }), + # '?format=...&name=...'-style URLs + ("https://twitter.com/poco_dandy/status/1150646424461176832", { + "options": (("cards", True),), + "pattern": r"https://pbs.twimg.com/card_img/157\d+/\w+" + r"\?format=(jpg|png)&name=orig$", + "range": "1-2", + }), ) def __init__(self, match): @@ -1442,6 +1469,10 @@ class TwitterAPI(): else: retweet_id = None + tweet["created_at"] = text.parse_datetime( + tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime( + "%a %b %d %H:%M:%S +0000 %Y") + if "video" in tweet: video = tweet["video"] video["variants"] = (max( diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 2b5acd8..72cf438 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -11,6 +11,8 @@ from .booru import BooruExtractor from ..cache import cache from .. import text, exception +from xml.etree import ElementTree + BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -54,7 +56,7 @@ class ZerochanExtractor(BooruExtractor): return response.cookies - def _parse_entry_page(self, entry_id): + def _parse_entry_html(self, entry_id): url = "{}/{}".format(self.root, entry_id) extr = text.extract_from(self.request(url).text) @@ -66,10 +68,26 @@ class ZerochanExtractor(BooruExtractor): '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"), "width" : extr('"width": "', ' '), "height": extr('"height": "', ' '), - "size" : extr('"contentSize": "', 'B'), + "size" : text.parse_bytes(extr('"contentSize": "', 'B')), "path" : text.split_html(extr( 'class="breadcrumbs', '</p>'))[3::2], - "tags" : extr('alt="Tags: ', '"').split(", ") + "tags" : extr('alt="Tags: Anime, ', '"').split(", ") + } + + def _parse_entry_xml(self, entry_id): + url = "{}/{}?xml".format(self.root, entry_id) + item = ElementTree.fromstring(self.request(url).text)[0][-1] + # content = item[4].attrib + + return { + # "id" : entry_id, + # "file_url": content["url"], + # "width" : content["width"], + # "height": content["height"], + # "size" : content["filesize"], + "name" : item[2].text, + "tags" : item[5].text.lstrip().split(", "), + "md5" : item[6].text, } @@ -105,6 +123,7 @@ class ZerochanTagExtractor(ZerochanExtractor): url = self.root + "/" + self.search_tag params = text.parse_query(self.query) params["p"] = text.parse_int(params.get("p"), 1) + metadata = self.config("metadata") while True: page = self.request(url, params=params).text @@ -115,15 +134,22 @@ class ZerochanTagExtractor(ZerochanExtractor): post = extr('<li class="', '>') if not post: break - yield { - "id" : extr('href="/', '"'), - "name" : extr('alt="', '"'), - "width" : extr('title="', 'x'), - "height": extr('', ' '), - "size" : extr('', 'B'), - "file_url": "https://static." + extr( - '<a href="https://static.', '"'), - } + + if metadata: + entry_id = extr('href="/', '"') + post = self._parse_entry_html(entry_id) + post.update(self._parse_entry_xml(entry_id)) + yield post + else: + yield { + "id" : extr('href="/', '"'), + "name" : extr('alt="', '"'), + "width" : extr('title="', 'x'), + "height": extr('', ' '), + "size" : extr('', 'B'), + "file_url": "https://static." + extr( + '<a href="https://static.', '"'), + } if 'rel="next"' not in page: break @@ -153,4 +179,7 @@ class ZerochanImageExtractor(ZerochanExtractor): self.image_id = match.group(1) def posts(self): - return (self._parse_entry_page(self.image_id),) + post = self._parse_entry_html(self.image_id) + if self.config("metadata"): + post.update(self._parse_entry_xml(self.image_id)) + return (post,) diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py index ff97add..4f376fe 100644 --- a/gallery_dl/postprocessor/zip.py +++ b/gallery_dl/postprocessor/zip.py @@ -26,6 +26,7 @@ class ZipPP(PostProcessor): def __init__(self, job, options): PostProcessor.__init__(self, job) self.delete = not options.get("keep-files", False) + self.files = options.get("files") ext = "." + options.get("extension", "zip") algorithm = options.get("compression", "store") if algorithm not in self.COMPRESSION_ALGORITHMS: @@ -56,6 +57,9 @@ class ZipPP(PostProcessor): # 'NameToInfo' is not officially documented, but it's available # for all supported Python versions and using it directly is a lot # faster than calling getinfo() + if self.files: + self.write_extra(pathfmt, zfile, self.files) + self.files = None if pathfmt.filename not in zfile.NameToInfo: zfile.write(pathfmt.temppath, pathfmt.filename) pathfmt.delete = self.delete @@ -69,6 +73,21 @@ class ZipPP(PostProcessor): with self.open() as zfile: self.write(pathfmt, zfile) + def write_extra(self, pathfmt, zfile, files): + for path in map(util.expand_path, files): + if not os.path.isabs(path): + path = os.path.join(pathfmt.realdirectory, path) + try: + zfile.write(path, os.path.basename(path)) + except OSError as exc: + self.log.warning( + "Unable to write %s to %s", path, zfile.filename) + self.log.debug("%s: %s", exc, exc.__class__.__name__) + pass + else: + if self.delete: + util.remove_file(path) + def finalize(self, pathfmt, status): if self.zfile: self.zfile.close() diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d12d088..ce018fe 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.23.0" +__version__ = "1.23.1" diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 42babd3..af8b0af 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -452,9 +452,11 @@ class ZipTest(BasePostprocessorTest): self.assertTrue(pp.args[0].endswith("/test.cbz")) def test_zip_write(self): - pp = self._create() - with tempfile.NamedTemporaryFile("w", dir=self.dir.name) as file: + pp = self._create({"files": [file.name, "_info_.json"], + "keep-files": True}) + + filename = os.path.basename(file.name) file.write("foobar\n") # write dummy file with 3 different names @@ -466,18 +468,19 @@ class ZipTest(BasePostprocessorTest): self._trigger() nti = pp.zfile.NameToInfo - self.assertEqual(len(nti), i+1) + self.assertEqual(len(nti), i+2) self.assertIn(name, nti) # check file contents - self.assertEqual(len(nti), 3) + self.assertEqual(len(nti), 4) self.assertIn("file0.ext", nti) self.assertIn("file1.ext", nti) self.assertIn("file2.ext", nti) + self.assertIn(filename, nti) # write the last file a second time (will be skipped) self._trigger() - self.assertEqual(len(pp.zfile.NameToInfo), 3) + self.assertEqual(len(pp.zfile.NameToInfo), 4) # close file self._trigger(("finalize",), 0) @@ -485,10 +488,11 @@ class ZipTest(BasePostprocessorTest): # reopen to check persistence with zipfile.ZipFile(pp.zfile.filename) as file: nti = file.NameToInfo - self.assertEqual(len(pp.zfile.NameToInfo), 3) + self.assertEqual(len(pp.zfile.NameToInfo), 4) self.assertIn("file0.ext", nti) self.assertIn("file1.ext", nti) self.assertIn("file2.ext", nti) + self.assertIn(filename, nti) os.unlink(pp.zfile.filename) |
