From f98ab7aaca3c4acbd5a793267791749740330e9c Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Tue, 25 Apr 2023 21:32:02 -0400 Subject: New upstream version 1.25.2. --- CHANGELOG.md | 27 ++++++++ PKG-INFO | 6 +- README.rst | 4 +- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 18 ++++- docs/gallery-dl-example.conf | 4 ++ docs/gallery-dl.conf | 3 + gallery_dl.egg-info/PKG-INFO | 6 +- gallery_dl.egg-info/SOURCES.txt | 1 + gallery_dl/downloader/http.py | 4 ++ gallery_dl/extractor/__init__.py | 1 + gallery_dl/extractor/blogger.py | 2 +- gallery_dl/extractor/bunkr.py | 14 ++-- gallery_dl/extractor/danbooru.py | 26 +++++-- gallery_dl/extractor/deviantart.py | 31 ++++++--- gallery_dl/extractor/gelbooru_v02.py | 26 ++++++- gallery_dl/extractor/generic.py | 58 +++++++++------- gallery_dl/extractor/hentaifoundry.py | 6 +- gallery_dl/extractor/hotleak.py | 5 +- gallery_dl/extractor/imagechest.py | 15 ++-- gallery_dl/extractor/mastodon.py | 11 +++ gallery_dl/extractor/nitter.py | 5 ++ gallery_dl/extractor/sexcom.py | 17 +++-- gallery_dl/extractor/shopify.py | 13 ++-- gallery_dl/extractor/twitter.py | 124 ++++++++++++++++++++++++---------- gallery_dl/extractor/urlshortener.py | 69 +++++++++++++++++++ gallery_dl/version.py | 2 +- test/test_downloader.py | 4 ++ 28 files changed, 385 insertions(+), 119 deletions(-) create mode 100644 gallery_dl/extractor/urlshortener.py diff --git a/CHANGELOG.md b/CHANGELOG.md index d312557..a67e3ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # Changelog +## 1.25.2 - 2023-04-15 +### Additions +- [deviantart] add `public` option +- [nitter] extract videos from `source` elements ([#3912](https://github.com/mikf/gallery-dl/issues/3912)) +- [twitter] add `date_liked` and `date_bookmarked` metadata for liked and bookmarked Tweets ([#3816](https://github.com/mikf/gallery-dl/issues/3816)) +- [urlshortener] add support for bit.ly & t.co ([#3841](https://github.com/mikf/gallery-dl/issues/3841)) +- [downloader:http] add MIME type and signature for `.heic` files ([#3915](https://github.com/mikf/gallery-dl/issues/3915)) +### Fixes +- [blogger] update regex to get the highest resolution URLs ([#3863](https://github.com/mikf/gallery-dl/issues/3863), [#3870](https://github.com/mikf/gallery-dl/issues/3870)) +- [bunkr] update domain to `bunkr.la` ([#3813](https://github.com/mikf/gallery-dl/issues/3813), [#3877](https://github.com/mikf/gallery-dl/issues/3877)) +- [deviantart] keep using private access tokens when requesting download URLs ([#3845](https://github.com/mikf/gallery-dl/issues/3845), [#3857](https://github.com/mikf/gallery-dl/issues/3857), [#3896](https://github.com/mikf/gallery-dl/issues/3896)) +- [hentaifoundry] fix content filters ([#3887](https://github.com/mikf/gallery-dl/issues/3887)) +- [hotleak] fix downloading of creators whose name starts with a category name ([#3871](https://github.com/mikf/gallery-dl/issues/3871)) +- [imagechest] fix extraction ([#3914](https://github.com/mikf/gallery-dl/issues/3914)) +- [realbooru] fix extraction ([#2530](https://github.com/mikf/gallery-dl/issues/2530)) +- [sexcom] fix pagination ([#3906](https://github.com/mikf/gallery-dl/issues/3906)) +- [sexcom] fix HD video extraction +- [shopify] fix `collection` extractor ([#3866](https://github.com/mikf/gallery-dl/issues/3866), [#3868](https://github.com/mikf/gallery-dl/issues/3868)) +- [twitter] update to bookmark timeline v2 ([#3859](https://github.com/mikf/gallery-dl/issues/3859), [#3854](https://github.com/mikf/gallery-dl/issues/3854)) +- [twitter] warn about "withheld" Tweets and users ([#3864](https://github.com/mikf/gallery-dl/issues/3864)) +### Improvements +- [danbooru] reduce number of API requests when fetching extended `metadata` +- [deviantart:search] detect login redirects ([#3860](https://github.com/mikf/gallery-dl/issues/3860)) +- [generic] write regular expressions without `x` flags +- [mastodon] try to get account IDs without access token +- [twitter] calculate `date` from Tweet IDs + ## 1.25.1 - 2023-03-25 ### Additions - [nitter] support nitter.it ([#3819](https://github.com/mikf/gallery-dl/issues/3819)) diff --git a/PKG-INFO b/PKG-INFO index 1156e79..cb01fca 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.25.1 +Version: 1.25.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -106,9 +106,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/README.rst b/README.rst index e4fd1c6..8472d2d 100644 --- a/README.rst +++ b/README.rst @@ -69,9 +69,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 8b96657..8aa419d 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2023-03-25" "1.25.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2023-04-15" "1.25.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index fd32eb1..63d78f0 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2023-03-25" "1.25.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2023-04-15" "1.25.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -1346,7 +1346,7 @@ It is possible to specify a custom list of metadata includes. See \f[I]available_includes\f[] for possible field names. \f[I]aibooru\f[] also supports \f[I]ai_metadata\f[]. -Note: This requires 1 additional HTTP request per post. +Note: This requires 1 additional HTTP request per 200-post batch. .SS extractor.{Danbooru].threshold @@ -1602,6 +1602,20 @@ Controls when to stop paginating over API results. * \f[I]"manual"\f[]: Disregard \f[I]has_more\f[] and only stop when a batch of results is empty. +.SS extractor.deviantart.public +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Use a public access token for API requests. + +Disable this option to *force* using a private token for all requests +when a \f[I]refresh token\f[] is provided. + + .SS extractor.deviantart.refresh-token .IP "Type:" 6 \f[I]string\f[] diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index ef7b3b5..da386dd 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -317,6 +317,10 @@ "archive": "~/gallery-dl/custom-archive-file-for-TBIB.db", "filename": "{id}_{md5}.{extension}", "sleep-request": [0, 1.2] + }, + + "urlshortener": { + "tinyurl": {"root": "https://tinyurl.com"} } }, diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 7564e5b..09d9e80 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -74,6 +74,7 @@ { "client-id": null, "client-secret": null, + "refresh-token": null, "auto-watch": false, "auto-unwatch": false, "comments": false, @@ -86,6 +87,8 @@ "mature": true, "metadata": false, "original": true, + "pagination": "api", + "public": true, "wait-min": 0 }, "e621": diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index f836313..25c9619 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.25.1 +Version: 1.25.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -106,9 +106,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 9827944..bb2ff51 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -199,6 +199,7 @@ gallery_dl/extractor/twibooru.py gallery_dl/extractor/twitter.py gallery_dl/extractor/unsplash.py gallery_dl/extractor/uploadir.py +gallery_dl/extractor/urlshortener.py gallery_dl/extractor/vanillarock.py gallery_dl/extractor/vichan.py gallery_dl/extractor/vk.py diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index e977320..88e86e9 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -353,6 +353,8 @@ MIME_TYPES = { "image/x-ms-bmp": "bmp", "image/webp" : "webp", "image/avif" : "avif", + "image/heic" : "heic", + "image/heif" : "heif", "image/svg+xml" : "svg", "image/ico" : "ico", "image/icon" : "ico", @@ -399,6 +401,8 @@ SIGNATURE_CHECKS = { "webp": lambda s: (s[0:4] == b"RIFF" and s[8:12] == b"WEBP"), "avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs", + "heic": lambda s: (s[4:10] == b"ftyphe" and s[10:12] in ( + b"ic", b"im", b"is", b"ix", b"vc", b"vm", b"vs")), "svg" : lambda s: s[0:5] == b"-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -52,6 +52,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "num": int, }, }), + ("https://bunkr.la/a/Lktg9Keq"), + ("https://bunkr.su/a/Lktg9Keq"), + ("https://bunkr.ru/a/Lktg9Keq"), + ("https://bunkr.is/a/Lktg9Keq"), ("https://bunkr.to/a/Lktg9Keq"), ) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index f104556..326b53b 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -26,6 +26,7 @@ class DanbooruExtractor(BaseExtractor): BaseExtractor.__init__(self, match) self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) + self.includes = False threshold = self.config("threshold") if isinstance(threshold, int): @@ -54,6 +55,7 @@ class DanbooruExtractor(BaseExtractor): includes = ",".join(includes) elif not isinstance(includes, str): includes = "artist_commentary,children,notes,parent,uploader" + self.includes = includes + ",id" data = self.metadata() for post in self.posts(): @@ -77,11 +79,6 @@ class DanbooruExtractor(BaseExtractor): url = post["large_file_url"] post["extension"] = "webm" - if includes: - meta_url = "{}/posts/{}.json?only={}".format( - self.root, post["id"], includes) - post.update(self.request(meta_url).json()) - if url[0] == "/": url = self.root + url @@ -104,6 +101,19 @@ class DanbooruExtractor(BaseExtractor): posts = self.request(url, params=params).json() if "posts" in posts: posts = posts["posts"] + + if self.includes and posts: + if not pages and "only" not in params: + params["page"] = "b{}".format(posts[0]["id"] + 1) + params["only"] = self.includes + data = { + meta["id"]: meta + for meta in self.request(url, params=params).json() + } + for post in posts: + post.update(data[post["id"]]) + params["only"] = None + yield from posts if len(posts) < self.threshold: @@ -255,7 +265,11 @@ class DanbooruPostExtractor(DanbooruExtractor): def posts(self): url = "{}/posts/{}.json".format(self.root, self.post_id) - return (self.request(url).json(),) + post = self.request(url).json() + if self.includes: + params = {"only": self.includes} + post.update(self.request(url, params=params).json()) + return (post,) class DanbooruPopularExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 37475df..f532a97 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -320,7 +320,7 @@ class DeviantartExtractor(Extractor): yield url, folder def _update_content_default(self, deviation, content): - public = "premium_folder_data" not in deviation + public = False if "premium_folder_data" in deviation else None data = self.api.deviation_download(deviation["deviationid"], public) content.update(data) @@ -1180,7 +1180,11 @@ class DeviantartSearchExtractor(DeviantartExtractor): } while True: - page = self.request(url, params=params).text + response = self.request(url, params=params) + + if response.history and "/users/login" in response.url: + raise exception.StopExtraction("HTTP redirect to login page") + page = response.text items , pos = text.rextract(page, r'\"items\":[', ']') cursor, pos = text.extract(page, r'\"cursor\":\"', '\\', pos) @@ -1280,6 +1284,7 @@ class DeviantartOAuthAPI(): self.folders = extractor.config("folders", False) self.metadata = extractor.extra or extractor.config("metadata", False) self.strategy = extractor.config("pagination") + self.public = extractor.config("public", True) self.client_id = extractor.config("client-id") if self.client_id: @@ -1385,7 +1390,7 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params=params, key="thread") - def deviation(self, deviation_id, public=True): + def deviation(self, deviation_id, public=None): """Query and return info about a single Deviation""" endpoint = "/deviation/" + deviation_id deviation = self._call(endpoint, public=public) @@ -1395,7 +1400,7 @@ class DeviantartOAuthAPI(): self._folders((deviation,)) return deviation - def deviation_content(self, deviation_id, public=True): + def deviation_content(self, deviation_id, public=None): """Get extended content of a single Deviation""" endpoint = "/deviation/content" params = {"deviationid": deviation_id} @@ -1408,7 +1413,7 @@ class DeviantartOAuthAPI(): self.log.warning("Private Journal") return content - def deviation_download(self, deviation_id, public=True): + def deviation_download(self, deviation_id, public=None): """Get the original file download (if allowed)""" endpoint = "/deviation/download/" + deviation_id params = {"mature_content": self.mature} @@ -1423,7 +1428,7 @@ class DeviantartOAuthAPI(): params = {"mature_content": self.mature} return self._call(endpoint, params=params)["metadata"] - def gallery(self, username, folder_id, offset=0, extend=True, public=True): + def gallery(self, username, folder_id, offset=0, extend=True, public=None): """Yield all Deviation-objects contained in a gallery folder""" endpoint = "/gallery/" + folder_id params = {"username": username, "offset": offset, "limit": 24, @@ -1513,11 +1518,14 @@ class DeviantartOAuthAPI(): refresh_token_key, data["refresh_token"]) return "Bearer " + data["access_token"] - def _call(self, endpoint, fatal=True, public=True, **kwargs): + def _call(self, endpoint, fatal=True, public=None, **kwargs): """Call an API endpoint""" url = "https://www.deviantart.com/api/v1/oauth2" + endpoint kwargs["fatal"] = None + if public is None: + public = self.public + while True: if self.delay: self.extractor.sleep(self.delay, "api") @@ -1559,8 +1567,13 @@ class DeviantartOAuthAPI(): return data def _pagination(self, endpoint, params, - extend=True, public=True, unpack=False, key="results"): + extend=True, public=None, unpack=False, key="results"): warn = True + if public is None: + public = self.public + elif not public: + self.public = False + while True: data = self._call(endpoint, params=params, public=public) if key not in data: @@ -1575,7 +1588,7 @@ class DeviantartOAuthAPI(): if public and len(results) < params["limit"]: if self.refresh_token_key: self.log.debug("Switching to private access token") - public = False + self.public = public = False continue elif data["has_more"] and warn: warn = False diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index facd3db..958c4b5 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -30,7 +30,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.api_root = self.root if self.category == "realbooru": - self._file_url = self._file_url_realbooru + self.items = self._items_realbooru self._tags = self._tags_realbooru def _api_request(self, params): @@ -129,6 +129,28 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url + def _items_realbooru(self): + from .common import Message + data = self.metadata() + + for post in self.posts(): + try: + html = self._html(post) + url = post["file_url"] = text.rextract( + html, 'href="', '"', html.index(">Original<"))[0] + except Exception: + self.log.debug("Unable to fetch download URL for post %s " + "(md5: %s)", post.get("id"), post.get("md5")) + continue + + text.nameext_from_url(url, post) + post.update(data) + self._prepare(post) + self._tags(post, html) + + yield Message.Directory, post + yield Message.Url, url, post + def _tags_realbooru(self, post, page): tag_container = text.extr(page, 'id="tagLink"', '') tags = collections.defaultdict(list) @@ -404,7 +426,7 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): }, }), ("https://realbooru.com/index.php?page=post&s=view&id=668483", { - "pattern": r"https://realbooru\.com/images/dc/b5" + "pattern": r"https://realbooru\.com//?images/dc/b5" r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg", "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", "options": (("tags", True),), diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 9999283..4ab26ae 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -1,16 +1,19 @@ # -*- coding: utf-8 -*- -"""Extractor for images in a generic web page.""" +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Generic information extractor""" from .common import Extractor, Message from .. import config, text -import re import os.path +import re class GenericExtractor(Extractor): """Extractor for images in a generic web page.""" - category = "generic" directory_fmt = ("{category}", "{pageurl}") archive_fmt = "{imageurl}" @@ -18,19 +21,19 @@ class GenericExtractor(Extractor): # By default, the generic extractor is disabled # and the "g(eneric):" prefix in url is required. # If the extractor is enabled, make the prefix optional - pattern = r"(?ix)(?Pg(?:eneric)?:)" + pattern = r"(?i)(?Pg(?:eneric)?:)" if config.get(("extractor", "generic"), "enabled"): pattern += r"?" # The generic extractor pattern should match (almost) any valid url # Based on: https://tools.ietf.org/html/rfc3986#appendix-B - pattern += r""" - (?Phttps?://)? # optional http(s) scheme - (?P[-\w\.]+) # required domain - (?P/[^?#]*)? # optional path - (?:\?(?P[^#]*))? # optional query - (?:\#(?P.*))? # optional fragment - """ + pattern += ( + r"(?Phttps?://)?" # optional http(s) scheme + r"(?P[-\w\.]+)" # required domain + r"(?P/[^?#]*)?" # optional path + r"(?:\?(?P[^#]*))?" # optional query + r"(?:\#(?P.*))?" # optional fragment + ) test = ( ("generic:https://www.nongnu.org/lzip/", { @@ -49,19 +52,20 @@ class GenericExtractor(Extractor): "count": 2, "pattern": "^https://räksmörgås.josefsson.org/", }), + ("g:https://en.wikipedia.org/Main_Page"), + ("g:https://example.org/path/to/file?que=1?&ry=2/#fragment"), + ("g:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"), ("generic:https://en.wikipedia.org/Main_Page"), ("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"), ("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"), ) def __init__(self, match): - """Init.""" Extractor.__init__(self, match) # Strip the "g(eneric):" prefix # and inform about "forced" or "fallback" mode if match.group('generic'): - self.log.info("Forcing use of generic information extractor.") self.url = match.group(0).partition(":")[2] else: self.log.info("Falling back on generic information extractor.") @@ -93,7 +97,6 @@ class GenericExtractor(Extractor): pass images = enumerate(imgs, 1) - yield Message.Version, 1 yield Message.Directory, data for data["num"], (url, imgdata) in images: @@ -158,11 +161,13 @@ class GenericExtractor(Extractor): image urls; this pattern matches only the first url; remaining urls will be matched by the "imageurl_pattern_ext" pattern below. """ - imageurl_pattern_src = r"""(?ix) - <(?:img|video|source)\s.*? # ,