diff options
| author | 2023-04-25 21:32:02 -0400 | |
|---|---|---|
| committer | 2023-04-25 21:32:02 -0400 | |
| commit | f98ab7aaca3c4acbd5a793267791749740330e9c (patch) | |
| tree | 72e3d3312a8ff2cdb24353b1d7be6fb8301f431c | |
| parent | 09e426350409d45e7f7a8ff369f8d8aa9eec0fe4 (diff) | |
New upstream version 1.25.2.upstream/1.25.2
28 files changed, 385 insertions, 119 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index d312557..a67e3ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,32 @@ # Changelog +## 1.25.2 - 2023-04-15 +### Additions +- [deviantart] add `public` option +- [nitter] extract videos from `source` elements ([#3912](https://github.com/mikf/gallery-dl/issues/3912)) +- [twitter] add `date_liked` and `date_bookmarked` metadata for liked and bookmarked Tweets ([#3816](https://github.com/mikf/gallery-dl/issues/3816)) +- [urlshortener] add support for bit.ly & t.co ([#3841](https://github.com/mikf/gallery-dl/issues/3841)) +- [downloader:http] add MIME type and signature for `.heic` files ([#3915](https://github.com/mikf/gallery-dl/issues/3915)) +### Fixes +- [blogger] update regex to get the highest resolution URLs ([#3863](https://github.com/mikf/gallery-dl/issues/3863), [#3870](https://github.com/mikf/gallery-dl/issues/3870)) +- [bunkr] update domain to `bunkr.la` ([#3813](https://github.com/mikf/gallery-dl/issues/3813), [#3877](https://github.com/mikf/gallery-dl/issues/3877)) +- [deviantart] keep using private access tokens when requesting download URLs ([#3845](https://github.com/mikf/gallery-dl/issues/3845), [#3857](https://github.com/mikf/gallery-dl/issues/3857), [#3896](https://github.com/mikf/gallery-dl/issues/3896)) +- [hentaifoundry] fix content filters ([#3887](https://github.com/mikf/gallery-dl/issues/3887)) +- [hotleak] fix downloading of creators whose name starts with a category name ([#3871](https://github.com/mikf/gallery-dl/issues/3871)) +- [imagechest] fix extraction ([#3914](https://github.com/mikf/gallery-dl/issues/3914)) +- [realbooru] fix extraction ([#2530](https://github.com/mikf/gallery-dl/issues/2530)) +- [sexcom] fix pagination ([#3906](https://github.com/mikf/gallery-dl/issues/3906)) +- [sexcom] fix HD video extraction +- [shopify] fix `collection` extractor ([#3866](https://github.com/mikf/gallery-dl/issues/3866), [#3868](https://github.com/mikf/gallery-dl/issues/3868)) +- [twitter] update to bookmark timeline v2 ([#3859](https://github.com/mikf/gallery-dl/issues/3859), [#3854](https://github.com/mikf/gallery-dl/issues/3854)) +- [twitter] warn about "withheld" Tweets and users ([#3864](https://github.com/mikf/gallery-dl/issues/3864)) +### Improvements +- [danbooru] reduce number of API requests when fetching extended `metadata` +- [deviantart:search] detect login redirects ([#3860](https://github.com/mikf/gallery-dl/issues/3860)) +- [generic] write regular expressions without `x` flags +- [mastodon] try to get account IDs without access token +- [twitter] calculate `date` from Tweet IDs + ## 1.25.1 - 2023-03-25 ### Additions - [nitter] support nitter.it ([#3819](https://github.com/mikf/gallery-dl/issues/3819)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.25.1 +Version: 1.25.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -106,9 +106,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.2/gallery-dl.bin>`__ Nightly Builds @@ -69,9 +69,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.2/gallery-dl.bin>`__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 8b96657..8aa419d 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2023-03-25" "1.25.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2023-04-15" "1.25.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index fd32eb1..63d78f0 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2023-03-25" "1.25.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2023-04-15" "1.25.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -1346,7 +1346,7 @@ It is possible to specify a custom list of metadata includes. See \f[I]available_includes\f[] for possible field names. \f[I]aibooru\f[] also supports \f[I]ai_metadata\f[]. -Note: This requires 1 additional HTTP request per post. +Note: This requires 1 additional HTTP request per 200-post batch. .SS extractor.{Danbooru].threshold @@ -1602,6 +1602,20 @@ Controls when to stop paginating over API results. * \f[I]"manual"\f[]: Disregard \f[I]has_more\f[] and only stop when a batch of results is empty. +.SS extractor.deviantart.public +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Use a public access token for API requests. + +Disable this option to *force* using a private token for all requests +when a \f[I]refresh token\f[] is provided. + + .SS extractor.deviantart.refresh-token .IP "Type:" 6 \f[I]string\f[] diff --git a/docs/gallery-dl-example.conf b/docs/gallery-dl-example.conf index ef7b3b5..da386dd 100644 --- a/docs/gallery-dl-example.conf +++ b/docs/gallery-dl-example.conf @@ -317,6 +317,10 @@ "archive": "~/gallery-dl/custom-archive-file-for-TBIB.db", "filename": "{id}_{md5}.{extension}", "sleep-request": [0, 1.2] + }, + + "urlshortener": { + "tinyurl": {"root": "https://tinyurl.com"} } }, diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 7564e5b..09d9e80 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -74,6 +74,7 @@ { "client-id": null, "client-secret": null, + "refresh-token": null, "auto-watch": false, "auto-unwatch": false, "comments": false, @@ -86,6 +87,8 @@ "mature": true, "metadata": false, "original": true, + "pagination": "api", + "public": true, "wait-min": 0 }, "e621": diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index f836313..25c9619 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.25.1 +Version: 1.25.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -106,9 +106,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.25.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.25.2/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 9827944..bb2ff51 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -199,6 +199,7 @@ gallery_dl/extractor/twibooru.py gallery_dl/extractor/twitter.py gallery_dl/extractor/unsplash.py gallery_dl/extractor/uploadir.py +gallery_dl/extractor/urlshortener.py gallery_dl/extractor/vanillarock.py gallery_dl/extractor/vichan.py gallery_dl/extractor/vk.py diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index e977320..88e86e9 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -353,6 +353,8 @@ MIME_TYPES = { "image/x-ms-bmp": "bmp", "image/webp" : "webp", "image/avif" : "avif", + "image/heic" : "heic", + "image/heif" : "heif", "image/svg+xml" : "svg", "image/ico" : "ico", "image/icon" : "ico", @@ -399,6 +401,8 @@ SIGNATURE_CHECKS = { "webp": lambda s: (s[0:4] == b"RIFF" and s[8:12] == b"WEBP"), "avif": lambda s: s[4:11] == b"ftypavi" and s[11] in b"fs", + "heic": lambda s: (s[4:10] == b"ftyphe" and s[10:12] in ( + b"ic", b"im", b"is", b"ix", b"vc", b"vm", b"vs")), "svg" : lambda s: s[0:5] == b"<?xml", "ico" : lambda s: s[0:4] == b"\x00\x00\x01\x00", "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00", diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3968d72..553a110 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -153,6 +153,7 @@ modules = [ "twitter", "unsplash", "uploadir", + "urlshortener", "vanillarock", "vichan", "vk", diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 56010c2..eafc8af 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -40,7 +40,7 @@ class BloggerExtractor(Extractor): blog["date"] = text.parse_datetime(blog["published"]) del blog["selfLink"] - sub = re.compile(r"(/|=)(?:s\d+|w\d+-h\d+)(?=/|$)").sub + sub = re.compile(r"(/|=)(?:[sw]\d+|w\d+-h\d+)(?=/|$)").sub findall_image = re.compile( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 17d066d..7c66fb0 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkr.su/""" +"""Extractors for https://bunkr.la/""" from .lolisafe import LolisafeAlbumExtractor from .. import text class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkr.su albums""" + """Extractor for bunkr.la albums""" category = "bunkr" - root = "https://bunkr.su" - pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:[sr]u|is|to)/a/([^/?#]+)" + root = "https://bunkr.la" + pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:la|[sr]u|is|to)/a/([^/?#]+)" test = ( - ("https://bunkr.su/a/Lktg9Keq", { + ("https://bunkr.la/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.ru/test-テスト-\"&>-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -52,6 +52,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "num": int, }, }), + ("https://bunkr.la/a/Lktg9Keq"), + ("https://bunkr.su/a/Lktg9Keq"), + ("https://bunkr.ru/a/Lktg9Keq"), + ("https://bunkr.is/a/Lktg9Keq"), ("https://bunkr.to/a/Lktg9Keq"), ) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index f104556..326b53b 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -26,6 +26,7 @@ class DanbooruExtractor(BaseExtractor): BaseExtractor.__init__(self, match) self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) + self.includes = False threshold = self.config("threshold") if isinstance(threshold, int): @@ -54,6 +55,7 @@ class DanbooruExtractor(BaseExtractor): includes = ",".join(includes) elif not isinstance(includes, str): includes = "artist_commentary,children,notes,parent,uploader" + self.includes = includes + ",id" data = self.metadata() for post in self.posts(): @@ -77,11 +79,6 @@ class DanbooruExtractor(BaseExtractor): url = post["large_file_url"] post["extension"] = "webm" - if includes: - meta_url = "{}/posts/{}.json?only={}".format( - self.root, post["id"], includes) - post.update(self.request(meta_url).json()) - if url[0] == "/": url = self.root + url @@ -104,6 +101,19 @@ class DanbooruExtractor(BaseExtractor): posts = self.request(url, params=params).json() if "posts" in posts: posts = posts["posts"] + + if self.includes and posts: + if not pages and "only" not in params: + params["page"] = "b{}".format(posts[0]["id"] + 1) + params["only"] = self.includes + data = { + meta["id"]: meta + for meta in self.request(url, params=params).json() + } + for post in posts: + post.update(data[post["id"]]) + params["only"] = None + yield from posts if len(posts) < self.threshold: @@ -255,7 +265,11 @@ class DanbooruPostExtractor(DanbooruExtractor): def posts(self): url = "{}/posts/{}.json".format(self.root, self.post_id) - return (self.request(url).json(),) + post = self.request(url).json() + if self.includes: + params = {"only": self.includes} + post.update(self.request(url, params=params).json()) + return (post,) class DanbooruPopularExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 37475df..f532a97 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -320,7 +320,7 @@ class DeviantartExtractor(Extractor): yield url, folder def _update_content_default(self, deviation, content): - public = "premium_folder_data" not in deviation + public = False if "premium_folder_data" in deviation else None data = self.api.deviation_download(deviation["deviationid"], public) content.update(data) @@ -1180,7 +1180,11 @@ class DeviantartSearchExtractor(DeviantartExtractor): } while True: - page = self.request(url, params=params).text + response = self.request(url, params=params) + + if response.history and "/users/login" in response.url: + raise exception.StopExtraction("HTTP redirect to login page") + page = response.text items , pos = text.rextract(page, r'\"items\":[', ']') cursor, pos = text.extract(page, r'\"cursor\":\"', '\\', pos) @@ -1280,6 +1284,7 @@ class DeviantartOAuthAPI(): self.folders = extractor.config("folders", False) self.metadata = extractor.extra or extractor.config("metadata", False) self.strategy = extractor.config("pagination") + self.public = extractor.config("public", True) self.client_id = extractor.config("client-id") if self.client_id: @@ -1385,7 +1390,7 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params=params, key="thread") - def deviation(self, deviation_id, public=True): + def deviation(self, deviation_id, public=None): """Query and return info about a single Deviation""" endpoint = "/deviation/" + deviation_id deviation = self._call(endpoint, public=public) @@ -1395,7 +1400,7 @@ class DeviantartOAuthAPI(): self._folders((deviation,)) return deviation - def deviation_content(self, deviation_id, public=True): + def deviation_content(self, deviation_id, public=None): """Get extended content of a single Deviation""" endpoint = "/deviation/content" params = {"deviationid": deviation_id} @@ -1408,7 +1413,7 @@ class DeviantartOAuthAPI(): self.log.warning("Private Journal") return content - def deviation_download(self, deviation_id, public=True): + def deviation_download(self, deviation_id, public=None): """Get the original file download (if allowed)""" endpoint = "/deviation/download/" + deviation_id params = {"mature_content": self.mature} @@ -1423,7 +1428,7 @@ class DeviantartOAuthAPI(): params = {"mature_content": self.mature} return self._call(endpoint, params=params)["metadata"] - def gallery(self, username, folder_id, offset=0, extend=True, public=True): + def gallery(self, username, folder_id, offset=0, extend=True, public=None): """Yield all Deviation-objects contained in a gallery folder""" endpoint = "/gallery/" + folder_id params = {"username": username, "offset": offset, "limit": 24, @@ -1513,11 +1518,14 @@ class DeviantartOAuthAPI(): refresh_token_key, data["refresh_token"]) return "Bearer " + data["access_token"] - def _call(self, endpoint, fatal=True, public=True, **kwargs): + def _call(self, endpoint, fatal=True, public=None, **kwargs): """Call an API endpoint""" url = "https://www.deviantart.com/api/v1/oauth2" + endpoint kwargs["fatal"] = None + if public is None: + public = self.public + while True: if self.delay: self.extractor.sleep(self.delay, "api") @@ -1559,8 +1567,13 @@ class DeviantartOAuthAPI(): return data def _pagination(self, endpoint, params, - extend=True, public=True, unpack=False, key="results"): + extend=True, public=None, unpack=False, key="results"): warn = True + if public is None: + public = self.public + elif not public: + self.public = False + while True: data = self._call(endpoint, params=params, public=public) if key not in data: @@ -1575,7 +1588,7 @@ class DeviantartOAuthAPI(): if public and len(results) < params["limit"]: if self.refresh_token_key: self.log.debug("Switching to private access token") - public = False + self.public = public = False continue elif data["has_more"] and warn: warn = False diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index facd3db..958c4b5 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -30,7 +30,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.api_root = self.root if self.category == "realbooru": - self._file_url = self._file_url_realbooru + self.items = self._items_realbooru self._tags = self._tags_realbooru def _api_request(self, params): @@ -129,6 +129,28 @@ class GelbooruV02Extractor(booru.BooruExtractor): self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) return url + def _items_realbooru(self): + from .common import Message + data = self.metadata() + + for post in self.posts(): + try: + html = self._html(post) + url = post["file_url"] = text.rextract( + html, 'href="', '"', html.index(">Original<"))[0] + except Exception: + self.log.debug("Unable to fetch download URL for post %s " + "(md5: %s)", post.get("id"), post.get("md5")) + continue + + text.nameext_from_url(url, post) + post.update(data) + self._prepare(post) + self._tags(post, html) + + yield Message.Directory, post + yield Message.Url, url, post + def _tags_realbooru(self, post, page): tag_container = text.extr(page, 'id="tagLink"', '</div>') tags = collections.defaultdict(list) @@ -404,7 +426,7 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): }, }), ("https://realbooru.com/index.php?page=post&s=view&id=668483", { - "pattern": r"https://realbooru\.com/images/dc/b5" + "pattern": r"https://realbooru\.com//?images/dc/b5" r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg", "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", "options": (("tags", True),), diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 9999283..4ab26ae 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -1,16 +1,19 @@ # -*- coding: utf-8 -*- -"""Extractor for images in a generic web page.""" +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Generic information extractor""" from .common import Extractor, Message from .. import config, text -import re import os.path +import re class GenericExtractor(Extractor): """Extractor for images in a generic web page.""" - category = "generic" directory_fmt = ("{category}", "{pageurl}") archive_fmt = "{imageurl}" @@ -18,19 +21,19 @@ class GenericExtractor(Extractor): # By default, the generic extractor is disabled # and the "g(eneric):" prefix in url is required. # If the extractor is enabled, make the prefix optional - pattern = r"(?ix)(?P<generic>g(?:eneric)?:)" + pattern = r"(?i)(?P<generic>g(?:eneric)?:)" if config.get(("extractor", "generic"), "enabled"): pattern += r"?" # The generic extractor pattern should match (almost) any valid url # Based on: https://tools.ietf.org/html/rfc3986#appendix-B - pattern += r""" - (?P<scheme>https?://)? # optional http(s) scheme - (?P<domain>[-\w\.]+) # required domain - (?P<path>/[^?#]*)? # optional path - (?:\?(?P<query>[^#]*))? # optional query - (?:\#(?P<fragment>.*))? # optional fragment - """ + pattern += ( + r"(?P<scheme>https?://)?" # optional http(s) scheme + r"(?P<domain>[-\w\.]+)" # required domain + r"(?P<path>/[^?#]*)?" # optional path + r"(?:\?(?P<query>[^#]*))?" # optional query + r"(?:\#(?P<fragment>.*))?" # optional fragment + ) test = ( ("generic:https://www.nongnu.org/lzip/", { @@ -49,19 +52,20 @@ class GenericExtractor(Extractor): "count": 2, "pattern": "^https://räksmörgås.josefsson.org/", }), + ("g:https://en.wikipedia.org/Main_Page"), + ("g:https://example.org/path/to/file?que=1?&ry=2/#fragment"), + ("g:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"), ("generic:https://en.wikipedia.org/Main_Page"), ("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"), ("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"), ) def __init__(self, match): - """Init.""" Extractor.__init__(self, match) # Strip the "g(eneric):" prefix # and inform about "forced" or "fallback" mode if match.group('generic'): - self.log.info("Forcing use of generic information extractor.") self.url = match.group(0).partition(":")[2] else: self.log.info("Falling back on generic information extractor.") @@ -93,7 +97,6 @@ class GenericExtractor(Extractor): pass images = enumerate(imgs, 1) - yield Message.Version, 1 yield Message.Directory, data for data["num"], (url, imgdata) in images: @@ -158,11 +161,13 @@ class GenericExtractor(Extractor): image urls; this pattern matches only the first url; remaining urls will be matched by the "imageurl_pattern_ext" pattern below. """ - imageurl_pattern_src = r"""(?ix) - <(?:img|video|source)\s.*? # <img>, <video> or <source> - src(?:set)?=["']? # src or srcset attributes - (?P<URL>[^"'\s>]+) # url - """ + + imageurl_pattern_src = ( + r"(?i)" + r"<(?:img|video|source)\s[^>]*" # <img>, <video> or <source> + r"src(?:set)?=[\"']?" # src or srcset attributes + r"(?P<URL>[^\"'\s>]+)" # url + ) """ 2: Look anywhere for urls containing common image/video extensions @@ -176,12 +181,13 @@ class GenericExtractor(Extractor): urls in html tags. """ - imageurl_pattern_ext = r"""(?ix) - (?:[^?&#"'>\s]+) # anything until dot+extension - \.(?:jpe?g|jpe|png|gif - |web[mp]|mp4|mkv|og[gmv]|opus) # dot + image/video extensions - (?:[^"'<>\s]*)? # optional query and fragment - """ + imageurl_pattern_ext = ( + r"(?i)" + r"(?:[^?&#\"'>\s]+)" # anything until dot+extension + # dot + image/video extensions + r"\.(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus)" + r"(?:[^\"'<>\s]*)?" # optional query and fragment + ) imageurls_src = re.findall(imageurl_pattern_src, page) imageurls_ext = re.findall(imageurl_pattern_ext, page) @@ -221,7 +227,7 @@ class GenericExtractor(Extractor): absimageurls.append(self.baseurl + '/' + u) # Remove duplicates - absimageurls = set(absimageurls) + absimageurls = dict.fromkeys(absimageurls) # Create the image metadata dict and add imageurl to it # (image filename and extension are added by items()) diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 2dfc721..e01a4ed 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -123,6 +123,9 @@ class HentaifoundryExtractor(Extractor): def _init_site_filters(self): """Set site-internal filters to show all images""" + if self.session.cookies.get("PHPSESSID", domain=self.cookiedomain): + return + url = self.root + "/?enterAgree=1" self.request(url, method="HEAD") @@ -153,7 +156,6 @@ class HentaifoundryExtractor(Extractor): "rating_scat" : "1", "rating_incest" : "1", "rating_rape" : "1", - "filter_media" : "A", "filter_order" : "date_new", "filter_type" : "0", "YII_CSRF_TOKEN" : text.unquote(text.extr( diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 7c656be..30158b4 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -58,7 +58,7 @@ def decode_video_url(url): class HotleakPostExtractor(HotleakExtractor): """Extractor for individual posts on hotleak""" subcategory = "post" - pattern = (BASE_PATTERN + r"/(?!hot|creators|videos|photos)" + pattern = (BASE_PATTERN + r"/(?!(?:hot|creators|videos|photos)(?:$|/))" r"([^/]+)/(photo|video)/(\d+)") test = ( ("https://hotleak.vip/kaiyakawaii/photo/1617145", { @@ -117,7 +117,8 @@ class HotleakPostExtractor(HotleakExtractor): class HotleakCreatorExtractor(HotleakExtractor): """Extractor for all posts from a hotleak creator""" subcategory = "creator" - pattern = BASE_PATTERN + r"/(?!hot|creators|videos|photos)([^/?#]+)/?$" + pattern = (BASE_PATTERN + r"/(?!(?:hot|creators|videos|photos)(?:$|/))" + r"([^/?#]+)/?$") test = ( ("https://hotleak.vip/kaiyakawaii", { "range": "1-200", diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 14aa16f..8b18d5e 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -1,12 +1,13 @@ # -*- coding: utf-8 -*- # Copyright 2020 Leonid "Bepis" Pavel +# Copyright 2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from galleries at https://imgchest.com/""" +"""Extractors for https://imgchest.com/""" from .common import GalleryExtractor from .. import text, exception @@ -19,7 +20,14 @@ class ImagechestGalleryExtractor(GalleryExtractor): pattern = r"(?:https?://)?(?:www\.)?imgchest\.com/p/([A-Za-z0-9]{11})" test = ( ("https://imgchest.com/p/3na7kr3by8d", { - "url": "f095b4f78c051e5a94e7c663814d1e8d4c93c1f7", + "pattern": r"https://cdn\.imgchest\.com/files/\w+\.(jpg|png)", + "keyword": { + "count": 3, + "gallery_id": "3na7kr3by8d", + "num": int, + "title": "Wizardry - Video Game From The Mid 80's", + }, + "url": "7328ca4ec2459378d725e3be19f661d2b045feda", "content": "076959e65be30249a2c651fbe6090dc30ba85193", "count": 3 }), @@ -43,6 +51,5 @@ class ImagechestGalleryExtractor(GalleryExtractor): def images(self, page): return [ (url, None) - for url in text.extract_iter( - page, 'property="og:image" content="', '"') + for url in text.extract_iter(page, 'data-url="', '"') ] diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index e49d29a..e190c7e 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -227,6 +227,12 @@ class MastodonAPI(): if username.startswith("id:"): return username[3:] + try: + return self.account_lookup(username)["id"] + except Exception: + # fall back to account search + pass + if "@" in username: handle = "@" + username else: @@ -246,6 +252,11 @@ class MastodonAPI(): endpoint = "/v1/accounts/{}/following".format(account_id) return self._pagination(endpoint, None) + def account_lookup(self, username): + endpoint = "/v1/accounts/lookup" + params = {"acct": username} + return self._call(endpoint, params).json() + def account_search(self, query, limit=40): """Search for accounts""" endpoint = "/v1/accounts/search" diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 725788a..5f4ceea 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -93,6 +93,11 @@ class NitterExtractor(BaseExtractor): "filename" : name.rpartition(".")[0], "extension": "mp4", }) + + for url in text.extract_iter( + attachments, '<source src="', '"'): + append(text.nameext_from_url(url, {"url": url})) + else: files = () tweet["count"] = len(files) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 486bf92..c6588de 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -35,6 +35,7 @@ class SexcomExtractor(Extractor): def _pagination(self, url): while True: extr = text.extract_from(self.request(url).text) + url = extr('<link rel="next" href="', '"') while True: href = extr('<a class="image_wrapper" href="', '"') @@ -42,11 +43,9 @@ class SexcomExtractor(Extractor): break yield self.root + href - pager = extr('id="pagenum"', '</div>') - url = text.extr(pager, ' href="', '"') if not url: return - url = text.urljoin(self.root, url) + url = text.urljoin(self.root, text.unescape(url)) def _parse_pin(self, url): response = self.request(url, fatal=False) @@ -71,9 +70,12 @@ class SexcomExtractor(Extractor): info = extr("player.updateSrc(", ");") if info: - path = text.extr(info, "src: '", "'") - data["filename"] = path.rpartition("/")[2] - data["extension"] = "mp4" + try: + path, _ = text.rextract( + info, "src: '", "'", info.index("label: 'HD'")) + except ValueError: + path = text.extr(info, "src: '", "'") + text.nameext_from_url(path, data) data["url"] = path else: iframe = extr('<iframe', '>') @@ -132,7 +134,8 @@ class SexcomPinExtractor(SexcomExtractor): }), # video ("https://www.sex.com/pin/55748341/", { - "pattern": "https://www.sex.com/video/stream/776229/hd", + "pattern": r"https://cdn\.sex\.com/videos/pinporn" + r"/2018/02/10/776229_hd\.mp4", "content": "e1a5834869163e2c4d1ca2677f5b7b367cf8cfff", }), # pornhub embed diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 278ad14..f6e8bc0 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -119,15 +119,14 @@ class ShopifyCollectionExtractor(ShopifyExtractor): def products(self): url = self.item_url + "/products.json" + params = {"page": 1} - while url: - response = self.request(url) - yield from response.json()["products"] - - url = response.links.get("next") - if not url: + while True: + data = self.request(url, params=params).json()["products"] + if not data: return - url = url["url"] + yield from data + params["page"] += 1 class ShopifyProductExtractor(ShopifyExtractor): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 89d96d7..2ccc7e5 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -105,6 +105,10 @@ class TwitterExtractor(Extractor): continue seen_tweets.add(data["id_str"]) + if "withheld_scope" in data: + txt = data.get("full_text") or data.get("text") or "" + self.log.warning("'%s' (%s)", txt, data["id_str"]) + files = [] if "extended_entities" in data: self._extract_media( @@ -256,19 +260,26 @@ class TwitterExtractor(Extractor): if "legacy" in tweet: tweet = tweet["legacy"] + tweet_id = int(tweet["id_str"]) + if tweet_id >= 300000000000000: + date = text.parse_timestamp( + ((tweet_id >> 22) + 1288834974657) // 1000) + else: + date = text.parse_datetime( + tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") + tget = tweet.get tdata = { - "tweet_id" : text.parse_int(tweet["id_str"]), + "tweet_id" : tweet_id, "retweet_id" : text.parse_int( tget("retweeted_status_id_str")), "quote_id" : text.parse_int( tget("quoted_by_id_str")), "reply_id" : text.parse_int( tget("in_reply_to_status_id_str")), - "date" : text.parse_datetime( - tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), - "user" : self._user or author, + "date" : date, "author" : author, + "user" : self._user or author, "lang" : tweet["lang"], "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), @@ -321,8 +332,10 @@ class TwitterExtractor(Extractor): user = self.api.user_by_screen_name(user["screen_name"])["legacy"] uget = user.get - entities = user["entities"] + if uget("withheld_scope"): + self.log.warning("'%s'", uget("description")) + entities = user["entities"] self._user_cache[uid] = udata = { "id" : text.parse_int(uid), "name" : user["screen_name"], @@ -398,10 +411,8 @@ class TwitterExtractor(Extractor): except Exception: yield tweet - def _make_tweet(self, user, id_str, url, timestamp): + def _make_tweet(self, user, url, id_str): return { - "created_at": text.parse_timestamp(timestamp).strftime( - "%a %b %d %H:%M:%S +0000 %Y"), "id_str": id_str, "lang": None, "user": user, @@ -564,6 +575,12 @@ class TwitterLikesExtractor(TwitterExtractor): def tweets(self): return self.api.user_likes(self.user) + def _transform_tweet(self, tweet): + tdata = TwitterExtractor._transform_tweet(self, tweet) + tdata["date_liked"] = text.parse_timestamp( + (int(tweet["sortIndex"]) >> 20) // 1000) + return tdata + class TwitterBookmarkExtractor(TwitterExtractor): """Extractor for bookmarked tweets""" @@ -574,6 +591,12 @@ class TwitterBookmarkExtractor(TwitterExtractor): def tweets(self): return self.api.user_bookmarks() + def _transform_tweet(self, tweet): + tdata = TwitterExtractor._transform_tweet(self, tweet) + tdata["date_bookmarked"] = text.parse_timestamp( + (int(tweet["sortIndex"]) >> 20) // 1000) + return tdata + class TwitterListExtractor(TwitterExtractor): """Extractor for Twitter lists""" @@ -593,7 +616,11 @@ class TwitterListMembersExtractor(TwitterExtractor): """Extractor for members of a Twitter list""" subcategory = "list-members" pattern = BASE_PATTERN + r"/i/lists/(\d+)/members" - test = ("https://twitter.com/i/lists/784214683683127296/members",) + test = ("https://twitter.com/i/lists/784214683683127296/members", { + "pattern": TwitterTimelineExtractor.pattern, + "range": "1-40", + "count": 40, + }) def items(self): self.login() @@ -780,6 +807,16 @@ class TwitterTweetExtractor(TwitterExtractor): ("cards-blacklist", ("twitch.tv",))), "count": 0, }), + # retweet + ("https://twitter.com/jessica_3978/status/1296304589591810048", { + "options": (("retweets", True),), + "count": 2, + "keyword": { + "tweet_id" : 1296304589591810048, + "retweet_id": 1296296016002547713, + "date" : "dt:2020-08-20 04:34:32", + }, + }), # original retweets (#1026) ("https://twitter.com/jessica_3978/status/1296304589591810048", { "options": (("retweets", "original"),), @@ -915,9 +952,8 @@ class TwitterAvatarExtractor(TwitterExtractor): url = url.replace("_normal.", ".") id_str = url.rsplit("/", 2)[1] - timestamp = ((int(id_str) >> 22) + 1288834974657) // 1000 - return (self._make_tweet(user, id_str, url, timestamp),) + return (self._make_tweet(user, url, id_str),) class TwitterBackgroundExtractor(TwitterExtractor): @@ -932,7 +968,7 @@ class TwitterBackgroundExtractor(TwitterExtractor): "keyword": { "date": "dt:2015-01-12 10:29:43", "filename": "1421058583", - "tweet_id": 0, + "tweet_id": 554586009367478272, }, }), ("https://twitter.com/User16/header_photo", { @@ -950,7 +986,8 @@ class TwitterBackgroundExtractor(TwitterExtractor): except (KeyError, ValueError): return () - return (self._make_tweet(user, None, url, timestamp),) + id_str = str((int(timestamp) * 1000 - 1288834974657) << 22) + return (self._make_tweet(user, url, id_str),) class TwitterImageExtractor(Extractor): @@ -1008,9 +1045,6 @@ class TwitterAPI(): auth_token = cookies.get("auth_token", domain=cookiedomain) - if not auth_token: - self.user_media = self.user_media_legacy - self.headers = { "Accept": "*/*", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" @@ -1071,6 +1105,7 @@ class TwitterAPI(): "withReactionsPerspective": False, } self.features = { + "blue_business_profile_image_shape_enabled": False, "responsive_web_twitter_blue_verified_badge_is_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, @@ -1079,6 +1114,7 @@ class TwitterAPI(): "responsive_web_graphql_timeline_navigation_enabled": True, } self.features_pagination = { + "blue_business_profile_image_shape_enabled": False, "responsive_web_twitter_blue_verified_badge_is_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, @@ -1103,7 +1139,7 @@ class TwitterAPI(): } def tweet_detail(self, tweet_id): - endpoint = "/graphql/zXaXQgfyR4GxE21uwYQSyA/TweetDetail" + endpoint = "/graphql/AV_lPTkN6Fc6LgerQpK8Zg/TweetDetail" variables = { "focalTweetId": tweet_id, "referrer": "profile", @@ -1121,7 +1157,7 @@ class TwitterAPI(): endpoint, variables, ("threaded_conversation_with_injections_v2",)) def user_tweets(self, screen_name): - endpoint = "/graphql/9rys0A7w1EyqVd2ME0QCJg/UserTweets" + endpoint = "/graphql/BeHK76TOCY3P8nO-FWocjA/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1133,7 +1169,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/ehMCHF3Mkgjsfz_aImqOsg/UserTweetsAndReplies" + endpoint = "/graphql/eZVlZu_1gwb6hMUDXBnZoQ/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1145,7 +1181,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_media(self, screen_name): - endpoint = "/graphql/MA_EP2a21zpzNWKRkaPBMg/UserMedia" + endpoint = "/graphql/d_ONZLUHGCsErBCriRsLXg/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1178,7 +1214,7 @@ class TwitterAPI(): features=False) def user_likes(self, screen_name): - endpoint = "/graphql/XbHBYpgURwtklXj8NNxTDw/Likes" + endpoint = "/graphql/fN4-E0MjFJ9Cn7IYConL7g/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1191,15 +1227,18 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_bookmarks(self): - endpoint = "/graphql/Xq0wQSWHlcfnXARLJGqTxg/Bookmarks" + endpoint = "/graphql/RV1g3b8n_SGOHwkqKYSCFw/Bookmarks" variables = { "count": 100, } + features = self.features_pagination.copy() + features["graphql_timeline_v2_bookmark_timeline"] = True return self._pagination_tweets( - endpoint, variables, ("bookmark_timeline", "timeline"), False) + endpoint, variables, ("bookmark_timeline_v2", "timeline"), False, + features=features) def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/FDI9EiIp54KxEOWGiv3B4A/ListLatestTweetsTimeline" + endpoint = "/graphql/5DAiJG3bD77SiWEs4xViBw/ListLatestTweetsTimeline" variables = { "listId": list_id, "count": 100, @@ -1234,7 +1273,7 @@ class TwitterAPI(): ["twitter_objects"]["live_events"][event_id]) def list_by_rest_id(self, list_id): - endpoint = "/graphql/KlGpwq5CAt9tCfHkV2mwYQ/ListByRestId" + endpoint = "/graphql/D0EoyrDcct2MEqC-LnPzFg/ListByRestId" params = { "variables": self._json_dumps({ "listId": list_id, @@ -1248,7 +1287,7 @@ class TwitterAPI(): raise exception.NotFoundError("list") def list_members(self, list_id): - endpoint = "/graphql/XsAJX17RLgLYU8GALIWg2g/ListMembers" + endpoint = "/graphql/tzsIIbGUH9RyFCVmtO2W2w/ListMembers" variables = { "listId": list_id, "count": 100, @@ -1258,7 +1297,7 @@ class TwitterAPI(): endpoint, variables, ("list", "members_timeline", "timeline")) def user_following(self, screen_name): - endpoint = "/graphql/vTZwBbd_gz6aI8v6Wze21A/Following" + endpoint = "/graphql/FaBzCqZXuQCb4PhB0RHqHw/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1267,7 +1306,7 @@ class TwitterAPI(): return self._pagination_users(endpoint, variables) def user_by_rest_id(self, rest_id): - endpoint = "/graphql/QPSxc9lxrmrwnBzYkJI8eA/UserByRestId" + endpoint = "/graphql/S2BkcAyFMG--jef2N6Dgzw/UserByRestId" params = { "variables": self._json_dumps({ "userId": rest_id, @@ -1278,7 +1317,7 @@ class TwitterAPI(): return self._call(endpoint, params)["data"]["user"]["result"] def user_by_screen_name(self, screen_name): - endpoint = "/graphql/nZjSkpOpSL5rWyIVdsKeLA/UserByScreenName" + endpoint = "/graphql/k26ASEiniqy4eXMdknTSoQ/UserByScreenName" params = { "variables": self._json_dumps({ "screen_name": screen_name, @@ -1451,15 +1490,17 @@ class TwitterAPI(): params["cursor"] = cursor def _pagination_tweets(self, endpoint, variables, - path=None, stop_tweets=True, features=True): + path=None, stop_tweets=True, features=None): extr = self.extractor variables.update(self.variables) original_retweets = (extr.retweets == "original") pinned_tweet = extr.pinned params = {"variables": None} + if features is None: + features = self.features_pagination if features: - params["features"] = self._json_dumps(self.features_pagination) + params["features"] = self._json_dumps(features) while True: params["variables"] = self._json_dumps(variables) @@ -1550,6 +1591,7 @@ class TwitterAPI(): if "tweet" in tweet: tweet = tweet["tweet"] legacy = tweet["legacy"] + tweet["sortIndex"] = entry.get("sortIndex") except KeyError: extr.log.debug( "Skipping %s (deleted)", @@ -1574,10 +1616,17 @@ class TwitterAPI(): retweet["rest_id"] tweet["author"] = \ retweet["core"]["user_results"]["result"] - if "extended_entities" in retweet["legacy"] and \ + + rtlegacy = retweet["legacy"] + if "extended_entities" in rtlegacy and \ "extended_entities" not in legacy: legacy["extended_entities"] = \ - retweet["legacy"]["extended_entities"] + rtlegacy["extended_entities"] + if "withheld_scope" in rtlegacy and \ + "withheld_scope" not in legacy: + legacy["withheld_scope"] = \ + rtlegacy["withheld_scope"] + legacy["full_text"] = rtlegacy["full_text"] except KeyError: pass @@ -1590,6 +1639,8 @@ class TwitterAPI(): tweet["core"]["user_results"]["result"] ["legacy"]["screen_name"]) quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"] + quoted["sortIndex"] = entry.get("sortIndex") + yield quoted except KeyError: extr.log.debug( @@ -1679,9 +1730,10 @@ class TwitterAPI(): "in_reply_to_status_id_str" not in tweet: tweet["conversation_id_str"] = tweet["id_str"] - tweet["created_at"] = text.parse_datetime( - tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime( - "%a %b %d %H:%M:%S +0000 %Y") + if int(tweet_id) < 300000000000000: + tweet["created_at"] = text.parse_datetime( + tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime( + "%a %b %d %H:%M:%S +0000 %Y") if "video" in tweet: video = tweet["video"] diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py new file mode 100644 index 0000000..1a39b5b --- /dev/null +++ b/gallery_dl/extractor/urlshortener.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for general-purpose URL shorteners""" + +from .common import BaseExtractor, Message +from .. import exception + + +class UrlshortenerExtractor(BaseExtractor): + """Base class for URL shortener extractors""" + basecategory = "urlshortener" + + +INSTANCES = { + "bitly": { + "root": "https://bit.ly", + "pattern": r"bit\.ly", + }, + "tco": { + # t.co sends 'http-equiv="refresh"' (200) when using browser UA + "headers": {"User-Agent": None}, + "root": "https://t.co", + "pattern": r"t\.co", + }, +} + +BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES) + + +class UrlshortenerLinkExtractor(UrlshortenerExtractor): + """Extractor for general-purpose URL shorteners""" + subcategory = "link" + pattern = BASE_PATTERN + r"/([^/?&#]+)" + test = ( + ("https://bit.ly/3cWIUgq", { + "count": 1, + "pattern": "^https://gumroad.com/l/storm_b1", + }), + ("https://t.co/bCgBY8Iv5n", { + "count": 1, + "pattern": "^https://twitter.com/elonmusk/status/" + "1421395561324896257/photo/1", + }), + ("https://t.co/abcdefghij", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + UrlshortenerExtractor.__init__(self, match) + self.id = match.group(match.lastindex) + + try: + self.headers = INSTANCES[self.category]["headers"] + except Exception: + self.headers = None + + def items(self): + response = self.request( + "{}/{}".format(self.root, self.id), headers=self.headers, + method="HEAD", allow_redirects=False, notfound="URL") + try: + yield Message.Queue, response.headers["location"], {} + except KeyError: + raise exception.StopExtraction("Unable to resolve short URL") diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 93a9148..c40736a 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.1" +__version__ = "1.25.2" diff --git a/test/test_downloader.py b/test/test_downloader.py index bbee0f4..c65be95 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -289,6 +289,10 @@ SAMPLES = { ("webp", b"RIFF????WEBP"), ("avif", b"????ftypavif"), ("avif", b"????ftypavis"), + ("heic", b"????ftypheic"), + ("heic", b"????ftypheim"), + ("heic", b"????ftypheis"), + ("heic", b"????ftypheix"), ("svg" , b"<?xml"), ("ico" , b"\x00\x00\x01\x00"), ("cur" , b"\x00\x00\x02\x00"), |
