diff options
| author | 2025-09-07 20:40:52 -0400 | |
|---|---|---|
| committer | 2025-09-07 20:40:52 -0400 | |
| commit | 88f40b9b0dc47fa22a209e8246d97a43f4b60cb2 (patch) | |
| tree | 5764999c5d1ce99ccebd92dddb7a3914b55e43c2 | |
| parent | 7ac1b3bb04430b981f4f796fd765499cdc8b67ec (diff) | |
| parent | 243b2597edb922fe7e0b0d887e80bb7ebbe72ab7 (diff) | |
Update upstream source from tag 'upstream/1.30.6'
Update to upstream version '1.30.6'
with Debian dir 9f14996b07ee3246bdcde2ec12796c77da2a3060
36 files changed, 995 insertions, 226 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 227e251..8fb17a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,36 +1,43 @@ -## 1.30.5 - 2025-08-24 +## 1.30.6 - 2025-09-06 ### Extractors #### Additions -- [shimmie2] support `noz.rip/booru` ([#8101](https://github.com/mikf/gallery-dl/issues/8101)) -- [sizebooru] add support ([#7667](https://github.com/mikf/gallery-dl/issues/7667)) -- [twitter] add `highlights` extractor ([#7826](https://github.com/mikf/gallery-dl/issues/7826)) -- [twitter] add `home` extractor ([#7974](https://github.com/mikf/gallery-dl/issues/7974)) +- [chevereto] add `video` extractor ([#8149](https://github.com/mikf/gallery-dl/issues/8149)) +- [comick] add `covers` extractor +- [fansly] add support ([#4401](https://github.com/mikf/gallery-dl/issues/4401)) +- [instagram] add `stories-tray` extractor ([#6582](https://github.com/mikf/gallery-dl/issues/6582)) +- [shimmie2] support `co.llection.pics` ([#8166](https://github.com/mikf/gallery-dl/issues/8166)) +- [tungsten] add support ([#8061](https://github.com/mikf/gallery-dl/issues/8061)) +- [vk] add `wall-post` extractor ([#474](https://github.com/mikf/gallery-dl/issues/474) [#6378](https://github.com/mikf/gallery-dl/issues/6378) [#8159](https://github.com/mikf/gallery-dl/issues/8159)) #### Fixes -- [aryion] fix pagination ([#8091](https://github.com/mikf/gallery-dl/issues/8091)) -- [rule34] support using `api-key` & `user-id` ([#8077](https://github.com/mikf/gallery-dl/issues/8077) [#8088](https://github.com/mikf/gallery-dl/issues/8088) [#8098](https://github.com/mikf/gallery-dl/issues/8098)) -- [tumblr:search] fix `ValueError: not enough values to unpack` ([#8079](https://github.com/mikf/gallery-dl/issues/8079)) -- [twitter] handle `KeyError: 'result'` for retweets ([#8072](https://github.com/mikf/gallery-dl/issues/8072)) -- [zerochan] expect `500 Internal Server Error` responses for HTML requests ([#8097](https://github.com/mikf/gallery-dl/issues/8097)) +- [bunkr] fix downloading albums with more than 100 files ([#8150](https://github.com/mikf/gallery-dl/issues/8150) [#8155](https://github.com/mikf/gallery-dl/issues/8155) [#8175](https://github.com/mikf/gallery-dl/issues/8175)) +- [chevereto:user] fix names starting with an `a` ([#8149](https://github.com/mikf/gallery-dl/issues/8149)) +- [common] prevent exception when using empty `user-agent` ([#8116](https://github.com/mikf/gallery-dl/issues/8116)) +- [deviantart:search] fix extraction ([#8083](https://github.com/mikf/gallery-dl/issues/8083)) +- [hentaifoundry:story] fix `src` & `description` extraction ([#8163](https://github.com/mikf/gallery-dl/issues/8163)) +- [imagebam] update guard page bypass cookies ([#8123](https://github.com/mikf/gallery-dl/issues/8123)) +- [kemono] fix `.bin` archive files not being added to archives list ([#8156](https://github.com/mikf/gallery-dl/issues/8156)) +- [reddit] fix `TypeaError` when processing comments ([#8139](https://github.com/mikf/gallery-dl/issues/8139)) +- [tumblr] fix pagination when using `date-max` +- [twitter] prevent exceptions in `_transform_community()` ([#8134](https://github.com/mikf/gallery-dl/issues/8134)) +- [twitter] prevent `KeyError: 'name'` in `_transform_user()` ([#8154](https://github.com/mikf/gallery-dl/issues/8154)) +- [twitter] fix `KeyError: 'core'` when processing communities ([#8141](https://github.com/mikf/gallery-dl/issues/8141)) +- [zerochan] fix `500 Internal Server Error` during login ([#8097](https://github.com/mikf/gallery-dl/issues/8097) [#8114](https://github.com/mikf/gallery-dl/issues/8114)) #### Improvements -- [civitai:search] add `token` option ([#8093](https://github.com/mikf/gallery-dl/issues/8093)) -- [instagram] warn about lower quality video downloads ([#7921](https://github.com/mikf/gallery-dl/issues/7921) [#8078](https://github.com/mikf/gallery-dl/issues/8078)) -- [instagram] remove `candidates` warning ([#7921](https://github.com/mikf/gallery-dl/issues/7921) [#7989](https://github.com/mikf/gallery-dl/issues/7989) [#8071](https://github.com/mikf/gallery-dl/issues/8071)) -- [oauth] improve error messages ([#8086](https://github.com/mikf/gallery-dl/issues/8086)) -- [pixiv] distinguish empty from deleted profiles ([#8066](https://github.com/mikf/gallery-dl/issues/8066)) -- [twitter] update API endpoint query hashes & parameters +- [comick] detect broken chapters ([#8054](https://github.com/mikf/gallery-dl/issues/8054)) +- [erome] handle reposts on user profiles ([#6582](https://github.com/mikf/gallery-dl/issues/6582)) +- [instagram] improve video quality warning regex ([#8078](https://github.com/mikf/gallery-dl/issues/8078)) +- [jpgfish] update domain to `jpg6.su` +- [reddit] add `api` & `limit` options ([#7997](https://github.com/mikf/gallery-dl/issues/7997) [#8012](https://github.com/mikf/gallery-dl/issues/8012) [#8092](https://github.com/mikf/gallery-dl/issues/8092)) +- [reddit] support video embeds ([#8139](https://github.com/mikf/gallery-dl/issues/8139)) +- [tumblr:tagged] support `/archive/tagged/` URLs ([#8160](https://github.com/mikf/gallery-dl/issues/8160)) #### Metadata -- [batoto] extract more metadata ([#7994](https://github.com/mikf/gallery-dl/issues/7994)) -- [instagram:highlights] extract `author` & `owner` & `user` metadata ([#7846](https://github.com/mikf/gallery-dl/issues/7846)) -- [newgrounds] extract `slug` metadata ([#8064](https://github.com/mikf/gallery-dl/issues/8064)) -- [twitter] extract `community` metadata ([#7424](https://github.com/mikf/gallery-dl/issues/7424)) -#### Removals -- [shimmie2] remove `sizechangebooru.com` ([#7667](https://github.com/mikf/gallery-dl/issues/7667)) -- [zzup] remove module ([#4604](https://github.com/mikf/gallery-dl/issues/4604)) +- [khinsider] extract `description` metadata +- [tumblr:tagged] provide `search_tags` metadata ([#8160](https://github.com/mikf/gallery-dl/issues/8160)) +- [vk] parse `date` & `description` metadata ([#8029](https://github.com/mikf/gallery-dl/issues/8029)) +- [vk:album] extract more metadata ([#8029](https://github.com/mikf/gallery-dl/issues/8029)) ### Downloaders -- [ytdl] improve playlist handling ([#8085](https://github.com/mikf/gallery-dl/issues/8085)) -### Scripts -- implement `rm` helper script -- add `-g/--git` command-line options -- [util] add `git()` & `lines()` helper functions +- [ytdl] implement `_ytdl_manifest_cookies` ### Miscellaneous -- [config] add `conf` argument to `config.load()` ([#8084](https://github.com/mikf/gallery-dl/issues/8084)) +- [formatter] add `R` conversion - extract URLs ([#8125](https://github.com/mikf/gallery-dl/issues/8125)) +- [options] add `-a` as short option for `--user-agent` +- [scripts/init] implement `-s/--subcategory` @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: gallery_dl -Version: 1.30.5 +Version: 1.30.6 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -139,9 +139,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.6/gallery-dl.bin>`__ Nightly Builds @@ -79,9 +79,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.6/gallery-dl.bin>`__ Nightly Builds diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 07cfcd9..539ec1b 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -11,7 +11,7 @@ _arguments -s -S \ {-d,--destination}'[Target location for file downloads]':'<path>' \ {-D,--directory}'[Exact location for file downloads]':'<path>' \ {-X,--extractors}'[Load external extractors from PATH]':'<path>' \ ---user-agent'[User-Agent request header]':'<ua>' \ +{-a,--user-agent}'[User-Agent request header]':'<ua>' \ --clear-cache'[Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)]':'<module>' \ --compat'[Restore legacy '\''category'\'' names]' \ {-U,--update-check}'[Check if a newer version is available]' \ diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish index b7e4fe4..84ff5b5 100644 --- a/data/completion/gallery-dl.fish +++ b/data/completion/gallery-dl.fish @@ -5,7 +5,7 @@ complete -c gallery-dl -x -s 'f' -l 'filename' -d 'Filename format string for do complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'd' -l 'destination' -d 'Target location for file downloads' complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'D' -l 'directory' -d 'Exact location for file downloads' complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'X' -l 'extractors' -d 'Load external extractors from PATH' -complete -c gallery-dl -x -l 'user-agent' -d 'User-Agent request header' +complete -c gallery-dl -x -s 'a' -l 'user-agent' -d 'User-Agent request header' complete -c gallery-dl -x -l 'clear-cache' -d 'Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)' complete -c gallery-dl -l 'compat' -d 'Restore legacy "category" names' complete -c gallery-dl -s 'U' -l 'update-check' -d 'Check if a newer version is available' diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 89d7116..3cda42a 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2025-08-24" "1.30.5" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2025-09-06" "1.30.6" "gallery-dl Manual" .\" disable hyphenation .nh @@ -35,7 +35,7 @@ Exact location for file downloads .B "\-X, \-\-extractors" \f[I]PATH\f[] Load external extractors from PATH .TP -.B "\-\-user\-agent" \f[I]UA\f[] +.B "\-a, \-\-user\-agent" \f[I]UA\f[] User-Agent request header .TP .B "\-\-clear\-cache" \f[I]MODULE\f[] diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 4870130..335195f 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2025-08-24" "1.30.5" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2025-09-06" "1.30.6" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -3020,6 +3020,17 @@ greater than the per-page limit, gallery-dl will stop after the first batch. The value cannot be less than 1. +.SS extractor.erome.user.reposts +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Include reposts when extracting albums from a user profile. + + .SS extractor.exhentai.domain .IP "Type:" 6 \f[I]string\f[] @@ -3321,6 +3332,32 @@ Note: \f[I]comments\f[] can also be enabled via \f[I]fanbox.comments\f[] +.SS extractor.fansly.format +.IP "Type:" 6 +\f[I]list\f[] of \f[I]integers\f[] + +.IP "Default:" 9 +\f[I][303, 302, 1, 2, 4]\f[] + +.IP "Description:" 4 +Selects the file format to extract. + +When more than one format is given, the first available one is selected. + + +.SS extractor.fansly.token +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Example:" 4 +"kX7pL9qW3zT2rY8mB5nJ4vC6xF1tA0hD8uE2wG9yR3sQ7iZ4oM5jN6cP8lV0bK2tU9aL1eW" + +.IP "Description:" 4 +\f[I]authorization\f[] header value +used for requests to \f[I]https://apiv3.fansly.com/api\f[] +to access locked content. + + .SS extractor.flickr.access-token & .access-token-secret .IP "Type:" 6 \f[I]string\f[] @@ -5311,6 +5348,33 @@ Sets the \f[I]quality\f[] query parameter of issue pages. (\f[I]"lq"\f[] or \f[I or \f[I]"hq"\f[] if not present. +.SS extractor.reddit.api +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"oauth"\f[] + +.IP "Description:" 4 +Selects which API endpoints to use. + +\f[I]"oauth"\f[] +Use the OAuth API at \f[I]https://oauth.reddit.com\f[] + +Requires +\f[I]client-id & user-agent\f[] +and uses a +\f[I]refresh token\f[] +for authentication. + +\f[I]"rest"\f[] +Use the REST API at \f[I]https://www.reddit.com\f[] + +Uses +\f[I]cookies\f[] +for authentication. + + .SS extractor.reddit.comments .IP "Type:" 6 \f[I]integer\f[] @@ -5378,6 +5442,23 @@ Ignore all submissions posted before/after this date. Ignore all submissions posted before/after the submission with this ID. +.SS extractor.reddit.limit +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Number of results to return in a single API query. + +This value specifies the \f[I]limit\f[] parameter +used for API requests when retrieving paginated results. + +\f[I]null\f[] means not including this parameter at all +and letting Reddit chose a default. + + .SS extractor.reddit.previews .IP "Type:" 6 \f[I]bool\f[] @@ -6181,7 +6262,10 @@ use an extra HTTP request to find the URL to its full-resolution version. \f[I]string\f[] .IP "Default:" 9 -\f[I]"offset"\f[] +.br +* \f[I]"before"\f[] if \f[I]date-max\f[] is set +.br +* \f[I]"offset"\f[] otherwise .IP "Description:" 4 Controls how to paginate over blog posts. diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1890b72..f1e3833 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -287,6 +287,12 @@ "metadata": false } }, + "erome": + { + "user": { + "reposts": false + } + }, "exhentai": { "username": "", @@ -322,6 +328,12 @@ "fee-max" : null, "metadata": false }, + "fansly": + { + "token": "", + + "format": [303, 302, 1, 2, 4] + }, "flickr": { "access-token" : null, @@ -617,6 +629,7 @@ "user-agent" : null, "refresh-token": null, + "api" : "oauth", "comments" : 0, "morecomments": false, "embeds" : true, @@ -625,6 +638,7 @@ "date-format" : "%Y-%m-%dT%H:%M:%S", "id-min" : null, "id-max" : null, + "limit" : null, "previews" : true, "recursion" : 0, "selftext" : null, @@ -771,7 +785,7 @@ "inline" : true, "offset" : 0, "original" : true, - "pagination": "offset", + "pagination": null, "posts" : "all", "ratelimit" : "abort", "reblogs" : true, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 559c580..353bfae 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.4 Name: gallery_dl -Version: 1.30.5 +Version: 1.30.6 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -139,9 +139,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.30.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.30.6/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 6f7a991..1a0bc19 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -94,6 +94,7 @@ gallery_dl/extractor/everia.py gallery_dl/extractor/exhentai.py gallery_dl/extractor/facebook.py gallery_dl/extractor/fanbox.py +gallery_dl/extractor/fansly.py gallery_dl/extractor/fantia.py gallery_dl/extractor/fapachi.py gallery_dl/extractor/fapello.py @@ -240,6 +241,7 @@ gallery_dl/extractor/toyhouse.py gallery_dl/extractor/tsumino.py gallery_dl/extractor/tumblr.py gallery_dl/extractor/tumblrgallery.py +gallery_dl/extractor/tungsten.py gallery_dl/extractor/twibooru.py gallery_dl/extractor/twitter.py gallery_dl/extractor/unsplash.py diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 9ef8816..a56a6be 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -11,6 +11,7 @@ from .common import DownloaderBase from .. import ytdl, text from xml.etree import ElementTree +from http.cookiejar import Cookie import os @@ -85,7 +86,8 @@ class YoutubeDLDownloader(DownloaderBase): info_dict = self._extract_manifest( ytdl_instance, url, manifest, kwdict.pop("_ytdl_manifest_data", None), - kwdict.pop("_ytdl_manifest_headers", None)) + kwdict.pop("_ytdl_manifest_headers", None), + kwdict.pop("_ytdl_manifest_cookies", None)) else: info_dict = self._extract_info(ytdl_instance, url) except Exception as exc: @@ -194,10 +196,21 @@ class YoutubeDLDownloader(DownloaderBase): return ytdl.extract_info(url, download=False) def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None, - headers=None): + headers=None, cookies=None): extr = ytdl.get_info_extractor("Generic") video_id = extr._generic_id(url) + if cookies is not None: + if isinstance(cookies, dict): + cookies = cookies.items() + set_cookie = ytdl.cookiejar.set_cookie + for name, value in cookies: + set_cookie(Cookie( + 0, name, value, None, False, + "", False, False, "/", False, + False, None, False, None, None, {}, + )) + if manifest_type == "hls": if manifest_data is None: try: diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index aabaa93..574d1e2 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -56,6 +56,7 @@ modules = [ "exhentai", "facebook", "fanbox", + "fansly", "fantia", "fapello", "fapachi", @@ -190,6 +191,7 @@ modules = [ "tsumino", "tumblr", "tumblrgallery", + "tungsten", "twibooru", "twitter", "urlgalleries", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index eba1678..cf5bce1 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -62,7 +62,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): root = "https://bunkr.si" root_dl = "https://get.bunkrr.su" root_api = "https://apidl.bunkr.ru" - archive_fmt = "{album_id}_{id|id_url}" + archive_fmt = "{album_id}_{id|id_url|slug}" pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://bunkr.si/a/ID" @@ -134,13 +134,13 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): def fetch_album(self, album_id): # album metadata - page = self.request(self.root + "/a/" + album_id).text + page = self.request(f"{self.root}/a/{album_id}?advanced=1").text title = text.unescape(text.unescape(text.extr( page, 'property="og:title" content="', '"'))) # files - items = list(text.extract_iter( - page, '<div class="grid-images_box', "</a>")) + items = text.extr( + page, "window.albumFiles = [", "</script>").split("\n},\n") return self._extract_files(items), { "album_id" : album_id, @@ -156,30 +156,29 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): for item in items: try: - url = text.unescape(text.extr(item, ' href="', '"')) - if url[0] == "/": - url = self.root + url - - file = self._extract_file(url) - info = text.split_html(item) - if not file["name"]: - file["name"] = info[-3] - file["size"] = info[-2] - file["date"] = text.parse_datetime( - info[-1], "%H:%M:%S %d/%m/%Y") + data_id = text.extr(item, " id: ", ",").strip() + file = self._extract_file(data_id) + + file["name"] = util.json_loads(text.extr( + item, 'original:', ',\n').replace("\\'", "'")) + file["slug"] = util.json_loads(text.extr( + item, 'slug: ', ',\n')) + file["uuid"] = text.extr( + item, 'name: "', ".") + file["size"] = text.parse_int(text.extr( + item, "size: ", " ,\n")) + file["date"] = text.parse_datetime(text.extr( + item, 'timestamp: "', '"'), "%H:%M:%S %d/%m/%Y") yield file except exception.ControlException: raise except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) - self.log.debug("", exc_info=exc) - - def _extract_file(self, webpage_url): - page = self.request(webpage_url).text - data_id = text.extr(page, 'data-file-id="', '"') - referer = self.root_dl + "/file/" + data_id + self.log.debug("%s", item, exc_info=exc) + def _extract_file(self, data_id): + referer = f"{self.root_dl}/file/{data_id}" headers = {"Referer": referer, "Origin": self.root_dl} data = self.request_json(self.endpoint, method="POST", headers=headers, json={"id": data_id}) @@ -190,14 +189,9 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): else: file_url = data["url"] - file_name = text.extr(page, "<h1", "<").rpartition(">")[2] - fallback = text.extr(page, 'property="og:url" content="', '"') - return { "file" : file_url, - "name" : text.unescape(file_name), "id_url" : data_id, - "_fallback" : (fallback,) if fallback else (), "_http_headers" : {"Referer": referer}, "_http_validate": self._validate, } @@ -222,7 +216,13 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): def fetch_album(self, album_id): try: - file = self._extract_file(self.root + album_id) + page = self.request(f"{self.root}{album_id}").text + data_id = text.extr(page, 'data-file-id="', '"') + file = self._extract_file(data_id) + file["name"] = text.unquote(text.unescape(text.extr( + page, "<h1", "<").rpartition(">")[2])) + file["slug"] = album_id.rpartition("/")[2] + file["uuid"] = text.extr(page, "/thumbs/", ".") except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) return (), {} diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 1da7e23..6ba4d08 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -15,7 +15,7 @@ from .. import text, util class CheveretoExtractor(BaseExtractor): """Base class for chevereto extractors""" basecategory = "chevereto" - directory_fmt = ("{category}", "{user}", "{album}",) + directory_fmt = ("{category}", "{user}", "{album}") archive_fmt = "{id}" def _init(self): @@ -39,7 +39,7 @@ class CheveretoExtractor(BaseExtractor): BASE_PATTERN = CheveretoExtractor.update({ "jpgfish": { - "root": "https://jpg5.su", + "root": "https://jpg6.su", "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", }, "imgkiwi": { @@ -54,7 +54,7 @@ BASE_PATTERN = CheveretoExtractor.update({ class CheveretoImageExtractor(CheveretoExtractor): - """Extractor for chevereto Images""" + """Extractor for chevereto images""" subcategory = "image" pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)" example = "https://jpg2.su/img/TITLE.ID" @@ -74,22 +74,66 @@ class CheveretoImageExtractor(CheveretoExtractor): url, b"seltilovessimpcity@simpcityhatesscrapers", fromhex=True) - image = { + file = { "id" : self.path.rpartition(".")[2], "url" : url, - "album": text.extr(extr("Added to <a", "/a>"), ">", "<"), + "album": text.remove_html(extr( + "Added to <a", "</a>").rpartition(">")[2]), "date" : text.parse_datetime(extr( '<span title="', '"'), "%Y-%m-%d %H:%M:%S"), "user" : extr('username: "', '"'), } - text.nameext_from_url(image["url"], image) - yield Message.Directory, image - yield Message.Url, image["url"], image + text.nameext_from_url(file["url"], file) + yield Message.Directory, file + yield Message.Url, file["url"], file + + +class CheveretoVideoExtractor(CheveretoExtractor): + """Extractor for chevereto videos""" + subcategory = "video" + pattern = BASE_PATTERN + r"(/video/[^/?#]+)" + example = "https://imagepond.net/video/TITLE.ID" + + def items(self): + url = self.root + self.path + page = self.request(url).text + extr = text.extract_from(page) + + file = { + "id" : self.path.rpartition(".")[2], + "title" : text.unescape(extr( + 'property="og:title" content="', '"')), + "thumbnail": extr( + 'property="og:image" content="', '"'), + "url" : extr( + 'property="og:video" content="', '"'), + "width" : text.parse_int(extr( + 'property="video:width" content="', '"')), + "height" : text.parse_int(extr( + 'property="video:height" content="', '"')), + "duration" : extr( + 'class="far fa-clock"></i>', "—"), + "album": text.remove_html(extr( + "Added to <a", "</a>").rpartition(">")[2]), + "date" : text.parse_datetime(extr( + '<span title="', '"'), "%Y-%m-%d %H:%M:%S"), + "user" : extr('username: "', '"'), + } + + try: + min, _, sec = file["duration"].partition(":") + file["duration"] = int(min) * 60 + int(sec) + except Exception: + pass + + text.nameext_from_url(file["url"], file) + yield Message.Directory, file + yield Message.Url, file["url"], file class CheveretoAlbumExtractor(CheveretoExtractor): - """Extractor for chevereto Albums""" + """Extractor for chevereto albums""" subcategory = "album" pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)" example = "https://jpg2.su/album/TITLE.ID" @@ -109,9 +153,9 @@ class CheveretoAlbumExtractor(CheveretoExtractor): class CheveretoUserExtractor(CheveretoExtractor): - """Extractor for chevereto Users""" + """Extractor for chevereto users""" subcategory = "user" - pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)" + pattern = BASE_PATTERN + r"(/[^/?#]+(?:/albums)?)" example = "https://jpg2.su/USER" def items(self): @@ -119,8 +163,11 @@ class CheveretoUserExtractor(CheveretoExtractor): if self.path.endswith("/albums"): data = {"_extractor": CheveretoAlbumExtractor} + for url in self._pagination(url): + yield Message.Queue, url, data else: - data = {"_extractor": CheveretoImageExtractor} - - for url in self._pagination(url): - yield Message.Queue, url, data + data_image = {"_extractor": CheveretoImageExtractor} + data_video = {"_extractor": CheveretoVideoExtractor} + for url in self._pagination(url): + data = data_video if "/video/" in url else data_image + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/comick.py b/gallery_dl/extractor/comick.py index 6c54156..a6aec38 100644 --- a/gallery_dl/extractor/comick.py +++ b/gallery_dl/extractor/comick.py @@ -8,7 +8,7 @@ """Extractors for https://comick.io/""" -from .common import ChapterExtractor, MangaExtractor, Message +from .common import GalleryExtractor, ChapterExtractor, MangaExtractor, Message from .. import text from ..cache import memcache @@ -21,6 +21,42 @@ class ComickBase(): root = "https://comick.io" +class ComickCoversExtractor(ComickBase, GalleryExtractor): + """Extractor for comick.io manga covers""" + subcategory = "covers" + directory_fmt = ("{category}", "{manga}", "Covers") + filename_fmt = "{volume:>02}_{lang}.{extension}" + archive_fmt = "c_{id}" + pattern = BASE_PATTERN + r"/comic/([\w-]+)/cover" + example = "https://comick.io/comic/MANGA/cover" + + def metadata(self, page): + manga = _manga_info(self, self.groups[0]) + self.slug = manga['manga_slug'] + return manga + + def images(self, page): + url = f"{self.root}/comic/{self.slug}/cover" + page = self.request(url).text + data = self._extract_nextdata(page) + + covers = data["props"]["pageProps"]["comic"]["md_covers"] + covers.reverse() + + return [ + (f"https://meo.comick.pictures/{cover['b2key']}", { + "id" : cover["id"], + "width" : cover["w"], + "height": cover["h"], + "size" : cover["s"], + "lang" : cover["locale"], + "volume": text.parse_int(cover["vol"]), + "cover" : cover, + }) + for cover in covers + ] + + class ComickChapterExtractor(ComickBase, ChapterExtractor): """Extractor for comick.io manga chapters""" archive_fmt = "{chapter_hid}_{page}" @@ -60,8 +96,15 @@ class ComickChapterExtractor(ComickBase, ChapterExtractor): } def images(self, page): + if not self._images[0].get("b2key") and all( + not img.get("b2key") for img in self._images): + self.log.error( + "%s: Broken Chapter (missing 'b2key' for all pages)", + self.groups[1]) + return () + return [ - ("https://meo.comick.pictures/" + img["b2key"], { + (f"https://meo.comick.pictures/{img['b2key']}", { "width" : img["w"], "height" : img["h"], "size" : img["s"], diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 719fc62..568f435 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -462,7 +462,7 @@ class Extractor(): headers["Referer"] = self.root + "/" custom_ua = self.config("user-agent") - if custom_ua is None or custom_ua == "auto": + if not custom_ua or custom_ua == "auto": pass elif custom_ua == "browser": headers["User-Agent"] = _browser_useragent(None) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index d900f4c..39690da 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1356,7 +1356,8 @@ class DeviantartSearchExtractor(DeviantartExtractor): def _search_html(self, params): url = self.root + "/search" - + find = text.re(r'''href="https://www.deviantart.com/([^/?#]+)''' + r'''/(art|journal)/(?:[^"]+-)?(\d+)''').findall while True: response = self.request(url, params=params) @@ -1364,12 +1365,11 @@ class DeviantartSearchExtractor(DeviantartExtractor): raise exception.AbortExtraction("HTTP redirect to login page") page = response.text - for dev in DeviantartDeviationExtractor.pattern.findall( - page)[2::3]: + for user, type, did in find(page)[:-3:3]: yield { - "deviationId": dev[3], - "author": {"username": dev[0]}, - "isJournal": dev[2] == "journal", + "deviationId": did, + "author": {"username": user}, + "isJournal": type == "journal", } cursor = text.extr(page, r'\"cursor\":\"', '\\',) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 7beeac5..68cfdbc 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -48,10 +48,13 @@ class EromeExtractor(Extractor): self.sleep(5.0, "check") def _pagination(self, url, params): - for params["page"] in itertools.count(1): + find_albums = EromeAlbumExtractor.pattern.findall + + for params["page"] in itertools.count( + text.parse_int(params.get("page"), 1)): page = self.request(url, params=params).text - album_ids = EromeAlbumExtractor.pattern.findall(page)[::2] + album_ids = find_albums(page)[::2] yield from album_ids if len(album_ids) < 36: @@ -114,12 +117,18 @@ class EromeAlbumExtractor(EromeExtractor): class EromeUserExtractor(EromeExtractor): subcategory = "user" - pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)" + pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)(?:/?\?([^#]+))?" example = "https://www.erome.com/USER" def albums(self): - url = f"{self.root}/{self.groups[0]}" - return self._pagination(url, {}) + user, qs = self.groups + url = f"{self.root}/{user}" + + params = text.parse_query(qs) + if "t" not in params and not self.config("reposts", False): + params["t"] = "posts" + + return self._pagination(url, params) class EromeSearchExtractor(EromeExtractor): @@ -128,7 +137,7 @@ class EromeSearchExtractor(EromeExtractor): example = "https://www.erome.com/search?q=QUERY" def albums(self): - url = self.root + "/search" + url = f"{self.root}/search" params = text.parse_query(self.groups[0]) return self._pagination(url, params) diff --git a/gallery_dl/extractor/fansly.py b/gallery_dl/extractor/fansly.py new file mode 100644 index 0000000..31d242f --- /dev/null +++ b/gallery_dl/extractor/fansly.py @@ -0,0 +1,318 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://fansly.com/""" + +from .common import Extractor, Message +from .. import text, util +import time + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?fansly\.com" + + +class FanslyExtractor(Extractor): + """Base class for fansly extractors""" + category = "fansly" + root = "https://fansly.com" + directory_fmt = ("{category}", "{account[username]} ({account[id]})") + filename_fmt = "{id}_{num}_{file[id]}.{extension}" + archive_fmt = "{file[id]}" + + def _init(self): + self.api = FanslyAPI(self) + self.formats = self.config("format") or (303, 302, 1, 2, 4) + + def items(self): + for post in self.posts(): + files = self._extract_files(post) + post["count"] = len(files) + post["date"] = text.parse_timestamp(post["createdAt"]) + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + post.update(file) + url = file["url"] + yield Message.Url, url, text.nameext_from_url(url, post) + + def _extract_files(self, post): + files = [] + for attachment in post.pop("attachments"): + try: + self._extract_attachment(files, post, attachment) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.error( + "%s/%s, Failed to extract media (%s: %s)", + post["id"], attachment.get("id"), + exc.__class__.__name__, exc) + return files + + def _extract_attachment(self, files, post, attachment): + media = attachment["media"] + variants = { + variant["type"]: variant + for variant in media.pop("variants", ()) + } + variants[media["type"]] = media + + for fmt in self.formats: + if fmt in variants and (variant := variants[fmt]).get("locations"): + break + else: + return self.log.warning( + "%s/%s: Requested format not available", + post["id"], attachment["id"]) + + mime = variant["mimetype"] + location = variant.pop("locations")[0] + if "metadata" in variant: + try: + variant.update(util.json_loads(variant.pop("metadata"))) + except Exception: + pass + + file = { + **variant, + "format": fmt, + "date": text.parse_timestamp(media["createdAt"]), + "date_updated": text.parse_timestamp(media["updatedAt"]), + } + + if "metadata" in location: + # manifest + meta = location["metadata"] + + file["type"] = "video" + files.append({ + "file": file, + "url": f"ytdl:{location['location']}", + # "_fallback": (media["locations"][0]["location"],), + "_ytdl_manifest": + "dash" if mime == "application/dash+xml" else "hls", + "_ytdl_manifest_cookies": ( + ("CloudFront-Key-Pair-Id", meta["Key-Pair-Id"]), + ("CloudFront-Signature" , meta["Signature"]), + ("CloudFront-Policy" , meta["Policy"]), + ), + }) + else: + file["type"] = "image" if mime.startswith("image/") else "video" + files.append({ + "file": file, + "url" : location["location"], + }) + + +class FanslyPostExtractor(FanslyExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}/post/(\d+)" + example = "https://fansly.com/post/1234567890" + + def posts(self): + return self.api.post(self.groups[0]) + + +class FanslyHomeExtractor(FanslyExtractor): + subcategory = "home" + pattern = rf"{BASE_PATTERN}/home(?:/(?:subscribed()|list/(\d+)))?" + example = "https://fansly.com/home" + + def posts(self): + subscribed, list_id = self.groups + if subscribed is not None: + mode = "1" + elif list_id is not None: + mode = None + else: + mode = "0" + return self.api.timeline_home(mode, list_id) + + +class FanslyListExtractor(FanslyExtractor): + subcategory = "list" + pattern = rf"{BASE_PATTERN}/lists/(\d+)" + example = "https://fansly.com/lists/1234567890" + + def items(self): + base = f"{self.root}/" + for account in self.api.lists_itemsnew(self.groups[0]): + account["_extractor"] = FanslyCreatorPostsExtractor + url = f"{base}{account['username']}/posts" + yield Message.Queue, url, account + + +class FanslyListsExtractor(FanslyExtractor): + subcategory = "lists" + pattern = rf"{BASE_PATTERN}/lists" + example = "https://fansly.com/lists" + + def items(self): + base = f"{self.root}/lists/" + for list in self.api.lists_account(): + list["_extractor"] = FanslyListExtractor + url = f"{base}{list['id']}#{list['label']}" + yield Message.Queue, url, list + + +class FanslyCreatorPostsExtractor(FanslyExtractor): + subcategory = "creator-posts" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/posts" + example = "https://fansly.com/CREATOR/posts" + + def posts(self): + creator = self.groups[0] + if creator.startswith("id:"): + account = self.api.account_by_id(creator[3:]) + else: + account = self.api.account(creator) + wall_id = account["walls"][0]["id"] + return self.api.timeline_new(account["id"], wall_id) + + +class FanslyAPI(): + ROOT = "https://apiv3.fansly.com" + + def __init__(self, extractor): + self.extractor = extractor + + token = extractor.config("token") + if not token: + self.extractor.log.warning("No 'token' provided") + + self.headers = { + "fansly-client-ts": None, + "Origin" : extractor.root, + "authorization" : token, + } + + def account(self, username): + endpoint = "/v1/account" + params = {"usernames": username} + return self._call(endpoint, params)[0] + + def account_by_id(self, account_id): + endpoint = "/v1/account" + params = {"ids": account_id} + return self._call(endpoint, params)[0] + + def accounts_by_id(self, account_ids): + endpoint = "/v1/account" + params = {"ids": ",".join(map(str, account_ids))} + return self._call(endpoint, params) + + def lists_account(self): + endpoint = "/v1/lists/account" + params = {"itemId": ""} + return self._call(endpoint, params) + + def lists_itemsnew(self, list_id, sort="3"): + endpoint = "/v1/lists/itemsnew" + params = { + "listId" : list_id, + "limit" : 50, + "after" : None, + "sortMode": sort, + } + return self._pagination(endpoint, params) + + def post(self, post_id): + endpoint = "/v1/post" + params = {"ids": post_id} + return self._update_posts(self._call(endpoint, params)) + + def timeline_home(self, mode="0", list_id=None): + endpoint = "/v1/timeline/home" + params = {"before": "0", "after": "0"} + if list_id is None: + params["mode"] = mode + else: + params["listId"] = list_id + return self._pagination(endpoint, params) + + def timeline_new(self, account_id, wall_id): + endpoint = f"/v1/timelinenew/{account_id}" + params = { + "before" : "0", + "after" : "0", + "wallId" : wall_id, + "contentSearch": "", + } + return self._pagination(endpoint, params) + + def _update_posts(self, response): + accounts = { + account["id"]: account + for account in response["accounts"] + } + media = { + media["id"]: media + for media in response["accountMedia"] + } + bundles = { + bundle["id"]: bundle + for bundle in response["accountMediaBundles"] + } + + posts = response["posts"] + for post in posts: + post["account"] = accounts[post.pop("accountId")] + + attachments = [] + for attachment in post["attachments"]: + cid = attachment["contentId"] + if cid in media: + attachments.append(media[cid]) + elif cid in bundles: + bundle = bundles[cid]["bundleContent"] + bundle.sort(key=lambda c: c["pos"]) + attachments.extend( + media[m["accountMediaId"]] + for m in bundle + if m["accountMediaId"] in media + ) + else: + self.extractor.log.warning( + "%s: Unhandled 'contentId' %s", + post["id"], cid) + post["attachments"] = attachments + return posts + + def _update_items(self, items): + ids = [item["id"] for item in items] + accounts = { + account["id"]: account + for account in self.accounts_by_id(ids) + } + return [accounts[id] for id in ids] + + def _call(self, endpoint, params): + url = f"{self.ROOT}/api{endpoint}" + params["ngsw-bypass"] = "true" + headers = self.headers.copy() + headers["fansly-client-ts"] = str(int(time.time() * 1000)) + + data = self.extractor.request_json( + url, params=params, headers=headers) + return data["response"] + + def _pagination(self, endpoint, params): + while True: + response = self._call(endpoint, params) + + if isinstance(response, list): + if not response: + return + yield from self._update_items(response) + params["after"] = response[-1]["sortId"] + + else: + if not response.get("posts"): + return + posts = self._update_posts(response) + yield from posts + params["before"] = min(p["id"] for p in posts) diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 91bcd38..a08f7bb 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -136,8 +136,9 @@ class HentaifoundryExtractor(Extractor): ">" + c + ":</span>", "<").replace(",", "")) data["description"] = text.unescape(extr( - "class='storyDescript'>", "<div")) - path = extr('href="', '"') + "class='storyDescript'>", '<div class="storyRead">')).replace( + "\r\n", "\n") + path = extr('class="pdfLink" href="', '"') data["src"] = self.root + path data["index"] = text.parse_int(path.rsplit("/", 2)[1]) data["ratings"] = [text.unescape(r) for r in text.extract_iter(extr( diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 171feea..abba9df 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -23,6 +23,7 @@ class ImagebamExtractor(Extractor): def _init(self): self.cookies.set("nsfw_inter", "1", domain="www.imagebam.com") + self.cookies.set("sfw_inter", "1", domain="www.imagebam.com") def _parse_image_page(self, path): page = self.request(self.root + path).text diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index fa60f91..9b8f8c9 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -242,8 +242,7 @@ class InstagramExtractor(Extractor): if self._warn_video_ua: self._warn_video_ua = False pattern = text.re( - r"AppleWebKit/537\.36 \(KHTML, like Gecko\) " - r"Chrome/\d+\.\d+\.\d+\.\d+ Safari/537\.36$") + r"Chrome/\d{3,}\.\d+\.\d+\.\d+(?!\d* Mobile)") if not pattern.search(self.session.headers["User-Agent"]): self.log.warning("Potentially lowered video quality " "due to non-Chrome User-Agent") @@ -568,6 +567,20 @@ class InstagramCollectionExtractor(InstagramExtractor): return self.api.user_collection(self.collection_id) +class InstagramStoriesTrayExtractor(InstagramExtractor): + """Extractor for your Instagram account's stories tray""" + subcategory = "stories-tray" + pattern = rf"{BASE_PATTERN}/stories/me/?$()" + example = "https://www.instagram.com/stories/me/" + + def items(self): + base = f"{self.root}/stories/id:" + for story in self.api.reels_tray(): + story["date"] = text.parse_timestamp(story["latest_reel_media"]) + story["_extractor"] = InstagramStoriesExtractor + yield Message.Queue, f"{base}{story['id']}/", story + + class InstagramStoriesExtractor(InstagramExtractor): """Extractor for Instagram stories""" subcategory = "stories" @@ -793,7 +806,11 @@ class InstagramRestAPI(): try: return self._call(endpoint, params=params)["reels_media"] except KeyError: - raise exception.AuthorizationError("Login required") + raise exception.AuthRequired("authenticated cookies") + + def reels_tray(self): + endpoint = "/v1/feed/reels_tray/" + return self._call(endpoint)["tray"] def tags_media(self, tag): for section in self.tags_sections(tag): diff --git a/gallery_dl/extractor/kemono.py b/gallery_dl/extractor/kemono.py index a5e1f6d..fc5972c 100644 --- a/gallery_dl/extractor/kemono.py +++ b/gallery_dl/extractor/kemono.py @@ -151,7 +151,8 @@ class KemonoExtractor(Extractor): file["extension"] = ext elif ext == "txt" and file["extension"] != "txt": file["_http_validate"] = _validate - elif ext in exts_archive: + elif ext in exts_archive or \ + ext == "bin" and file["extension"] in exts_archive: file["type"] = "archive" if archives: try: diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index 9c33d4f..f22d54e 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -63,6 +63,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): "date" : extr("Date Added: <b>", "<"), "type" : text.remove_html(extr("Album type: <b>", "</b>")), "uploader": text.remove_html(extr("Uploaded by: ", "</")), + "description": extr("<h2>Description</h2>", "<h2>").strip(), }} def _extract_tracks(self, page): diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index c87430b..9febda9 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -17,7 +17,7 @@ class RedditExtractor(Extractor): """Base class for reddit extractors""" category = "reddit" directory_fmt = ("{category}", "{subreddit}") - filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}" + filename_fmt = "{id}{num:? //>02} {title|link_title:[:220]}.{extension}" archive_fmt = "{filename}" cookies_domain = ".reddit.com" request_interval = 0.6 @@ -84,6 +84,12 @@ class RedditExtractor(Extractor): text.nameext_from_url(url, submission) yield Message.Url, url, submission + elif embeds and "media_metadata" in media: + for embed in self._extract_embed(submission): + submission["num"] += 1 + text.nameext_from_url(embed, submission) + yield Message.Url, embed, submission + elif media["is_video"]: if videos: text.nameext_from_url(url, submission) @@ -102,6 +108,12 @@ class RedditExtractor(Extractor): urls.append((url, submission)) if self.api.comments: + if comments and not submission: + submission = comments[0] + submission.setdefault("num", 0) + if not parentdir: + yield Message.Directory, submission + for comment in comments: html = comment["body_html"] or "" href = (' href="' in html) @@ -193,19 +205,26 @@ class RedditExtractor(Extractor): return for mid, data in meta.items(): - if data["status"] != "valid" or "s" not in data: + if data["status"] != "valid": self.log.warning( "embed %s: skipping item %s (status: %s)", submission["id"], mid, data.get("status")) continue - src = data["s"] - if url := src.get("u") or src.get("gif") or src.get("mp4"): - yield url.partition("?")[0].replace("/preview.", "/i.", 1) - else: - self.log.error( - "embed %s: unable to fetch download URL for item %s", - submission["id"], mid) - self.log.debug(src) + + if src := data.get("s"): + if url := src.get("u") or src.get("gif") or src.get("mp4"): + yield url.partition("?")[0].replace("/preview.", "/i.", 1) + else: + self.log.error( + "embed %s: unable to fetch download URL for item %s", + submission["id"], mid) + self.log.debug(src) + elif url := data.get("dashUrl"): + submission["_ytdl_manifest"] = "dash" + yield f"ytdl:{url}" + elif url := data.get("hlsUrl"): + submission["_ytdl_manifest"] = "hls" + yield f"ytdl:{url}" def _extract_video_ytdl(self, submission): return "https://www.reddit.com" + submission["permalink"] @@ -361,6 +380,7 @@ class RedditAPI(): Ref: https://www.reddit.com/dev/api/ """ + ROOT = "https://oauth.reddit.com" CLIENT_ID = "6N9uN0krSDE-ig" USER_AGENT = "Python:gallery-dl:0.8.4 (by /u/mikf1)" @@ -369,41 +389,50 @@ class RedditAPI(): self.log = extractor.log config = extractor.config + self.comments = text.parse_int(config("comments", 0)) self.morecomments = config("morecomments", False) + self._warn_429 = False - client_id = config("client-id") - if client_id is None: - self.client_id = self.CLIENT_ID - self.headers = {"User-Agent": self.USER_AGENT} + if config("api") == "rest": + self.root = "https://www.reddit.com" + self.headers = None + self.authenticate = util.noop + self.log.debug("Using REST API") else: - self.client_id = client_id - self.headers = {"User-Agent": config("user-agent")} + self.root = self.ROOT - if self.client_id == self.CLIENT_ID: - client_id = self.client_id - self._warn_429 = True - kind = "default" - else: - client_id = client_id[:5] + "*" * (len(client_id)-5) - self._warn_429 = False - kind = "custom" + client_id = config("client-id") + if client_id is None: + self.client_id = self.CLIENT_ID + self.headers = {"User-Agent": self.USER_AGENT} + else: + self.client_id = client_id + self.headers = {"User-Agent": config("user-agent")} - self.log.debug( - "Using %s API credentials (client-id %s)", kind, client_id) + if self.client_id == self.CLIENT_ID: + client_id = self.client_id + self._warn_429 = True + kind = "default" + else: + client_id = client_id[:5] + "*" * (len(client_id)-5) + kind = "custom" - token = config("refresh-token") - if token is None or token == "cache": - key = "#" + self.client_id - self.refresh_token = _refresh_token_cache(key) - else: - self.refresh_token = token + self.log.debug( + "Using %s API credentials (client-id %s)", kind, client_id) - if not self.refresh_token: - # allow downloading from quarantined subreddits (#2180) - extractor.cookies.set( - "_options", '%7B%22pref_quarantine_optin%22%3A%20true%7D', - domain=extractor.cookies_domain) + token = config("refresh-token") + if token is None or token == "cache": + key = "#" + self.client_id + self.refresh_token = _refresh_token_cache(key) + else: + self.refresh_token = token + + if not self.refresh_token: + # allow downloading from quarantined subreddits (#2180) + extractor.cookies.set( + "_options", '%7B%22pref_quarantine_optin%22%3A%20true%7D', + domain=extractor.cookies_domain) def submission(self, submission_id): """Fetch the (submission, comments)=-tuple for a submission id""" @@ -416,13 +445,11 @@ class RedditAPI(): def submissions_subreddit(self, subreddit, params): """Collect all (submission, comments)-tuples of a subreddit""" endpoint = subreddit + "/.json" - params["limit"] = 100 return self._pagination(endpoint, params) def submissions_user(self, user, params): """Collect all (submission, comments)-tuples posted by a user""" endpoint = "/user/" + user + "/.json" - params["limit"] = 100 return self._pagination(endpoint, params) def morechildren(self, link_id, children): @@ -477,7 +504,7 @@ class RedditAPI(): return "Bearer " + data["access_token"] def _call(self, endpoint, params): - url = "https://oauth.reddit.com" + endpoint + url = f"{self.root}{endpoint}" params["raw_json"] = "1" while True: @@ -522,6 +549,9 @@ class RedditAPI(): id_max = float("inf") date_min, date_max = self.extractor._get_date_min_max(0, 253402210800) + if limit := self.extractor.config("limit"): + params["limit"] = limit + while True: data = self._call(endpoint, params)["data"] diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index b988646..36b083b 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -79,9 +79,12 @@ BASE_PATTERN = Shimmie2Extractor.update({ }, "nozrip": { "root": "https://noz.rip/booru", - "base": "https://noz.rip", "pattern": r"noz\.rip/booru", }, + "thecollectionS": { + "root": "https://co.llection.pics", + "pattern": r"co\.llection\.pics", + }, }) + r"/(?:index\.php\?q=/?)?" @@ -160,10 +163,12 @@ class Shimmie2PostExtractor(Shimmie2Extractor): def posts(self): post_id = self.groups[-1] - url = f"{self.root}/post/view/{post_id}" + root = self.root + base = root if (pos := root.find("/", 8)) < 0 else root[:pos] + + url = f"{root}/post/view/{post_id}" page = self.request(url).text extr = text.extract_from(page) - base = self.config_instance("base", self.root) qt = self._quote_type(page) post = { diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 6eea76c..92fc831 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -31,15 +31,12 @@ class TumblrExtractor(Extractor): filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" - def __init__(self, match): - Extractor.__init__(self, match) - - if name := match[2]: - self.blog = name + ".tumblr.com" + def _init(self): + if name := self.groups[1]: + self.blog = f"{name}.tumblr.com" else: - self.blog = match[1] or match[3] + self.blog = self.groups[0] or self.groups[2] - def _init(self): self.api = TumblrAPI(self) self.types = self._setup_posttypes() self.avatar = self.config("avatar", False) @@ -287,14 +284,10 @@ class TumblrPostExtractor(TumblrExtractor): pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)" example = "https://www.tumblr.com/BLOG/12345" - def __init__(self, match): - TumblrExtractor.__init__(self, match) - self.post_id = match[4] + def posts(self): self.reblogs = True self.date_min = 0 - - def posts(self): - return self.api.posts(self.blog, {"id": self.post_id}) + return self.api.posts(self.blog, {"id": self.groups[3]}) def _setup_posttypes(self): return POST_TYPES @@ -303,15 +296,13 @@ class TumblrPostExtractor(TumblrExtractor): class TumblrTagExtractor(TumblrExtractor): """Extractor for Tumblr user's posts by tag""" subcategory = "tag" - pattern = BASE_PATTERN + r"/tagged/([^/?#]+)" + pattern = BASE_PATTERN + r"(?:/archive)?/tagged/([^/?#]+)" example = "https://www.tumblr.com/BLOG/tagged/TAG" - def __init__(self, match): - TumblrExtractor.__init__(self, match) - self.tag = text.unquote(match[4].replace("-", " ")) - def posts(self): - return self.api.posts(self.blog, {"tag": self.tag}) + self.kwdict["search_tags"] = tag = text.unquote( + self.groups[3].replace("-", " ")) + return self.api.posts(self.blog, {"tag": tag}) class TumblrDayExtractor(TumblrExtractor): @@ -320,21 +311,13 @@ class TumblrDayExtractor(TumblrExtractor): pattern = BASE_PATTERN + r"/day/(\d\d\d\d/\d\d/\d\d)" example = "https://www.tumblr.com/BLOG/day/1970/01/01" - def __init__(self, match): - TumblrExtractor.__init__(self, match) - year, month, day = match[4].split("/") - self.ordinal = date(int(year), int(month), int(day)).toordinal() - - def _init(self): - TumblrExtractor._init(self) - - self.date_min = ( - # 719163 == date(1970, 1, 1).toordinal() - (self.ordinal - 719163) * 86400) + def posts(self): + year, month, day = self.groups[3].split("/") + ordinal = date(int(year), int(month), int(day)).toordinal() + # 719163 == date(1970, 1, 1).toordinal() + self.date_min = (ordinal - 719163) * 86400 self.api.before = self.date_min + 86400 - - def posts(self): return self.api.posts(self.blog, {}) @@ -550,8 +533,11 @@ class TumblrAPI(oauth.OAuth1API): params["api_key"] = self.api_key strategy = self.extractor.config("pagination") - if not strategy and "offset" not in params: - strategy = "api" + if not strategy: + if params.get("before"): + strategy = "before" + elif "offset" not in params: + strategy = "api" while True: data = self._call(endpoint, params) @@ -573,10 +559,9 @@ class TumblrAPI(oauth.OAuth1API): endpoint = data["_links"]["next"]["href"] except KeyError: return - - params = None - if self.api_key: - endpoint += "&api_key=" + self.api_key + if params is not None and self.api_key: + endpoint = f"{endpoint}&api_key={self.api_key}" + params = None elif strategy == "before": if not posts: diff --git a/gallery_dl/extractor/tungsten.py b/gallery_dl/extractor/tungsten.py new file mode 100644 index 0000000..20d5a59 --- /dev/null +++ b/gallery_dl/extractor/tungsten.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://tungsten.run/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?tungsten\.run" + + +class TungstenExtractor(Extractor): + """Base class for tungsten extractors""" + category = "tungsten" + root = "https://tungsten.run" + directory_fmt = ("{category}", "{user[username]}") + filename_fmt = "{date} {title:?/ /}{uuid}.{extension}" + archive_fmt = "{uuid}" + + def items(self): + for post in self.posts(): + url = post["original_url"] + post["date"] = text.parse_datetime(post["created_at"]) + post["filename"] = url[url.rfind("/")+1:] + post["extension"] = "webp" + yield Message.Directory, post + yield Message.Url, url, post + + def _pagination(self, url, params): + params["page"] = 1 + params["per_page"] = 40 + + headers = { + "Origin": self.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + } + + while True: + data = self.request_json(url, params=params, headers=headers) + + yield from data + + if len(data) < params["per_page"]: + break + params["page"] += 1 + + +class TungstenPostExtractor(TungstenExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}/post/(\w+)" + example = "https://tungsten.run/post/AbCdEfGhIjKlMnOp" + + def posts(self): + url = f"{self.root}/post/{self.groups[0]}" + page = self.request(url).text + data = self._extract_nextdata(page) + return (data["props"]["pageProps"]["post"],) + + +class TungstenModelExtractor(TungstenExtractor): + subcategory = "model" + pattern = rf"{BASE_PATTERN}/model/(\w+)(?:/?\?model_version=(\w+))?" + example = "https://tungsten.run/model/AbCdEfGhIjKlM" + + def posts(self): + uuid_model, uuid_version = self.groups + + if uuid_version is None: + url = f"{self.root}/model/{uuid_model}/" + page = self.request(url).text + uuid_version = text.extr(page, '"modelVersionUUID":"', '"') + + url = "https://api.tungsten.run/v1/posts" + params = { + "sort" : "top_all_time", + "tweakable_only": "false", + "following" : "false", + "model_version_uuid": uuid_version, + } + return self._pagination(url, params) + + +class TungstenUserExtractor(TungstenExtractor): + subcategory = "user" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)" + example = "https://tungsten.run/user/USER/posts" + + def posts(self): + url = f"{self.root}/user/{self.groups[0]}" + page = self.request(url).text + uuid_user = text.extr(page, '"user":{"uuid":"', '"') + + url = f"https://api.tungsten.run/v1/users/{uuid_user}/posts" + params = {"sort": "top_all_time"} + return self._pagination(url, params) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c928507..c919cb8 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -471,21 +471,35 @@ class TwitterExtractor(Extractor): except KeyError: pass + admin = creator = banner = None + try: + if results := com.get("admin_results"): + admin = results["result"]["core"]["screen_name"] + except Exception: + pass + try: + if results := com.get("creator_results"): + creator = results["result"]["core"]["screen_name"] + except Exception: + pass + try: + if results := com.get("custom_banner_media"): + banner = results["media_info"]["original_img_url"] + except Exception: + pass + self._user_cache[f"C#{cid}"] = cdata = { "id": text.parse_int(cid), - "name": com["name"], - "description": com["description"], - "date": text.parse_timestamp(com["created_at"] // 1000), - "nsfw": com["is_nsfw"], - "role": com["role"], - "member_count": com["member_count"], - "rules": [rule["name"] for rule in com["rules"]], - "admin": (admin := com.get("admin_results")) and - admin["result"]["core"]["screen_name"], # noqa: E131 - "creator": (creator := com.get("creator_results")) and - creator["result"]["core"]["screen_name"], # noqa: E131 - "banner": (banner := com.get("custom_banner_media")) and - banner["media_info"]["original_img_url"], # noqa: E131 + "name": com.get("name"), + "description": com.get("description"), + "date": text.parse_timestamp(com.get("created_at", 0) // 1000), + "nsfw": com.get("is_nsfw"), + "role": com.get("role"), + "member_count": com.get("member_count"), + "rules": [rule["name"] for rule in com.get("rules", ())], + "admin" : admin, + "creator": creator, + "banner" : banner, } return cdata @@ -512,8 +526,8 @@ class TwitterExtractor(Extractor): entities = legacy["entities"] self._user_cache[uid] = udata = { "id" : text.parse_int(uid), - "name" : core["screen_name"], - "nick" : core["name"], + "name" : core.get("screen_name"), + "nick" : core.get("name"), "location" : user["location"]["location"], "date" : text.parse_datetime( core["created_at"], "%a %b %d %H:%M:%S %z %Y"), diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 75a0137..22d4b9a 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -72,6 +72,10 @@ class VkExtractor(Extractor): photo["width"] = photo["height"] = 0 photo["id"] = photo["id"].rpartition("_")[2] + photo["date"] = text.parse_timestamp(text.extr( + photo["date"], 'data-date="', '"')) + photo["description"] = text.unescape(text.extr( + photo.get("desc", ""), ">", "<")) photo.update(data) text.nameext_from_url(url, photo) @@ -108,6 +112,10 @@ class VkExtractor(Extractor): total = payload[1] photos = payload[3] + for i in range(len(photos)): + photos[i]["num"] = self.offset + i + 1 + photos[i]["count"] = total + offset_next = self.offset + len(photos) if offset_next >= total: # the last chunk of photos also contains the first few photos @@ -128,7 +136,7 @@ class VkPhotosExtractor(VkExtractor): subcategory = "photos" pattern = (BASE_PATTERN + r"/(?:" r"(?:albums|photos|id)(-?\d+)" - r"|(?!(?:album|tag)-?\d+_?)([^/?#]+))") + r"|(?!(?:album|tag|wall)-?\d+_?)([^/?#]+))") example = "https://vk.com/id12345" def __init__(self, match): @@ -179,17 +187,40 @@ class VkAlbumExtractor(VkExtractor): pattern = BASE_PATTERN + r"/album(-?\d+)_(\d+)$" example = "https://vk.com/album12345_00" - def __init__(self, match): - VkExtractor.__init__(self, match) - self.user_id, self.album_id = match.groups() - def photos(self): - return self._pagination(f"album{self.user_id}_{self.album_id}") + user_id, album_id = self.groups + return self._pagination(f"album{user_id}_{album_id}") def metadata(self): + user_id, album_id = self.groups + + url = f"{self.root}/album{user_id}_{album_id}" + page = self.request(url).text + desc = text.extr(page, 'name="og:description" value="', '"') + try: + album_name, user_name, photos = desc.rsplit(" - ", 2) + except ValueError: + if msg := text.extr( + page, '<div class="message_page_title">Error</div>', + "</div>"): + msg = f" ('{text.remove_html(msg)[:-5]}')" + self.log.warning("%s_%s: Failed to extract metadata%s", + user_id, album_id, msg) + return {"user": {"id": user_id}, "album": {"id": album_id}} + return { - "user": {"id": self.user_id}, - "album": {"id": self.album_id}, + "user": { + "id" : user_id, + "nick" : text.unescape(user_name), + "name" : text.unescape(text.extr( + page, 'class="ui_crumb" href="/', '"')), + "group": user_id[0] == "-", + }, + "album": { + "id" : album_id, + "name" : text.unescape(album_name), + "count": text.parse_int(photos[:-7]) + }, } @@ -209,3 +240,35 @@ class VkTaggedExtractor(VkExtractor): def metadata(self): return {"user": {"id": self.user_id}} + + +class VkWallPostExtractor(VkExtractor): + """Extractor for a vk wall post""" + subcategory = "wall-post" + directory_fmt = ("{category}", "{user[id]}", "wall") + filename_fmt = "{wall[id]}_{num}.{extension}" + pattern = BASE_PATTERN + r"/wall(-?\d+)_(\d+)" + example = "https://vk.com/wall12345_123" + + def photos(self): + user_id, wall_id = self.groups + return self._pagination(f"wall{user_id}_{wall_id}") + + def metadata(self): + user_id, wall_id = self.groups + + url = f"{self.root}/wall{user_id}_{wall_id}" + page = self.request(url).text + desc = text.unescape( + text.extr(page, 'data-testid="post_description">', "</div>") or + text.extr(page, 'name="description" content="', '"')) + + return { + "user": { + "id": user_id, + }, + "wall": { + "id": wall_id, + "description": desc, + }, + } diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index fca8911..e1b4897 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -55,7 +55,8 @@ class ZerochanExtractor(BooruExtractor): "login" : "Login", } - response = self.request(url, method="POST", headers=headers, data=data) + response = self.request( + url, method="POST", headers=headers, data=data, expected=(500,)) if not response.history: raise exception.AuthenticationError() diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index 7a49049..b09203f 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -565,6 +565,7 @@ _CONVERSIONS = { "U": text.unescape, "H": lambda s: text.unescape(text.remove_html(s)), "g": text.slugify, + "R": text.re(r"https?://[^\s\"']+").findall, "W": text.sanitize_whitespace, "S": util.to_string, "s": str, diff --git a/gallery_dl/option.py b/gallery_dl/option.py index fd664e6..05cc9d3 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -222,10 +222,6 @@ class Formatter(argparse.HelpFormatter): def _format_usage(self, usage, actions, groups, prefix): return f"Usage: {self._prog} [OPTIONS] URL [URL...]\n" - def format_help(self): - return self._long_break_matcher.sub( - "\n\n", self._root_section.format_help()) - def _parse_option(opt): key, _, value = opt.partition("=") @@ -276,7 +272,7 @@ def build_parser(): help="Load external extractors from PATH", ) general.add_argument( - "--user-agent", + "-a", "--user-agent", dest="user-agent", metavar="UA", action=ConfigAction, help="User-Agent request header", ) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 8020352..187ef92 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.30.5" +__version__ = "1.30.6" __variant__ = None diff --git a/test/test_formatter.py b/test/test_formatter.py index f3ed9dd..8b35a2b 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -44,6 +44,10 @@ class TestFormatter(unittest.TestCase): "s": " \n\r\tSPACE ", "S": " \n\r\tS P A\tC\nE ", "h": "<p>foo </p> & bar <p> </p>", + "H": """<p> + <a href="http://www.example.com">Lorem ipsum dolor sit amet</a>. + Duis aute irure <a href="http://blog.example.org">dolor</a>. +</p>""", "u": "'< / >'", "t": 1262304000, "ds": "2010-01-01T01:00:00+01:00", @@ -72,6 +76,9 @@ class TestFormatter(unittest.TestCase): self._run_test("{h!H}", "foo & bar") self._run_test("{u!H}", "'< / >'") self._run_test("{n!H}", "") + self._run_test("{h!R}", []) + self._run_test("{H!R}", ["http://www.example.com", + "http://blog.example.org"]) self._run_test("{a!s}", self.kwdict["a"]) self._run_test("{a!r}", f"'{self.kwdict['a']}'") self._run_test("{a!a}", f"'{self.kwdict['a']}'") @@ -590,10 +597,11 @@ def gentext(kwdict): def lengths(kwdict): a = 0 for k, v in kwdict.items(): - try: - a += len(v) - except TypeError: - pass + if k == k.lower(): + try: + a += len(v) + except TypeError: + pass return format(a) def noarg(): @@ -616,10 +624,10 @@ def noarg(): fmt4 = formatter.parse(f"\fM {path}:lengths") self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt2.format_map(self.kwdict), "168") + self.assertEqual(fmt2.format_map(self.kwdict), "139") self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt4.format_map(self.kwdict), "168") + self.assertEqual(fmt4.format_map(self.kwdict), "139") with self.assertRaises(TypeError): self.assertEqual(fmt0.format_map(self.kwdict), "") diff --git a/test/test_results.py b/test/test_results.py index 05b98bf..7e024b8 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -361,6 +361,12 @@ class TestExtractorResults(unittest.TestCase): msg = f"{path} / ISO 639-1" self.assertIsInstance(value, str, msg=msg) self.assertRegex(value, r"^[a-z]{2}(-\w+)?$", msg=msg) + elif iso in ("uuid", "11578", "11578:1996", "4122"): + msg = f"{path} / ISO 11578:1996" + pat = (r"(?i)[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-" + r"[0-9a-f]{4}-[0-9a-f]{12}") + self.assertIsInstance(value, str, msg=msg) + self.assertRegex(value, pat, msg=msg) else: self.fail(f"Unsupported ISO test '{test}'") else: |
