diff options
39 files changed, 879 insertions, 665 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 484ddeb..257f47b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,15 +1,43 @@ -## 1.29.2 - 2025-03-15 +## 1.29.3 - 2025-03-29 ### Extractors #### Additions -- [arcalive] add support ([#5657](https://github.com/mikf/gallery-dl/issues/5657) [#7100](https://github.com/mikf/gallery-dl/issues/7100)) -- [furaffinity] add `folder` extractor ([#1817](https://github.com/mikf/gallery-dl/issues/1817) [#7159](https://github.com/mikf/gallery-dl/issues/7159)) +- [danbooru] add `favgroup` extractor +- [imhentai] support `hentaienvy.com` and `hentaizap.com` ([#7192](https://github.com/mikf/gallery-dl/issues/7192) [#7218](https://github.com/mikf/gallery-dl/issues/7218)) #### Fixes -- [civitai] fix/improve query parameter handling ([#7138](https://github.com/mikf/gallery-dl/issues/7138)) -- [facebook] improve `date` extraction ([#7151](https://github.com/mikf/gallery-dl/issues/7151)) -- [sankaku] update API URLs ([#7154](https://github.com/mikf/gallery-dl/issues/7154) [#7155](https://github.com/mikf/gallery-dl/issues/7155) [#7163](https://github.com/mikf/gallery-dl/issues/7163)) -- [twitter] prevent exception in `_extract_components()` ([#7139](https://github.com/mikf/gallery-dl/issues/7139)) +- [bunkr] fix `filename` extraction ([#7237](https://github.com/mikf/gallery-dl/issues/7237)) +- [deviantart:stash] fix legacy `sta.sh` links ([#7181](https://github.com/mikf/gallery-dl/issues/7181)) +- [hitomi] fix extractors ([#7230](https://github.com/mikf/gallery-dl/issues/7230)) +- [mangapark] fix extractors ([#4999](https://github.com/mikf/gallery-dl/issues/4999) [#5883](https://github.com/mikf/gallery-dl/issues/5883) [#6507](https://github.com/mikf/gallery-dl/issues/6507) [#6908](https://github.com/mikf/gallery-dl/issues/6908) [#7232](https://github.com/mikf/gallery-dl/issues/7232)) +- [nozomi] fix extractors ([#7242](https://github.com/mikf/gallery-dl/issues/7242)) +- [patreon] include subdomains in `session_id` cookie check ([#7188](https://github.com/mikf/gallery-dl/issues/7188)) +- [patreon] do not match `/messages` URLs as creator ([#7187](https://github.com/mikf/gallery-dl/issues/7187)) +- [pinterest] handle `story_pin_static_sticker_block` blocks ([#7251](https://github.com/mikf/gallery-dl/issues/7251)) +- [sexcom] fix `gif` pin extraction ([#7239](https://github.com/mikf/gallery-dl/issues/7239)) +- [skeb] make exceptions when extracting posts non-fatal ([#7250](https://github.com/mikf/gallery-dl/issues/7250)) +- [zerochan] parse `JSON-LD` data ([#7178](https://github.com/mikf/gallery-dl/issues/7178)) #### Improvements -- [batoto] add `domain` option ([#7174](https://github.com/mikf/gallery-dl/issues/7174)) -- [furaffinity] extract `scraps` metadata ([#7015](https://github.com/mikf/gallery-dl/issues/7015)) -- [tiktok] implement audio extraction without `yt-dlp` -- [wikimedia] add `subcategories` option ([#2340](https://github.com/mikf/gallery-dl/issues/2340)) +- [arcalive] extend `gifs` option +- [deviantart] support multiple images for single posts ([#6653](https://github.com/mikf/gallery-dl/issues/6653) [#7261](https://github.com/mikf/gallery-dl/issues/7261)) +- [deviantart] add subfolder support ([#4988](https://github.com/mikf/gallery-dl/issues/4988) [#7185](https://github.com/mikf/gallery-dl/issues/7185) [#7220](https://github.com/mikf/gallery-dl/issues/7220)) +- [deviantart] match `/gallery/recommended-for-you` URLs ([#7168](https://github.com/mikf/gallery-dl/issues/7168) [#7243](https://github.com/mikf/gallery-dl/issues/7243)) +- [instagram] extract videos from `video_dash_manifest` data ([#6379](https://github.com/mikf/gallery-dl/issues/6379) [#7006](https://github.com/mikf/gallery-dl/issues/7006)) +- [mangapark] support mirror domains +- [mangapark] support v3 URLs ([#2072](https://github.com/mikf/gallery-dl/issues/2072)) +- [mastodon] support `/statuses` URLs ([#7255](https://github.com/mikf/gallery-dl/issues/7255)) +- [sexcom] support new-style `/gifs` and `/videos` URLs ([#7239](https://github.com/mikf/gallery-dl/issues/7239)) +- [subscribestar] detect redirects to `/age_confirmation_warning` pages +- [tiktok] add retry mechanism to rehydration data extraction ([#7191](https://github.com/mikf/gallery-dl/issues/7191)) +#### Metadata +- [bbc] extract more metadata ([#6582](https://github.com/mikf/gallery-dl/issues/6582)) +- [kemonoparty] extract `archives` metadata ([#7195](https://github.com/mikf/gallery-dl/issues/7195)) +- [kemonoparty] enable `username`/`user_profile` metadata by default +- [kemonoparty:discord] always provide `channel_name` metadata ([#7245](https://github.com/mikf/gallery-dl/issues/7245)) +- [sexcom] extract `date_url` metadata ([#7239](https://github.com/mikf/gallery-dl/issues/7239)) +- [subscribestar] extract `title` metadata ([#7219](https://github.com/mikf/gallery-dl/issues/7219)) +### Downloaders +- [ytdl] support processing inline HLS/DASH manifest data ([#6379](https://github.com/mikf/gallery-dl/issues/6379) [#7006](https://github.com/mikf/gallery-dl/issues/7006)) +### Miscellaneous +- [aes] simplify `block_count` calculation +- [common] add `subdomains` argument to `cookies_check()` ([#7188](https://github.com/mikf/gallery-dl/issues/7188)) +- [config] fix using the same key multiple times with `apply` ([#7127](https://github.com/mikf/gallery-dl/issues/7127)) +- [tests] implement expected failures @@ -1,6 +1,6 @@ -Metadata-Version: 2.2 +Metadata-Version: 2.4 Name: gallery_dl -Version: 1.29.2 +Version: 1.29.3 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -46,6 +46,7 @@ Dynamic: download-url Dynamic: home-page Dynamic: keywords Dynamic: license +Dynamic: license-file Dynamic: maintainer Dynamic: maintainer-email Dynamic: provides-extra @@ -132,9 +133,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.2/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.3/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.2/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.3/gallery-dl.bin>`__ Nightly Builds @@ -77,9 +77,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.2/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.3/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.2/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.3/gallery-dl.bin>`__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 8c34ff3..5b0e7e7 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2025-03-15" "1.29.2" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2025-03-29" "1.29.3" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index aaf94b3..d032f25 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2025-03-15" "1.29.2" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2025-03-29" "1.29.3" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -1651,14 +1651,26 @@ Download emoticon images. .SS extractor.arcalive.gifs .IP "Type:" 6 -\f[I]bool\f[] +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] .IP "Default:" 9 \f[I]true\f[] .IP "Description:" 4 -Check if \f[I].mp4\f[] videos have a \f[I].gif\f[] version -and download those instead. +Try to download \f[I].gif\f[] versions of \f[I].mp4\f[] videos. + +\f[I]true\f[] | \f[I]"fallback\f[] +Use the \f[I].gif\f[] version as primary URL +and provide the \f[I].mp4\f[] one as +\f[I]fallback\f[]. +\f[I]"check"\f[] +Check whether a \f[I].gif\f[] version is available +by sending an extra HEAD request. +\f[I]false\f[] +Always download the \f[I].mp4\f[] version. .SS extractor.artstation.external @@ -2219,7 +2231,7 @@ For unavailable or restricted posts, follow the \f[I]source\f[] and download from there if possible. -.SS extractor.[Danbooru].pool.order-posts +.SS extractor.[Danbooru].favgroup.order-posts .IP "Type:" 6 \f[I]string\f[] @@ -2227,7 +2239,7 @@ follow the \f[I]source\f[] and download from there if possible. \f[I]"pool"\f[] .IP "Description:" 4 -Controls the order in which pool posts are returned. +Controls the order in which \f[I]pool\f[]/\f[I]favgroup\f[] posts are returned. \f[I]"pool"\f[] \f[I] \f[I]"pool_asc"\f[] \f[] \f[I]"asc"\f[] \f[I] \f[I]"asc_pool"\f[] Pool order @@ -2689,6 +2701,17 @@ Leave \f[I]SIZE\f[] empty to download the regular, small avatar format. .br +.SS extractor.deviantart.folder.subfolders +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Also extract subfolder content. + + .SS extractor.discord.embeds .IP "Type:" 6 \f[I]list\f[] of \f[I]strings\f[] @@ -3293,9 +3316,6 @@ Selects which image format to download. Available formats are \f[I]"webp"\f[] and \f[I]"avif"\f[]. -\f[I]"original"\f[] will try to download the original \f[I]jpg\f[] or \f[I]png\f[] versions, -but is most likely going to fail with \f[I]403 Forbidden\f[] errors. - .SS extractor.imagechest.access-token .IP "Type:" 6 @@ -3513,13 +3533,23 @@ Download video previews. .SS extractor.instagram.videos .IP "Type:" 6 -\f[I]bool\f[] +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] .IP "Default:" 9 \f[I]true\f[] .IP "Description:" 4 -Download video files. +Controls video download behavior. + +\f[I]true\f[] \f[I] \f[I]"dash"\f[] \f[] \f[I]"ytdl"\f[] +Download videos from \f[I]video_dash_manifest\f[] data using \f[I]ytdl\f[] +\f[I]"merged"\f[] +Download pre-merged video formats +\f[I]false\f[] +Do not download videos .SS extractor.itaku.videos @@ -3533,6 +3563,20 @@ Download video files. Download video files. +.SS extractor.kemonoparty.archives +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract additional metadata for \f[I]archives\f[] files, including +\f[I]file\f[], \f[I]file_list\f[], and \f[I]password\f[]. + +Note: This requires 1 additional HTTP request per \f[I]archives\f[] file. + + .SS extractor.kemonoparty.comments .IP "Type:" 6 \f[I]bool\f[] @@ -3626,10 +3670,10 @@ Limit the number of posts to download. \f[I]bool\f[] .IP "Default:" 9 -\f[I]false\f[] +\f[I]true\f[] .IP "Description:" 4 -Extract \f[I]username\f[] metadata. +Extract \f[I]username\f[] and \f[I]user_profile\f[] metadata. .SS extractor.kemonoparty.revisions @@ -5187,31 +5231,31 @@ tried until a format is found. Possible formats include .br -* \f[I]"gif"\f[] +* \f[I]gif\f[] .br -* \f[I]"gif_transparent"\f[] +* \f[I]gif_transparent\f[] .br -* \f[I]"gifpreview"\f[] +* \f[I]mediumgif\f[] .br -* \f[I]"mediumgif"\f[] +* \f[I]gifpreview\f[] .br -* \f[I]"tinygif"\f[] +* \f[I]tinygif\f[] .br -* \f[I]"tinygif_transparent"\f[] +* \f[I]tinygif_transparent\f[] .br -* \f[I]"mp4"\f[] +* \f[I]mp4\f[] .br -* \f[I]"tinymp4"\f[] +* \f[I]tinymp4\f[] .br -* \f[I]"webm"\f[] +* \f[I]webm\f[] .br -* \f[I]"webp"\f[] +* \f[I]webp\f[] .br -* \f[I]"webp_transparent"\f[] +* \f[I]webp_transparent\f[] .br -* \f[I]"tinywebp"\f[] +* \f[I]tinywebp\f[] .br -* \f[I]"tinywebp_transparent"\f[] +* \f[I]tinywebp_transparent\f[] .SS extractor.tiktok.audio diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 7887fd5..8ede568 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -235,6 +235,9 @@ "avatar": { "formats": null + }, + "folder": { + "subfolders": true } }, "exhentai": @@ -368,13 +371,14 @@ "password": "", "announcements": false, + "archives" : false, "comments" : false, "dms" : false, "duplicates" : false, "favorites" : "artist", "files" : ["attachments", "file", "inline"], "max-posts" : null, - "metadata" : false, + "metadata" : true, "revisions" : false, "order-revisions": "desc" }, @@ -788,6 +792,9 @@ "threshold": "auto", "ugoira" : false, + "favgroup": { + "order-posts": "pool" + }, "pool": { "order-posts": "pool" } diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 1d71036..4481e14 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ -Metadata-Version: 2.2 +Metadata-Version: 2.4 Name: gallery_dl -Version: 1.29.2 +Version: 1.29.3 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -46,6 +46,7 @@ Dynamic: download-url Dynamic: home-page Dynamic: keywords Dynamic: license +Dynamic: license-file Dynamic: maintainer Dynamic: maintainer-email Dynamic: provides-extra @@ -132,9 +133,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.2/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.29.3/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.2/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.29.3/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 3e8f365..2f4a87c 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -108,7 +108,6 @@ gallery_dl/extractor/hatenablog.py gallery_dl/extractor/hentai2read.py gallery_dl/extractor/hentaicosplays.py gallery_dl/extractor/hentaifoundry.py -gallery_dl/extractor/hentaifox.py gallery_dl/extractor/hentaihand.py gallery_dl/extractor/hentaihere.py gallery_dl/extractor/hentainexus.py diff --git a/gallery_dl/aes.py b/gallery_dl/aes.py index 6727541..3fd1d5e 100644 --- a/gallery_dl/aes.py +++ b/gallery_dl/aes.py @@ -78,7 +78,7 @@ def aes_ecb_encrypt(data, key, iv=None): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] for i in range(block_count): @@ -99,7 +99,7 @@ def aes_ecb_decrypt(data, key, iv=None): @returns {int[]} decrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] for i in range(block_count): @@ -132,7 +132,7 @@ def aes_ctr_encrypt(data, key, iv): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) counter = iter_vector(iv) encrypted_data = [] @@ -158,7 +158,7 @@ def aes_cbc_decrypt(data, key, iv): @returns {int[]} decrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) decrypted_data = [] previous_cipher_block = iv @@ -184,7 +184,7 @@ def aes_cbc_encrypt(data, key, iv): @returns {int[]} encrypted data """ expanded_key = key_expansion(key) - block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + block_count = ceil(len(data) / BLOCK_SIZE_BYTES) encrypted_data = [] previous_cipher_block = iv diff --git a/gallery_dl/config.py b/gallery_dl/config.py index f932e3a..92e55d3 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -322,6 +322,7 @@ class apply(): set(path, key, value) def __exit__(self, exc_type, exc_value, traceback): + self.original.reverse() for path, key, value in self.original: if value is util.SENTINEL: unset(path, key) diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 1242098..9d653b3 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -10,6 +10,7 @@ from .common import DownloaderBase from .. import ytdl, text +from xml.etree import ElementTree import os @@ -76,7 +77,8 @@ class YoutubeDLDownloader(DownloaderBase): manifest = kwdict.pop("_ytdl_manifest", None) if manifest: info_dict = self._extract_manifest( - ytdl_instance, url, manifest) + ytdl_instance, url, manifest, + kwdict.pop("_ytdl_manifest_data", None)) else: info_dict = self._extract_info(ytdl_instance, url) except Exception as exc: @@ -154,37 +156,55 @@ class YoutubeDLDownloader(DownloaderBase): def _extract_info(self, ytdl, url): return ytdl.extract_info(url, download=False) - def _extract_manifest(self, ytdl, url, manifest): + def _extract_manifest(self, ytdl, url, manifest_type, manifest_data=None): extr = ytdl.get_info_extractor("Generic") video_id = extr._generic_id(url) - if manifest == "hls": - try: - formats, subtitles = extr._extract_m3u8_formats_and_subtitles( - url, video_id, "mp4") - except AttributeError: - formats = extr._extract_m3u8_formats(url, video_id, "mp4") - subtitles = None - - elif manifest == "dash": - try: - formats, subtitles = extr._extract_mpd_formats_and_subtitles( - url, video_id) - except AttributeError: - formats = extr._extract_mpd_formats(url, video_id) - subtitles = None + if manifest_type == "hls": + if manifest_data is None: + try: + fmts, subs = extr._extract_m3u8_formats_and_subtitles( + url, video_id, "mp4") + except AttributeError: + fmts = extr._extract_m3u8_formats(url, video_id, "mp4") + subs = None + else: + try: + fmts, subs = extr._parse_m3u8_formats_and_subtitles( + url, video_id, "mp4") + except AttributeError: + fmts = extr._parse_m3u8_formats(url, video_id, "mp4") + subs = None + + elif manifest_type == "dash": + if manifest_data is None: + try: + fmts, subs = extr._extract_mpd_formats_and_subtitles( + url, video_id) + except AttributeError: + fmts = extr._extract_mpd_formats(url, video_id) + subs = None + else: + if isinstance(manifest_data, str): + manifest_data = ElementTree.fromstring(manifest_data) + try: + fmts, subs = extr._parse_mpd_formats_and_subtitles( + manifest_data, mpd_id="dash") + except AttributeError: + fmts = extr._parse_mpd_formats( + manifest_data, mpd_id="dash") + subs = None else: - self.log.error("Unsupported manifest type '%s'", manifest) + self.log.error("Unsupported manifest type '%s'", manifest_type) return None info_dict = { "id" : video_id, "title" : video_id, - "formats" : formats, - "subtitles": subtitles, + "formats" : fmts, + "subtitles": subs, } - # extr._extra_manifest_info(info_dict, url) return ytdl.process_ie_result(info_dict, download=False) def _progress_hook(self, info): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8198619..87c3798 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -68,7 +68,6 @@ modules = [ "hentai2read", "hentaicosplays", "hentaifoundry", - "hentaifox", "hentaihand", "hentaihere", "hentainexus", diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py index 8e832fe..8c44256 100644 --- a/gallery_dl/extractor/arcalive.py +++ b/gallery_dl/extractor/arcalive.py @@ -41,7 +41,9 @@ class ArcalivePostExtractor(ArcaliveExtractor): def items(self): self.emoticons = self.config("emoticons", False) - self.gifs = self.config("gifs", True) + self.gifs = gifs = self.config("gifs", True) + if gifs: + self.gifs_fallback = (gifs != "check") post = self.api.post(self.groups[0]) files = self._extract_files(post) @@ -90,11 +92,15 @@ class ArcalivePostExtractor(ArcaliveExtractor): url = path + "." + orig elif video and self.gifs: url_gif = url.rpartition(".")[0] + ".gif" - response = self.request( - url_gif + "?type=orig", method="HEAD", fatal=False) - if response.status_code < 400: + if self.gifs_fallback: fallback = (url + "?type=orig",) url = url_gif + else: + response = self.request( + url_gif + "?type=orig", method="HEAD", fatal=False) + if response.status_code < 400: + fallback = (url + "?type=orig",) + url = url_gif files.append({ "url" : url + "?type=orig", diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index 113a669..b398152 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -27,7 +27,12 @@ class BbcGalleryExtractor(GalleryExtractor): def metadata(self, page): data = self._extract_jsonld(page) + return { + "title": text.unescape(text.extr( + page, "<h1>", "</h1>").rpartition("</span>")[2]), + "description": text.unescape(text.extr( + page, 'property="og:description" content="', '"')), "programme": self.gallery_url.split("/")[4], "path": list(util.unique_sequence( element["name"] @@ -40,11 +45,20 @@ class BbcGalleryExtractor(GalleryExtractor): width = width - width % 16 if width else 1920 dimensions = "/{}xn/".format(width) - return [ - (src.replace("/320x180_b/", dimensions), - {"_fallback": self._fallback_urls(src, width)}) - for src in text.extract_iter(page, 'data-image-src="', '"') - ] + results = [] + for img in text.extract_iter(page, 'class="gallery__thumbnail', ">"): + src = text.extr(img, 'data-image-src="', '"') + results.append(( + src.replace("/320x180_b/", dimensions), + { + "title_image": text.unescape(text.extr( + img, 'data-gallery-title="', '"')), + "synopsis": text.unescape(text.extr( + img, 'data-gallery-synopsis="', '"')), + "_fallback": self._fallback_urls(src, width), + }, + )) + return results @staticmethod def _fallback_urls(src, max_width): @@ -62,14 +76,11 @@ class BbcProgrammeExtractor(Extractor): pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?" example = "https://www.bbc.co.uk/programmes/ID/galleries" - def __init__(self, match): - Extractor.__init__(self, match) - self.path, self.page = match.groups() - def items(self): + path, pnum = self.groups data = {"_extractor": BbcGalleryExtractor} - params = {"page": text.parse_int(self.page, 1)} - galleries_url = self.root + self.path + params = {"page": text.parse_int(pnum, 1)} + galleries_url = self.root + path while True: page = self.request(galleries_url, params=params).text diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index d74f59c..481e962 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -189,8 +189,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): else: file_url = data["url"] - file_name = (text.extr(page, 'property="og:title" content="', '"') or - text.extr(page, "<title>", " | Bunkr<")) + file_name = text.extr(page, "<h1", "<").rpartition(">")[2] fallback = text.extr(page, 'property="og:url" content="', '"') return { diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index a85eedd..995505f 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -539,7 +539,7 @@ class Extractor(): for name, value in cookiedict.items(): set_cookie(name, value, domain=domain) - def cookies_check(self, cookies_names, domain=None): + def cookies_check(self, cookies_names, domain=None, subdomains=False): """Check if all 'cookies_names' are in the session's cookiejar""" if not self.cookies: return False @@ -550,26 +550,31 @@ class Extractor(): now = time.time() for cookie in self.cookies: - if cookie.name in names and ( - not domain or cookie.domain == domain): - - if cookie.expires: - diff = int(cookie.expires - now) - - if diff <= 0: - self.log.warning( - "Cookie '%s' has expired", cookie.name) - continue - - elif diff <= 86400: - hours = diff // 3600 - self.log.warning( - "Cookie '%s' will expire in less than %s hour%s", - cookie.name, hours + 1, "s" if hours else "") - - names.discard(cookie.name) - if not names: - return True + if cookie.name not in names: + continue + + if not domain or cookie.domain == domain: + pass + elif not subdomains or not cookie.domain.endswith(domain): + continue + + if cookie.expires: + diff = int(cookie.expires - now) + + if diff <= 0: + self.log.warning( + "Cookie '%s' has expired", cookie.name) + continue + + elif diff <= 86400: + hours = diff // 3600 + self.log.warning( + "Cookie '%s' will expire in less than %s hour%s", + cookie.name, hours + 1, "s" if hours else "") + + names.discard(cookie.name) + if not names: + return True return False def _extract_jsonld(self, page): diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 8d00728..741800c 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -175,6 +175,51 @@ class DanbooruExtractor(BaseExtractor): return [{"file": fmt(index), "delay": delay} for index, delay in enumerate(delays)] + def _collection_posts(self, cid, ctype): + reverse = prefix = None + + order = self.config("order-posts") + if not order or order in {"asc", "pool", "pool_asc", "asc_pool"}: + params = {"tags": "ord{}:{}".format(ctype, cid)} + elif order in {"id", "desc_id", "id_desc"}: + params = {"tags": "{}:{}".format(ctype, cid)} + prefix = "b" + elif order in {"desc", "desc_pool", "pool_desc"}: + params = {"tags": "ord{}:{}".format(ctype, cid)} + reverse = True + elif order in {"asc_id", "id_asc"}: + params = {"tags": "{}:{}".format(ctype, cid)} + reverse = True + + posts = self._pagination("/posts.json", params, prefix) + if reverse: + self.log.info("Collecting posts of %s %s", ctype, cid) + return self._collection_enumerate_reverse(posts) + else: + return self._collection_enumerate(posts) + + def _collection_metadata(self, cid, ctype, cname=None): + url = "{}/{}s/{}.json".format(self.root, cname or ctype, cid) + collection = self.request(url).json() + collection["name"] = collection["name"].replace("_", " ") + self.post_ids = collection.pop("post_ids", ()) + return {ctype: collection} + + def _collection_enumerate(self, posts): + pid_to_num = {pid: num for num, pid in enumerate(self.post_ids, 1)} + for post in posts: + post["num"] = pid_to_num[post["id"]] + yield post + + def _collection_enumerate_reverse(self, posts): + posts = list(posts) + posts.reverse() + + pid_to_num = {pid: num for num, pid in enumerate(self.post_ids, 1)} + for post in posts: + post["num"] = pid_to_num[post["id"]] + return posts + BASE_PATTERN = DanbooruExtractor.update({ "danbooru": { @@ -228,7 +273,7 @@ class DanbooruTagExtractor(DanbooruExtractor): class DanbooruPoolExtractor(DanbooruExtractor): - """Extractor for posts from danbooru pools""" + """Extractor for Danbooru pools""" subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}") filename_fmt = "{num:>04}_{id}_{filename}.{extension}" @@ -237,50 +282,28 @@ class DanbooruPoolExtractor(DanbooruExtractor): example = "https://danbooru.donmai.us/pools/12345" def metadata(self): - self.pool_id = self.groups[-1] - url = "{}/pools/{}.json".format(self.root, self.pool_id) - pool = self.request(url).json() - pool["name"] = pool["name"].replace("_", " ") - self.post_ids = pool.pop("post_ids", ()) - return {"pool": pool} + return self._collection_metadata(self.groups[-1], "pool") def posts(self): - reverse = prefix = None + return self._collection_posts(self.groups[-1], "pool") - order = self.config("order-posts") - if not order or order in ("asc", "pool", "pool_asc", "asc_pool"): - params = {"tags": "ordpool:" + self.pool_id} - elif order in ("id", "desc_id", "id_desc"): - params = {"tags": "pool:" + self.pool_id} - prefix = "b" - elif order in ("desc", "desc_pool", "pool_desc"): - params = {"tags": "ordpool:" + self.pool_id} - reverse = True - elif order in ("asc_id", "id_asc"): - params = {"tags": "pool:" + self.pool_id} - reverse = True - posts = self._pagination("/posts.json", params, prefix) - if reverse: - return self._enumerate_posts_reverse(posts) - else: - return self._enumerate_posts(posts) - - def _enumerate_posts(self, posts): - pid_to_num = {pid: num+1 for num, pid in enumerate(self.post_ids)} - for post in posts: - post["num"] = pid_to_num[post["id"]] - yield post +class DanbooruFavgroupExtractor(DanbooruExtractor): + """Extractor for Danbooru favorite groups""" + subcategory = "favgroup" + directory_fmt = ("{category}", "Favorite Groups", + "{favgroup[id]} {favgroup[name]}") + filename_fmt = "{num:>04}_{id}_{filename}.{extension}" + archive_fmt = "fg_{favgroup[id]}_{id}" + pattern = BASE_PATTERN + r"/favorite_group(?:s|/show)/(\d+)" + example = "https://danbooru.donmai.us/favorite_groups/12345" - def _enumerate_posts_reverse(self, posts): - self.log.info("Collecting posts of pool %s", self.pool_id) - posts = list(posts) - posts.reverse() + def metadata(self): + return self._collection_metadata( + self.groups[-1], "favgroup", "favorite_group") - pid_to_num = {pid: num+1 for num, pid in enumerate(self.post_ids)} - for post in posts: - post["num"] = pid_to_num[post["id"]] - return posts + def posts(self): + return self._collection_posts(self.groups[-1], "favgroup") class DanbooruPostExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 59b2d6d..3a862c1 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -687,10 +687,18 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ for folder in folders: if match(folder["name"]): return folder + elif folder["has_subfolders"]: + for subfolder in folder["subfolders"]: + if match(subfolder["name"]): + return subfolder else: for folder in folders: if folder["folderid"] == uuid: return folder + elif folder["has_subfolders"]: + for subfolder in folder["subfolders"]: + if subfolder["folderid"] == uuid: + return subfolder raise exception.NotFoundError("folder") def _folder_urls(self, folders, category, extractor): @@ -891,7 +899,8 @@ class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" subcategory = "gallery" archive_fmt = "g_{_username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery(?:/all|/?\?catpath=)?/?$" + pattern = (BASE_PATTERN + r"/gallery" + r"(?:/all|/recommended-for-you|/?\?catpath=)?/?$") example = "https://www.deviantart.com/USER/gallery/" def deviations(self): @@ -987,13 +996,36 @@ class DeviantartFolderExtractor(DeviantartExtractor): def deviations(self): folders = self.api.gallery_folders(self.user) folder = self._find_folder(folders, self.folder_name, self.folder_id) + + # Leaving this here for backwards compatibility self.folder = { "title": folder["name"], "uuid" : folder["folderid"], "index": self.folder_id, "owner": self.user, + "parent_uuid": folder["parent"], } - return self.api.gallery(self.user, folder["folderid"], self.offset) + + if folder.get("subfolder"): + self.folder["parent_folder"] = folder["parent_folder"] + self.archive_fmt = "F_{folder[parent_uuid]}_{index}.{extension}" + + if self.flat: + self.directory_fmt = ("{category}", "{username}", + "{folder[parent_folder]}") + else: + self.directory_fmt = ("{category}", "{username}", + "{folder[parent_folder]}", + "{folder[title]}") + + if folder.get("has_subfolders") and self.config("subfolders", True): + for subfolder in folder["subfolders"]: + subfolder["parent_folder"] = folder["name"] + subfolder["subfolder"] = True + yield from self._folder_urls( + folder["subfolders"], "gallery", DeviantartFolderExtractor) + + yield from self.api.gallery(self.user, folder["folderid"], self.offset) def prepare(self, deviation): DeviantartExtractor.prepare(self, deviation) @@ -1004,7 +1036,7 @@ class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" archive_fmt = "{index}.{extension}" - pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)" + pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.s(h))" r"/([a-z0-9]+)") example = "https://www.deviantart.com/stash/abcde" @@ -1016,9 +1048,18 @@ class DeviantartStashExtractor(DeviantartExtractor): def deviations(self, stash_id=None): if stash_id is None: - stash_id = self.groups[0] - url = "https://www.deviantart.com/stash/" + stash_id - page = self._limited_request(url).text + legacy_url, stash_id = self.groups + else: + legacy_url = False + + if legacy_url and stash_id[0] == "2": + url = "https://sta.sh/" + stash_id + response = self._limited_request(url) + stash_id = response.url.rpartition("/")[2] + page = response.text + else: + url = "https://www.deviantart.com/stash/" + stash_id + page = self._limited_request(url).text if stash_id[0] == "0": uuid = text.extr(page, '//deviation/', '"') @@ -1235,7 +1276,34 @@ class DeviantartDeviationExtractor(DeviantartExtractor): deviation = self.api.deviation(uuid) deviation["_page"] = page - return (deviation,) + + _dev_info = text.extr( + page, '\\"deviationExtended\\":', ',\\"deviation\\":', None) + # Clean up escaped quotes + _json_str = re.sub( + r'(?<!\\)\\{1}"', '"', _dev_info).replace("\\'", "'") + _extended_info = util.json_loads(_json_str)[self.deviation_id] + additional_media = _extended_info.get("additionalMedia") or () + + if additional_media: + self.filename_fmt = ("{category}_{index}_{index_file}_{title}_" + "{num:>02}.{extension}") + self.archive_fmt = ("g_{_username}_{index}{index_file:?_//}." + "{extension}") + + deviation["index_file"] = 0 + deviation["count"] = 1 + len(additional_media) + deviation["num"] = 1 + yield deviation + + for index, post in enumerate(additional_media): + uri = post["media"]["baseUri"].encode().decode("unicode-escape") + deviation["content"]["src"] = uri + deviation["num"] += 1 + deviation["index_file"] = post["fileId"] + # Download only works on purchased materials - no way to check + deviation["is_downloadable"] = False + yield deviation class DeviantartScrapsExtractor(DeviantartExtractor): @@ -1366,7 +1434,7 @@ class DeviantartOAuthAPI(): def __init__(self, extractor): self.extractor = extractor self.log = extractor.log - self.headers = {"dA-minor-version": "20200519"} + self.headers = {"dA-minor-version": "20210526"} self._warn_429 = True self.delay = extractor.config("wait-min", 0) diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py deleted file mode 100644 index 31a302d..0000000 --- a/gallery_dl/extractor/hentaifox.py +++ /dev/null @@ -1,119 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://hentaifox.com/""" - -from .common import GalleryExtractor, Extractor, Message -from .. import text, util - - -class HentaifoxBase(): - """Base class for hentaifox extractors""" - category = "hentaifox" - root = "https://hentaifox.com" - - -class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): - """Extractor for image galleries on hentaifox.com""" - pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" - example = "https://hentaifox.com/gallery/12345/" - - def __init__(self, match): - GalleryExtractor.__init__(self, match) - self.gallery_id = match.group(2) - - @staticmethod - def _split(txt): - return [ - text.remove_html(tag.partition(">")[2], "", "") - for tag in text.extract_iter( - txt, "class='tag_btn", "<span class='t_badge") - ] - - def metadata(self, page): - extr = text.extract_from(page) - split = self._split - - return { - "gallery_id": text.parse_int(self.gallery_id), - "parody" : split(extr(">Parodies:" , "</ul>")), - "characters": split(extr(">Characters:", "</ul>")), - "tags" : split(extr(">Tags:" , "</ul>")), - "artist" : split(extr(">Artists:" , "</ul>")), - "group" : split(extr(">Groups:" , "</ul>")), - "type" : text.remove_html(extr(">Category:", "<span")), - "title" : text.unescape(extr( - 'id="gallery_title" value="', '"')), - "language" : "English", - "lang" : "en", - } - - def images(self, page): - cover, pos = text.extract(page, '<img src="', '"') - data , pos = text.extract(page, "$.parseJSON('", "');", pos) - path = "/".join(cover.split("/")[3:-1]) - - result = [] - append = result.append - extmap = {"j": "jpg", "p": "png", "g": "gif"} - urlfmt = ("/" + path + "/{}.{}").format - - server1 = "https://i.hentaifox.com" - server2 = "https://i2.hentaifox.com" - - for num, image in util.json_loads(data).items(): - ext, width, height = image.split(",") - path = urlfmt(num, extmap[ext]) - append((server1 + path, { - "width" : width, - "height" : height, - "_fallback": (server2 + path,), - })) - - return result - - -class HentaifoxSearchExtractor(HentaifoxBase, Extractor): - """Extractor for search results and listings on hentaifox.com""" - subcategory = "search" - pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com" - r"(/(?:parody|tag|artist|character|search|group)/[^/?%#]+)") - example = "https://hentaifox.com/tag/TAG/" - - def __init__(self, match): - Extractor.__init__(self, match) - self.path = match.group(1) - - def items(self): - for gallery in self.galleries(): - yield Message.Queue, gallery["url"], gallery - - def galleries(self): - num = 1 - - while True: - url = "{}{}/pag/{}/".format(self.root, self.path, num) - page = self.request(url).text - - for info in text.extract_iter( - page, 'class="g_title"><a href="', '</a>'): - url, _, title = info.partition('">') - - yield { - "url" : text.urljoin(self.root, url), - "gallery_id": text.parse_int( - url.strip("/").rpartition("/")[2]), - "title" : text.unescape(title), - "_extractor": HentaifoxGalleryExtractor, - } - - pos = page.find(">Next<") - url = text.rextract(page, "href=", ">", pos)[0] - if pos == -1 or "/pag" not in url: - return - num += 1 diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index e15e13c..086b77c 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -16,19 +16,25 @@ import string import re -class HitomiGalleryExtractor(GalleryExtractor): - """Extractor for image galleries from hitomi.la""" +class HitomiExtractor(Extractor): + """Base class for hitomi extractors""" category = "hitomi" root = "https://hitomi.la" + domain = "gold-usergeneratedcontent.net" + + +class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): + """Extractor for hitomi.la galleries""" pattern = (r"(?:https?://)?hitomi\.la" r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)" r"/(?:[^/?#]+-)?(\d+)") example = "https://hitomi.la/manga/TITLE-867789.html" def __init__(self, match): - self.gid = match.group(1) - url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gid) - GalleryExtractor.__init__(self, match, url) + GalleryExtractor.__init__(self, match, False) + self.gid = gid = self.groups[0] + self.gallery_url = "https://ltn.{}/galleries/{}.js".format( + self.domain, gid) def _init(self): self.session.headers["Referer"] = "{}/reader/{}.html".format( @@ -71,43 +77,34 @@ class HitomiGalleryExtractor(GalleryExtractor): } def images(self, _): - # see https://ltn.hitomi.la/gg.js + # https://ltn.gold-usergeneratedcontent.net/gg.js gg_m, gg_b, gg_default = _parse_gg(self) - fmt = self.config("format") or "webp" - if fmt == "original": - subdomain, path, ext, check = "b", "images", None, False - else: - subdomain, path, ext, check = "a", fmt, fmt, (fmt != "webp") + fmt = ext = self.config("format") or "webp" + check = (fmt != "webp") result = [] for image in self.info["files"]: if check: - if image.get("has" + fmt): - path = ext = fmt - else: - path = ext = "webp" + ext = fmt if image.get("has" + fmt) else "webp" ihash = image["hash"] idata = text.nameext_from_url(image["name"]) idata["extension_original"] = idata["extension"] - if ext: - idata["extension"] = ext + idata["extension"] = ext - # see https://ltn.hitomi.la/common.js + # https://ltn.gold-usergeneratedcontent.net/common.js inum = int(ihash[-1] + ihash[-3:-1], 16) - url = "https://{}{}.hitomi.la/{}/{}/{}/{}.{}".format( - chr(97 + gg_m.get(inum, gg_default)), - subdomain, path, gg_b, inum, ihash, idata["extension"], + url = "https://{}{}.{}/{}/{}/{}.{}".format( + ext[0], gg_m.get(inum, gg_default) + 1, self.domain, + gg_b, inum, ihash, ext, ) result.append((url, idata)) return result -class HitomiTagExtractor(Extractor): +class HitomiTagExtractor(HitomiExtractor): """Extractor for galleries from tag searches on hitomi.la""" - category = "hitomi" subcategory = "tag" - root = "https://hitomi.la" pattern = (r"(?:https?://)?hitomi\.la" r"/(tag|artist|group|series|type|character)" r"/([^/?#]+)\.html") @@ -126,8 +123,8 @@ class HitomiTagExtractor(Extractor): "_extractor": HitomiGalleryExtractor, "search_tags": text.unquote(self.tag.rpartition("-")[0]), } - nozomi_url = "https://ltn.hitomi.la/{}/{}.nozomi".format( - self.type, self.tag) + nozomi_url = "https://ltn.{}/{}/{}.nozomi".format( + self.domain, self.type, self.tag) headers = { "Origin": self.root, "Cache-Control": "max-age=0", @@ -166,8 +163,8 @@ class HitomiIndexExtractor(HitomiTagExtractor): def items(self): data = {"_extractor": HitomiGalleryExtractor} - nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format( - self.tag, self.language) + nozomi_url = "https://ltn.{}/{}-{}.nozomi".format( + self.domain, self.tag, self.language) headers = { "Origin": self.root, "Cache-Control": "max-age=0", @@ -194,11 +191,9 @@ class HitomiIndexExtractor(HitomiTagExtractor): return -class HitomiSearchExtractor(Extractor): +class HitomiSearchExtractor(HitomiExtractor): """Extractor for galleries from multiple tag searches on hitomi.la""" - category = "hitomi" subcategory = "search" - root = "https://hitomi.la" pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)" example = "https://hitomi.la/search.html?QUERY" @@ -224,11 +219,11 @@ class HitomiSearchExtractor(Extractor): area, tag, language = self.get_nozomi_args(full_tag) if area: - nozomi_url = "https://ltn.hitomi.la/n/{}/{}-{}.nozomi".format( - area, tag, language) + nozomi_url = "https://ltn.{}/n/{}/{}-{}.nozomi".format( + self.domain, area, tag, language) else: - nozomi_url = "https://ltn.hitomi.la/n/{}-{}.nozomi".format( - tag, language) + nozomi_url = "https://ltn.{}/n/{}-{}.nozomi".format( + self.domain, tag, language) headers = { "Origin": self.root, @@ -257,7 +252,7 @@ class HitomiSearchExtractor(Extractor): @memcache(maxage=1800) def _parse_gg(extr): - page = extr.request("https://ltn.hitomi.la/gg.js").text + page = extr.request("https://ltn.gold-usergeneratedcontent.net/gg.js").text m = {} @@ -280,4 +275,4 @@ def _parse_gg(extr): d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page) b = re.search(r"b:\s*[\"'](.+)[\"']", page) - return m, b.group(1).strip("/"), int(d.group(1)) if d else 1 + return m, b.group(1).strip("/"), int(d.group(1)) if d else 0 diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py index 0439f5b..1b0fba3 100644 --- a/gallery_dl/extractor/imhentai.py +++ b/gallery_dl/extractor/imhentai.py @@ -22,10 +22,15 @@ class ImhentaiExtractor(BaseExtractor): while True: page = self.request(url).text + + pos = page.find('class="ranking_list"') + if pos >= 0: + page = page[:pos] + extr = text.extract_from(page) while True: - gallery_id = extr('<a href="/gallery/', '"') + gallery_id = extr('href="/gallery/', '"') if gallery_id == prev: continue if not gallery_id: @@ -57,6 +62,18 @@ BASE_PATTERN = ImhentaiExtractor.update({ "root": "https://hentairox.com", "pattern": r"(?:www\.)?hentairox\.com", }, + "hentaifox": { + "root": "https://hentaifox.com", + "pattern": r"(?:www\.)?hentaifox\.com", + }, + "hentaienvy": { + "root": "https://hentaienvy.com", + "pattern": r"(?:www\.)?hentaienvy\.com", + }, + "hentaizap": { + "root": "https://hentaizap.com", + "pattern": r"(?:www\.)?hentaizap\.com", + }, }) @@ -72,17 +89,20 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) + title = extr("<h1>", "<") + title_alt = extr('class="subtitle">', "<") + end = "</li>" if extr('<ul class="galleries_info', ">") else "</ul>" data = { "gallery_id": text.parse_int(self.gallery_id), - "title" : text.unescape(extr("<h1>", "<")), - "title_alt" : text.unescape(extr('class="subtitle">', "<")), - "parody" : self._split(extr(">Parodies", "</li>")), - "character" : self._split(extr(">Characters", "</li>")), - "tags" : self._split(extr(">Tags", "</li>")), - "artist" : self._split(extr(">Artists", "</li>")), - "group" : self._split(extr(">Groups", "</li>")), - "language" : self._split(extr(">Languages", "</li>")), + "title" : text.unescape(title), + "title_alt" : text.unescape(title_alt), + "parody" : self._split(extr(">Parodies", end)), + "character" : self._split(extr(">Characters", end)), + "tags" : self._split(extr(">Tags", end)), + "artist" : self._split(extr(">Artists", end)), + "group" : self._split(extr(">Groups", end)), + "language" : self._split(extr(">Languages", end)), "type" : extr("href='/category/", "/"), } @@ -94,10 +114,12 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): def _split(self, html): results = [] for tag in text.extract_iter(html, ">", "</a>"): - tag = tag.partition(" <span class='badge'>")[0] - if "<" in tag: - tag = text.remove_html(tag) + badge = ("badge'>" in tag or "class='badge" in tag) + tag = text.remove_html(tag) + if badge: + tag = tag.rpartition(" ")[0] results.append(tag) + results.sort() return results def images(self, page): @@ -132,9 +154,9 @@ class ImhentaiTagExtractor(ImhentaiExtractor): class ImhentaiSearchExtractor(ImhentaiExtractor): """Extractor for imhentai search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search/?\?([^#]+)" + pattern = BASE_PATTERN + r"/search(/?\?[^#]+|/[^/?#]+/?)" example = "https://imhentai.xxx/search/?key=QUERY" def items(self): - url = self.root + "/search/?" + self.groups[-1] + url = self.root + "/search" + self.groups[-1] return self._pagination(url) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index e344b2f..aa26408 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -56,9 +56,11 @@ class InstagramExtractor(Extractor): data = self.metadata() videos = self.config("videos", True) + if videos: + videos_dash = (videos != "merged") + videos_headers = {"User-Agent": "Mozilla/5.0"} previews = self.config("previews", False) max_posts = self.config("max-posts") - video_headers = {"User-Agent": "Mozilla/5.0"} order = self.config("order-files") reverse = order[0] in ("r", "d") if order else False @@ -92,8 +94,12 @@ class InstagramExtractor(Extractor): url = file.get("video_url") if url: if videos: - file["_http_headers"] = video_headers + file["_http_headers"] = videos_headers text.nameext_from_url(url, file) + if videos_dash: + file["_fallback"] = (url,) + file["_ytdl_manifest"] = "dash" + url = "ytdl:dash" yield Message.Url, url, file if previews: file["media_id"] += "p" @@ -246,6 +252,7 @@ class InstagramExtractor(Extractor): "video_url" : video["url"] if video else None, "width" : media["width"], "height" : media["height"], + "_ytdl_manifest_data": item.get("video_dash_manifest"), } if "expiring_at" in item: diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 788b5d9..860e771 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -57,11 +57,13 @@ class KemonopartyExtractor(Extractor): find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) announcements = True if self.config("announcements") else None + archives = True if self.config("archives") else False comments = True if self.config("comments") else False duplicates = True if self.config("duplicates") else False dms = True if self.config("dms") else None max_posts = self.config("max-posts") - creator_info = {} if self.config("metadata") else None + creator_info = {} if self.config("metadata", True) else None + exts_archive = {"zip", "rar", "7z"} # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} @@ -115,6 +117,7 @@ class KemonopartyExtractor(Extractor): files = [] hashes = set() + post_archives = post["archives"] = [] for file in itertools.chain.from_iterable( g(post) for g in generators): @@ -129,31 +132,45 @@ class KemonopartyExtractor(Extractor): continue hashes.add(hash) else: - file["hash"] = "" + file["hash"] = hash = "" + + if url[0] == "/": + url = self.root + "/data" + url + elif url.startswith(self.root): + url = self.root + "/data" + url[20:] + file["url"] = url + + text.nameext_from_url(file.get("name", url), file) + ext = text.ext_from_url(url) + if not file["extension"]: + file["extension"] = ext + elif ext == "txt" and file["extension"] != "txt": + file["_http_validate"] = _validate + elif ext in exts_archive: + file["type"] = "archive" + if archives: + try: + data = self.api.posts_archives(file["hash"]) + data.update(file) + post_archives.append(data) + except Exception as exc: + self.log.warning( + "%s: Failed to retrieve archive metadata of " + "'%s' (%s: %s)", post["id"], file.get("name"), + exc.__class__.__name__, exc) + post_archives.append(file.copy()) + else: + post_archives.append(file.copy()) files.append(file) post["count"] = len(files) yield Message.Directory, post - for post["num"], file in enumerate(files, 1): - post["_http_validate"] = None - post["hash"] = file["hash"] - post["type"] = file["type"] - url = file["path"] - - text.nameext_from_url(file.get("name", url), post) - ext = text.ext_from_url(url) - if not post["extension"]: - post["extension"] = ext - elif ext == "txt" and post["extension"] != "txt": - post["_http_validate"] = _validate - - if url[0] == "/": - url = self.root + "/data" + url - elif url.startswith(self.root): - url = self.root + "/data" + url[20:] - yield Message.Url, url, post + if "id" in file: + del file["id"] + post.update(file) + yield Message.Url, file["url"], post def login(self): username, password = self._get_auth_info() @@ -368,17 +385,18 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): key = "id" else: key = "name" + else: + key = "id" + channel = channel_id + if not channel_name or not channel_id: for ch in self.api.discord_server(server_id): if ch[key] == channel: break else: raise exception.NotFoundError("channel") - channel_id = ch["id"] channel_name = ch["name"] - elif channel_name is None: - channel_name = "" find_inline = re.compile( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" @@ -503,6 +521,10 @@ class KemonoAPI(): params = {"q": query, "o": offset, "tag": tags} return self._pagination(endpoint, params, 50, "posts") + def posts_archives(self, file_hash): + endpoint = "/posts/archives/" + file_hash + return self._call(endpoint)["archive"] + def creator_posts(self, service, creator_id, offset=0, query=None): endpoint = "/{}/user/{}".format(service, creator_id) params = {"q": query, "o": offset} diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 6f7a238..b11f81d 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -10,9 +10,13 @@ from .common import ChapterExtractor, Extractor, Message from .. import text, util, exception +from ..cache import memcache import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangapark\.(?:net|com|org|io|me)" +BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:" + r"(?:manga|comic|read)park\.(?:com|net|org|me|io|to)|" + r"parkmanga\.(?:com|net|org)|" + r"mpark\.to)") class MangaparkBase(): @@ -31,57 +35,87 @@ class MangaparkBase(): match = self._match_title(title) return match.groups() if match else (0, 0, "", "") + @memcache(keyarg=1) + def _extract_manga(self, manga_id): + variables = { + "getComicNodeId": manga_id, + } + return self._request_graphql("Get_comicNode", variables)["data"] + + def _extract_chapter(self, chapter_id): + variables = { + "getChapterNodeId": chapter_id, + } + return self._request_graphql("Get_chapterNode", variables)["data"] + + def _extract_chapters_all(self, manga_id): + variables = { + "comicId": manga_id, + } + return self._request_graphql("Get_comicChapterList", variables) + + def _extract_chapters_source(self, source_id): + variables = { + "sourceId": source_id, + } + return self._request_graphql( + "get_content_source_chapterList", variables) + + def _request_graphql(self, opname, variables): + url = self.root + "/apo/" + data = { + "query" : QUERIES[opname], + "variables" : variables, + "operationName": opname, + } + return self.request( + url, method="POST", json=data).json()["data"].popitem()[1] + class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" - pattern = BASE_PATTERN + r"/title/[^/?#]+/(\d+)" + pattern = (BASE_PATTERN + + r"/(?:title/[^/?#]+/|comic/\d+/[^/?#]+/[^/?#]+-i)(\d+)") example = "https://mangapark.net/title/MANGA/12345-en-ch.01" def __init__(self, match): self.root = text.root_from_url(match.group(0)) - url = "{}/title/_/{}".format(self.root, match.group(1)) - ChapterExtractor.__init__(self, match, url) - - def metadata(self, page): - data = self._extract_nextdata(page) - chapter = (data["props"]["pageProps"]["dehydratedState"] - ["queries"][0]["state"]["data"]["data"]) - manga = chapter["comicNode"]["data"] - source = chapter["sourceNode"]["data"] - - self._urls = chapter["imageSet"]["httpLis"] - self._params = chapter["imageSet"]["wordLis"] + ChapterExtractor.__init__(self, match, False) + + def metadata(self, _): + chapter = self._extract_chapter(self.groups[0]) + manga = self._extract_manga(chapter["comicNode"]["id"]) + + self._urls = chapter["imageFile"]["urlList"] vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) + lang = chapter.get("lang") or "en" return { "manga" : manga["name"], - "manga_id" : manga["id"], - "artist" : source["artists"], - "author" : source["authors"], - "genre" : source["genres"], + "manga_id" : text.parse_int(manga["id"]), + "artist" : manga["artists"], + "author" : manga["authors"], + "genre" : manga["genres"], "volume" : text.parse_int(vol), "chapter" : text.parse_int(ch), "chapter_minor": minor, - "chapter_id": chapter["id"], - "title" : chapter["title"] or title or "", - "lang" : chapter["lang"], - "language" : util.code_to_language(chapter["lang"]), - "source" : source["srcTitle"], - "source_id" : source["id"], + "chapter_id": text.parse_int(chapter["id"]), + "title" : title or "", + "lang" : lang, + "language" : util.code_to_language(lang), + "source" : chapter["srcTitle"], + "source_id" : chapter["sourceId"], "date" : text.parse_timestamp(chapter["dateCreate"] // 1000), } - def images(self, page): - return [ - (url + "?" + params, None) - for url, params in zip(self._urls, self._params) - ] + def images(self, _): + return [(url, None) for url in self._urls] class MangaparkMangaExtractor(MangaparkBase, Extractor): """Extractor for manga from mangapark.net""" subcategory = "manga" - pattern = BASE_PATTERN + r"/title/(\d+)(?:-[^/?#]*)?/?$" + pattern = BASE_PATTERN + r"/(?:title|comic)/(\d+)(?:[/-][^/?#]*)?/?$" example = "https://mangapark.net/title/12345-MANGA" def __init__(self, match): @@ -95,6 +129,8 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): url = self.root + chapter["urlPath"] vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) + lang = chapter.get("lang") or "en" + data = { "manga_id" : self.manga_id, "volume" : text.parse_int(vol), @@ -102,8 +138,8 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): "chapter_minor": minor, "chapter_id": chapter["id"], "title" : chapter["title"] or title or "", - "lang" : chapter["lang"], - "language" : util.code_to_language(chapter["lang"]), + "lang" : lang, + "language" : util.code_to_language(lang), "source" : chapter["srcTitle"], "source_id" : chapter["sourceId"], "date" : text.parse_timestamp( @@ -114,45 +150,12 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): def chapters(self): source = self.config("source") - if not source: - return self.chapters_all() - - source_id = self._select_source(source) - self.log.debug("Requesting chapters for source_id %s", source_id) - return self.chapters_source(source_id) - - def chapters_all(self): - pnum = 0 - variables = { - "select": { - "comicId": self.manga_id, - "range" : None, - "isAsc" : not self.config("chapter-reverse"), - } - } - - while True: - data = self._request_graphql( - "get_content_comicChapterRangeList", variables) - - for item in data["items"]: - yield from item["chapterNodes"] - - if not pnum: - pager = data["pager"] - pnum += 1 - - try: - variables["select"]["range"] = pager[pnum] - except IndexError: - return - - def chapters_source(self, source_id): - variables = { - "sourceId": source_id, - } - chapters = self._request_graphql( - "get_content_source_chapterList", variables) + if source: + source_id = self._select_source(source) + self.log.debug("Requesting chapters for source_id %s", source_id) + chapters = self._extract_chapters_source(source_id) + else: + chapters = self._extract_chapters_all(self.groups[0]) if self.config("chapter-reverse"): chapters.reverse() @@ -180,101 +183,58 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): raise exception.StopExtraction( "'%s' does not match any available source", source) - def _request_graphql(self, opname, variables): - url = self.root + "/apo/" - data = { - "query" : QUERIES[opname], - "variables" : util.json_dumps(variables), - "operationName": opname, - } - return self.request( - url, method="POST", json=data).json()["data"][opname] - QUERIES = { - "get_content_comicChapterRangeList": """ - query get_content_comicChapterRangeList($select: Content_ComicChapterRangeList_Select) { - get_content_comicChapterRangeList( - select: $select - ) { - reqRange{x y} - missing - pager {x y} - items{ - serial - chapterNodes { - - id - data { - - - id - sourceId - - dbStatus - isNormal - isHidden - isDeleted - isFinal - - dateCreate - datePublic - dateModify - lang - volume - serial - dname - title - urlPath - - srcTitle srcColor - - count_images - - stat_count_post_child - stat_count_post_reply - stat_count_views_login - stat_count_views_guest - - userId - userNode { - - id - data { - -id -name -uniq -avatarUrl -urlPath - -verified -deleted -banned - -dateCreate -dateOnline - -stat_count_chapters_normal -stat_count_chapters_others - -is_adm is_mod is_vip is_upr - - } - - } - - disqusId - - - } + "Get_comicChapterList": """ +query Get_comicChapterList($comicId: ID!) { + get_comicChapterList(comicId: $comicId) { + data { + id + dname + title + lang + urlPath + srcTitle + sourceId + dateCreate + } + } +} +""", - sser_read + "Get_chapterNode": """ +query Get_chapterNode($getChapterNodeId: ID!) { + get_chapterNode(id: $getChapterNodeId) { + data { + id + dname + lang + sourceId + srcTitle + dateCreate + comicNode{ + id + } + imageFile { + urlList + } } - } + } +} +""", + "Get_comicNode": """ +query Get_comicNode($getComicNodeId: ID!) { + get_comicNode(id: $getComicNodeId) { + data { + id + name + artists + authors + genres + } } - } +} """, "get_content_source_chapterList": """ diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 5b354ac..5e78ad4 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -196,7 +196,8 @@ class MastodonFollowingExtractor(MastodonExtractor): class MastodonStatusExtractor(MastodonExtractor): """Extractor for images from a status""" subcategory = "status" - pattern = BASE_PATTERN + r"/@[^/?#]+/(?!following)([^/?#]+)" + pattern = (BASE_PATTERN + r"/(?:@[^/?#]+|(?:users/[^/?#]+/)?statuses)" + r"/(?!following)([^/?#]+)") example = "https://mastodon.social/@USER/12345" def statuses(self): diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 851f663..3d1722a 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -21,6 +21,7 @@ class NozomiExtractor(Extractor): """Base class for nozomi extractors""" category = "nozomi" root = "https://nozomi.la" + domain = "gold-usergeneratedcontent.net" filename_fmt = "{postid} {dataid}.{extension}" archive_fmt = "{dataid}" @@ -31,8 +32,8 @@ class NozomiExtractor(Extractor): data = self.metadata() for post_id in map(str, self.posts()): - url = "https://j.nozomi.la/post/{}/{}/{}.json".format( - post_id[-1], post_id[-3:-1], post_id) + url = "https://j.{}/post/{}/{}/{}.json".format( + self.domain, post_id[-1], post_id[-3:-1], post_id) response = self.request(url, fatal=False) if response.status_code >= 400: @@ -76,8 +77,8 @@ class NozomiExtractor(Extractor): ext = "webp" post["extension"] = ext - post["url"] = url = "https://{}.nozomi.la/{}/{}/{}.{}".format( - subdomain, did[-1], did[-3:-1], did, ext) + post["url"] = url = "https://{}.{}/{}/{}/{}.{}".format( + subdomain, self.domain, did[-1], did[-3:-1], did, ext) yield Message.Url, url, post def posts(self): @@ -168,7 +169,7 @@ class NozomiSearchExtractor(NozomiExtractor): negative = [] def nozomi(path): - url = "https://j.nozomi.la/" + path + ".nozomi" + url = "https://j.{}/{}.nozomi".format(self.domain, path) return decode_nozomi(self.request(url).content) for tag in self.tags: diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index f5a33d5..b8c6acb 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -27,7 +27,7 @@ class PatreonExtractor(Extractor): _warning = True def _init(self): - if not self.cookies_check(("session_id",)): + if not self.cookies_check(("session_id",), subdomains=True): if self._warning: PatreonExtractor._warning = False self.log.warning("no 'session_id' cookie set") @@ -329,10 +329,11 @@ class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" - r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))" + r"/(?!(?:home|create|login|signup|search|posts|messages)" + r"(?:$|[/?#]))" r"(?:profile/creators|(?:c/)?([^/?#]+)(?:/posts)?)" r"/?(?:\?([^#]+))?") - example = "https://www.patreon.com/USER" + example = "https://www.patreon.com/c/USER" def posts(self): creator, query = self.groups @@ -370,7 +371,7 @@ class PatreonCreatorExtractor(PatreonExtractor): data = None data = self._extract_bootstrap(page) return data["campaign"]["data"]["id"] - except (KeyError, ValueError) as exc: + except Exception as exc: if data: self.log.debug(data) raise exception.StopExtraction( diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 121c7bf..1a299c1 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -132,6 +132,9 @@ class PinterestExtractor(Extractor): "extension": "txt", "media_id": block.get("id")} + elif type == "story_pin_static_sticker_block": + continue + else: self.log.warning("%s: Unsupported story block '%s'", pin.get("id"), type) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 7708b5c..9e7d75d 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -10,6 +10,9 @@ from .common import Extractor, Message from .. import text +from datetime import datetime + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?sex\.com" class SexcomExtractor(Extractor): @@ -23,8 +26,20 @@ class SexcomExtractor(Extractor): def items(self): yield Message.Directory, self.metadata() for pin in map(self._parse_pin, self.pins()): - if pin: - yield Message.Url, pin["url"], pin + if not pin: + continue + + url = pin["url"] + parts = url.rsplit("/", 4) + try: + pin["date_url"] = dt = datetime( + int(parts[1]), int(parts[2]), int(parts[3])) + if "date" not in pin: + pin["date"] = dt + except Exception: + pass + + yield Message.Url, url, pin def metadata(self): return {} @@ -53,10 +68,18 @@ class SexcomExtractor(Extractor): self.log.warning('Unable to fetch %s ("%s %s")', url, response.status_code, response.reason) return None + + if "/pin/" in response.url: + return self._parse_pin_legacy(response) + if "/videos/" in response.url: + return self._parse_pin_video(response) + return self._parse_pin_gifs(response) + + def _parse_pin_legacy(self, response): extr = text.extract_from(response.text) data = {} - data["_http_headers"] = {"Referer": url} + data["_http_headers"] = {"Referer": response.url} data["thumbnail"] = extr('itemprop="thumbnail" content="', '"') data["type"] = extr('<h1>' , '<').rstrip(" -").strip().lower() data["title"] = text.unescape(extr('itemprop="name">' , '<')) @@ -82,7 +105,8 @@ class SexcomExtractor(Extractor): src = (text.extr(iframe, ' src="', '"') or text.extr(iframe, " src='", "'")) if not src: - self.log.warning("Unable to fetch media from %s", url) + self.log.warning( + "Unable to fetch media from %s", response.url) return None data["extension"] = None data["url"] = "ytdl:" + src @@ -100,27 +124,60 @@ class SexcomExtractor(Extractor): return data + def _parse_pin_gifs(self, response): + extr = text.extract_from(response.text) + + data = { + "_http_headers": {"Referer": response.url}, + "type": "gif", + "url": extr(' href="', '"'), + "title": text.unescape(extr("<title>", " Gif | Sex.com<")), + "pin_id": text.parse_int(extr( + 'rel="canonical" href="', '"').rpartition("/")[2]), + "tags": text.split_html(extr("</h1>", "</section>")), + } + + return text.nameext_from_url(data["url"], data) + + def _parse_pin_video(self, response): + extr = text.extract_from(response.text) + + if not self.cookies.get("CloudFront-Key-Pair-Id", domain=".sex.com"): + self.log.warning("CloudFront cookies required for video downloads") + + data = { + "_ytdl_manifest": "hls", + "extension": "mp4", + "type": "video", + "title": text.unescape(extr("<title>", " | Sex.com<")), + "pin_id": text.parse_int(extr( + 'rel="canonical" href="', '"').rpartition("/")[2]), + "tags": text.split_html(extr( + 'event_name="video_tags_click"', "<div data-testid=") + .partition(">")[2]), + "url": "ytdl:" + extr('<source src="', '"'), + } + + return data + class SexcomPinExtractor(SexcomExtractor): """Extractor for a pinned image or video on www.sex.com""" subcategory = "pin" directory_fmt = ("{category}",) - pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)(?!.*#related$)" + pattern = (BASE_PATTERN + + r"(/(?:pin|\w\w/(?:gif|video)s)/\d+/?)(?!.*#related$)") example = "https://www.sex.com/pin/12345-TITLE/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.pin_id = match.group(1) - def pins(self): - return ("{}/pin/{}/".format(self.root, self.pin_id),) + return (self.root + self.groups[0],) class SexcomRelatedPinExtractor(SexcomPinExtractor): """Extractor for related pins on www.sex.com""" subcategory = "related-pin" directory_fmt = ("{category}", "related {original_pin[pin_id]}") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+).*#related$" + pattern = BASE_PATTERN + r"(/pin/(\d+)/?).*#related$" example = "https://www.sex.com/pin/12345#related" def metadata(self): @@ -129,7 +186,7 @@ class SexcomRelatedPinExtractor(SexcomPinExtractor): def pins(self): url = "{}/pin/related?pinId={}&limit=24&offset=0".format( - self.root, self.pin_id) + self.root, self.groups[1]) return self._pagination(url) @@ -137,18 +194,14 @@ class SexcomPinsExtractor(SexcomExtractor): """Extractor for a user's pins on www.sex.com""" subcategory = "pins" directory_fmt = ("{category}", "{user}") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/pins/" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/pins/" example = "https://www.sex.com/user/USER/pins/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user = match.group(1) - def metadata(self): - return {"user": text.unquote(self.user)} + return {"user": text.unquote(self.groups[0])} def pins(self): - url = "{}/user/{}/pins/".format(self.root, self.user) + url = "{}/user/{}/pins/".format(self.root, self.groups[0]) return self._pagination(url) @@ -156,18 +209,14 @@ class SexcomLikesExtractor(SexcomExtractor): """Extractor for a user's liked pins on www.sex.com""" subcategory = "likes" directory_fmt = ("{category}", "{user}", "Likes") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/likes/" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/likes/" example = "https://www.sex.com/user/USER/likes/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user = match.group(1) - def metadata(self): - return {"user": text.unquote(self.user)} + return {"user": text.unquote(self.groups[0])} def pins(self): - url = "{}/user/{}/likes/".format(self.root, self.user) + url = "{}/user/{}/likes/".format(self.root, self.groups[0]) return self._pagination(url) @@ -175,15 +224,12 @@ class SexcomBoardExtractor(SexcomExtractor): """Extractor for pins from a board on www.sex.com""" subcategory = "board" directory_fmt = ("{category}", "{user}", "{board}") - pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user" + pattern = (BASE_PATTERN + r"/user" r"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)") example = "https://www.sex.com/user/USER/BOARD/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user, self.board = match.groups() - def metadata(self): + self.user, self.board = self.groups return { "user" : text.unquote(self.user), "board": text.unquote(self.board), @@ -198,19 +244,18 @@ class SexcomSearchExtractor(SexcomExtractor): """Extractor for search results on www.sex.com""" subcategory = "search" directory_fmt = ("{category}", "search", "{search[query]}") - pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:" + pattern = (BASE_PATTERN + r"/((?:" r"(pic|gif|video)s/([^/?#]*)|search/(pic|gif|video)s" r")/?(?:\?([^#]+))?)") example = "https://www.sex.com/search/pics?query=QUERY" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.path = match.group(1) + def _init(self): + self.path, t1, query_alt, t2, query = self.groups - self.search = text.parse_query(match.group(5)) - self.search["type"] = match.group(2) or match.group(4) + self.search = text.parse_query(query) + self.search["type"] = t1 or t2 if "query" not in self.search: - self.search["query"] = match.group(3) or "" + self.search["query"] = query_alt or "" def metadata(self): return {"search": self.search} diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 07c9b21..cdccd4c 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -48,7 +48,12 @@ class SkebExtractor(Extractor): def items(self): metadata = self.metadata() for user_name, post_num in self.posts(): - response, post = self._get_post_data(user_name, post_num) + try: + response, post = self._get_post_data(user_name, post_num) + except Exception as exc: + self.log.error("@%s/%s: %s: %s", user_name, post_num, + exc.__class__.__name__, exc) + continue if metadata: post.update(metadata) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 6c43941..5d0ec46 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -39,6 +39,8 @@ class SubscribestarExtractor(Extractor): for post_html in self.posts(): media = self._media_from_post(post_html) data = self._data_from_post(post_html) + data["title"] = text.unescape(text.extr( + data["content"], "<h1>", "</h1>")) yield Message.Directory, data for num, item in enumerate(media, 1): item.update(data) @@ -55,7 +57,9 @@ class SubscribestarExtractor(Extractor): while True: response = Extractor.request(self, url, **kwargs) - if response.history and "/verify_subscriber" in response.url: + if response.history and ( + "/verify_subscriber" in response.url or + "/age_confirmation_warning" in response.url): raise exception.StopExtraction( "HTTP redirect to %s", response.url) diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index 30f310d..4c1da7a 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -25,14 +25,8 @@ class TiktokExtractor(Extractor): def _init(self): self.audio = self.config("audio", True) self.video = self.config("videos", True) - if not self.config("avatar", True): - self.avatar = util.false def items(self): - # We assume that all of the URLs served by urls() come from the same - # author. - downloaded_avatar = not self.avatar() - for tiktok_url in self.urls(): tiktok_url = self._sanitize_url(tiktok_url) data = self._extract_rehydration_data(tiktok_url) @@ -49,18 +43,10 @@ class TiktokExtractor(Extractor): post = video_detail["itemInfo"]["itemStruct"] author = post["author"] - post["user"] = user = author["uniqueId"] + post["user"] = author["uniqueId"] post["date"] = text.parse_timestamp(post["createTime"]) original_title = title = post["desc"] - if not downloaded_avatar: - avatar_url = author["avatarLarger"] - avatar = self._generate_avatar( - avatar_url, post, user, author["id"]) - yield Message.Directory, avatar - yield Message.Url, avatar_url, avatar - downloaded_avatar = True - yield Message.Directory, post ytdl_media = False @@ -111,44 +97,29 @@ class TiktokExtractor(Extractor): }) yield Message.Url, "ytdl:" + tiktok_url, post - # If we couldn't download the avatar because the given user has no - # posts, we'll need to make a separate request for the user's page - # and download the avatar that way. - if not downloaded_avatar: - user_name = self.avatar() - profile_url = "https://www.tiktok.com/@{}".format(user_name) - data = self._extract_rehydration_data(profile_url) - data = data["webapp.user-detail"]["userInfo"]["user"] - data["user"] = user_name - avatar_url = data["avatarLarger"] - avatar = self._generate_avatar( - avatar_url, data, user_name, data["id"]) - yield Message.Directory, avatar - yield Message.Url, avatar_url, avatar - - def avatar(self): - return False - - def _generate_avatar(self, avatar_url, data, user_name, user_id): - avatar = text.nameext_from_url(avatar_url, data.copy()) - avatar.update({ - "type" : "avatar", - "title" : "@" + user_name, - "id" : user_id, - "img_id": avatar["filename"].partition("~")[0], - "num" : 0, - }) - return avatar - def _sanitize_url(self, url): return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) def _extract_rehydration_data(self, url): - html = self.request(url).text - data = text.extr( - html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" ' - 'type="application/json">', '</script>') - return util.json_loads(data)["__DEFAULT_SCOPE__"] + tries = 0 + while True: + try: + html = self.request(url).text + data = text.extr( + html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" ' + 'type="application/json">', '</script>') + return util.json_loads(data)["__DEFAULT_SCOPE__"] + except ValueError: + # We failed to retrieve rehydration data. This happens + # relatively frequently when making many requests, so + # retry. + if tries >= self._retries: + raise + tries += 1 + self.log.warning("%s: Failed to retrieve rehydration data " + "(%s/%s)", url.rpartition("/")[2], tries, + self._retries) + self.sleep(self._timeout, "retry") def _extract_audio(self, post): audio = post["music"] @@ -179,7 +150,7 @@ class TiktokExtractor(Extractor): elif status == 10204: self.log.error("%s: Requested post not available", url) elif status == 10231: - self.log.error("%s: Region locked - Try downloading with a" + self.log.error("%s: Region locked - Try downloading with a " "VPN/proxy connection", url) else: self.log.error( @@ -230,7 +201,10 @@ class TiktokUserExtractor(TiktokExtractor): pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)" example = "https://www.tiktok.com/@USER" - def urls(self): + def _init(self): + self.avatar = self.config("avatar", True) + + def items(self): """Attempt to use yt-dlp/youtube-dl to extract links from a user's page""" @@ -263,19 +237,39 @@ class TiktokUserExtractor(TiktokExtractor): ytdl_instance = ytdl.construct_YoutubeDL( module, self, user_opts, extr_opts) - # transfer cookies to ytdl + # Transfer cookies to ytdl. if self.cookies: set_cookie = ytdl_instance.cookiejar.set_cookie for cookie in self.cookies: set_cookie(cookie) + user_name = self.groups[0] + profile_url = "{}/@{}".format(self.root, user_name) + if self.avatar: + avatar_url, avatar = self._generate_avatar(user_name, profile_url) + yield Message.Directory, avatar + yield Message.Url, avatar_url, avatar + with ytdl_instance as ydl: info_dict = ydl._YoutubeDL__extract_info( - "{}/@{}".format(self.root, self.groups[0]), - ydl.get_info_extractor("TikTokUser"), + profile_url, ydl.get_info_extractor("TikTokUser"), False, {}, True) # This should include video and photo posts in /video/ URL form. - return [video["url"] for video in info_dict["entries"]] - - def avatar(self): - return self.groups[0] + for video in info_dict["entries"]: + data = {"_extractor": TiktokPostExtractor} + yield Message.Queue, video["url"].partition("?")[0], data + + def _generate_avatar(self, user_name, profile_url): + data = self._extract_rehydration_data(profile_url) + data = data["webapp.user-detail"]["userInfo"]["user"] + data["user"] = user_name + avatar_url = data["avatarLarger"] + avatar = text.nameext_from_url(avatar_url, data.copy()) + avatar.update({ + "type" : "avatar", + "title" : "@" + user_name, + "id" : data["id"], + "img_id": avatar["filename"].partition("~")[0], + "num" : 0, + }) + return (avatar_url, avatar) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index bc135ad..ac1400e 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -64,16 +64,22 @@ class ZerochanExtractor(BooruExtractor): def _parse_entry_html(self, entry_id): url = "{}/{}".format(self.root, entry_id) - extr = text.extract_from(self.request(url).text) + page = self.request(url).text + try: + jsonld = self._extract_jsonld(page) + except Exception: + return {"id": entry_id} + + extr = text.extract_from(page) data = { "id" : text.parse_int(entry_id), - "author" : text.parse_unicode_escapes(extr(' "name": "', '"')), - "file_url": extr('"contentUrl": "', '"'), - "date" : text.parse_datetime(extr('"datePublished": "', '"')), - "width" : text.parse_int(extr('"width": "', ' ')), - "height" : text.parse_int(extr('"height": "', ' ')), - "size" : text.parse_bytes(extr('"contentSize": "', 'B')), + "author" : jsonld["author"]["name"], + "file_url": jsonld["contentUrl"], + "date" : text.parse_datetime(jsonld["datePublished"]), + "width" : text.parse_int(jsonld["width"][:-3]), + "height" : text.parse_int(jsonld["height"][:-3]), + "size" : text.parse_bytes(jsonld["contentSize"][:-1]), "path" : text.split_html(extr( 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), @@ -86,7 +92,7 @@ class ZerochanExtractor(BooruExtractor): tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: category = text.extr(tag, '"', '"') - name = text.extr(tag, 'data-tag="', '"') + name = text.unescape(text.extr(tag, 'data-tag="', '"')) tags.append(category.partition(" ")[0].capitalize() + ":" + name) return data diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 06a580b..76e6517 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -686,8 +686,10 @@ class CustomNone(): # v128.0 release on 2024-07-09 has ordinal 739076 +# v137.0 release on 2025-04-01 has ordinal 739342 # 735492 == 739076 - 128 * 28 -_ff_ver = (datetime.date.today().toordinal() - 735492) // 28 +# 735506 == 739342 - 137 * 28 +_ff_ver = (datetime.date.today().toordinal() - 735506) // 28 NONE = CustomNone() EPOCH = datetime.datetime(1970, 1, 1) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 558b02e..43b234d 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.29.2" +__version__ = "1.29.3" __variant__ = None diff --git a/test/test_config.py b/test/test_config.py index 1d49d77..be58456 100644 --- a/test/test_config.py +++ b/test/test_config.py @@ -168,6 +168,7 @@ class TestConfig(unittest.TestCase): options = ( (("b",) , "c", [1, 2, 3]), (("e", "f"), "g", 234), + (("e", "f"), "g", 234), ) self.assertEqual(config.get(("b",) , "c"), "text") diff --git a/test/test_cookies.py b/test/test_cookies.py index 60c83ff..9ba562c 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -153,19 +153,49 @@ class TestCookieUtils(unittest.TestCase): self.assertFalse(extr.cookies_check(("a",))) self.assertFalse(extr.cookies_check(("a", "b"))) - extr.cookies.set("a", "1") - self.assertFalse(extr.cookies_check(("a",))) + extr.cookies.set("nd_a", "1") + self.assertFalse(extr.cookies_check(("nd_a",))) - extr.cookies.set("a", "1", domain=extr.cookies_domain) - self.assertTrue(extr.cookies_check(("a",))) + extr.cookies.set("cd_a", "1", domain=extr.cookies_domain) + self.assertTrue(extr.cookies_check(("cd_a",))) - extr.cookies.set("a", "1", domain="www" + extr.cookies_domain) + extr.cookies.set("wd_a", "1", domain="www" + extr.cookies_domain) + self.assertFalse(extr.cookies_check(("wd_a",))) self.assertEqual(len(extr.cookies), 3) - self.assertTrue(extr.cookies_check(("a",))) - extr.cookies.set("b", "2", domain=extr.cookies_domain) - extr.cookies.set("c", "3", domain=extr.cookies_domain) - self.assertTrue(extr.cookies_check(("a", "b", "c"))) + extr.cookies.set("cd_b", "2", domain=extr.cookies_domain) + extr.cookies.set("cd_c", "3", domain=extr.cookies_domain) + self.assertFalse(extr.cookies_check(("nd_a", "cd_b", "cd_c"))) + self.assertTrue(extr.cookies_check(("cd_a", "cd_b", "cd_c"))) + self.assertFalse(extr.cookies_check(("wd_a", "cd_b", "cd_c"))) + self.assertEqual(len(extr.cookies), 5) + + def test_check_cookies_domain_sub(self): + extr = _get_extractor("test") + self.assertFalse(extr.cookies, "empty") + extr.cookies_domain = ".example.org" + + self.assertFalse(extr.cookies_check(("a",), subdomains=True)) + self.assertFalse(extr.cookies_check(("a", "b"), subdomains=True)) + + extr.cookies.set("nd_a", "1") + self.assertFalse(extr.cookies_check(("nd_a",), subdomains=True)) + + extr.cookies.set("cd_a", "1", domain=extr.cookies_domain) + self.assertTrue(extr.cookies_check(("cd_a",), subdomains=True)) + + extr.cookies.set("wd_a", "1", domain="www" + extr.cookies_domain) + self.assertTrue(extr.cookies_check(("wd_a",), subdomains=True)) + + extr.cookies.set("cd_b", "2", domain=extr.cookies_domain) + extr.cookies.set("cd_c", "3", domain=extr.cookies_domain) + self.assertEqual(len(extr.cookies), 5) + self.assertFalse(extr.cookies_check( + ("nd_a", "cd_b", "cd_c"), subdomains=True)) + self.assertTrue(extr.cookies_check( + ("cd_a", "cd_b", "cd_c"), subdomains=True)) + self.assertTrue(extr.cookies_check( + ("wd_a", "cd_b", "cd_c"), subdomains=True)) def test_check_cookies_expires(self): extr = _get_extractor("test") diff --git a/test/test_extractor.py b/test/test_extractor.py index cc85fb2..dfc5ff8 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -104,27 +104,39 @@ class TestExtractorModule(unittest.TestCase): @unittest.skipIf(not results, "no test data") def test_categories(self): for result in results.all(): - url = result["#url"] - cls = result["#class"] - try: - extr = cls.from_url(url) - except ImportError as exc: - if exc.name in ("youtube_dl", "yt_dlp"): - print("Skipping '{}' category checks".format(cls.category)) - continue - raise - self.assertTrue(extr, url) - - categories = result.get("#category") - if categories: - base, cat, sub = categories + if result.get("#fail"): + try: + self.assertCategories(result) + except AssertionError: + pass + else: + self.fail(result["#url"] + ": Test did not fail") else: - cat = cls.category - sub = cls.subcategory - base = cls.basecategory - self.assertEqual(extr.category, cat, url) - self.assertEqual(extr.subcategory, sub, url) - self.assertEqual(extr.basecategory, base, url) + self.assertCategories(result) + + def assertCategories(self, result): + url = result["#url"] + cls = result["#class"] + + try: + extr = cls.from_url(url) + except ImportError as exc: + if exc.name in ("youtube_dl", "yt_dlp"): + print("Skipping '{}' category checks".format(cls.category)) + return + raise + self.assertTrue(extr, url) + + categories = result.get("#category") + if categories: + base, cat, sub = categories + else: + cat = cls.category + sub = cls.subcategory + base = cls.basecategory + self.assertEqual(extr.category, cat, url) + self.assertEqual(extr.subcategory, sub, url) + self.assertEqual(extr.basecategory, base, url) @unittest.skipIf(not results, "no test data") def test_unique_pattern_matches(self): @@ -133,7 +145,8 @@ class TestExtractorModule(unittest.TestCase): append = test_urls.append for result in results.all(): - append((result["#url"], result["#class"])) + if not result.get("#fail"): + append((result["#url"], result["#class"])) # iterate over all testcase URLs for url, extr1 in test_urls: diff --git a/test/test_results.py b/test/test_results.py index 3136743..28db6c3 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -92,6 +92,15 @@ class TestExtractorResults(unittest.TestCase): self.assertGreaterEqual(value, range.start, msg=msg) def _run_test(self, result): + if result.get("#fail"): + del result["#fail"] + try: + self._run_test(result) + except AssertionError: + return + else: + self.fail("Test did not fail") + base, cat, sub = result_categories(result) result.pop("#comment", None) result.pop("#category", None) |
