diff options
| author | 2024-10-14 03:02:11 -0400 | |
|---|---|---|
| committer | 2024-10-14 03:02:11 -0400 | |
| commit | b28a9957b772b1d063bea4a50c0bbcb04cdef791 (patch) | |
| tree | bdf3b3bafb821af5cd41206d66c4a0b7a60e2a92 | |
| parent | 061cbaf29e92e57152175f877740d3d1a2157bd6 (diff) | |
| parent | 0db541f524e1774865efebcbe5653e9ad76ea2e8 (diff) | |
Update upstream source from tag 'upstream/1.27.6'
Update to upstream version '1.27.6'
with Debian dir bbafbef8a09f7a005c8afa3ebb1c8527d67c172a
43 files changed, 1449 insertions, 452 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index ab8f174..bc6a301 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,33 +1,57 @@ -## 1.27.5 - 2024-09-28 +## 1.27.6 - 2024-10-11 ### Extractors #### Additions -- [ao3] add support ([#6013](https://github.com/mikf/gallery-dl/issues/6013)) -- [civitai] add support ([#3706](https://github.com/mikf/gallery-dl/issues/3706), [#3787](https://github.com/mikf/gallery-dl/issues/3787), [#4129](https://github.com/mikf/gallery-dl/issues/4129), [#5995](https://github.com/mikf/gallery-dl/issues/5995), [#6220](https://github.com/mikf/gallery-dl/issues/6220)) -- [cohost] add support ([#4483](https://github.com/mikf/gallery-dl/issues/4483), [#6191](https://github.com/mikf/gallery-dl/issues/6191)) +- [ao3] add `subscriptions` extractor ([#6247](https://github.com/mikf/gallery-dl/issues/6247)) +- [boosty] add support ([#2387](https://github.com/mikf/gallery-dl/issues/2387)) +- [civitai] add `post` extractors ([#6279](https://github.com/mikf/gallery-dl/issues/6279)) +- [pixiv] support unlisted artworks ([#5162](https://github.com/mikf/gallery-dl/issues/5162)) #### Fixes -- [8chan] update `TOS` cookie name -- [deviantart] work around OAuth API returning empty journal texts ([#6196](https://github.com/mikf/gallery-dl/issues/6196), [#6207](https://github.com/mikf/gallery-dl/issues/6207), [#5916](https://github.com/mikf/gallery-dl/issues/5916)) -- [weasyl:favorite] fix pagination ([#6113](https://github.com/mikf/gallery-dl/issues/6113)) +- [cohost] sanitize default filenames ([#6262](https://github.com/mikf/gallery-dl/issues/6262)) + - limit `headline` length + - remove `plainTextBody` +- [deviantart] fix & improve journal/literature extraction ([#6254](https://github.com/mikf/gallery-dl/issues/6254), [#6207](https://github.com/mikf/gallery-dl/issues/6207), [#6196](https://github.com/mikf/gallery-dl/issues/6196)) + - extract journal HTML from webpage if possible + - support converting `tiptap` markup to HTML +- [deviantart] fix `stash` folder extraction +- [flickr] update default API credentials ([#6300](https://github.com/mikf/gallery-dl/issues/6300)) +- [flickr] fix `ZeroDivisionError` ([#6252](https://github.com/mikf/gallery-dl/issues/6252)) +- [imagefap] fix `{num}` in single image default filenames +- [myhentaigallery] fix `tags` extraction +- [patreon] extract `attachments_media` files ([#6241](https://github.com/mikf/gallery-dl/issues/6241), [#6268](https://github.com/mikf/gallery-dl/issues/6268)) +- [pixiv] implement workaround for `limit_sanity_level` works ([#4327](https://github.com/mikf/gallery-dl/issues/4327), [#4747](https://github.com/mikf/gallery-dl/issues/4747), [#5054](https://github.com/mikf/gallery-dl/issues/5054), [#5435](https://github.com/mikf/gallery-dl/issues/5435), [#5651](https://github.com/mikf/gallery-dl/issues/5651), [#5655](https://github.com/mikf/gallery-dl/issues/5655)) +- [pornhub] fix `KeyError` when album images are missing ([#6299](https://github.com/mikf/gallery-dl/issues/6299)) +- [rule34us] fix extraction ([#6289](https://github.com/mikf/gallery-dl/issues/6289)) +- [8chan] set TOS cookie for current and previous day #### Improvements -- [bluesky] support video downloads ([#6183](https://github.com/mikf/gallery-dl/issues/6183)) -- [deviantart] add `previews` option ([#3782](https://github.com/mikf/gallery-dl/issues/3782), [#6124](https://github.com/mikf/gallery-dl/issues/6124)) -- [deviantart] warn about empty journal texts ([#5916](https://github.com/mikf/gallery-dl/issues/5916)) -- [inkbunny:favorite] update default directory ([#6115](https://github.com/mikf/gallery-dl/issues/6115)) -- [jpgfish] update domain to `jpg5.su` ([#6231](https://github.com/mikf/gallery-dl/issues/6231)) -- [skeb] prevent 429 errors and need for `request_key` cookie -- [weasyl:favorite] support readable URL format ([#6113](https://github.com/mikf/gallery-dl/issues/6113)) -- [wikimedia] automatically detect API endpoint when none is defined -- [zzup] support `up.zzup.com` galleries ([#6181](https://github.com/mikf/gallery-dl/issues/6181)) +- [bunkr] support `bunkr.pk` URLs ([#6272](https://github.com/mikf/gallery-dl/issues/6272)) +- [civitai] use tRPC API by default ([#6279](https://github.com/mikf/gallery-dl/issues/6279)) +- [civitai] improve default archive format ([#6302](https://github.com/mikf/gallery-dl/issues/6302)) +- [komikcast] update domain to `komikcast.cz` +- [newgrounds] detect more comment embeds ([#6253](https://github.com/mikf/gallery-dl/issues/6253)) +- [newgrounds] add more fallback URL formats for `art-images` files +- [oauth] prevent empty browser names +- [patreon] use mobile UA ([#6241](https://github.com/mikf/gallery-dl/issues/6241), [#6239](https://github.com/mikf/gallery-dl/issues/6239), [#6140](https://github.com/mikf/gallery-dl/issues/6140)) +- [patreon] handle suspended accounts +- [pixiv] detect works requiring `My pixiv` access +#### Metadata +- [civitai] ensure image files have an `id` ([#6251](https://github.com/mikf/gallery-dl/issues/6251)) +- [gelbooru_v02] unescape HTML entities in categorized tags +- [generic] ensure `path` metadata is always defined +- [pixiv] retrieve `caption` from AJAX API when empty ([#4327](https://github.com/mikf/gallery-dl/issues/4327), [#5191](https://github.com/mikf/gallery-dl/issues/5191)) +#### Options +- [fanbox] add `comments` option, extend `metadata` option ([#6287](https://github.com/mikf/gallery-dl/issues/6287)) +- [pixiv] add `comments` option ([#6287](https://github.com/mikf/gallery-dl/issues/6287)) +#### Removals +- [blogger] remove `micmicidol.club` +- [chevereto] remove `deltaporno.com` +- [lolisafe] remove `xbunkr.com` +- [pururin] remove module +- [shimmie2] remove `loudbooru.com` ### Post Processors -- [ugoira] implement storing "original" frames in ZIP archives ([#6147](https://github.com/mikf/gallery-dl/issues/6147)) -- [ugoira] fix `KeyError: '_ugoira_frame_index'` ([#6154](https://github.com/mikf/gallery-dl/issues/6154)) -### Formatter -- add `L` conversion - returns the length of a value -- allow accessing `util.NONE` via global `_nul` +- [ugoira] fix `BadZipFile` exceptions ([#6285](https://github.com/mikf/gallery-dl/issues/6285)) +- [ugoira] catch all exceptions when extracting ZIP archives ([#6285](https://github.com/mikf/gallery-dl/issues/6285)) +- [ugoira] forward frame data as `_ugoira_frame_data` ([#6154](https://github.com/mikf/gallery-dl/issues/6154), [#6285](https://github.com/mikf/gallery-dl/issues/6285)) ### Miscellaneous -- [cookies] add `cookies-select` option -- [cookies:firefox] support using domain & container filters together -- [docker] prevent errors in Dockerfile build -- [tests] make `#category` result entries optional -- allow filtering `--list-extractors` results -- implement alternatives for deprecated `utc` datetime functions +- [build] remove setuptools and requests version restrictions +- [docker] build from `python:3.12-alpine` +- [text] improve `parse_query()` performance @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.27.5 +Version: 1.27.6 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -114,9 +114,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__ Nightly Builds @@ -74,9 +74,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 06effd6..3fedff4 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2024-09-28" "1.27.5" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2024-10-11" "1.27.6" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index a36e108..ba4bb3e 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2024-09-28" "1.27.5" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2024-10-11" "1.27.6" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -772,7 +772,7 @@ Setting this value to \f[I]"browser"\f[] will try to automatically detect and use the User-Agent used by the system's default browser. Note: This option has no effect on -pixiv, e621, and mangadex +pixiv, e621, mangadex, and patreon extractors, as these need specific values to function correctly. @@ -782,7 +782,7 @@ extractors, as these need specific values to function correctly. .IP "Default:" 9 .br -* \f[I]"firefox"\f[]: \f[I]artstation\f[], \f[I]mangasee\f[], \f[I]patreon\f[], \f[I]pixiv:series\f[], \f[I]twitter\f[] +* \f[I]"firefox"\f[]: \f[I]artstation\f[], \f[I]mangasee\f[], \f[I]twitter\f[] .br * \f[I]null\f[]: otherwise @@ -868,7 +868,7 @@ to be passed to .IP "Default:" 9 .br -* \f[I]false\f[]: \f[I]artstation\f[], \f[I]patreon\f[], \f[I]pixiv:series\f[] +* \f[I]false\f[]: \f[I]artstation\f[] .br * \f[I]true\f[]: otherwise @@ -1761,6 +1761,63 @@ Process reposts. Download videos. +.SS extractor.boosty.allowed +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Request only available posts. + + +.SS extractor.boosty.bought +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Request only purchased posts for \f[I]feed\f[] results. + + +.SS extractor.boosty.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Provide detailed \f[I]user\f[] metadata. + + +.SS extractor.boosty.videos +.IP "Type:" 6 +.br +* \f[I]bool\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Example:" 4 +["full_hd", "high", "medium"] + +.IP "Description:" 4 +Download videos. + +If this is a \f[I]list\f[], it selects which format to try to download. +.br +Possibly available formats are +.br +\f[I]"quad_hd"\f[], \f[I]"ultra_hd"\f[], \f[I]"full_hd"\f[], +\f[I]"high"\f[], \f[I]"medium"\f[], \f[I]"low"\f[] + + .SS extractor.bunkr.tlds .IP "Type:" 6 \f[I]bool\f[] @@ -1799,7 +1856,7 @@ Available types are \f[I]string\f[] .IP "Default:" 9 -\f[I]"rest"\f[] +\f[I]"trpc"\f[] .IP "Description:" 4 Selects which API endpoints to use. @@ -1807,7 +1864,7 @@ Selects which API endpoints to use. .br * \f[I]"rest"\f[]: \f[I]Public REST API\f[] .br -* \f[I]"trpc"\f[]: Internal TRPC API +* \f[I]"trpc"\f[]: Internal tRPC API .SS extractor.civitai.api-key @@ -1839,6 +1896,28 @@ Available types are \f[I]gallery\f[]. +.SS extractor.civitai.include +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]["user-models", "user-posts"]\f[] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are +\f[I]"user-models"\f[], +\f[I]"user-posts"\f[], +\f[I]"user-images"\f[]. + +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + .SS extractor.civitai.nsfw .IP "Type:" 6 .br @@ -2620,6 +2699,20 @@ Selects an alternative source to download files from. * \f[I]"hitomi"\f[]: Download the corresponding gallery from \f[I]hitomi.la\f[] +.SS extractor.fanbox.comments +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract \f[I]comments\f[] metadata. + +Note: This requires 1 or more additional API requests per post, +depending on the number of comments. + + .SS extractor.fanbox.embeds .IP "Type:" 6 .br @@ -2657,13 +2750,25 @@ extraction and download for YouTube, Vimeo, and SoundCloud embeds. .IP "Example:" 4 .br -* user,plan +* user,plan,comments .br -* ["user", "plan"] +* ["user", "plan", "comments"] .IP "Description:" 4 Extract \f[I]plan\f[] and extended \f[I]user\f[] metadata. +Supported fields when selecting which data to extract are + +.br +* \f[I]comments\f[] +.br +* \f[I]plan\f[] +.br +* \f[I]user\f[] + +Note: \f[I]comments\f[] can also be enabled via +\f[I]fanbox.comments\f[] + .SS extractor.flickr.access-token & .access-token-secret .IP "Type:" 6 @@ -3987,7 +4092,21 @@ For works bookmarked by \f[I]your own account\f[], fetch bookmark tags as \f[I]tags_bookmark\f[] metadata. -Note: This requires 1 additional API call per bookmarked post. +Note: This requires 1 additional API request per bookmarked post. + + +.SS extractor.pixiv.comments +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Fetch \f[I]comments\f[] metadata. + +Note: This requires 1 or more additional API requests per post, +depending on the number of comments. .SS extractor.pixiv.work.related @@ -4054,6 +4173,17 @@ When downloading galleries, this sets the maximum number of posts to get. A value of \f[I]0\f[] means no limit. +.SS extractor.pixiv.sanity +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Try to fetch \f[I]limit_sanity_level\f[] works via web API. + + .SS extractor.plurk.comments .IP "Type:" 6 \f[I]bool\f[] diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 32ff8bc..27d0dd4 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.27.5 +Version: 1.27.6 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -114,9 +114,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.5/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index 8ae8026..df9217a 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -67,6 +67,7 @@ gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py gallery_dl/extractor/bluesky.py gallery_dl/extractor/booru.py +gallery_dl/extractor/boosty.py gallery_dl/extractor/bunkr.py gallery_dl/extractor/catbox.py gallery_dl/extractor/chevereto.py @@ -179,7 +180,6 @@ gallery_dl/extractor/poringa.py gallery_dl/extractor/pornhub.py gallery_dl/extractor/pornpics.py gallery_dl/extractor/postmill.py -gallery_dl/extractor/pururin.py gallery_dl/extractor/reactor.py gallery_dl/extractor/readcomiconline.py gallery_dl/extractor/recursive.py diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index afa3a69..f81d2a1 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -29,8 +29,10 @@ class _8chanExtractor(Extractor): def _init(self): now = util.datetime_utcnow() domain = self.root.rpartition("/")[2] - self.cookies.set("TOS20240928", "1", domain=domain) - self.cookies.set(now.strftime("TOS%Y%m%d"), "1", domain=domain) + self.cookies.set( + now.strftime("TOS%Y%m%d"), "1", domain=domain) + self.cookies.set( + (now - timedelta(1)).strftime("TOS%Y%m%d"), "1", domain=domain) @memcache() def cookies_prepare(self): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 826771c..9885195 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -32,6 +32,7 @@ modules = [ "behance", "blogger", "bluesky", + "boosty", "bunkr", "catbox", "chevereto", @@ -133,7 +134,6 @@ modules = [ "pornhub", "pornpics", "postmill", - "pururin", "reactor", "readcomiconline", "reddit", diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py index 1f570e8..d3ab846 100644 --- a/gallery_dl/extractor/ao3.py +++ b/gallery_dl/extractor/ao3.py @@ -29,11 +29,31 @@ class Ao3Extractor(Extractor): self.login() base = self.root + "/works/" - data = {"_extractor": Ao3WorkExtractor} + data = {"_extractor": Ao3WorkExtractor, "type": "work"} for work_id in self.works(): yield Message.Queue, base + work_id, data + def items_list(self, type, needle, part=True): + self.login() + + base = self.root + "/" + data_work = {"_extractor": Ao3WorkExtractor, "type": "work"} + data_series = {"_extractor": Ao3SeriesExtractor, "type": "series"} + data_user = {"_extractor": Ao3UserExtractor, "type": "user"} + + for item in self._pagination(self.groups[0], needle): + path = item.rpartition("/")[0] if part else item + url = base + path + if item.startswith("works/"): + yield Message.Queue, url, data_work + elif item.startswith("series/"): + yield Message.Queue, url, data_series + elif item.startswith("users/"): + yield Message.Queue, url, data_user + else: + self.log.warning("Unsupported %s type '%s'", type, path) + def works(self): return self._pagination(self.groups[0]) @@ -284,19 +304,14 @@ class Ao3UserBookmarkExtractor(Ao3Extractor): example = "https://archiveofourown.org/users/USER/bookmarks" def items(self): - self.login() + return self.items_list("bookmark", '<span class="count"><a href="/') - base = self.root + "/" - data_work = {"_extractor": Ao3WorkExtractor} - data_series = {"_extractor": Ao3SeriesExtractor} - for item in self._pagination( - self.groups[0], '<span class="count"><a href="/'): - path = item.rpartition("/")[0] - url = base + path - if item.startswith("works/"): - yield Message.Queue, url, data_work - elif item.startswith("series/"): - yield Message.Queue, url, data_series - else: - self.log.warning("Unsupported bookmark type '%s'", path) +class Ao3SubscriptionsExtractor(Ao3Extractor): + """Extractor for your AO3 account's subscriptions""" + subcategory = "subscriptions" + pattern = BASE_PATTERN + r"(/users/([^/?#]+)/subscriptions(?:/?\?.+)?)" + example = "https://archiveofourown.org/users/USER/subscriptions" + + def items(self): + return self.items_list("subscription", '<dt>\n<a href="/', False) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 402408e..37075ea 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -89,10 +89,6 @@ BASE_PATTERN = BloggerExtractor.update({ "root": None, "pattern": r"[\w-]+\.blogspot\.com", }, - "micmicidol": { - "root": "https://www.micmicidol.club", - "pattern": r"(?:www\.)?micmicidol\.club", - }, }) diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py new file mode 100644 index 0000000..997de4a --- /dev/null +++ b/gallery_dl/extractor/boosty.py @@ -0,0 +1,357 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.boosty.to/""" + +from .common import Extractor, Message +from .. import text, util, exception + +BASE_PATTERN = r"(?:https?://)?boosty\.to" + + +class BoostyExtractor(Extractor): + """Base class for boosty extractors""" + category = "boosty" + root = "https://www.boosty.to" + directory_fmt = ("{category}", "{user[blogUrl]} ({user[id]})", + "{post[date]:%Y-%m-%d} {post[int_id]}") + filename_fmt = "{num:>02} {file[id]}.{extension}" + archive_fmt = "{file[id]}" + cookies_domain = ".boosty.to" + cookies_names = ("auth",) + + def _init(self): + self.api = BoostyAPI(self) + + self._user = None if self.config("metadata") else False + self.only_allowed = self.config("allowed", True) + self.only_bought = self.config("bought") + + videos = self.config("videos") + if videos is None or videos: + if isinstance(videos, str): + videos = videos.split(",") + elif not isinstance(videos, (list, tuple)): + videos = ("quad_hd", "ultra_hd", "full_hd", + "high", "medium", "low") + self.videos = videos + + def items(self): + for post in self.posts(): + if not post.get("hasAccess"): + self.log.warning("Not allowed to access post %s", post["id"]) + continue + + files = self._process_post(post) + data = { + "post" : post, + "user" : post.pop("user", None), + "count": len(files), + } + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + data["file"] = file + url = file["url"] + yield Message.Url, url, text.nameext_from_url(url, data) + + def posts(self): + """Yield JSON content of all relevant posts""" + + def _process_post(self, post): + files = [] + post["content"] = content = [] + post["links"] = links = [] + + if "createdAt" in post: + post["date"] = text.parse_timestamp(post["createdAt"]) + if self._user: + post["user"] = self._user + + for block in post["data"]: + try: + type = block["type"] + if type == "text": + if block["modificator"] == "BLOCK_END": + continue + c = util.json_loads(block["content"]) + content.append(c[0]) + + elif type == "image": + files.append(self._update_url(post, block)) + + elif type == "ok_video": + if not self.videos: + self.log.debug("%s: Skipping video %s", + post["int_id"], block["id"]) + continue + fmts = { + fmt["type"]: fmt["url"] + for fmt in block["playerUrls"] + if fmt["url"] + } + formats = [ + fmts[fmt] + for fmt in self.videos + if fmt in fmts + ] + if formats: + formats = iter(formats) + block["url"] = next(formats) + block["_fallback"] = formats + files.append(block) + else: + self.log.warning( + "%s: Found no suitable video format for %s", + post["int_id"], block["id"]) + + elif type == "link": + url = block["url"] + links.append(url) + content.append(url) + + elif type == "audio_file": + files.append(self._update_url(post, block)) + + else: + self.log.debug("%s: Unsupported data type '%s'", + post["int_id"], type) + except Exception as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + + del post["data"] + return files + + def _update_url(self, post, block): + url = block["url"] + sep = "&" if "?" in url else "?" + + signed_query = post.get("signedQuery") + if signed_query: + url += sep + signed_query[1:] + sep = "&" + + migrated = post.get("isMigrated") + if migrated is not None: + url += sep + "is_migrated=" + str(migrated).lower() + + block["url"] = url + return block + + +class BoostyUserExtractor(BoostyExtractor): + """Extractor for boosty.to user profiles""" + subcategory = "user" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:\?([^#]+))?$" + example = "https://boosty.to/USER" + + def posts(self): + user, query = self.groups + params = text.parse_query(query) + if self._user is None: + self._user = self.api.user(user) + return self.api.blog_posts(user, params) + + +class BoostyMediaExtractor(BoostyExtractor): + """Extractor for boosty.to user media""" + subcategory = "media" + directory_fmt = "{category}", "{user[blogUrl]} ({user[id]})", "media" + filename_fmt = "{post[id]}_{num}.{extension}" + pattern = BASE_PATTERN + r"/([^/?#]+)/media/([^/?#]+)(?:\?([^#]+))?" + example = "https://boosty.to/USER/media/all" + + def posts(self): + user, media, query = self.groups + params = text.parse_query(query) + self._user = self.api.user(user) + return self.api.blog_media_album(user, media, params) + + +class BoostyFeedExtractor(BoostyExtractor): + """Extractor for your boosty.to subscription feed""" + subcategory = "feed" + pattern = BASE_PATTERN + r"/(?:\?([^#]+))?(?:$|#)" + example = "https://boosty.to/" + + def posts(self): + params = text.parse_query(self.groups[0]) + return self.api.feed_posts(params) + + +class BoostyPostExtractor(BoostyExtractor): + """Extractor for boosty.to posts""" + subcategory = "post" + pattern = BASE_PATTERN + r"/([^/?#]+)/posts/([0-9a-f-]+)" + example = "https://boosty.to/USER/posts/01234567-89ab-cdef-0123-456789abcd" + + def posts(self): + user, post_id = self.groups + if self._user is None: + self._user = self.api.user(user) + return (self.api.post(user, post_id),) + + +class BoostyFollowingExtractor(BoostyExtractor): + """Extractor for your boosty.to subscribed users""" + subcategory = "following" + pattern = BASE_PATTERN + r"/app/settings/subscriptions" + example = "https://boosty.to/app/settings/subscriptions" + + def items(self): + for user in self.api.user_subscriptions(): + url = "{}/{}".format(self.root, user["blog"]["blogUrl"]) + user["_extractor"] = BoostyUserExtractor + yield Message.Queue, url, user + + +class BoostyAPI(): + """Interface for the Boosty API""" + root = "https://api.boosty.to" + + def __init__(self, extractor, access_token=None): + self.extractor = extractor + self.headers = { + "Accept": "application/json, text/plain, */*", + "Origin": extractor.root, + } + + if not access_token: + auth = self.extractor.cookies.get("auth", domain=".boosty.to") + if auth: + access_token = text.extr( + auth, "%22accessToken%22%3A%22", "%22") + if access_token: + self.headers["Authorization"] = "Bearer " + access_token + + def blog_posts(self, username, params): + endpoint = "/v1/blog/{}/post/".format(username) + params = self._merge_params(params, { + "limit" : "5", + "offset" : None, + "comments_limit": "2", + "reply_limit" : "1", + }) + return self._pagination(endpoint, params) + + def blog_media_album(self, username, type="all", params=()): + endpoint = "/v1/blog/{}/media_album/".format(username) + params = self._merge_params(params, { + "type" : type.rstrip("s"), + "limit" : "15", + "limit_by": "media", + "offset" : None, + }) + return self._pagination(endpoint, params, self._transform_media_posts) + + def _transform_media_posts(self, data): + posts = [] + + for obj in data["mediaPosts"]: + post = obj["post"] + post["data"] = obj["media"] + posts.append(post) + + return posts + + def post(self, username, post_id): + endpoint = "/v1/blog/{}/post/{}".format(username, post_id) + return self._call(endpoint) + + def feed_posts(self, params=None): + endpoint = "/v1/feed/post/" + params = self._merge_params(params, { + "limit" : "5", + "offset" : None, + "comments_limit": "2", + }) + if "only_allowed" not in params and self.extractor.only_allowed: + params["only_allowed"] = "true" + if "only_bought" not in params and self.extractor.only_bought: + params["only_bought"] = "true" + return self._pagination(endpoint, params, key="posts") + + def user(self, username): + endpoint = "/v1/blog/" + username + user = self._call(endpoint) + user["id"] = user["owner"]["id"] + return user + + def user_subscriptions(self, params=None): + endpoint = "/v1/user/subscriptions" + params = self._merge_params(params, { + "limit" : "30", + "with_follow": "true", + "offset" : None, + }) + return self._pagination_users(endpoint, params) + + def _merge_params(self, params_web, params_api): + if params_web: + web_to_api = { + "isOnlyAllowedPosts": "is_only_allowed", + "postsTagsIds" : "tags_ids", + "postsFrom" : "from_ts", + "postsTo" : "to_ts", + } + for name, value in params_web.items(): + name = web_to_api.get(name, name) + params_api[name] = value + return params_api + + def _call(self, endpoint, params=None): + url = self.root + endpoint + + while True: + response = self.extractor.request( + url, params=params, headers=self.headers, + fatal=None, allow_redirects=False) + + if response.status_code < 300: + return response.json() + + elif response.status_code < 400: + raise exception.AuthenticationError("Invalid API access token") + + elif response.status_code == 429: + self.extractor.wait(seconds=600) + + else: + self.extractor.log.debug(response.text) + raise exception.StopExtraction("API request failed") + + def _pagination(self, endpoint, params, transform=None, key=None): + if "is_only_allowed" not in params and self.extractor.only_allowed: + params["is_only_allowed"] = "true" + + while True: + data = self._call(endpoint, params) + + if transform: + yield from transform(data["data"]) + elif key: + yield from data["data"][key] + else: + yield from data["data"] + + extra = data["extra"] + if extra.get("isLast"): + return + offset = extra.get("offset") + if not offset: + return + params["offset"] = offset + + def _pagination_users(self, endpoint, params): + while True: + data = self._call(endpoint, params) + + yield from data["data"] + + offset = data["offset"] + data["limit"] + if offset > data["total"]: + return + params["offset"] = offset diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 780bdf1..9022ffc 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -21,7 +21,7 @@ else: BASE_PATTERN = ( r"(?:bunkr:(?:https?://)?([^/?#]+)|" r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|[cf]i|ru|la|is|to|a[cx]" + r"\.(?:s[kiu]|[cf]i|pk|ru|la|is|to|a[cx]" r"|black|cat|media|red|site|ws|org)))" ) diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index 102945b..aedcea4 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -42,10 +42,6 @@ BASE_PATTERN = CheveretoExtractor.update({ "root": "https://img.kiwi", "pattern": r"img\.kiwi", }, - "deltaporno": { - "root": "https://gallery.deltaporno.com", - "pattern": r"gallery\.deltaporno\.com", - }, }) diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index 3e657d6..725af3a 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -22,17 +22,17 @@ class CivitaiExtractor(Extractor): category = "civitai" root = "https://civitai.com" directory_fmt = ("{category}", "{username|user[username]}", "images") - filename_fmt = "{id}.{extension}" - archive_fmt = "{hash}" + filename_fmt = "{file[id]|id|filename}.{extension}" + archive_fmt = "{file[hash]|hash}" request_interval = (0.5, 1.5) def _init(self): - if self.config("api") == "trpc": - self.log.debug("Using tRPC API") - self.api = CivitaiTrpcAPI(self) - else: + if self.config("api") == "rest": self.log.debug("Using REST API") self.api = CivitaiRestAPI(self) + else: + self.log.debug("Using tRPC API") + self.api = CivitaiTrpcAPI(self) quality = self.config("quality") if quality: @@ -53,6 +53,30 @@ class CivitaiExtractor(Extractor): yield Message.Queue, url, data return + posts = self.posts() + if posts: + for post in posts: + + if "images" in post: + images = post["images"] + else: + images = self.api.images_post(post["id"]) + + post = self.api.post(post["id"]) + post["date"] = text.parse_datetime( + post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + data = { + "post": post, + "user": post["user"], + } + del post["user"] + + yield Message.Directory, data + for file in self._image_results(images): + file.update(data) + yield Message.Url, file["url"], file + return + images = self.images() if images: for image in images: @@ -68,6 +92,9 @@ class CivitaiExtractor(Extractor): def models(self): return () + def posts(self): + return () + def images(self): return () @@ -87,13 +114,26 @@ class CivitaiExtractor(Extractor): url, self._image_quality, name) ) + def _image_results(self, images): + for num, file in enumerate(images, 1): + data = text.nameext_from_url(file["url"], { + "num" : num, + "file": file, + "url" : self._url(file), + }) + if not data["extension"]: + data["extension"] = self._image_ext + if "id" not in file and data["filename"].isdecimal(): + file["id"] = text.parse_int(data["filename"]) + yield data + class CivitaiModelExtractor(CivitaiExtractor): subcategory = "model" directory_fmt = ("{category}", "{user[username]}", "{model[id]}{model[name]:? //}", "{version[id]}{version[name]:? //}") - filename_fmt = "{filename}.{extension}" + filename_fmt = "{file[id]}.{extension}" archive_fmt = "{file[hash]}" pattern = BASE_PATTERN + r"/models/(\d+)(?:/?\?modelVersionId=(\d+))?" example = "https://civitai.com/models/12345/TITLE" @@ -183,23 +223,11 @@ class CivitaiModelExtractor(CivitaiExtractor): } images = self.api.images(params, defaults=False) - return [ - text.nameext_from_url(file["url"], { - "num" : num, - "file": file, - "url" : self._url(file), - }) - for num, file in enumerate(images, 1) - ] + return self._image_results(images) def _extract_files_gallery(self, model, version, user): images = self.api.images_gallery(model, version, user) - for num, file in enumerate(images, 1): - yield text.nameext_from_url(file["url"], { - "num" : num, - "file": file, - "url" : self._url(file), - }) + return self._image_results(images) def _validate_file_model(self, response): if response.headers.get("Content-Type", "").startswith("text/html"): @@ -224,6 +252,17 @@ class CivitaiImageExtractor(CivitaiExtractor): return self.api.image(self.groups[0]) +class CivitaiPostExtractor(CivitaiExtractor): + subcategory = "post" + directory_fmt = ("{category}", "{username|user[username]}", "posts", + "{post[id]}{post[title]:? //}") + pattern = BASE_PATTERN + r"/posts/(\d+)" + example = "https://civitai.com/posts/12345" + + def posts(self): + return ({"id": int(self.groups[0])},) + + class CivitaiTagModelsExtractor(CivitaiExtractor): subcategory = "tag-models" pattern = BASE_PATTERN + r"/(?:tag/|models\?tag=)([^/?&#]+)" @@ -266,8 +305,9 @@ class CivitaiUserExtractor(CivitaiExtractor): base = "{}/user/{}/".format(self.root, self.groups[0]) return self._dispatch_extractors(( (CivitaiUserModelsExtractor, base + "models"), + (CivitaiUserPostsExtractor , base + "posts"), (CivitaiUserImagesExtractor, base + "images"), - ), ("user-models", "user-images")) + ), ("user-models", "user-posts")) class CivitaiUserModelsExtractor(CivitaiExtractor): @@ -281,6 +321,19 @@ class CivitaiUserModelsExtractor(CivitaiExtractor): return self.api.models(params) +class CivitaiUserPostsExtractor(CivitaiExtractor): + subcategory = "user-posts" + directory_fmt = ("{category}", "{username|user[username]}", "posts", + "{post[id]}{post[title]:? //}") + pattern = USER_PATTERN + r"/posts/?(?:\?([^#]+))?" + example = "https://civitai.com/user/USER/posts" + + def posts(self): + params = text.parse_query(self.groups[1]) + params["username"] = text.unquote(self.groups[0]) + return self.api.posts(params) + + class CivitaiUserImagesExtractor(CivitaiExtractor): subcategory = "user-images" pattern = USER_PATTERN + r"/images/?(?:\?([^#]+))?" @@ -373,7 +426,7 @@ class CivitaiTrpcAPI(): self.root = extractor.root + "/api/trpc/" self.headers = { "content-type" : "application/json", - "x-client-version": "5.0.94", + "x-client-version": "5.0.146", "x-client-date" : "", "x-client" : "web", "x-fingerprint" : "undefined", @@ -399,7 +452,7 @@ class CivitaiTrpcAPI(): endpoint = "image.getInfinite" if defaults: - params_ = { + params = self._merge_params(params, { "useIndex" : True, "period" : "AllTime", "sort" : "Newest", @@ -408,12 +461,9 @@ class CivitaiTrpcAPI(): "fromPlatform" : False, # Made On-Site "browsingLevel": self.nsfw, "include" : ["cosmetics"], - } - params_.update(params) - else: - params_ = params + }) - return self._pagination(endpoint, params_) + return self._pagination(endpoint, params) def images_gallery(self, model, version, user): endpoint = "image.getImagesAsPostsInfinite" @@ -430,6 +480,13 @@ class CivitaiTrpcAPI(): for post in self._pagination(endpoint, params): yield from post["images"] + def images_post(self, post_id): + params = { + "postId" : int(post_id), + "pending": True, + } + return self.images(params) + def model(self, model_id): endpoint = "model.getById" params = {"id": int(model_id)} @@ -444,7 +501,7 @@ class CivitaiTrpcAPI(): endpoint = "model.getAll" if defaults: - params_ = { + params = self._merge_params(params, { "period" : "AllTime", "periodMode" : "published", "sort" : "Newest", @@ -455,36 +512,71 @@ class CivitaiTrpcAPI(): "fromPlatform" : False, "supportsGeneration": False, "browsingLevel": self.nsfw, - } - params_.update(params) - else: - params_ = params + }) + + return self._pagination(endpoint, params) + + def post(self, post_id): + endpoint = "post.get" + params = {"id": int(post_id)} + return self._call(endpoint, params) - return self._pagination(endpoint, params_) + def posts(self, params, defaults=True): + endpoint = "post.getInfinite" + meta = {"cursor": ("Date",)} + + if defaults: + params = self._merge_params(params, { + "browsingLevel": self.nsfw, + "period" : "AllTime", + "periodMode" : "published", + "sort" : "Newest", + "followed" : False, + "draftOnly" : False, + "pending" : True, + "include" : ["cosmetics"], + }) + + return self._pagination(endpoint, params, meta) def user(self, username): endpoint = "user.getCreator" params = {"username": username} return (self._call(endpoint, params),) - def _call(self, endpoint, params): + def _call(self, endpoint, params, meta=None): url = self.root + endpoint headers = self.headers - params = {"input": util.json_dumps({"json": params})} + if meta: + input = {"json": params, "meta": {"values": meta}} + else: + input = {"json": params} + + params = {"input": util.json_dumps(input)} headers["x-client-date"] = str(int(time.time() * 1000)) - response = self.extractor.request(url, headers=headers, params=params) + response = self.extractor.request(url, params=params, headers=headers) return response.json()["result"]["data"]["json"] - def _pagination(self, endpoint, params): + def _pagination(self, endpoint, params, meta=None): + if "cursor" not in params: + params["cursor"] = None + meta_ = {"cursor": ("undefined",)} + while True: - data = self._call(endpoint, params) + data = self._call(endpoint, params, meta_) yield from data["items"] try: if not data["nextCursor"]: return - params["cursor"] = data["nextCursor"] except KeyError: return + + params["cursor"] = data["nextCursor"] + meta_ = meta + + def _merge_params(self, params_user, params_default): + params_default.update(params_user) + return params_default diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py index e1f6040..4722a4f 100644 --- a/gallery_dl/extractor/cohost.py +++ b/gallery_dl/extractor/cohost.py @@ -19,8 +19,7 @@ class CohostExtractor(Extractor): category = "cohost" root = "https://cohost.org" directory_fmt = ("{category}", "{postingProject[handle]}") - filename_fmt = ("{postId}_{headline|plainTextBody:?/_/[:100]}" - "{num}.{extension}") + filename_fmt = ("{postId}_{headline:?/_/[b:200]}{num}.{extension}") archive_fmt = "{postId}_{num}" def _init(self): diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 09beb5f..1746647 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -93,7 +93,9 @@ class DanbooruExtractor(BaseExtractor): if post["extension"] == "zip": if self.ugoira: - post["frames"] = self._ugoira_frames(post) + post["_ugoira_original"] = False + post["_ugoira_frame_data"] = post["frames"] = \ + self._ugoira_frames(post) post["_http_adjust_extension"] = False else: url = post["large_file_url"] diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 3686e1b..836fae7 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -177,24 +177,7 @@ class DeviantartExtractor(Extractor): yield self.commit(deviation, deviation["flash"]) if self.commit_journal: - if "excerpt" in deviation: - # journal = self.api.deviation_content( - # deviation["deviationid"]) - if not self.eclipse_api: - self.eclipse_api = DeviantartEclipseAPI(self) - content = self.eclipse_api.deviation_extended_fetch( - deviation["index"], - deviation["author"]["username"], - "journal", - )["deviation"]["textContent"] - html = content["html"]["markup"] - if html.startswith("{"): - html = content["excerpt"].replace("\n", "<br />") - journal = {"html": html} - elif "body" in deviation: - journal = {"html": deviation.pop("body")} - else: - journal = None + journal = self._extract_journal(deviation) if journal: if self.extra: deviation["_journal"] = journal["html"] @@ -375,6 +358,204 @@ class DeviantartExtractor(Extractor): deviation["extension"] = "txt" return Message.Url, txt, deviation + def _extract_journal(self, deviation): + if "excerpt" in deviation: + # # empty 'html' + # return self.api.deviation_content(deviation["deviationid"]) + + if "_page" in deviation: + page = deviation["_page"] + del deviation["_page"] + else: + page = self._limited_request(deviation["url"]).text + + # extract journal html from webpage + html = text.extr( + page, + "<h2>Literature Text</h2></span><div>", + "</div></section></div></div>") + if html: + return {"html": html} + + self.log.debug("%s: Failed to extract journal HTML from webpage. " + "Falling back to __INITIAL_STATE__ markup.", + deviation["index"]) + + # parse __INITIAL_STATE__ as fallback + state = util.json_loads(text.extr( + page, 'window.__INITIAL_STATE__ = JSON.parse("', '");') + .replace("\\\\", "\\").replace("\\'", "'").replace('\\"', '"')) + deviations = state["@@entities"]["deviation"] + content = deviations.popitem()[1]["textContent"] + + html = self._textcontent_to_html(deviation, content) + if html: + return {"html": html} + return {"html": content["excerpt"].replace("\n", "<br />")} + + if "body" in deviation: + return {"html": deviation.pop("body")} + return None + + def _textcontent_to_html(self, deviation, content): + html = content["html"] + markup = html["markup"] + + if not markup.startswith("{"): + return markup + + if html["type"] == "tiptap": + try: + return self._tiptap_to_html(markup) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.error("%s: '%s: %s'", deviation["index"], + exc.__class__.__name__, exc) + + self.log.warning("%s: Unsupported '%s' markup.", + deviation["index"], html["type"]) + + def _tiptap_to_html(self, markup): + html = [] + + html.append('<div data-editor-viewer="1" ' + 'class="_83r8m _2CKTq _3NjDa mDnFl">') + data = util.json_loads(markup) + for block in data["document"]["content"]: + self._tiptap_process_content(html, block) + html.append("</div>") + + return "".join(html) + + def _tiptap_process_content(self, html, content): + type = content["type"] + + if type == "paragraph": + children = content.get("content") + if children: + html.append('<p style="') + + attrs = content["attrs"] + if "textAlign" in attrs: + html.append("text-align:") + html.append(attrs["textAlign"]) + html.append(";") + html.append('margin-inline-start:0px">') + + for block in children: + self._tiptap_process_content(html, block) + html.append("</p>") + else: + html.append('<p class="empty-p"><br/></p>') + + elif type == "text": + self._tiptap_process_text(html, content) + + elif type == "hardBreak": + html.append("<br/><br/>") + + elif type == "horizontalRule": + html.append("<hr/>") + + elif type == "da-deviation": + self._tiptap_process_deviation(html, content) + + elif type == "da-mention": + user = content["attrs"]["user"]["username"] + html.append('<a href="https://www.deviantart.com/') + html.append(user.lower()) + html.append('" data-da-type="da-mention" data-user="">@<!-- -->') + html.append(user) + html.append('</a>') + + else: + self.log.warning("Unsupported content type '%s'", type) + + def _tiptap_process_text(self, html, content): + marks = content.get("marks") + if marks: + close = [] + for mark in marks: + type = mark["type"] + if type == "link": + html.append('<a href="') + html.append(text.escape(mark["attrs"]["href"])) + html.append('" rel="noopener noreferrer nofollow ugc">') + close.append("</a>") + elif type == "bold": + html.append("<strong>") + close.append("</strong>") + elif type == "italic": + html.append("<em>") + close.append("</em>") + elif type == "underline": + html.append("<u>") + close.append("</u>") + elif type == "textStyle" and len(mark) <= 1: + pass + else: + self.log.warning("Unsupported text marker '%s'", type) + close.reverse() + html.append(text.escape(content["text"])) + html.extend(close) + else: + html.append(text.escape(content["text"])) + + def _tiptap_process_deviation(self, html, content): + dev = content["attrs"]["deviation"] + media = dev.get("media") or () + + html.append('<div class="jjNX2">') + html.append('<figure class="Qf-HY" data-da-type="da-deviation" ' + 'data-deviation="" ' + 'data-width="" data-link="" data-alignment="center">') + + if "baseUri" in media: + url, formats = self._eclipse_media(media) + full = formats["fullview"] + + html.append('<a href="') + html.append(text.escape(dev["url"])) + html.append('" class="_3ouD5" style="margin:0 auto;display:flex;' + 'align-items:center;justify-content:center;' + 'overflow:hidden;width:780px;height:') + html.append(str(780 * full["h"] / full["w"])) + html.append('px">') + + html.append('<img src="') + html.append(text.escape(url)) + html.append('" alt="') + html.append(text.escape(dev["title"])) + html.append('" style="width:100%;max-width:100%;display:block"/>') + html.append("</a>") + + elif "textContent" in dev: + html.append('<div class="_32Hs4" style="width:350px">') + + html.append('<a href="') + html.append(text.escape(dev["url"])) + html.append('" class="_3ouD5">') + + html.append('''\ +<section class="Q91qI aG7Yi" style="width:350px;height:313px">\ +<div class="_16ECM _1xMkk" aria-hidden="true">\ +<svg height="100%" viewBox="0 0 15 12" preserveAspectRatio="xMidYMin slice" \ +fill-rule="evenodd">\ +<linearGradient x1="87.8481761%" y1="16.3690766%" \ +x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ +<stop stop-color="#00FF62" offset="0%"></stop>\ +<stop stop-color="#3197EF" stop-opacity="0" offset="100%"></stop>\ +</linearGradient>\ +<text class="_2uqbc" fill="url(#app-root-3)" text-anchor="end" x="15" y="11">J\ +</text></svg></div><div class="_1xz9u">Literature</div><h3 class="_2WvKD">\ +''') + html.append(text.escape(dev["title"])) + html.append('</h3><div class="_2CPLm">') + html.append(text.escape(dev["textContent"]["excerpt"])) + html.append('</div></section></a></div>') + + html.append('</figure></div>') + def _extract_content(self, deviation): content = deviation["content"] @@ -552,6 +733,23 @@ class DeviantartExtractor(Extractor): self.log.info("Unwatching %s", username) self.api.user_friends_unwatch(username) + def _eclipse_media(self, media, format="preview"): + url = [media["baseUri"], ] + + formats = { + fmt["t"]: fmt + for fmt in media["types"] + } + + tokens = media["token"] + if len(tokens) == 1: + fmt = formats[format] + url.append(fmt["c"].replace("<prettyName>", media["prettyName"])) + url.append("?token=") + url.append(tokens[-1]) + + return "".join(url), formats + def _eclipse_to_oauth(self, eclipse_api, deviations): for obj in deviations: deviation = obj["deviation"] if "deviation" in obj else obj @@ -709,43 +907,35 @@ class DeviantartStashExtractor(DeviantartExtractor): archive_fmt = "{index}.{extension}" pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)" r"/([a-z0-9]+)") - example = "https://sta.sh/abcde" + example = "https://www.deviantart.com/stash/abcde" skip = Extractor.skip def __init__(self, match): DeviantartExtractor.__init__(self, match) self.user = None - self.stash_id = match.group(1) def deviations(self, stash_id=None): if stash_id is None: - stash_id = self.stash_id - url = "https://sta.sh/" + stash_id + stash_id = self.groups[0] + url = "https://www.deviantart.com/stash/" + stash_id page = self._limited_request(url).text if stash_id[0] == "0": uuid = text.extr(page, '//deviation/', '"') if uuid: deviation = self.api.deviation(uuid) + deviation["_page"] = page deviation["index"] = text.parse_int(text.extr( page, '\\"deviationId\\":', ',')) yield deviation return - for item in text.extract_iter( - page, 'class="stash-thumb-container', '</div>'): - url = text.extr(item, '<a href="', '"') - - if url: - stash_id = url.rpartition("/")[2] - else: - stash_id = text.extr(item, 'gmi-stashid="', '"') - stash_id = "2" + util.bencode(text.parse_int( - stash_id), "0123456789abcdefghijklmnopqrstuvwxyz") - - if len(stash_id) > 2: - yield from self.deviations(stash_id) + for sid in text.extract_iter( + page, 'href="https://www.deviantart.com/stash/', '"'): + if sid == stash_id or sid.endswith("#comments"): + continue + yield from self.deviations(sid) class DeviantartFavoriteExtractor(DeviantartExtractor): @@ -939,11 +1129,14 @@ class DeviantartDeviationExtractor(DeviantartExtractor): else: url = "{}/view/{}/".format(self.root, self.deviation_id) - uuid = text.extr(self._limited_request(url).text, - '"deviationUuid\\":\\"', '\\') + page = self._limited_request(url, notfound="deviation").text + uuid = text.extr(page, '"deviationUuid\\":\\"', '\\') if not uuid: raise exception.NotFoundError("deviation") - return (self.api.deviation(uuid),) + + deviation = self.api.deviation(uuid) + deviation["_page"] = page + return (deviation,) class DeviantartScrapsExtractor(DeviantartExtractor): @@ -1816,25 +2009,28 @@ JOURNAL_TEMPLATE_HTML = """text:<!DOCTYPE html> <head> <meta charset="utf-8"> <title>{title}</title> - <link rel="stylesheet" href="https://st.deviantart.net/\ -css/deviantart-network_lc.css?3843780832"> - <link rel="stylesheet" href="https://st.deviantart.net/\ -css/group_secrets_lc.css?3250492874"> - <link rel="stylesheet" href="https://st.deviantart.net/\ -css/v6core_lc.css?4246581581"> - <link rel="stylesheet" href="https://st.deviantart.net/\ -css/sidebar_lc.css?1490570941"> - <link rel="stylesheet" href="https://st.deviantart.net/\ -css/writer_lc.css?3090682151"> - <link rel="stylesheet" href="https://st.deviantart.net/\ -css/v6loggedin_lc.css?3001430805"> + <link rel="stylesheet" href="https://st.deviantart.net\ +/css/deviantart-network_lc.css?3843780832"/> + <link rel="stylesheet" href="https://st.deviantart.net\ +/css/group_secrets_lc.css?3250492874"/> + <link rel="stylesheet" href="https://st.deviantart.net\ +/css/v6core_lc.css?4246581581"/> + <link rel="stylesheet" href="https://st.deviantart.net\ +/css/sidebar_lc.css?1490570941"/> + <link rel="stylesheet" href="https://st.deviantart.net\ +/css/writer_lc.css?3090682151"/> + <link rel="stylesheet" href="https://st.deviantart.net\ +/css/v6loggedin_lc.css?3001430805"/> <style>{css}</style> - <link rel="stylesheet" href="https://st.deviantart.net/\ -roses/cssmin/core.css?1488405371919" > - <link rel="stylesheet" href="https://st.deviantart.net/\ -roses/cssmin/peeky.css?1487067424177" > - <link rel="stylesheet" href="https://st.deviantart.net/\ -roses/cssmin/desktop.css?1491362542749" > + <link rel="stylesheet" href="https://st.deviantart.net\ +/roses/cssmin/core.css?1488405371919"/> + <link rel="stylesheet" href="https://st.deviantart.net\ +/roses/cssmin/peeky.css?1487067424177"/> + <link rel="stylesheet" href="https://st.deviantart.net\ +/roses/cssmin/desktop.css?1491362542749"/> + <link rel="stylesheet" href="https://static.parastorage.com/services\ +/da-deviation/2bfd1ff7a9d6bf10d27b98dd8504c0399c3f9974a015785114b7dc6b\ +/app.min.css"/> </head> <body id="deviantART-v7" class="bubble no-apps loggedout w960 deviantart"> <div id="output"> diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index d8337b6..9bbfb43 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -29,7 +29,10 @@ class FanboxExtractor(Extractor): _warning = True def _init(self): - self.headers = {"Origin": self.root} + self.headers = { + "Accept": "application/json, text/plain, */*", + "Origin": self.root, + } self.embeds = self.config("embeds", True) includes = self.config("metadata") @@ -40,8 +43,12 @@ class FanboxExtractor(Extractor): includes = ("user", "plan") self._meta_user = ("user" in includes) self._meta_plan = ("plan" in includes) + self._meta_comments = ("comments" in includes) else: - self._meta_user = self._meta_plan = False + self._meta_user = self._meta_plan = self._meta_comments = False + + if self.config("comments"): + self._meta_comments = True if self._warning: if not self.cookies_check(("FANBOXSESSID",)): @@ -124,6 +131,11 @@ class FanboxExtractor(Extractor): plan = plans[0].copy() plan["fee"] = fee post["plan"] = plans[fee] = plan + if self._meta_comments: + if post["commentCount"]: + post["comments"] = list(self._get_comment_data(post_id)) + else: + post["commentd"] = () return content_body, post @@ -160,6 +172,18 @@ class FanboxExtractor(Extractor): return plans + def _get_comment_data(self, post_id): + url = ("https://api.fanbox.cc/post.listComments" + "?limit=10&postId=" + post_id) + + comments = [] + while url: + url = text.ensure_http_scheme(url) + body = self.request(url, headers=self.headers).json()["body"] + comments.extend(body["items"]) + url = body["nextUrl"] + return comments + def _get_urls_from_post(self, content_body, post): num = 0 cover_image = post.get("coverImageUrl") diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 6aefa11..df252ee 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -22,7 +22,7 @@ class FlickrExtractor(Extractor): archive_fmt = "{id}" cookies_domain = None request_interval = (1.0, 2.0) - request_interval_min = 0.2 + request_interval_min = 0.5 def __init__(self, match): Extractor.__init__(self, match) @@ -37,7 +37,6 @@ class FlickrExtractor(Extractor): extract = self.api._extract_format for photo in self.photos(): try: - 1/0 photo = extract(photo) except Exception as exc: self.log.warning( @@ -236,8 +235,8 @@ class FlickrAPI(oauth.OAuth1API): """ API_URL = "https://api.flickr.com/services/rest/" - API_KEY = "f8f78d1a40debf471f0b22fa2d00525f" - API_SECRET = "4f9dae1113e45556" + API_KEY = "90c368449018a0cb880ea4889cbb8681" + API_SECRET = "e4b83e319c11e9e1" FORMATS = [ ("o" , "Original" , None), ("6k", "X-Large 6K" , 6144), diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index fbbd26c..0baad2f 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -97,6 +97,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): @staticmethod def _prepare(post): + post["tags"] = post["tags"].strip() post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") @@ -114,7 +115,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): pattern = re.compile( r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S) for tag_type, tag_name in pattern.findall(tag_container): - tags[tag_type].append(text.unquote(tag_name)) + tags[tag_type].append(text.unescape(text.unquote(tag_name))) for key, value in tags.items(): post["tags_" + key] = " ".join(value) @@ -178,7 +179,7 @@ class GelbooruV02Extractor(booru.BooruExtractor): pattern = re.compile( r'<a class="(?:tag-type-)?([^"]+).*?;tags=([^"&]+)') for tag_type, tag_name in pattern.findall(tag_container): - tags[tag_type].append(text.unquote(tag_name)) + tags[tag_type].append(text.unescape(text.unquote(tag_name))) for key, value in tags.items(): post["tags_" + key] = " ".join(value) diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index a6c1d5a..370cd43 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -89,30 +89,33 @@ class GenericExtractor(Extractor): def metadata(self, page): """Extract generic webpage metadata, return them in a dict.""" - data = {} - data['path'] = self.path.replace("/", "") - data['pageurl'] = self.url - data['title'] = text.extr(page, '<title>', "</title>") - data['description'] = text.extr( - page, '<meta name="description" content="', '"') - data['keywords'] = text.extr( - page, '<meta name="keywords" content="', '"') - data['language'] = text.extr( - page, '<meta name="language" content="', '"') - data['name'] = text.extr( - page, '<meta itemprop="name" content="', '"') - data['copyright'] = text.extr( - page, '<meta name="copyright" content="', '"') - data['og_site'] = text.extr( - page, '<meta property="og:site" content="', '"') - data['og_site_name'] = text.extr( - page, '<meta property="og:site_name" content="', '"') - data['og_title'] = text.extr( - page, '<meta property="og:title" content="', '"') - data['og_description'] = text.extr( - page, '<meta property="og:description" content="', '"') - - data = {k: text.unescape(data[k]) for k in data if data[k] != ""} + data = { + "title" : text.extr( + page, "<title>", "</title>"), + "description" : text.extr( + page, '<meta name="description" content="', '"'), + "keywords" : text.extr( + page, '<meta name="keywords" content="', '"'), + "language" : text.extr( + page, '<meta name="language" content="', '"'), + "name" : text.extr( + page, '<meta itemprop="name" content="', '"'), + "copyright" : text.extr( + page, '<meta name="copyright" content="', '"'), + "og_site" : text.extr( + page, '<meta property="og:site" content="', '"'), + "og_site_name" : text.extr( + page, '<meta property="og:site_name" content="', '"'), + "og_title" : text.extr( + page, '<meta property="og:title" content="', '"'), + "og_description": text.extr( + page, '<meta property="og:description" content="', '"'), + + } + + data = {k: text.unescape(v) for k, v in data.items() if v} + data["path"] = self.path.replace("/", "") + data["pageurl"] = self.url return data diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 345f51d..28590fc 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -19,7 +19,8 @@ class ImagefapExtractor(Extractor): category = "imagefap" root = "https://www.imagefap.com" directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{num:04}_{filename}.{extension}" + filename_fmt = ("{category}_{gallery_id}_{num:?/_/>04}" + "{filename}.{extension}") archive_fmt = "{gallery_id}_{image_id}" request_interval = (2.0, 4.0) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 422c865..dd1272f 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -14,7 +14,6 @@ from .. import text, util, exception from ..cache import cache, memcache import itertools import binascii -import json import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?instagram\.com" @@ -913,7 +912,7 @@ class InstagramGraphqlAPI(): self.user_collection = self.user_saved = self.reels_media = \ self.highlights_media = self.guide = self.guide_media = \ self._unsupported - self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode + self._json_dumps = util.json_dumps api = InstagramRestAPI(extractor) self.user_by_name = api.user_by_name diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index 7a19be5..e39e272 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://komikcast.lol/""" +"""Extractors for https://komikcast.cz/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:lol|site|me|com)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:cz|lol|site|mo?e|com)" class KomikcastBase(): """Base class for komikcast extractors""" category = "komikcast" - root = "https://komikcast.lol" + root = "https://komikcast.cz" @staticmethod def parse_chapter_string(chapter_string, data=None): @@ -46,9 +46,9 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): - """Extractor for manga-chapters from komikcast.lol""" + """Extractor for komikcast manga chapters""" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" - example = "https://komikcast.lol/chapter/TITLE/" + example = "https://komikcast.cz/chapter/TITLE/" def metadata(self, page): info = text.extr(page, "<title>", " - Komikcast<") @@ -65,10 +65,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): - """Extractor for manga from komikcast.lol""" + """Extractor for komikcast manga""" chapterclass = KomikcastChapterExtractor pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$" - example = "https://komikcast.lol/komik/TITLE" + example = "https://komikcast.cz/komik/TITLE" def chapters(self, page): results = [] diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 117b88b..6fc0689 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -20,10 +20,6 @@ class LolisafeExtractor(BaseExtractor): BASE_PATTERN = LolisafeExtractor.update({ - "xbunkr": { - "root": "https://xbunkr.com", - "pattern": r"xbunkr\.com", - }, }) diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py index 5e8179e..f09507c 100644 --- a/gallery_dl/extractor/myhentaigallery.py +++ b/gallery_dl/extractor/myhentaigallery.py @@ -33,7 +33,7 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor): title = extr('<div class="comic-description">\n', '</h1>').lstrip() if title.startswith("<h1>"): - title = title[len("<h1>"):] + title = title[4:] if not title: raise exception.NotFoundError("gallery") @@ -41,10 +41,10 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor): return { "title" : text.unescape(title), "gallery_id": text.parse_int(self.gallery_id), - "tags" : split(extr('<div>\nCategories:', '</div>')), - "artist" : split(extr('<div>\nArtists:' , '</div>')), - "group" : split(extr('<div>\nGroups:' , '</div>')), - "parodies" : split(extr('<div>\nParodies:' , '</div>')), + "tags" : split(extr(" Categories:", "</div>")), + "artist" : split(extr(" Artists:" , "</div>")), + "group" : split(extr(" Groups:" , "</div>")), + "parodies" : split(extr(" Parodies:" , "</div>")), } def images(self, page): diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index dfa1f6e..2928573 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -32,6 +32,8 @@ class NewgroundsExtractor(Extractor): self.user_root = "https://{}.newgrounds.com".format(self.user) def _init(self): + self._extract_comment_urls = re.compile( + r'(?:<img |data-smartload-)src="([^"]+)').findall self.flash = self.config("flash", True) fmt = self.config("format") @@ -78,8 +80,7 @@ class NewgroundsExtractor(Extractor): if "_fallback" in post: del post["_fallback"] - for url in text.extract_iter( - post["_comment"], 'data-smartload-src="', '"'): + for url in self._extract_comment_urls(post["_comment"]): post["num"] += 1 post["_index"] = "{}_{:>02}".format( post["index"], post["num"]) @@ -243,9 +244,12 @@ class NewgroundsExtractor(Extractor): url = text.ensure_http_scheme(url) url = url.replace("/medium_views/", "/images/", 1) if text.ext_from_url(url) == "webp": + fallback = [url.replace(".webp", "." + e) + for e in ("jpg", "png", "gif") if e != ext] + fallback.append(url) yield { "image" : url.replace(".webp", "." + ext), - "_fallback": (url,), + "_fallback": fallback, } else: yield {"image": url} diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 9d025d5..e7540f8 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -83,7 +83,7 @@ class OAuthBase(Extractor): browser = None if browser and browser.open(url): - name = getattr(browser, "name", "Browser") + name = getattr(browser, "name", None) or "Browser" self.log.info("Opening URL in %s:", name.capitalize()) else: self.log.info("Please open this URL in your browser:") diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index eb6d677..d47ffa2 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -23,16 +23,17 @@ class PatreonExtractor(Extractor): directory_fmt = ("{category}", "{creator[full_name]}") filename_fmt = "{id}_{title}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" - browser = "firefox" - tls12 = False _warning = True - def items(self): + def _init(self): + self.session.headers["User-Agent"] = \ + "Patreon/72.2.28 (Android; Android 14; Scale/2.10)" if self._warning: if not self.cookies_check(("session_id",)): self.log.warning("no 'session_id' cookie set") PatreonExtractor._warning = False + def items(self): generators = self._build_file_generators(self.config("files")) for post in self.posts(): @@ -99,6 +100,11 @@ class PatreonExtractor(Extractor): if url: yield "attachment", url, attachment["name"] + for attachment in post.get("attachments_media") or (): + url = attachment.get("download_url") + if url: + yield "attachment", url, attachment["file_name"] + def _content(self, post): content = post.get("content") if content: @@ -137,8 +143,12 @@ class PatreonExtractor(Extractor): if attr.get("current_user_can_view", True): relationships = post["relationships"] - attr["images"] = self._files(post, included, "images") - attr["attachments"] = self._files(post, included, "attachments") + attr["images"] = self._files( + post, included, "images") + attr["attachments"] = self._files( + post, included, "attachments") + attr["attachments_media"] = self._files( + post, included, "attachments_media") attr["date"] = text.parse_datetime( attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") @@ -210,8 +220,8 @@ class PatreonExtractor(Extractor): return ( "https://www.patreon.com/api/" + endpoint + - "?include=campaign,access_rules,attachments,audio,images,media," - "native_video_insights,poll.choices," + "?include=campaign,access_rules,attachments,attachments_media," + "audio,images,media,native_video_insights,poll.choices," "poll.current_user_responses.user," "poll.current_user_responses.choice," "poll.current_user_responses.poll," @@ -303,13 +313,11 @@ class PatreonCreatorExtractor(PatreonExtractor): r"([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") example = "https://www.patreon.com/USER" - def __init__(self, match): - PatreonExtractor.__init__(self, match) - self.creator, self.query = match.groups() - def posts(self): - query = text.parse_query(self.query) - campaign_id = self._get_campaign_id(query) + creator, query = self.groups + + query = text.parse_query(query) + campaign_id = self._get_campaign_id(creator, query) filters = self._get_filters(query) self.log.debug("campaign_id: %s", campaign_id) @@ -322,9 +330,9 @@ class PatreonCreatorExtractor(PatreonExtractor): )) return self._pagination(url) - def _get_campaign_id(self, query): - if self.creator.startswith("id:"): - return self.creator[3:] + def _get_campaign_id(self, creator, query): + if creator.startswith("id:"): + return creator[3:] campaign_id = query.get("c") or query.get("campaign_id") if campaign_id: @@ -334,7 +342,7 @@ class PatreonCreatorExtractor(PatreonExtractor): if user_id: url = "{}/user/posts?u={}".format(self.root, user_id) else: - url = "{}/{}/posts".format(self.root, self.creator) + url = "{}/{}/posts".format(self.root, creator) page = self.request(url, notfound="creator").text try: @@ -377,14 +385,18 @@ class PatreonPostExtractor(PatreonExtractor): pattern = r"(?:https?://)?(?:www\.)?patreon\.com/posts/([^/?#]+)" example = "https://www.patreon.com/posts/TITLE-12345" - def __init__(self, match): - PatreonExtractor.__init__(self, match) - self.slug = match.group(1) - def posts(self): - url = "{}/posts/{}".format(self.root, self.slug) + url = "{}/posts/{}".format(self.root, self.groups[0]) page = self.request(url, notfound="post").text - post = self._extract_bootstrap(page)["post"] + bootstrap = self._extract_bootstrap(page) + + try: + post = bootstrap["post"] + except KeyError: + self.log.debug(bootstrap) + if bootstrap.get("campaignDisciplinaryStatus") == "suspended": + self.log.warning("Account suspended") + return () included = self._transform(post["included"]) return (self._process(post["data"], included),) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index c908e44..c2d1243 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -27,11 +27,17 @@ class PixivExtractor(Extractor): filename_fmt = "{id}_p{num}.{extension}" archive_fmt = "{id}{suffix}.{extension}" cookies_domain = None + sanity_url = "https://s.pximg.net/common/images/limit_sanity_level_360.png" + mypixiv_url = "https://s.pximg.net/common/images/limit_mypixiv_360.png" def _init(self): self.api = PixivAppAPI(self) self.load_ugoira = self.config("ugoira", True) self.max_posts = self.config("max-posts", 0) + self.sanity_workaround = self.config("sanity", True) + self.meta_user = self.config("metadata") + self.meta_bookmark = self.config("metadata-bookmark") + self.meta_comments = self.config("comments") def items(self): tags = self.config("tags", "japanese") @@ -46,11 +52,7 @@ class PixivExtractor(Extractor): def transform_tags(work): work["tags"] = [tag["name"] for tag in work["tags"]] - url_sanity = ("https://s.pximg.net/common/images" - "/limit_sanity_level_360.png") ratings = {0: "General", 1: "R-18", 2: "R-18G"} - meta_user = self.config("metadata") - meta_bookmark = self.config("metadata-bookmark") metadata = self.metadata() works = self.works() @@ -60,18 +62,26 @@ class PixivExtractor(Extractor): if not work["user"]["id"]: continue - meta_single_page = work["meta_single_page"] - meta_pages = work["meta_pages"] - del work["meta_single_page"] - del work["image_urls"] - del work["meta_pages"] + files = self._extract_files(work) - if meta_user: + if self.meta_user: work.update(self.api.user_detail(work["user"]["id"])) - if meta_bookmark and work["is_bookmarked"]: + if self.meta_comments: + if work["total_comments"]: + work["comments"] = list( + self.api.illust_comments(work["id"])) + else: + work["comments"] = () + if self.meta_bookmark and work["is_bookmarked"]: detail = self.api.illust_bookmark_detail(work["id"]) work["tags_bookmark"] = [tag["name"] for tag in detail["tags"] if tag["is_registered"]] + if self.sanity_workaround and not work.get("caption") and \ + not work.get("_mypixiv"): + body = self._request_ajax("/illust/" + str(work["id"])) + if body: + work["caption"] = text.unescape(body["illustComment"]) + if transform_tags: transform_tags(work) work["num"] = 0 @@ -81,69 +91,177 @@ class PixivExtractor(Extractor): work.update(metadata) yield Message.Directory, work + for work["num"], file in enumerate(files): + url = file["url"] + work.update(file) + work["date_url"] = self._date_from_url(url) + yield Message.Url, url, text.nameext_from_url(url, work) - if work["type"] == "ugoira": - if not self.load_ugoira: - continue + def _extract_files(self, work): + meta_single_page = work["meta_single_page"] + meta_pages = work["meta_pages"] + del work["meta_single_page"] + del work["image_urls"] + del work["meta_pages"] + if work["type"] == "ugoira": + if self.load_ugoira: try: - ugoira = self.api.ugoira_metadata(work["id"]) + return self._extract_ugoira(work) except exception.StopExtraction as exc: self.log.warning( "Unable to retrieve Ugoira metatdata (%s - %s)", - work.get("id"), exc.message) - continue - - url = ugoira["zip_urls"]["medium"] - work["frames"] = frames = ugoira["frames"] - work["date_url"] = self._date_from_url(url) - work["_http_adjust_extension"] = False - - if self.load_ugoira == "original": - base, sep, _ = url.rpartition("_ugoira") - base = base.replace( - "/img-zip-ugoira/", "/img-original/", 1) + sep - - for ext in ("jpg", "png", "gif"): - try: - url = ("{}0.{}".format(base, ext)) - self.request(url, method="HEAD") - break - except exception.HttpError: - pass - else: - self.log.warning( - "Unable to find Ugoira frame URLs (%s)", - work.get("id")) - continue - - for num, frame in enumerate(frames): - url = ("{}{}.{}".format(base, num, ext)) - work["num"] = work["_ugoira_frame_index"] = num - work["suffix"] = "_p{:02}".format(num) - text.nameext_from_url(url, work) - yield Message.Url, url, work - + work["id"], exc.message) + + elif work["page_count"] == 1: + url = meta_single_page["original_image_url"] + if url == self.sanity_url: + if self.sanity_workaround: + self.log.warning("%s: 'sanity_level' warning", work["id"]) + body = self._request_ajax("/illust/" + str(work["id"])) + return self._extract_ajax(work, body) else: - url = url.replace("_ugoira600x600", "_ugoira1920x1080") - yield Message.Url, url, text.nameext_from_url(url, work) - - elif work["page_count"] == 1: - url = meta_single_page["original_image_url"] - if url == url_sanity: self.log.warning( - "Unable to download work %s ('sanity_level' warning)", + "%s: Unable to download work ('sanity_level' warning)", work["id"]) - continue - work["date_url"] = self._date_from_url(url) - yield Message.Url, url, text.nameext_from_url(url, work) + elif url == self.mypixiv_url: + work["_mypixiv"] = True + self.log.warning("%s: 'My pixiv' locked", work["id"]) + return () + else: + return ({"url": url},) + else: + return [ + { + "url" : img["image_urls"]["original"], + "suffix": "_p{:02}".format(num), + } + for num, img in enumerate(meta_pages) + ] + + return () + + def _extract_ugoira(self, work): + ugoira = self.api.ugoira_metadata(work["id"]) + url = ugoira["zip_urls"]["medium"] + work["_ugoira_frame_data"] = work["frames"] = frames = ugoira["frames"] + work["date_url"] = self._date_from_url(url) + work["_http_adjust_extension"] = False + + if self.load_ugoira == "original": + work["_ugoira_original"] = True + base, sep, _ = url.rpartition("_ugoira") + base = base.replace("/img-zip-ugoira/", "/img-original/", 1) + sep + + for ext in ("jpg", "png", "gif"): + try: + url = "{}0.{}".format(base, ext) + self.request(url, method="HEAD") + break + except exception.HttpError: + pass else: - for work["num"], img in enumerate(meta_pages): - url = img["image_urls"]["original"] - work["date_url"] = self._date_from_url(url) - work["suffix"] = "_p{:02}".format(work["num"]) - yield Message.Url, url, text.nameext_from_url(url, work) + self.log.warning( + "Unable to find Ugoira frame URLs (%s)", work["id"]) + + return [ + { + "url": "{}{}.{}".format(base, num, ext), + "suffix": "_p{:02}".format(num), + "_ugoira_frame_index": num, + } + for num in range(len(frames)) + ] + else: + work["_ugoira_original"] = False + url = url.replace("_ugoira600x600", "_ugoira1920x1080", 1) + return ({"url": url},) + + def _request_ajax(self, endpoint): + url = "{}/ajax{}".format(self.root, endpoint) + try: + return self.request(url, headers=self.headers_web).json()["body"] + except Exception: + return None + + def _extract_ajax(self, work, body): + url = self._extract_ajax_url(body) + if not url: + return () + + for key_app, key_ajax in ( + ("title" , "illustTitle"), + ("image_urls" , "urls"), + ("create_date" , "createDate"), + ("width" , "width"), + ("height" , "height"), + ("sanity_level" , "sl"), + ("total_view" , "viewCount"), + ("total_comments" , "commentCount"), + ("total_bookmarks" , "bookmarkCount"), + ("restrict" , "restrict"), + ("x_restrict" , "xRestrict"), + ("illust_ai_type" , "aiType"), + ("illust_book_style", "bookStyle"), + ): + work[key_app] = body[key_ajax] + + work["user"] = { + "account" : body["userAccount"], + "id" : int(body["userId"]), + "is_followed": False, + "name" : body["userName"], + "profile_image_urls": {}, + } + + work["tags"] = tags = [] + for tag in body["tags"]["tags"]: + name = tag["tag"] + try: + translated_name = tag["translation"]["en"] + except Exception: + translated_name = None + tags.append({"name": name, "translated_name": translated_name}) + + work["caption"] = text.unescape(body["illustComment"]) + work["page_count"] = count = body["pageCount"] + if count == 1: + return ({"url": url},) + + base, _, ext = url.rpartition("_p0.") + return [ + { + "url" : "{}_p{}.{}".format(base, num, ext), + "suffix": "_p{:02}".format(num), + } + for num in range(count) + ] + + def _extract_ajax_url(self, body): + try: + original = body["urls"]["original"] + if original: + return original + except KeyError: + pass + + try: + square1200 = body["userIllusts"][body["id"]]["url"] + except KeyError: + return + parts = square1200.rpartition("_p0")[0].split("/") + del parts[3:5] + parts[3] = "img-original" + base = "/".join(parts) + + for ext in ("jpg", "png", "gif"): + try: + url = "{}_p0.{}".format(base, ext) + self.request(url, method="HEAD") + return url + except exception.HttpError: + pass @staticmethod def _date_from_url(url, offset=timedelta(hours=9)): @@ -175,6 +293,9 @@ class PixivExtractor(Extractor): "x_restrict" : 0, } + def _web_to_mobile(self, work): + return work + def works(self): """Return an iterable containing all relevant 'work' objects""" @@ -255,12 +376,12 @@ class PixivAvatarExtractor(PixivExtractor): pattern = USER_PATTERN + r"/avatar" example = "https://www.pixiv.net/en/users/12345/avatar" - def __init__(self, match): - PixivExtractor.__init__(self, match) - self.user_id = match.group(1) + def _init(self): + PixivExtractor._init(self) + self.sanity_workaround = self.meta_comments = False def works(self): - user = self.api.user_detail(self.user_id)["user"] + user = self.api.user_detail(self.groups[0])["user"] url = user["profile_image_urls"]["medium"].replace("_170.", ".") return (self._make_work("avatar", url, user),) @@ -273,12 +394,12 @@ class PixivBackgroundExtractor(PixivExtractor): pattern = USER_PATTERN + "/background" example = "https://www.pixiv.net/en/users/12345/background" - def __init__(self, match): - PixivExtractor.__init__(self, match) - self.user_id = match.group(1) + def _init(self): + PixivExtractor._init(self) + self.sanity_workaround = self.meta_comments = False def works(self): - detail = self.api.user_detail(self.user_id) + detail = self.api.user_detail(self.groups[0]) url = detail["profile"]["background_image_url"] if not url: return () @@ -335,6 +456,22 @@ class PixivWorkExtractor(PixivExtractor): return works +class PixivUnlistedExtractor(PixivExtractor): + """Extractor for a unlisted pixiv illustrations""" + subcategory = "unlisted" + pattern = BASE_PATTERN + r"/(?:en/)?artworks/unlisted/(\w+)" + example = "https://www.pixiv.net/en/artworks/unlisted/a1b2c3d4e5f6g7h8i9j0" + + def _extract_files(self, work): + body = self._request_ajax("/illust/unlisted/" + work["id"]) + work["id_unlisted"] = work["id"] + work["id"] = text.parse_int(body["illustId"]) + return self._extract_ajax(work, body) + + def works(self): + return ({"id": self.groups[0], "user": {"id": 1}},) + + class PixivFavoriteExtractor(PixivExtractor): """Extractor for all favorites/bookmarks of a pixiv user""" subcategory = "favorite" @@ -626,8 +763,6 @@ class PixivNovelExtractor(PixivExtractor): work["tags"] = [tag["name"] for tag in work["tags"]] ratings = {0: "General", 1: "R-18", 2: "R-18G"} - meta_user = self.config("metadata") - meta_bookmark = self.config("metadata-bookmark") embeds = self.config("embeds") covers = self.config("covers") @@ -645,9 +780,15 @@ class PixivNovelExtractor(PixivExtractor): if self.max_posts: novels = itertools.islice(novels, self.max_posts) for novel in novels: - if meta_user: + if self.meta_user: novel.update(self.api.user_detail(novel["user"]["id"])) - if meta_bookmark and novel["is_bookmarked"]: + if self.meta_comments: + if novel["total_comments"]: + novel["comments"] = list( + self.api.novel_comments(novel["id"])) + else: + novel["comments"] = () + if self.meta_bookmark and novel["is_bookmarked"]: detail = self.api.novel_bookmark_detail(novel["id"]) novel["tags_bookmark"] = [tag["name"] for tag in detail["tags"] if tag["is_registered"]] @@ -848,6 +989,7 @@ class PixivAppAPI(): self.username = extractor._get_auth_info()[0] self.user = None + extractor.headers_web = extractor.session.headers.copy() extractor.session.headers.update({ "App-OS" : "ios", "App-OS-Version": "16.7.2", @@ -913,6 +1055,10 @@ class PixivAppAPI(): return self._call( "/v2/illust/bookmark/detail", params)["bookmark_detail"] + def illust_comments(self, illust_id): + params = {"illust_id": illust_id} + return self._pagination("/v3/illust/comments", params, "comments") + def illust_follow(self, restrict="all"): params = {"restrict": restrict} return self._pagination("/v2/illust/follow", params) @@ -935,6 +1081,10 @@ class PixivAppAPI(): return self._call( "/v2/novel/bookmark/detail", params)["bookmark_detail"] + def novel_comments(self, novel_id): + params = {"novel_id": novel_id} + return self._pagination("/v1/novel/comments", params, "comments") + def novel_detail(self, novel_id): params = {"novel_id": novel_id} return self._call("/v2/novel/detail", params)["novel"] diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index c7283fc..9800eb2 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -66,10 +66,19 @@ class PornhubGalleryExtractor(PornhubExtractor): def items(self): data = self.metadata() yield Message.Directory, data - for num, image in enumerate(self.images(), 1): + for num, img in enumerate(self.images(), 1): + + image = { + "url" : img["img_large"], + "caption": img["caption"], + "id" : text.parse_int(img["id"]), + "views" : text.parse_int(img["times_viewed"]), + "score" : text.parse_int(img["vote_percent"]), + "num" : num, + } + url = image["url"] image.update(data) - image["num"] = num yield Message.Url, url, text.nameext_from_url(url, image) def metadata(self): @@ -105,18 +114,20 @@ class PornhubGalleryExtractor(PornhubExtractor): images = response.json() key = end = self._first - while True: - img = images[key] - yield { - "url" : img["img_large"], - "caption": img["caption"], - "id" : text.parse_int(img["id"]), - "views" : text.parse_int(img["times_viewed"]), - "score" : text.parse_int(img["vote_percent"]), - } - key = str(img["next"]) - if key == end: - return + results = [] + try: + while True: + img = images[key] + results.append(img) + key = str(img["next"]) + if key == end: + break + except KeyError: + self.log.warning("%s: Unable to ensure correct file order", + self.gallery_id) + return images.values() + + return results class PornhubGifExtractor(PornhubExtractor): diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py deleted file mode 100644 index 3a4c614..0000000 --- a/gallery_dl/extractor/pururin.py +++ /dev/null @@ -1,72 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://pururin.to/""" - -from .common import GalleryExtractor -from .. import text, util - - -class PururinGalleryExtractor(GalleryExtractor): - """Extractor for image galleries on pururin.io""" - category = "pururin" - root = "https://pururin.to" - pattern = r"(?:https?://)?(?:www\.)?pururin\.[ti]o/(?:gallery|read)/(\d+)" - example = "https://pururin.to/gallery/12345/TITLE" - - def __init__(self, match): - self.gallery_id = match.group(1) - url = "{}/gallery/{}/x".format(self.root, self.gallery_id) - GalleryExtractor.__init__(self, match, url) - - def metadata(self, page): - extr = text.extract_from(page) - - def _lst(e=extr): - v = text.unescape(e('value="', '"')) - return [item["name"] for item in util.json_loads(v)] if v else () - - def _str(key, e=extr): - return text.unescape(text.extr( - e(key, "</td>"), 'title="', '"')).partition(" / ")[0] - - title = text.unescape(extr('<h1><span itemprop="name">', '<')) - title_en, _, title_ja = title.partition(" / ") - - data = { - "gallery_id": text.parse_int(self.gallery_id), - "title" : title_en or title_ja, - "title_en" : title_en, - "title_ja" : title_ja, - "language" : _str("<td>Language</td>"), - "type" : _str("<td>Category</td>"), - "uploader" : text.remove_html(extr("<td>Uploader</td>", "</td>")), - "rating" : text.parse_float(extr( - 'itemprop="ratingValue" content="', '"')), - "artist" : extr('name="artist_tags"', '') or _lst(), - "group" : _lst(), - "parody" : _lst(), - "tags" : _lst(), - "characters": _lst(), - "scanlator" : _lst(), - "convention": _lst(), - "collection": _lst(), - } - data["lang"] = util.language_to_code(data["language"]) - return data - - def images(self, _): - url = "{}/read/{}/01/x".format(self.root, self.gallery_id) - page = self.request(url).text - - svr, pos = text.extract(page, 'data-svr="', '"') - img, pos = text.extract(page, 'data-img="', '"', pos) - data = util.json_loads(text.unescape(img)) - - base = "{}/{}/".format(svr, data["directory"]) - return [(base + i["filename"], None) for i in data["images"]] diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py index cf70ccc..60c1c35 100644 --- a/gallery_dl/extractor/rule34us.py +++ b/gallery_dl/extractor/rule34us.py @@ -36,7 +36,7 @@ class Rule34usExtractor(BooruExtractor): "score" : text.extract(extr('Score: ', '> - <'), ">", "<")[0], "width" : extr('Size: ', 'w'), "height" : extr(' x ', 'h'), - "file_url": extr(' src="', '"'), + "file_url": extr('<source src="', '"') or extr('<img src="', '"'), } url = post["file_url"] diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py index a68f0db..97bad09 100644 --- a/gallery_dl/extractor/shimmie2.py +++ b/gallery_dl/extractor/shimmie2.py @@ -69,11 +69,6 @@ class Shimmie2Extractor(BaseExtractor): BASE_PATTERN = Shimmie2Extractor.update({ - "loudbooru": { - "root": "https://loudbooru.com", - "pattern": r"loudbooru\.com", - "cookies": {"ui-tnc-agreed": "true"}, - }, "giantessbooru": { "root": "https://sizechangebooru.com", "pattern": r"(?:sizechange|giantess)booru\.com", @@ -104,20 +99,15 @@ class Shimmie2TagExtractor(Shimmie2Extractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") file_url_fmt = "{}/_images/{}/{}%20-%20{}.{}" - pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?()" - example = "https://loudbooru.com/post/list/TAG/1" - - def __init__(self, match): - Shimmie2Extractor.__init__(self, match) - lastindex = match.lastindex - self.tags = text.unquote(match.group(lastindex-2)) - self.page = match.group(lastindex-1) + pattern = BASE_PATTERN + r"post/list/([^/?#]+)(?:/(\d+))?" + example = "https://vidya.pics/post/list/TAG/1" def metadata(self): + self.tags = text.unquote(self.groups[-2]) return {"search_tags": self.tags} def posts(self): - pnum = text.parse_int(self.page, 1) + pnum = text.parse_int(self.groups[-1], 1) file_url_fmt = self.file_url_fmt.format init = True @@ -171,7 +161,7 @@ class Shimmie2TagExtractor(Shimmie2Extractor): return def _posts_giantessbooru(self): - pnum = text.parse_int(self.page, 1) + pnum = text.parse_int(self.groups[-1], 1) file_url_fmt = (self.root + "/index.php?q=/image/{}.jpg").format while True: @@ -206,20 +196,17 @@ class Shimmie2PostExtractor(Shimmie2Extractor): """Extractor for single shimmie2 posts""" subcategory = "post" pattern = BASE_PATTERN + r"post/view/(\d+)" - example = "https://loudbooru.com/post/view/12345" - - def __init__(self, match): - Shimmie2Extractor.__init__(self, match) - self.post_id = match.group(match.lastindex) + example = "https://vidya.pics/post/view/12345" def posts(self): - url = "{}/post/view/{}".format(self.root, self.post_id) + post_id = self.groups[-1] + url = "{}/post/view/{}".format(self.root, post_id) page = self.request(url).text extr = text.extract_from(page) quote = self._quote_type(page) post = { - "id" : self.post_id, + "id" : post_id, "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "md5" : extr("/_thumbs/", "/"), "file_url": self.root + ( @@ -237,12 +224,12 @@ class Shimmie2PostExtractor(Shimmie2Extractor): return (post,) def _posts_giantessbooru(self): - url = "{}/index.php?q=/post/view/{}".format( - self.root, self.post_id) + post_id = self.groups[-1] + url = "{}/index.php?q=/post/view/{}".format(self.root, post_id) extr = text.extract_from(self.request(url).text) return ({ - "id" : self.post_id, + "id" : post_id, "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), "md5" : "", "file_url": self.root + extr("id='main_image' src='.", "'"), diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index d4ec343..9c9d505 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -13,7 +13,6 @@ from .. import text, util, exception from ..cache import cache, memcache import itertools import random -import json import re BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?" @@ -1034,7 +1033,7 @@ class TwitterAPI(): self.root = "https://x.com/i/api" self._nsfw_warning = True - self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode + self._json_dumps = util.json_dumps cookies = extractor.cookies cookies_domain = extractor.cookies_domain diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 87a0ba6..fec4ab0 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -105,11 +105,12 @@ class UgoiraPP(PostProcessor): }, options) def prepare(self, pathfmt): - if "frames" not in pathfmt.kwdict: + self._convert_zip = self._convert_files = False + if "_ugoira_frame_data" not in pathfmt.kwdict: self._frames = None return - self._frames = pathfmt.kwdict["frames"] + self._frames = pathfmt.kwdict["_ugoira_frame_data"] if pathfmt.extension == "zip": self._convert_zip = True if self.delete: @@ -136,7 +137,6 @@ class UgoiraPP(PostProcessor): def convert_from_zip(self, pathfmt): if not self._convert_zip: return - self._convert_zip = False self._zip_source = True with self._tempdir() as tempdir: @@ -147,6 +147,13 @@ class UgoiraPP(PostProcessor): except FileNotFoundError: pathfmt.realpath = pathfmt.temppath return + except Exception as exc: + pathfmt.realpath = pathfmt.temppath + self.log.error( + "%s: Unable to extract frames from %s (%s: %s)", + pathfmt.kwdict.get("id"), pathfmt.filename, + exc.__class__.__name__, exc) + return self.log.debug("", exc_info=exc) if self.convert(pathfmt, tempdir): if self.delete: @@ -159,7 +166,6 @@ class UgoiraPP(PostProcessor): def convert_from_files(self, pathfmt): if not self._convert_files: return - self._convert_files = False self._zip_source = False with tempfile.TemporaryDirectory() as tempdir: diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 8517cdf..5fd5a40 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -238,12 +238,49 @@ def parse_float(value, default=0.0): def parse_query(qs): - """Parse a query string into key-value pairs""" + """Parse a query string into name-value pairs + + Ignore values whose name has been seen before + """ + if not qs: + return {} + + result = {} + try: + for name_value in qs.split("&"): + name, eq, value = name_value.partition("=") + if eq: + name = unquote(name.replace("+", " ")) + if name not in result: + result[name] = unquote(value.replace("+", " ")) + except Exception: + pass + return result + + +def parse_query_list(qs): + """Parse a query string into name-value pairs + + Combine values of duplicate names into lists + """ + if not qs: + return {} + result = {} try: - for key, value in urllib.parse.parse_qsl(qs): - if key not in result: - result[key] = value + for name_value in qs.split("&"): + name, eq, value = name_value.partition("=") + if eq: + name = unquote(name.replace("+", " ")) + value = unquote(value.replace("+", " ")) + if name in result: + rvalue = result[name] + if isinstance(rvalue, list): + rvalue.append(value) + else: + result[name] = [rvalue, value] + else: + result[name] = value except Exception: pass return result diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 128f48b..d5bc171 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -253,7 +253,11 @@ def json_default(obj): json_loads = json._default_decoder.decode -json_dumps = json.JSONEncoder(default=json_default).encode +json_dumps = json.JSONEncoder( + check_circular=False, + separators=(",", ":"), + default=json_default, +).encode def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 513da41..dd96a9a 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.27.5" +__version__ = "1.27.6" __variant__ = None diff --git a/test/test_formatter.py b/test/test_formatter.py index f1d752d..c0b504d 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -64,7 +64,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{t!d}", datetime.datetime(2010, 1, 1)) self._run_test("{t!d:%Y-%m-%d}", "2010-01-01") self._run_test("{dt!T}", "1262304000") - self._run_test("{l!j}", '["a", "b", "c"]') + self._run_test("{l!j}", '["a","b","c"]') self._run_test("{dt!j}", '"2010-01-01 00:00:00"') self._run_test("{a!g}", "hello-world") self._run_test("{a!L}", 11) diff --git a/test/test_results.py b/test/test_results.py index aa09f2f..ed9c9a9 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -210,6 +210,7 @@ class TestExtractorResults(unittest.TestCase): if "#urls" in result: expected = result["#urls"] if isinstance(expected, str): + self.assertTrue(tjob.url_list, msg="#urls") self.assertEqual(tjob.url_list[0], expected, msg="#urls") else: self.assertSequenceEqual(tjob.url_list, expected, msg="#urls") @@ -235,6 +236,8 @@ class TestExtractorResults(unittest.TestCase): self.assertIsInstance(value, test, msg=path) elif isinstance(test, range): self.assertRange(value, test, msg=path) + elif isinstance(test, set): + self.assertIn(value, test, msg=path) elif isinstance(test, list): subtest = False for idx, item in enumerate(test): @@ -286,6 +289,8 @@ class ResultJob(job.DownloadJob): "".join(self.extractor.directory_fmt)).format_map self.format_filename = TestFormatter( self.extractor.filename_fmt).format_map + self.format_archive = TestFormatter( + self.extractor.archive_fmt).format_map def run(self): self._init() @@ -323,7 +328,7 @@ class ResultJob(job.DownloadJob): json.dumps(kwdict, sort_keys=True, default=str).encode()) def _update_archive(self, kwdict): - archive_id = self.extractor.archive_fmt.format_map(kwdict) + archive_id = self.format_archive(kwdict) self.archive_list.append(archive_id) self.archive_hash.update(archive_id.encode()) diff --git a/test/test_text.py b/test/test_text.py index 084436b..1b19c47 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -413,6 +413,28 @@ class TestText(unittest.TestCase): for value in INVALID: self.assertEqual(f(value), {}) + def test_parse_query_list(self, f=text.parse_query_list): + # standard usage + self.assertEqual(f(""), {}) + self.assertEqual(f("foo=1"), {"foo": "1"}) + self.assertEqual(f("foo=1&bar=2"), {"foo": "1", "bar": "2"}) + + # missing value + self.assertEqual(f("bar"), {}) + self.assertEqual(f("foo=1&bar"), {"foo": "1"}) + self.assertEqual(f("foo=1&bar&baz=3"), {"foo": "1", "baz": "3"}) + + # keys with identical names + self.assertEqual(f("foo=1&foo=2"), {"foo": ["1", "2"]}) + self.assertEqual( + f("foo=1&bar=2&foo=3&bar=4&foo=5"), + {"foo": ["1", "3", "5"], "bar": ["2", "4"]}, + ) + + # invalid arguments + for value in INVALID: + self.assertEqual(f(value), {}) + def test_parse_timestamp(self, f=text.parse_timestamp): null = util.datetime_utcfromtimestamp(0) value = util.datetime_utcfromtimestamp(1555816235) |
