diff options
| author | 2024-10-25 17:27:36 -0400 | |
|---|---|---|
| committer | 2024-10-25 17:27:36 -0400 | |
| commit | a46d8cec37ef1e7370a3127dd5bf3a47e7dc40de (patch) | |
| tree | 27382aedd6d14d1add2b1a37e6df2f3e52f0ac4e | |
| parent | e4f39ad7148b104ab522ee13e4af3d3003b65e0f (diff) | |
| parent | fc004701f923bb954a22c7fec2ae8d607e78cb2b (diff) | |
Update upstream source from tag 'upstream/1.27.7'
Update to upstream version '1.27.7'
with Debian dir f4e7d47b82b8fc4fb17fad4aa54873015dcc81c1
50 files changed, 1306 insertions, 505 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index bc6a301..f4bb546 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,57 +1,39 @@ -## 1.27.6 - 2024-10-11 +## 1.27.7 - 2024-10-25 ### Extractors #### Additions -- [ao3] add `subscriptions` extractor ([#6247](https://github.com/mikf/gallery-dl/issues/6247)) -- [boosty] add support ([#2387](https://github.com/mikf/gallery-dl/issues/2387)) -- [civitai] add `post` extractors ([#6279](https://github.com/mikf/gallery-dl/issues/6279)) -- [pixiv] support unlisted artworks ([#5162](https://github.com/mikf/gallery-dl/issues/5162)) +- [civitai] add extractors for global `models` and `images` ([#6310](https://github.com/mikf/gallery-dl/issues/6310)) +- [mangadex] add `author` extractor ([#6372](https://github.com/mikf/gallery-dl/issues/6372)) +- [scrolller] add support ([#295](https://github.com/mikf/gallery-dl/issues/295), [#3418](https://github.com/mikf/gallery-dl/issues/3418), [#5051](https://github.com/mikf/gallery-dl/issues/5051)) #### Fixes -- [cohost] sanitize default filenames ([#6262](https://github.com/mikf/gallery-dl/issues/6262)) - - limit `headline` length - - remove `plainTextBody` -- [deviantart] fix & improve journal/literature extraction ([#6254](https://github.com/mikf/gallery-dl/issues/6254), [#6207](https://github.com/mikf/gallery-dl/issues/6207), [#6196](https://github.com/mikf/gallery-dl/issues/6196)) - - extract journal HTML from webpage if possible - - support converting `tiptap` markup to HTML -- [deviantart] fix `stash` folder extraction -- [flickr] update default API credentials ([#6300](https://github.com/mikf/gallery-dl/issues/6300)) -- [flickr] fix `ZeroDivisionError` ([#6252](https://github.com/mikf/gallery-dl/issues/6252)) -- [imagefap] fix `{num}` in single image default filenames -- [myhentaigallery] fix `tags` extraction -- [patreon] extract `attachments_media` files ([#6241](https://github.com/mikf/gallery-dl/issues/6241), [#6268](https://github.com/mikf/gallery-dl/issues/6268)) -- [pixiv] implement workaround for `limit_sanity_level` works ([#4327](https://github.com/mikf/gallery-dl/issues/4327), [#4747](https://github.com/mikf/gallery-dl/issues/4747), [#5054](https://github.com/mikf/gallery-dl/issues/5054), [#5435](https://github.com/mikf/gallery-dl/issues/5435), [#5651](https://github.com/mikf/gallery-dl/issues/5651), [#5655](https://github.com/mikf/gallery-dl/issues/5655)) -- [pornhub] fix `KeyError` when album images are missing ([#6299](https://github.com/mikf/gallery-dl/issues/6299)) -- [rule34us] fix extraction ([#6289](https://github.com/mikf/gallery-dl/issues/6289)) -- [8chan] set TOS cookie for current and previous day +- [8chan] automatically detect `TOS` cookie name ([#6318](https://github.com/mikf/gallery-dl/issues/6318)) +- [bunkr] update to new site layout ([#6344](https://github.com/mikf/gallery-dl/issues/6344), [#6352](https://github.com/mikf/gallery-dl/issues/6352), [#6368](https://github.com/mikf/gallery-dl/issues/6368)) +- [bunkr] send proper `Referer` headers for file downloads ([#6319](https://github.com/mikf/gallery-dl/issues/6319)) +- [civitai] add `uuid` metadata field & use it as default archive format ([#6326](https://github.com/mikf/gallery-dl/issues/6326)) +- [civitai] fix "My Reactions" results ([#6263](https://github.com/mikf/gallery-dl/issues/6263)) +- [civitai] fix `model` file download URLs for tRPC API +- [lensdump] fix extraction ([#6313](https://github.com/mikf/gallery-dl/issues/6313)) +- [pixiv] make retrieving ugoira metadata non-fatal ([#6297](https://github.com/mikf/gallery-dl/issues/6297)) +- [pixiv] fix exception when processing deleted `sanity_level` works ([#6339](https://github.com/mikf/gallery-dl/issues/6339)) +- [urlgalleries] fix extraction +- [wikimedia] fix non-English Fandom/wiki.gg articles ([#6370](https://github.com/mikf/gallery-dl/issues/6370)) #### Improvements -- [bunkr] support `bunkr.pk` URLs ([#6272](https://github.com/mikf/gallery-dl/issues/6272)) -- [civitai] use tRPC API by default ([#6279](https://github.com/mikf/gallery-dl/issues/6279)) -- [civitai] improve default archive format ([#6302](https://github.com/mikf/gallery-dl/issues/6302)) -- [komikcast] update domain to `komikcast.cz` -- [newgrounds] detect more comment embeds ([#6253](https://github.com/mikf/gallery-dl/issues/6253)) -- [newgrounds] add more fallback URL formats for `art-images` files -- [oauth] prevent empty browser names -- [patreon] use mobile UA ([#6241](https://github.com/mikf/gallery-dl/issues/6241), [#6239](https://github.com/mikf/gallery-dl/issues/6239), [#6140](https://github.com/mikf/gallery-dl/issues/6140)) -- [patreon] handle suspended accounts -- [pixiv] detect works requiring `My pixiv` access -#### Metadata -- [civitai] ensure image files have an `id` ([#6251](https://github.com/mikf/gallery-dl/issues/6251)) -- [gelbooru_v02] unescape HTML entities in categorized tags -- [generic] ensure `path` metadata is always defined -- [pixiv] retrieve `caption` from AJAX API when empty ([#4327](https://github.com/mikf/gallery-dl/issues/4327), [#5191](https://github.com/mikf/gallery-dl/issues/5191)) +- [8chan] support `/last/` thread URLs ([#6318](https://github.com/mikf/gallery-dl/issues/6318)) +- [bunkr] support `bunkr.ph` and `bunkr.ps` URLs +- [newgrounds] support page numbers in URLs ([#6320](https://github.com/mikf/gallery-dl/issues/6320)) +- [patreon] support `/c/` prefix in creator URLs ([#6348](https://github.com/mikf/gallery-dl/issues/6348)) +- [pinterest] support `story` pins ([#6188](https://github.com/mikf/gallery-dl/issues/6188), [#6078](https://github.com/mikf/gallery-dl/issues/6078), [#4229](https://github.com/mikf/gallery-dl/issues/4229)) +- [pixiv] implement `sanity_level` workaround for user artworks results ([#4327](https://github.com/mikf/gallery-dl/issues/4327), [#5435](https://github.com/mikf/gallery-dl/issues/5435), [#6339](https://github.com/mikf/gallery-dl/issues/6339)) #### Options -- [fanbox] add `comments` option, extend `metadata` option ([#6287](https://github.com/mikf/gallery-dl/issues/6287)) -- [pixiv] add `comments` option ([#6287](https://github.com/mikf/gallery-dl/issues/6287)) -#### Removals -- [blogger] remove `micmicidol.club` -- [chevereto] remove `deltaporno.com` -- [lolisafe] remove `xbunkr.com` -- [pururin] remove module -- [shimmie2] remove `loudbooru.com` +- [bluesky] add `quoted` option ([#6323](https://github.com/mikf/gallery-dl/issues/6323)) +- [pixiv] add `captions` option ([#4327](https://github.com/mikf/gallery-dl/issues/4327)) +- [reddit] add `embeds` option ([#6357](https://github.com/mikf/gallery-dl/issues/6357)) +- [vk] add `offset` option ([#6328](https://github.com/mikf/gallery-dl/issues/6328)) +### Downloaders +- [ytdl] implement explicit HLS/DASH handling ### Post Processors -- [ugoira] fix `BadZipFile` exceptions ([#6285](https://github.com/mikf/gallery-dl/issues/6285)) -- [ugoira] catch all exceptions when extracting ZIP archives ([#6285](https://github.com/mikf/gallery-dl/issues/6285)) -- [ugoira] forward frame data as `_ugoira_frame_data` ([#6154](https://github.com/mikf/gallery-dl/issues/6154), [#6285](https://github.com/mikf/gallery-dl/issues/6285)) +- add `error` event ### Miscellaneous -- [build] remove setuptools and requests version restrictions -- [docker] build from `python:3.12-alpine` -- [text] improve `parse_query()` performance +- [cookies] convert Chromium `expires_utc` values to Unix timestamps +- [util] add `std` object to global eval namespace ([#6330](https://github.com/mikf/gallery-dl/issues/6330)) +- add `--print` and `--print-to-file` command-line options ([#6343](https://github.com/mikf/gallery-dl/issues/6343)) +- use child extractor fallbacks only when a non-user error occurs ([#6329](https://github.com/mikf/gallery-dl/issues/6329)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.27.6 +Version: 1.27.7 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -27,6 +27,7 @@ Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Internet :: WWW/HTTP @@ -114,9 +115,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.7/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.7/gallery-dl.bin>`__ Nightly Builds @@ -74,9 +74,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.7/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.7/gallery-dl.bin>`__ Nightly Builds diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 5e1b1e0..743808c 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -29,6 +29,8 @@ _arguments -s -S \ {-E,--extractor-info}'[Print extractor defaults and settings]' \ {-K,--list-keywords}'[Print a list of available keywords and example values for the given URLs]' \ {-e,--error-file}'[Add input URLs which returned an error to FILE]':'<file>':_files \ +{-N,--print}'[Write FORMAT during EVENT (default '\''prepare'\'') to standard output. Examples: '\''id'\'' or '\''post:{md5\[:8\]}'\'']':'<[event:]format>' \ +--print-to-file'[Append FORMAT during EVENT to FILE]':'<[event:]format file>' \ --list-modules'[Print a list of available extractor modules]' \ --list-extractors'[Print a list of extractor classes with description, (sub)category and example URL]':'<categories>' \ --write-log'[Write logging output to FILE]':'<file>':_files \ diff --git a/data/completion/gallery-dl b/data/completion/gallery-dl index 32d9705..fd5268f 100644 --- a/data/completion/gallery-dl +++ b/data/completion/gallery-dl @@ -10,7 +10,7 @@ _gallery_dl() elif [[ "${prev}" =~ ^()$ ]]; then COMPREPLY=( $(compgen -d -- "${cur}") ) else - COMPREPLY=( $(compgen -W "--help --version --filename --destination --directory --extractors --user-agent --clear-cache --update-check --input-file --input-file-comment --input-file-delete --no-input --quiet --warning --verbose --get-urls --resolve-urls --dump-json --resolve-json --simulate --extractor-info --list-keywords --error-file --list-modules --list-extractors --write-log --write-unsupported --write-pages --print-traffic --no-colors --retries --http-timeout --proxy --source-address --no-check-certificate --limit-rate --chunk-size --sleep --sleep-request --sleep-extractor --no-part --no-skip --no-mtime --no-download --option --config --config-yaml --config-toml --config-create --config-status --config-open --config-ignore --ignore-config --username --password --netrc --cookies --cookies-export --cookies-from-browser --abort --terminate --filesize-min --filesize-max --download-archive --range --chapter-range --filter --chapter-filter --postprocessor --no-postprocessors --postprocessor-option --write-metadata --write-info-json --write-infojson --write-tags --zip --cbz --mtime --mtime-from-date --rename --rename-to --ugoira --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --exec --exec-after" -- "${cur}") ) + COMPREPLY=( $(compgen -W "--help --version --filename --destination --directory --extractors --user-agent --clear-cache --update-check --input-file --input-file-comment --input-file-delete --no-input --quiet --warning --verbose --get-urls --resolve-urls --dump-json --resolve-json --simulate --extractor-info --list-keywords --error-file --print --print-to-file --list-modules --list-extractors --write-log --write-unsupported --write-pages --print-traffic --no-colors --retries --http-timeout --proxy --source-address --no-check-certificate --limit-rate --chunk-size --sleep --sleep-request --sleep-extractor --no-part --no-skip --no-mtime --no-download --option --config --config-yaml --config-toml --config-create --config-status --config-open --config-ignore --ignore-config --username --password --netrc --cookies --cookies-export --cookies-from-browser --abort --terminate --filesize-min --filesize-max --download-archive --range --chapter-range --filter --chapter-filter --postprocessor --no-postprocessors --postprocessor-option --write-metadata --write-info-json --write-infojson --write-tags --zip --cbz --mtime --mtime-from-date --rename --rename-to --ugoira --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --exec --exec-after" -- "${cur}") ) fi } diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish index 7734f40..a239c50 100644 --- a/data/completion/gallery-dl.fish +++ b/data/completion/gallery-dl.fish @@ -23,6 +23,8 @@ complete -c gallery-dl -s 's' -l 'simulate' -d 'Simulate data extraction; do not complete -c gallery-dl -s 'E' -l 'extractor-info' -d 'Print extractor defaults and settings' complete -c gallery-dl -s 'K' -l 'list-keywords' -d 'Print a list of available keywords and example values for the given URLs' complete -c gallery-dl -r -F -s 'e' -l 'error-file' -d 'Add input URLs which returned an error to FILE' +complete -c gallery-dl -x -s 'N' -l 'print' -d 'Write FORMAT during EVENT (default "prepare") to standard output. Examples: "id" or "post:{md5[:8]}"' +complete -c gallery-dl -x -l 'print-to-file' -d 'Append FORMAT during EVENT to FILE' complete -c gallery-dl -l 'list-modules' -d 'Print a list of available extractor modules' complete -c gallery-dl -x -l 'list-extractors' -d 'Print a list of extractor classes with description, (sub)category and example URL' complete -c gallery-dl -r -F -l 'write-log' -d 'Write logging output to FILE' diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 3fedff4..a56dbcd 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2024-10-11" "1.27.6" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2024-10-25" "1.27.7" "gallery-dl Manual" .\" disable hyphenation .nh @@ -89,6 +89,12 @@ Print a list of available keywords and example values for the given URLs .B "\-e, \-\-error\-file" \f[I]FILE\f[] Add input URLs which returned an error to FILE .TP +.B "\-N, \-\-print" \f[I][EVENT:]FORMAT\f[] +Write FORMAT during EVENT (default 'prepare') to standard output. Examples: 'id' or 'post:{md5[:8]}' +.TP +.B "\-\-print\-to\-file" \f[I][EVENT:]FORMAT FILE\f[] +Append FORMAT during EVENT to FILE +.TP .B "\-\-list\-modules" Print a list of available extractor modules .TP diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index ba4bb3e..0ae8c38 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2024-10-11" "1.27.6" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2024-10-25" "1.27.7" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -548,6 +548,8 @@ This is supported for .br * \f[I]sankaku\f[] .br +* \f[I]scrolller\f[] +.br * \f[I]seiga\f[] .br * \f[I]subscribestar\f[] @@ -1739,6 +1741,17 @@ Sets the maximum depth of returned reply posts. (See depth parameter of \f[I]app.bsky.feed.getPostThread\f[]) +.SS extractor.bluesky.quoted +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Fetch media from quoted posts. + + .SS extractor.bluesky.reposts .IP "Type:" 6 \f[I]bool\f[] @@ -3974,6 +3987,17 @@ uses the same domain as a given input URL. Include pins from board sections. +.SS extractor.pinterest.stories +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Extract files from story pins. + + .SS extractor.pinterest.videos .IP "Type:" 6 \f[I]bool\f[] @@ -4095,6 +4119,18 @@ fetch bookmark tags as \f[I]tags_bookmark\f[] metadata. Note: This requires 1 additional API request per bookmarked post. +.SS extractor.pixiv.captions +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +For works with seemingly empty \f[I]caption\f[] metadata, +try to grab the actual \f[I]caption\f[] value using the AJAX API. + + .SS extractor.pixiv.comments .IP "Type:" 6 \f[I]bool\f[] @@ -4284,6 +4320,17 @@ stubs in the base comment tree. Note: This requires 1 additional API call for every 100 extra comments. +.SS extractor.reddit.embeds +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download embedded comments media. + + .SS extractor.reddit.date-min & .date-max .IP "Type:" 6 \f[I]Date\f[] @@ -5200,7 +5247,21 @@ Any entries after the first one will be used for potential \f[I]fallback\f[] URLs. Known available sizes are -\f[I]4096x4096\f[], \f[I]orig\f[], \f[I]large\f[], \f[I]medium\f[], and \f[I]small\f[]. + +.br +* \f[I]orig\f[] +.br +* \f[I]large\f[] +.br +* \f[I]medium\f[] +.br +* \f[I]small\f[] +.br +* \f[I]4096x4096\f[] +.br +* \f[I]900x900\f[] +.br +* \f[I]360x360\f[] .SS extractor.twitter.logout @@ -5487,6 +5548,17 @@ Note: Requires \f[I]login\f[] or \f[I]cookies\f[] +.SS extractor.vk.offset +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]0\f[] + +.IP "Description:" 4 +Custom \f[I]offset\f[] starting value when paginating over image results. + + .SS extractor.vsco.include .IP "Type:" 6 .br @@ -7080,6 +7152,8 @@ but before it gets moved to its target location After a file got moved to its target location \f[I]skip\f[] When skipping a file download +\f[I]error\f[] +After a file download failed \f[I]post\f[] When starting to download all files of a post, e.g. a Tweet on Twitter or a post on Patreon. diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 27d0dd4..abc0001 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.27.6 +Version: 1.27.7 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -27,6 +27,7 @@ Classifier: Programming Language :: Python :: 3.9 Classifier: Programming Language :: Python :: 3.10 Classifier: Programming Language :: Python :: 3.11 Classifier: Programming Language :: Python :: 3.12 +Classifier: Programming Language :: Python :: 3.13 Classifier: Programming Language :: Python :: Implementation :: CPython Classifier: Programming Language :: Python :: Implementation :: PyPy Classifier: Topic :: Internet :: WWW/HTTP @@ -114,9 +115,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.7/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.6/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.7/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index df9217a..42a5df1 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -188,6 +188,7 @@ gallery_dl/extractor/redgifs.py gallery_dl/extractor/rule34us.py gallery_dl/extractor/sankaku.py gallery_dl/extractor/sankakucomplex.py +gallery_dl/extractor/scrolller.py gallery_dl/extractor/seiga.py gallery_dl/extractor/senmanga.py gallery_dl/extractor/sexcom.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 7a9e0be..62e96ae 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -63,7 +63,7 @@ def main(): browser, _, profile = args.cookies_from_browser.partition(":") browser, _, keyring = browser.partition("+") browser, _, domain = browser.partition("/") - if profile.startswith(":"): + if profile and profile[0] == ":": container = profile[1:] profile = None else: diff --git a/gallery_dl/config.py b/gallery_dl/config.py index 0a187c1..855fb4f 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -315,7 +315,7 @@ class apply(): self.original.append((path, key, get(path, key, util.SENTINEL))) set(path, key, value) - def __exit__(self, etype, value, traceback): + def __exit__(self, exc_type, exc_value, traceback): for path, key, value in self.original: if value is util.SENTINEL: unset(path, key) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 0ffd29a..cec2ea0 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -31,59 +31,63 @@ SUPPORTED_BROWSERS = SUPPORTED_BROWSERS_CHROMIUM | {"firefox", "safari"} logger = logging.getLogger("cookies") -def load_cookies(cookiejar, browser_specification): +def load_cookies(browser_specification): browser_name, profile, keyring, container, domain = \ _parse_browser_specification(*browser_specification) if browser_name == "firefox": - load_cookies_firefox(cookiejar, profile, container, domain) + return load_cookies_firefox(profile, container, domain) elif browser_name == "safari": - load_cookies_safari(cookiejar, profile, domain) + return load_cookies_safari(profile, domain) elif browser_name in SUPPORTED_BROWSERS_CHROMIUM: - load_cookies_chrome(cookiejar, browser_name, profile, keyring, domain) + return load_cookies_chromium(browser_name, profile, keyring, domain) else: raise ValueError("unknown browser '{}'".format(browser_name)) -def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None): +def load_cookies_firefox(profile=None, container=None, domain=None): path, container_id = _firefox_cookies_database(profile, container) - with DatabaseConnection(path) as db: - sql = ("SELECT name, value, host, path, isSecure, expiry " - "FROM moz_cookies") - conditions = [] - parameters = [] - - if container_id is False: - conditions.append("NOT INSTR(originAttributes,'userContextId=')") - elif container_id: - conditions.append( - "originAttributes LIKE ? OR originAttributes LIKE ?") - uid = "%userContextId={}".format(container_id) - parameters += (uid, uid + "&%") - - if domain: - if domain[0] == ".": - conditions.append("host == ? OR host LIKE ?") - parameters += (domain[1:], "%" + domain) - else: - conditions.append("host == ? OR host == ?") - parameters += (domain, "." + domain) + sql = ("SELECT name, value, host, path, isSecure, expiry " + "FROM moz_cookies") + conditions = [] + parameters = [] + + if container_id is False: + conditions.append("NOT INSTR(originAttributes,'userContextId=')") + elif container_id: + uid = "%userContextId={}".format(container_id) + conditions.append("originAttributes LIKE ? OR originAttributes LIKE ?") + parameters += (uid, uid + "&%") + + if domain: + if domain[0] == ".": + conditions.append("host == ? OR host LIKE ?") + parameters += (domain[1:], "%" + domain) + else: + conditions.append("host == ? OR host == ?") + parameters += (domain, "." + domain) - if conditions: - sql = "{} WHERE ( {} )".format(sql, " ) AND ( ".join(conditions)) + if conditions: + sql = "{} WHERE ( {} )".format(sql, " ) AND ( ".join(conditions)) - set_cookie = cookiejar.set_cookie - for name, value, domain, path, secure, expires in db.execute( - sql, parameters): - set_cookie(Cookie( + with DatabaseConnection(path) as db: + cookies = [ + Cookie( 0, name, value, None, False, - domain, bool(domain), domain.startswith("."), - path, bool(path), secure, expires, False, None, None, {}, - )) - _log_info("Extracted %s cookies from Firefox", len(cookiejar)) + domain, True if domain else False, + domain[0] == "." if domain else False, + path, True if path else False, secure, expires, + False, None, None, {}, + ) + for name, value, domain, path, secure, expires in db.execute( + sql, parameters) + ] + + _log_info("Extracted %s cookies from Firefox", len(cookies)) + return cookies -def load_cookies_safari(cookiejar, profile=None, domain=None): +def load_cookies_safari(profile=None, domain=None): """Ref.: https://github.com/libyal/dtformats/blob /main/documentation/Safari%20Cookies.asciidoc - This data appears to be out of date @@ -95,31 +99,33 @@ def load_cookies_safari(cookiejar, profile=None, domain=None): data = fp.read() page_sizes, body_start = _safari_parse_cookies_header(data) p = DataParser(data[body_start:]) + + cookies = [] for page_size in page_sizes: - _safari_parse_cookies_page(p.read_bytes(page_size), cookiejar) + _safari_parse_cookies_page(p.read_bytes(page_size), cookies) + _log_info("Extracted %s cookies from Safari", len(cookies)) + return cookies -def load_cookies_chrome(cookiejar, browser_name, profile=None, - keyring=None, domain=None): - config = _get_chromium_based_browser_settings(browser_name) - path = _chrome_cookies_database(profile, config) +def load_cookies_chromium(browser_name, profile=None, + keyring=None, domain=None): + config = _chromium_browser_settings(browser_name) + path = _chromium_cookies_database(profile, config) _log_debug("Extracting cookies from %s", path) + if domain: + if domain[0] == ".": + condition = " WHERE host_key == ? OR host_key LIKE ?" + parameters = (domain[1:], "%" + domain) + else: + condition = " WHERE host_key == ? OR host_key == ?" + parameters = (domain, "." + domain) + else: + condition = "" + parameters = () + with DatabaseConnection(path) as db: db.text_factory = bytes - decryptor = get_cookie_decryptor( - config["directory"], config["keyring"], keyring) - - if domain: - if domain[0] == ".": - condition = " WHERE host_key == ? OR host_key LIKE ?" - parameters = (domain[1:], "%" + domain) - else: - condition = " WHERE host_key == ? OR host_key == ?" - parameters = (domain, "." + domain) - else: - condition = "" - parameters = () try: rows = db.execute( @@ -130,10 +136,12 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, "SELECT host_key, name, value, encrypted_value, path, " "expires_utc, secure FROM cookies" + condition, parameters) - set_cookie = cookiejar.set_cookie failed_cookies = 0 unencrypted_cookies = 0 + decryptor = _chromium_cookie_decryptor( + config["directory"], config["keyring"], keyring) + cookies = [] for domain, name, value, enc_value, path, expires, secure in rows: if not value and enc_value: # encrypted @@ -145,15 +153,22 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, value = value.decode() unencrypted_cookies += 1 + if expires: + # https://stackoverflow.com/a/43520042 + expires = int(expires) // 1000000 - 11644473600 + else: + expires = None + domain = domain.decode() path = path.decode() name = name.decode() - set_cookie(Cookie( + cookies.append(Cookie( 0, name, value, None, False, - domain, bool(domain), domain.startswith("."), - path, bool(path), secure, expires or None, False, - None, None, {}, + domain, True if domain else False, + domain[0] == "." if domain else False, + path, True if path else False, secure, expires, + False, None, None, {}, )) if failed_cookies > 0: @@ -162,10 +177,11 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, failed_message = "" _log_info("Extracted %s cookies from %s%s", - len(cookiejar), browser_name.capitalize(), failed_message) + len(cookies), browser_name.capitalize(), failed_message) counts = decryptor.cookie_counts counts["unencrypted"] = unencrypted_cookies - _log_debug("Cookie version breakdown: %s", counts) + _log_debug("version breakdown: %s", counts) + return cookies # -------------------------------------------------------------------- @@ -253,7 +269,7 @@ def _safari_parse_cookies_header(data): return page_sizes, p.cursor -def _safari_parse_cookies_page(data, cookiejar, domain=None): +def _safari_parse_cookies_page(data, cookies, domain=None): p = DataParser(data) p.expect_bytes(b"\x00\x00\x01\x00", "page signature") number_of_cookies = p.read_uint() @@ -267,17 +283,17 @@ def _safari_parse_cookies_page(data, cookiejar, domain=None): for i, record_offset in enumerate(record_offsets): p.skip_to(record_offset, "space between records") record_length = _safari_parse_cookies_record( - data[record_offset:], cookiejar, domain) + data[record_offset:], cookies, domain) p.read_bytes(record_length) p.skip_to_end("space in between pages") -def _safari_parse_cookies_record(data, cookiejar, host=None): +def _safari_parse_cookies_record(data, cookies, host=None): p = DataParser(data) record_size = p.read_uint() p.skip(4, "unknown record field 1") flags = p.read_uint() - is_secure = bool(flags & 0x0001) + is_secure = True if (flags & 0x0001) else False p.skip(4, "unknown record field 2") domain_offset = p.read_uint() name_offset = p.read_uint() @@ -313,20 +329,21 @@ def _safari_parse_cookies_record(data, cookiejar, host=None): p.skip_to(record_size, "space at the end of the record") - cookiejar.set_cookie(Cookie( + cookies.append(Cookie( 0, name, value, None, False, - domain, bool(domain), domain.startswith("."), - path, bool(path), is_secure, expiration_date, False, - None, None, {}, + domain, True if domain else False, + domain[0] == "." if domain else False, + path, True if path else False, is_secure, expiration_date, + False, None, None, {}, )) return record_size # -------------------------------------------------------------------- -# chrome +# chromium -def _chrome_cookies_database(profile, config): +def _chromium_cookies_database(profile, config): if profile is None: search_root = config["directory"] elif _is_path(profile): @@ -346,7 +363,7 @@ def _chrome_cookies_database(profile, config): return path -def _get_chromium_based_browser_settings(browser_name): +def _chromium_browser_settings(browser_name): # https://chromium.googlesource.com/chromium # /src/+/HEAD/docs/user_data_dir.md join = os.path.join @@ -414,7 +431,17 @@ def _get_chromium_based_browser_settings(browser_name): } -class ChromeCookieDecryptor: +def _chromium_cookie_decryptor( + browser_root, browser_keyring_name, keyring=None): + if sys.platform in ("win32", "cygwin"): + return WindowsChromiumCookieDecryptor(browser_root) + elif sys.platform == "darwin": + return MacChromiumCookieDecryptor(browser_keyring_name) + else: + return LinuxChromiumCookieDecryptor(browser_keyring_name, keyring) + + +class ChromiumCookieDecryptor: """ Overview: @@ -452,16 +479,7 @@ class ChromeCookieDecryptor: raise NotImplementedError("Must be implemented by sub classes") -def get_cookie_decryptor(browser_root, browser_keyring_name, keyring=None): - if sys.platform in ("win32", "cygwin"): - return WindowsChromeCookieDecryptor(browser_root) - elif sys.platform == "darwin": - return MacChromeCookieDecryptor(browser_keyring_name) - else: - return LinuxChromeCookieDecryptor(browser_keyring_name, keyring) - - -class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): +class LinuxChromiumCookieDecryptor(ChromiumCookieDecryptor): def __init__(self, browser_keyring_name, keyring=None): self._v10_key = self.derive_key(b"peanuts") password = _get_linux_keyring_password(browser_keyring_name, keyring) @@ -500,7 +518,7 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): return None -class MacChromeCookieDecryptor(ChromeCookieDecryptor): +class MacChromiumCookieDecryptor(ChromiumCookieDecryptor): def __init__(self, browser_keyring_name): password = _get_mac_keyring_password(browser_keyring_name) self._v10_key = None if password is None else self.derive_key(password) @@ -539,7 +557,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): return encrypted_value -class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): +class WindowsChromiumCookieDecryptor(ChromiumCookieDecryptor): def __init__(self, browser_root): self._v10_key = _get_windows_v10_key(browser_root) self._cookie_counts = {"v10": 0, "other": 0} @@ -864,7 +882,7 @@ class DatabaseConnection(): self.directory.cleanup() raise - def __exit__(self, exc, value, tb): + def __exit__(self, exc_type, exc_value, traceback): self.database.close() if self.directory: self.directory.cleanup() diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 950a72f..26f328d 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -62,10 +62,18 @@ class YoutubeDLDownloader(DownloaderBase): info_dict = kwdict.pop("_ytdl_info_dict", None) if not info_dict: + url = url[5:] try: - info_dict = ytdl_instance.extract_info(url[5:], download=False) + manifest = kwdict.pop("_ytdl_manifest", None) + if manifest: + info_dict = self._extract_manifest( + ytdl_instance, url, manifest) + else: + info_dict = self._extract_info(ytdl_instance, url) except Exception as exc: self.log.debug("", exc_info=exc) + self.log.warning("%s: %s", exc.__class__.__name__, exc) + if not info_dict: return False @@ -134,6 +142,42 @@ class YoutubeDLDownloader(DownloaderBase): ytdl_instance.process_info(entry) return True + def _extract_info(self, ytdl, url): + return ytdl.extract_info(url, download=False) + + def _extract_manifest(self, ytdl, url, manifest): + extr = ytdl.get_info_extractor("Generic") + video_id = extr._generic_id(url) + + if manifest == "hls": + try: + formats, subtitles = extr._extract_m3u8_formats_and_subtitles( + url, video_id, "mp4") + except AttributeError: + formats = extr._extract_m3u8_formats(url, video_id, "mp4") + subtitles = None + + elif manifest == "dash": + try: + formats, subtitles = extr._extract_mpd_formats_and_subtitles( + url, video_id) + except AttributeError: + formats = extr._extract_mpd_formats(url, video_id) + subtitles = None + + else: + self.log.error("Unsupported manifest type '%s'", manifest) + return None + + info_dict = { + "id" : video_id, + "title" : video_id, + "formats" : formats, + "subtitles": subtitles, + } + # extr._extra_manifest_info(info_dict, url) + return ytdl.process_ie_result(info_dict, download=False) + def _progress_hook(self, info): if info["status"] == "downloading" and \ info["elapsed"] >= self.progress: diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py index 08dcfdc..6b2ce3a 100644 --- a/gallery_dl/exception.py +++ b/gallery_dl/exception.py @@ -54,10 +54,16 @@ class HttpError(ExtractionError): default = "HTTP request failed" code = 4 - def __init__(self, message, response=None): - ExtractionError.__init__(self, message) + def __init__(self, message="", response=None): self.response = response - self.status = 0 if response is None else response.status_code + if response is None: + self.status = 0 + else: + self.status = response.status_code + if not message: + message = "'{} {}' for '{}'".format( + response.status_code, response.reason, response.url) + ExtractionError.__init__(self, message) class NotFoundError(ExtractionError): diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index f81d2a1..ce1c52a 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -27,12 +27,22 @@ class _8chanExtractor(Extractor): Extractor.__init__(self, match) def _init(self): - now = util.datetime_utcnow() - domain = self.root.rpartition("/")[2] - self.cookies.set( - now.strftime("TOS%Y%m%d"), "1", domain=domain) - self.cookies.set( - (now - timedelta(1)).strftime("TOS%Y%m%d"), "1", domain=domain) + tos = self.cookies_tos_name() + self.cookies.set(tos, "1", domain=self.root[8:]) + + @memcache() + def cookies_tos_name(self): + url = self.root + "/.static/pages/confirmed.html" + headers = {"Referer": self.root + "/.static/pages/disclaimer.html"} + response = self.request(url, headers=headers, allow_redirects=False) + + for cookie in response.cookies: + if cookie.name.lower().startswith("tos"): + self.log.debug("TOS cookie name: %s", cookie.name) + return cookie.name + + self.log.error("Unable to determin TOS cookie name") + return "TOS20241009" @memcache() def cookies_prepare(self): @@ -64,16 +74,14 @@ class _8chanThreadExtractor(_8chanExtractor): "{threadId} {subject[:50]}") filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}" archive_fmt = "{boardUri}_{postId}_{num}" - pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/(?:res|last)/(\d+)" example = "https://8chan.moe/a/res/12345.html" - def __init__(self, match): - _8chanExtractor.__init__(self, match) - _, self.board, self.thread = match.groups() - def items(self): + _, board, thread = self.groups + # fetch thread data - url = "{}/{}/res/{}.".format(self.root, self.board, self.thread) + url = "{}/{}/res/{}.".format(self.root, board, thread) self.session.headers["Referer"] = url + "html" thread = self.request(url + "json").json() thread["postId"] = thread["threadId"] @@ -106,25 +114,22 @@ class _8chanBoardExtractor(_8chanExtractor): pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$" example = "https://8chan.moe/a/" - def __init__(self, match): - _8chanExtractor.__init__(self, match) - _, self.board, self.page = match.groups() - def items(self): - page = text.parse_int(self.page, 1) - url = "{}/{}/{}.json".format(self.root, self.board, page) - board = self.request(url).json() - threads = board["threads"] + _, board, pnum = self.groups + pnum = text.parse_int(pnum, 1) + url = "{}/{}/{}.json".format(self.root, board, pnum) + data = self.request(url).json() + threads = data["threads"] while True: for thread in threads: thread["_extractor"] = _8chanThreadExtractor url = "{}/{}/res/{}.html".format( - self.root, self.board, thread["threadId"]) + self.root, board, thread["threadId"]) yield Message.Queue, url, thread - page += 1 - if page > board["pageCount"]: + pnum += 1 + if pnum > data["pageCount"]: return - url = "{}/{}/{}.json".format(self.root, self.board, page) + url = "{}/{}/{}.json".format(self.root, board, pnum) threads = self.request(url).json()["threads"] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 9885195..4e9fa50 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -141,6 +141,7 @@ modules = [ "rule34us", "sankaku", "sankakucomplex", + "scrolller", "seiga", "senmanga", "sexcom", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 72f9195..14598b7 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -171,6 +171,7 @@ class BehanceGalleryExtractor(BehanceExtractor): url = text.extr(page, '<source src="', '"') if text.ext_from_url(url) == "m3u8": url = "ytdl:" + url + module["_ytdl_manifest"] = "hls" module["extension"] = "mp4" append((url, module)) continue diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index 39c5635..a1a488e 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -42,62 +42,76 @@ class BlueskyExtractor(Extractor): self._user = self._user_did = None self.instance = self.root.partition("://")[2] self.videos = self.config("videos", True) + self.quoted = self.config("quoted", False) def items(self): for post in self.posts(): if "post" in post: post = post["post"] - - pid = post["uri"].rpartition("/")[2] if self._user_did and post["author"]["did"] != self._user_did: - self.log.debug("Skipping %s (repost)", pid) - continue - - post.update(post["record"]) - del post["record"] - - if self._metadata_facets: - if "facets" in post: - post["hashtags"] = tags = [] - post["mentions"] = dids = [] - post["uris"] = uris = [] - for facet in post["facets"]: - features = facet["features"][0] - if "tag" in features: - tags.append(features["tag"]) - elif "did" in features: - dids.append(features["did"]) - elif "uri" in features: - uris.append(features["uri"]) - else: - post["hashtags"] = post["mentions"] = post["uris"] = () - - if self._metadata_user: - post["user"] = self._user or post["author"] - - files = self._extract_files(post) - post["instance"] = self.instance - post["post_id"] = pid - post["count"] = len(files) - post["date"] = text.parse_datetime( - post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") - - yield Message.Directory, post - - if not files: + self.log.debug("Skipping %s (repost)", self._pid(post)) continue - - base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob" - "?did={}&cid=".format(post["author"]["did"])) - for post["num"], file in enumerate(files, 1): - post.update(file) - yield Message.Url, base + file["filename"], post + embed = post.get("embed") + post.update(post.pop("record")) + + while True: + self._prepare(post) + files = self._extract_files(post) + + yield Message.Directory, post + if files: + base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob" + "?did={}&cid=".format(post["author"]["did"])) + for post["num"], file in enumerate(files, 1): + post.update(file) + yield Message.Url, base + file["filename"], post + + if not self.quoted or not embed or "record" not in embed: + break + + quote = embed["record"] + if "record" in quote: + quote = quote["record"] + quote["quote_id"] = self._pid(post) + quote["quote_by"] = post["author"] + embed = quote.get("embed") + quote.update(quote.pop("value")) + post = quote def posts(self): return () + def _pid(self, post): + return post["uri"].rpartition("/")[2] + + def _prepare(self, post): + if self._metadata_facets: + if "facets" in post: + post["hashtags"] = tags = [] + post["mentions"] = dids = [] + post["uris"] = uris = [] + for facet in post["facets"]: + features = facet["features"][0] + if "tag" in features: + tags.append(features["tag"]) + elif "did" in features: + dids.append(features["did"]) + elif "uri" in features: + uris.append(features["uri"]) + else: + post["hashtags"] = post["mentions"] = post["uris"] = () + + if self._metadata_user: + post["user"] = self._user or post["author"] + + post["instance"] = self.instance + post["post_id"] = self._pid(post) + post["date"] = text.parse_datetime( + post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") + def _extract_files(self, post): if "embed" not in post: + post["count"] = 0 return () files = [] @@ -111,6 +125,7 @@ class BlueskyExtractor(Extractor): if "video" in media and self.videos: files.append(self._extract_media(media, "video")) + post["count"] = len(files) return files def _extract_media(self, media, key): diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 9022ffc..6c79d0a 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -8,9 +8,10 @@ """Extractors for https://bunkr.si/""" +from .common import Extractor from .lolisafe import LolisafeAlbumExtractor -from .. import text, config - +from .. import text, config, exception +import random if config.get(("extractor", "bunkr"), "tlds"): BASE_PATTERN = ( @@ -21,11 +22,28 @@ else: BASE_PATTERN = ( r"(?:bunkr:(?:https?://)?([^/?#]+)|" r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|[cf]i|pk|ru|la|is|to|a[cx]" + r"\.(?:s[kiu]|[cf]i|p[hks]|ru|la|is|to|a[cx]" r"|black|cat|media|red|site|ws|org)))" ) +DOMAINS = [ + "bunkr.ac", + "bunkr.ci", + "bunkr.fi", + "bunkr.ph", + "bunkr.pk", + "bunkr.ps", + "bunkr.si", + "bunkr.sk", + "bunkr.ws", + "bunkr.black", + "bunkr.red", + "bunkr.media", + "bunkr.site", +] LEGACY_DOMAINS = { + "bunkr.ax", + "bunkr.cat", "bunkr.ru", "bunkrr.ru", "bunkr.su", @@ -34,6 +52,7 @@ LEGACY_DOMAINS = { "bunkr.is", "bunkr.to", } +CF_DOMAINS = set() class BunkrAlbumExtractor(LolisafeAlbumExtractor): @@ -49,45 +68,96 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): if domain not in LEGACY_DOMAINS: self.root = "https://" + domain + def request(self, url, **kwargs): + kwargs["allow_redirects"] = False + + while True: + try: + response = Extractor.request(self, url, **kwargs) + if response.status_code < 300: + return response + + # redirect + url = response.headers["Location"] + root, path = self._split(url) + if root not in CF_DOMAINS: + continue + self.log.debug("Redirect to known CF challenge domain '%s'", + root) + + except exception.HttpError as exc: + if exc.status != 403: + raise + + # CF challenge + root, path = self._split(url) + CF_DOMAINS.add(root) + self.log.debug("Added '%s' to CF challenge domains", root) + + try: + DOMAINS.remove(root.rpartition("/")[2]) + except ValueError: + pass + else: + if not DOMAINS: + raise exception.StopExtraction( + "All Bunkr domains require solving a CF challenge") + + # select alternative domain + root = "https://" + random.choice(DOMAINS) + self.log.debug("Trying '%s' as fallback", root) + url = root + path + def fetch_album(self, album_id): # album metadata page = self.request(self.root + "/a/" + self.album_id).text - info = text.split_html(text.extr( - page, "<h1", "</div>").partition(">")[2]) - count, _, size = info[1].split(None, 2) + title, size = text.split_html(text.extr( + page, "<h1", "</span>").partition(">")[2]) - pos = page.index('class="grid-images') - urls = list(text.extract_iter(page, '<a href="', '"', pos)) - - return self._extract_files(urls), { + items = list(text.extract_iter(page, "<!-- item -->", "<!-- -->")) + return self._extract_files(items), { "album_id" : self.album_id, - "album_name" : text.unescape(info[0]), - "album_size" : size[1:-1], - "count" : len(urls), - "_http_validate": self._validate, + "album_name" : title, + "album_size" : text.extr(size, "(", ")"), + "count" : len(items), } - def _extract_files(self, urls): - for url in urls: + def _extract_files(self, items): + for item in items: try: - url = self._extract_file(text.unescape(url)) + url = text.extr(item, ' href="', '"') + file = self._extract_file(text.unescape(url)) + + info = text.split_html(item) + file["name"] = info[0] + file["size"] = info[2] + file["date"] = text.parse_datetime( + info[-1], "%H:%M:%S %d/%m/%Y") + + yield file + except exception.StopExtraction: + raise except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) - continue - yield {"file": text.unescape(url)} - - def _extract_file(self, url): - page = self.request(url).text - url = (text.extr(page, '<source src="', '"') or - text.extr(page, '<img src="', '"')) - - if not url: - url_download = text.rextract( - page, ' href="', '"', page.rindex("Download"))[0] - page = self.request(text.unescape(url_download)).text - url = text.unescape(text.rextract(page, ' href="', '"')[0]) - - return url + self.log.debug("", exc_info=exc) + + def _extract_file(self, webpage_url): + response = self.request(webpage_url) + page = response.text + file_url = (text.extr(page, '<source src="', '"') or + text.extr(page, '<img src="', '"')) + + if not file_url: + webpage_url = text.unescape(text.rextract( + page, ' href="', '"', page.rindex("Download"))[0]) + response = self.request(webpage_url) + file_url = text.rextract(response.text, ' href="', '"')[0] + + return { + "file" : text.unescape(file_url), + "_http_headers" : {"Referer": response.url}, + "_http_validate": self._validate, + } def _validate(self, response): if response.history and response.url.endswith("/maintenance-vid.mp4"): @@ -95,6 +165,10 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): return False return True + def _split(self, url): + pos = url.index("/", 8) + return url[:pos], url[pos:] + class BunkrMediaExtractor(BunkrAlbumExtractor): """Extractor for bunkr.si media links""" @@ -105,16 +179,15 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): def fetch_album(self, album_id): try: - url = self._extract_file(self.root + self.album_id) + file = self._extract_file(self.root + album_id) except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) return (), {} - return ({"file": text.unescape(url)},), { + return (file,), { "album_id" : "", "album_name" : "", "album_size" : -1, "description": "", "count" : 1, - "_http_validate": self._validate, } diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index 725af3a..0b1e44a 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -9,7 +9,7 @@ """Extractors for https://www.civitai.com/""" from .common import Extractor, Message -from .. import text, util +from .. import text, util, exception import itertools import time @@ -23,7 +23,7 @@ class CivitaiExtractor(Extractor): root = "https://civitai.com" directory_fmt = ("{category}", "{username|user[username]}", "images") filename_fmt = "{file[id]|id|filename}.{extension}" - archive_fmt = "{file[hash]|hash}" + archive_fmt = "{file[uuid]|uuid}" request_interval = (0.5, 1.5) def _init(self): @@ -101,9 +101,11 @@ class CivitaiExtractor(Extractor): def _url(self, image): url = image["url"] if "/" in url: - parts = url.rsplit("/", 2) - parts[1] = self._image_quality + parts = url.rsplit("/", 3) + image["uuid"] = parts[1] + parts[2] = self._image_quality return "/".join(parts) + image["uuid"] = url name = image.get("name") if not name: @@ -133,8 +135,6 @@ class CivitaiModelExtractor(CivitaiExtractor): directory_fmt = ("{category}", "{user[username]}", "{model[id]}{model[name]:? //}", "{version[id]}{version[name]:? //}") - filename_fmt = "{file[id]}.{extension}" - archive_fmt = "{file[hash]}" pattern = BASE_PATTERN + r"/models/(\d+)(?:/?\?modelVersionId=(\d+))?" example = "https://civitai.com/models/12345/TITLE" @@ -195,19 +195,25 @@ class CivitaiModelExtractor(CivitaiExtractor): ) def _extract_files_model(self, model, version, user): - return [ - { + files = [] + + for num, file in enumerate(version["files"], 1): + file["uuid"] = "model-{}-{}-{}".format( + model["id"], version["id"], file["id"]) + files.append({ "num" : num, "file" : file, "filename" : file["name"], "extension": "bin", - "url" : file["downloadUrl"], + "url" : file.get("downloadUrl") or + "{}/api/download/models/{}".format( + self.root, version["id"]), "_http_headers" : { "Authorization": self.api.headers.get("Authorization")}, "_http_validate": self._validate_file_model, - } - for num, file in enumerate(version["files"], 1) - ] + }) + + return files def _extract_files_image(self, model, version, user): if "images" in version: @@ -263,24 +269,14 @@ class CivitaiPostExtractor(CivitaiExtractor): return ({"id": int(self.groups[0])},) -class CivitaiTagModelsExtractor(CivitaiExtractor): - subcategory = "tag-models" - pattern = BASE_PATTERN + r"/(?:tag/|models\?tag=)([^/?&#]+)" +class CivitaiTagExtractor(CivitaiExtractor): + subcategory = "tag" + pattern = BASE_PATTERN + r"/tag/([^/?&#]+)" example = "https://civitai.com/tag/TAG" def models(self): tag = text.unquote(self.groups[0]) - return self.api.models({"tag": tag}) - - -class CivitaiTagImagesExtractor(CivitaiExtractor): - subcategory = "tag-images" - pattern = BASE_PATTERN + r"/images\?tags=([^&#]+)" - example = "https://civitai.com/images?tags=12345" - - def images(self): - tag = text.unquote(self.groups[0]) - return self.api.images({"tag": tag}) + return self.api.models_tag(tag) class CivitaiSearchExtractor(CivitaiExtractor): @@ -293,6 +289,26 @@ class CivitaiSearchExtractor(CivitaiExtractor): return self.api.models(params) +class CivitaiModelsExtractor(CivitaiExtractor): + subcategory = "models" + pattern = BASE_PATTERN + r"/models(?:/?\?([^#]+))?(?:$|#)" + example = "https://civitai.com/models" + + def models(self): + params = text.parse_query(self.groups[0]) + return self.api.models(params) + + +class CivitaiImagesExtractor(CivitaiExtractor): + subcategory = "images" + pattern = BASE_PATTERN + r"/images(?:/?\?([^#]+))?(?:$|#)" + example = "https://civitai.com/images" + + def images(self): + params = text.parse_query(self.groups[0]) + return self.api.images(params) + + class CivitaiUserExtractor(CivitaiExtractor): subcategory = "user" pattern = USER_PATTERN + r"/?(?:$|\?|#)" @@ -339,11 +355,35 @@ class CivitaiUserImagesExtractor(CivitaiExtractor): pattern = USER_PATTERN + r"/images/?(?:\?([^#]+))?" example = "https://civitai.com/user/USER/images" + def __init__(self, match): + self.params = text.parse_query_list(match.group(2)) + if self.params.get("section") == "reactions": + self.subcategory = "reactions" + self.images = self.images_reactions + CivitaiExtractor.__init__(self, match) + def images(self): - params = text.parse_query(self.groups[1]) + params = self.params params["username"] = text.unquote(self.groups[0]) return self.api.images(params) + def images_reactions(self): + if "Authorization" not in self.api.headers and \ + not self.cookies.get( + "__Secure-civitai-token", domain=".civitai.com"): + raise exception.AuthorizationError("api-key or cookies required") + + params = self.params + params["authed"] = True + params["useIndex"] = False + if "reactions" in params: + if isinstance(params["reactions"], str): + params["reactions"] = (params["reactions"],) + else: + params["reactions"] = ( + "Like", "Dislike", "Heart", "Laugh", "Cry") + return self.api.images(params) + class CivitaiRestAPI(): """Interface for the Civitai Public REST API @@ -396,6 +436,9 @@ class CivitaiRestAPI(): def models(self, params): return self._pagination("/v1/models", params) + def models_tag(self, tag): + return self.models({"tag": tag}) + def _call(self, endpoint, params=None): if endpoint[0] == "/": url = self.root + endpoint @@ -419,14 +462,14 @@ class CivitaiRestAPI(): class CivitaiTrpcAPI(): - """Interface for the Civitai TRPC API""" + """Interface for the Civitai tRPC API""" def __init__(self, extractor): self.extractor = extractor self.root = extractor.root + "/api/trpc/" self.headers = { "content-type" : "application/json", - "x-client-version": "5.0.146", + "x-client-version": "5.0.185", "x-client-date" : "", "x-client" : "web", "x-fingerprint" : "undefined", @@ -463,6 +506,7 @@ class CivitaiTrpcAPI(): "include" : ["cosmetics"], }) + params = self._type_params(params) return self._pagination(endpoint, params) def images_gallery(self, model, version, user): @@ -516,6 +560,9 @@ class CivitaiTrpcAPI(): return self._pagination(endpoint, params) + def models_tag(self, tag): + return self.models({"tagname": tag}) + def post(self, post_id): endpoint = "post.get" params = {"id": int(post_id)} @@ -580,3 +627,13 @@ class CivitaiTrpcAPI(): def _merge_params(self, params_user, params_default): params_default.update(params_user) return params_default + + def _type_params(self, params): + for key, type in ( + ("tags" , int), + ("modelId" , int), + ("modelVersionId", int), + ): + if key in params: + params[key] = type(params[key]) + return params diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py index 4722a4f..0524239 100644 --- a/gallery_dl/extractor/cohost.py +++ b/gallery_dl/extractor/cohost.py @@ -109,7 +109,7 @@ class CohostUserExtractor(CohostExtractor): "projectHandle": self.groups[0], "page": 0, "options": { - "pinnedPostsAtTop" : bool(self.pinned), + "pinnedPostsAtTop" : True if self.pinned else False, "hideReplies" : not self.replies, "hideShares" : not self.shares, "hideAsks" : not self.asks, diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 32c8e67..2146fa6 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -185,7 +185,9 @@ class Extractor(): self._dump_response(response) if ( code < 400 or - code < 500 and (not fatal and code != 429 or fatal is None) + code < 500 and ( + not fatal and code != 429 or fatal is None) or + fatal is ... ): if encoding: response.encoding = encoding @@ -454,46 +456,49 @@ class Extractor(): cookies = random.choice(cookies) self.cookies_load(cookies) - def cookies_load(self, cookies): - if isinstance(cookies, dict): - self.cookies_update_dict(cookies, self.cookies_domain) + def cookies_load(self, cookies_source): + if isinstance(cookies_source, dict): + self.cookies_update_dict(cookies_source, self.cookies_domain) - elif isinstance(cookies, str): - path = util.expand_path(cookies) + elif isinstance(cookies_source, str): + path = util.expand_path(cookies_source) try: with open(path) as fp: - util.cookiestxt_load(fp, self.cookies) + cookies = util.cookiestxt_load(fp) except Exception as exc: self.log.warning("cookies: %s", exc) else: - self.log.debug("Loading cookies from '%s'", cookies) + self.log.debug("Loading cookies from '%s'", cookies_source) + set_cookie = self.cookies.set_cookie + for cookie in cookies: + set_cookie(cookie) self.cookies_file = path - elif isinstance(cookies, (list, tuple)): - key = tuple(cookies) - cookiejar = _browser_cookies.get(key) + elif isinstance(cookies_source, (list, tuple)): + key = tuple(cookies_source) + cookies = _browser_cookies.get(key) - if cookiejar is None: + if cookies is None: from ..cookies import load_cookies - cookiejar = self.cookies.__class__() try: - load_cookies(cookiejar, cookies) + cookies = load_cookies(cookies_source) except Exception as exc: self.log.warning("cookies: %s", exc) + cookies = () else: - _browser_cookies[key] = cookiejar + _browser_cookies[key] = cookies else: self.log.debug("Using cached cookies from %s", key) set_cookie = self.cookies.set_cookie - for cookie in cookiejar: + for cookie in cookies: set_cookie(cookie) else: self.log.warning( "Expected 'dict', 'list', or 'str' value for 'cookies' " "option, got '%s' (%s)", - cookies.__class__.__name__, cookies) + cookies_source.__class__.__name__, cookies_source) def cookies_store(self): """Store the session's cookies in a cookies.txt file""" diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 836fae7..693def9 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -401,7 +401,7 @@ class DeviantartExtractor(Extractor): html = content["html"] markup = html["markup"] - if not markup.startswith("{"): + if not markup or markup[0] != "{": return markup if html["type"] == "tiptap": @@ -1301,7 +1301,7 @@ class DeviantartOAuthAPI(): metadata = extractor.config("metadata", False) if not metadata: - metadata = bool(extractor.extra) + metadata = True if extractor.extra else False if metadata: self.metadata = True diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 01af7a4..3e6d537 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -260,9 +260,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "torrentcount" : extr('>Torrent Download (', ')'), } - if data["uploader"].startswith("<"): - data["uploader"] = text.unescape(text.extr( - data["uploader"], ">", "<")) + uploader = data["uploader"] + if uploader and uploader[0] == "<": + data["uploader"] = text.unescape(text.extr(uploader, ">", "<")) f = data["favorites"][0] if f == "N": diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 85dd896..44c4542 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -37,7 +37,7 @@ class FoolfuukaExtractor(BaseExtractor): if not url and "remote_media_link" in media: url = self.remote(media) - if url.startswith("/"): + if url and url[0] == "/": url = self.root + url post["filename"], _, post["extension"] = \ diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index 12e8860..72a6453 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -17,42 +17,30 @@ class LensdumpBase(): category = "lensdump" root = "https://lensdump.com" - def nodes(self, page=None): - if page is None: - page = self.request(self.url).text - - # go through all pages starting from the oldest - page_url = text.urljoin(self.root, text.extr( - text.extr(page, ' id="list-most-oldest-link"', '>'), - 'href="', '"')) - while page_url is not None: - if page_url == self.url: - current_page = page - else: - current_page = self.request(page_url).text - - for node in text.extract_iter( - current_page, ' class="list-item ', '>'): - yield node - - # find url of next page - page_url = text.extr( - text.extr(current_page, ' data-pagination="next"', '>'), - 'href="', '"') - if page_url is not None and len(page_url) > 0: - page_url = text.urljoin(self.root, page_url) - else: - page_url = None + def _pagination(self, page, begin, end): + while True: + yield from text.extract_iter(page, begin, end) + + next = text.extr(page, ' data-pagination="next"', '>') + if not next: + return + + url = text.urljoin(self.root, text.extr(next, 'href="', '"')) + page = self.request(url).text class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): subcategory = "album" - pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))" + pattern = BASE_PATTERN + r"/a/(\w+)(?:/?\?([^#]+))?" example = "https://lensdump.com/a/ID" def __init__(self, match): - GalleryExtractor.__init__(self, match, match.string) - self.gallery_id = match.group(1) or match.group(2) + self.gallery_id, query = match.groups() + if query: + url = "{}/a/{}/?{}".format(self.root, self.gallery_id, query) + else: + url = "{}/a/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) def metadata(self, page): return { @@ -62,40 +50,48 @@ class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): } def images(self, page): - for node in self.nodes(page): - # get urls and filenames of images in current page - json_data = util.json_loads(text.unquote( - text.extr(node, "data-object='", "'") or - text.extr(node, 'data-object="', '"'))) - image_id = json_data.get('name') - image_url = json_data.get('url') - image_title = json_data.get('title') + for image in self._pagination(page, ' class="list-item ', '>'): + + data = util.json_loads(text.unquote( + text.extr(image, "data-object='", "'") or + text.extr(image, 'data-object="', '"'))) + image_id = data.get("name") + image_url = data.get("url") + image_title = data.get("title") if image_title is not None: image_title = text.unescape(image_title) + yield (image_url, { - 'id': image_id, - 'url': image_url, - 'title': image_title, - 'name': json_data.get('filename'), - 'filename': image_id, - 'extension': json_data.get('extension'), - 'height': text.parse_int(json_data.get('height')), - 'width': text.parse_int(json_data.get('width')), + "id" : image_id, + "url" : image_url, + "title" : image_title, + "name" : data.get("filename"), + "filename" : image_id, + "extension": data.get("extension"), + "width" : text.parse_int(data.get("width")), + "height" : text.parse_int(data.get("height")), }) class LensdumpAlbumsExtractor(LensdumpBase, Extractor): """Extractor for album list from lensdump.com""" subcategory = "albums" - pattern = BASE_PATTERN + r"/\w+/albums" - example = "https://lensdump.com/USER/albums" + pattern = BASE_PATTERN + r"/(?![ai]/)([^/?#]+)(?:/?\?([^#]+))?" + example = "https://lensdump.com/USER" def items(self): - for node in self.nodes(): - album_url = text.urljoin(self.root, text.extr( - node, 'data-url-short="', '"')) - yield Message.Queue, album_url, { - "_extractor": LensdumpAlbumExtractor} + user, query = self.groups + url = "{}/{}/".format(self.root, user) + if query: + params = text.parse_query(query) + else: + params = {"sort": "date_asc", "page": "1"} + page = self.request(url, params=params).text + + data = {"_extractor": LensdumpAlbumExtractor} + for album_path in self._pagination(page, 'data-url-short="', '"'): + album_url = text.urljoin(self.root, album_path) + yield Message.Queue, album_url, data class LensdumpImageExtractor(LensdumpBase, Extractor): @@ -107,16 +103,13 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)" example = "https://lensdump.com/i/ID" - def __init__(self, match): - Extractor.__init__(self, match) - self.key = match.group(1) - def items(self): - url = "{}/i/{}".format(self.root, self.key) + key = self.groups[0] + url = "{}/i/{}".format(self.root, key) extr = text.extract_from(self.request(url).text) data = { - "id" : self.key, + "id" : key, "title" : text.unescape(extr( 'property="og:title" content="', '"')), "url" : extr( diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 6fc0689..044f4f5 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -47,7 +47,15 @@ class LolisafeAlbumExtractor(LolisafeExtractor): url = file["file"] file.update(data) text.nameext_from_url(url, file) - file["name"], sep, file["id"] = file["filename"].rpartition("-") + + if "name" in file: + name = file["name"] + file["name"] = name.rpartition(".")[0] or name + file["id"] = file["filename"].rpartition("-")[2] + else: + file["name"], sep, file["id"] = \ + file["filename"].rpartition("-") + yield Message.Url, url, file def fetch_album(self, album_id): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index bca7e4d..1f24593 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -174,6 +174,20 @@ class MangadexListExtractor(MangadexExtractor): yield Message.Queue, url, data +class MangadexAuthorExtractor(MangadexExtractor): + """Extractor for mangadex authors""" + subcategory = "author" + pattern = BASE_PATTERN + r"/author/([0-9a-f-]+)" + example = ("https://mangadex.org/author" + "/01234567-89ab-cdef-0123-456789abcdef/NAME") + + def items(self): + for manga in self.api.manga_author(self.uuid): + manga["_extractor"] = MangadexMangaExtractor + url = "{}/title/{}".format(self.root, manga["id"]) + yield Message.Queue, url, manga + + class MangadexAPI(): """Interface for the MangaDex API v5 @@ -195,6 +209,10 @@ class MangadexAPI(): def athome_server(self, uuid): return self._call("/at-home/server/" + uuid) + def author(self, uuid, manga=False): + params = {"includes[]": ("manga",)} if manga else None + return self._call("/author/" + uuid, params)["data"] + def chapter(self, uuid): params = {"includes[]": ("scanlation_group",)} return self._call("/chapter/" + uuid, params)["data"] @@ -210,6 +228,10 @@ class MangadexAPI(): params = {"includes[]": ("artist", "author")} return self._call("/manga/" + uuid, params)["data"] + def manga_author(self, uuid_author): + params = {"authorOrArtist": uuid_author} + return self._pagination("/manga", params) + def manga_feed(self, uuid): order = "desc" if self.extractor.config("chapter-reverse") else "asc" params = { diff --git a/gallery_dl/extractor/mangakakalot.py b/gallery_dl/extractor/mangakakalot.py index 0183b25..9fc8681 100644 --- a/gallery_dl/extractor/mangakakalot.py +++ b/gallery_dl/extractor/mangakakalot.py @@ -19,7 +19,7 @@ BASE_PATTERN = r"(?:https?://)?(?:ww[\dw]?\.)?mangakakalot\.tv" class MangakakalotBase(): """Base class for mangakakalot extractors""" category = "mangakakalot" - root = "https://ww6.mangakakalot.tv" + root = "https://ww8.mangakakalot.tv" class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): @@ -40,7 +40,7 @@ class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): match = re.match( r"(?:[Vv]ol\. *(\d+) )?" r"[Cc]hapter *([^:]*)" - r"(?:: *(.+))?", info) + r"(?:: *(.+))?", info or "") volume, chapter, title = match.groups() if match else ("", "", info) chapter, sep, minor = chapter.partition(".") @@ -86,7 +86,7 @@ class MangakakalotMangaExtractor(MangakakalotBase, MangaExtractor): data["chapter"] = text.parse_int(chapter) data["chapter_minor"] = sep + minor - if url.startswith("/"): + if url[0] == "/": url = self.root + url results.append((url, data.copy())) return results diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 2928573..61ffdee 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -14,6 +14,9 @@ from ..cache import cache import itertools import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?newgrounds\.com" +USER_PATTERN = r"(?:https?://)?([\w-]+)\.newgrounds\.com" + class NewgroundsExtractor(Extractor): """Base class for newgrounds extractors""" @@ -93,7 +96,7 @@ class NewgroundsExtractor(Extractor): def posts(self): """Return URLs of all relevant post pages""" - return self._pagination(self._path) + return self._pagination(self._path, self.groups[1]) def metadata(self): """Return general metadata""" @@ -334,10 +337,10 @@ class NewgroundsExtractor(Extractor): for fmt in formats: yield fmt[1][0]["src"] - def _pagination(self, kind): + def _pagination(self, kind, pnum=1): url = "{}/{}".format(self.user_root, kind) params = { - "page": 1, + "page": text.parse_int(pnum, 1), "isAjaxRequest": "1", } headers = { @@ -400,8 +403,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): class NewgroundsMediaExtractor(NewgroundsExtractor): """Extractor for a media file from newgrounds.com""" subcategory = "media" - pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" - r"(/(?:portal/view|audio/listen)/\d+)") + pattern = BASE_PATTERN + r"(/(?:portal/view|audio/listen)/\d+)" example = "https://www.newgrounds.com/portal/view/12345" def __init__(self, match): @@ -416,35 +418,35 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): class NewgroundsArtExtractor(NewgroundsExtractor): """Extractor for all images of a newgrounds user""" subcategory = _path = "art" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/art/?$" + pattern = USER_PATTERN + r"/art(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/art" class NewgroundsAudioExtractor(NewgroundsExtractor): """Extractor for all audio submissions of a newgrounds user""" subcategory = _path = "audio" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/audio/?$" + pattern = USER_PATTERN + r"/audio(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/audio" class NewgroundsMoviesExtractor(NewgroundsExtractor): """Extractor for all movies of a newgrounds user""" subcategory = _path = "movies" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/movies/?$" + pattern = USER_PATTERN + r"/movies(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/movies" class NewgroundsGamesExtractor(NewgroundsExtractor): """Extractor for a newgrounds user's games""" subcategory = _path = "games" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/games/?$" + pattern = USER_PATTERN + r"/games(?:(?:/page/|/?\?page=)(\d+))?/?$" example = "https://USER.newgrounds.com/games" class NewgroundsUserExtractor(NewgroundsExtractor): """Extractor for a newgrounds user profile""" subcategory = "user" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/?$" + pattern = USER_PATTERN + r"/?$" example = "https://USER.newgrounds.com" def initialize(self): @@ -464,25 +466,22 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): """Extractor for posts favorited by a newgrounds user""" subcategory = "favorite" directory_fmt = ("{category}", "{user}", "Favorites") - pattern = (r"(?:https?://)?([\w-]+)\.newgrounds\.com" - r"/favorites(?!/following)(?:/(art|audio|movies))?/?") + pattern = (USER_PATTERN + r"/favorites(?!/following)(?:/(art|audio|movies)" + r"(?:(?:/page/|/?\?page=)(\d+))?)?") example = "https://USER.newgrounds.com/favorites" - def __init__(self, match): - NewgroundsExtractor.__init__(self, match) - self.kind = match.group(2) - def posts(self): - if self.kind: - return self._pagination(self.kind) + _, kind, pnum = self.groups + if kind: + return self._pagination_favorites(kind, pnum) return itertools.chain.from_iterable( - self._pagination(k) for k in ("art", "audio", "movies") + self._pagination_favorites(k) for k in ("art", "audio", "movies") ) - def _pagination(self, kind): + def _pagination_favorites(self, kind, pnum=1): url = "{}/favorites/{}".format(self.user_root, kind) params = { - "page": 1, + "page": text.parse_int(pnum, 1), "isAjaxRequest": "1", } headers = { @@ -514,12 +513,13 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor): """Extractor for a newgrounds user's favorited users""" subcategory = "following" - pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/favorites/(following)" + pattern = USER_PATTERN + r"/favorites/(following)" example = "https://USER.newgrounds.com/favorites/following" def items(self): + _, kind, pnum = self.groups data = {"_extractor": NewgroundsUserExtractor} - for url in self._pagination(self.kind): + for url in self._pagination_favorites(kind, pnum): yield Message.Queue, url, data @staticmethod @@ -534,13 +534,12 @@ class NewgroundsSearchExtractor(NewgroundsExtractor): """Extractor for newgrounds.com search reesults""" subcategory = "search" directory_fmt = ("{category}", "search", "{search_tags}") - pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" - r"/search/conduct/([^/?#]+)/?\?([^#]+)") + pattern = BASE_PATTERN + r"/search/conduct/([^/?#]+)/?\?([^#]+)" example = "https://www.newgrounds.com/search/conduct/art?terms=QUERY" def __init__(self, match): NewgroundsExtractor.__init__(self, match) - self._path, query = match.groups() + self._path, query = self.groups self.query = text.parse_query(query) def posts(self): @@ -550,19 +549,20 @@ class NewgroundsSearchExtractor(NewgroundsExtractor): for s in suitabilities.split(",")} self.request(self.root + "/suitabilities", method="POST", data=data) - return self._pagination("/search/conduct/" + self._path, self.query) + return self._pagination_search( + "/search/conduct/" + self._path, self.query) def metadata(self): return {"search_tags": self.query.get("terms", "")} - def _pagination(self, path, params): + def _pagination_search(self, path, params): url = self.root + path + params["inner"] = "1" + params["page"] = text.parse_int(params.get("page"), 1) headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "X-Requested-With": "XMLHttpRequest", } - params["inner"] = "1" - params["page"] = 1 while True: data = self.request(url, params=params, headers=headers).json() diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 8c7ffe5..851f663 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -63,7 +63,8 @@ class NozomiExtractor(Extractor): yield Message.Directory, post for post["num"], image in enumerate(images, 1): post["filename"] = post["dataid"] = did = image["dataid"] - post["is_video"] = video = bool(image.get("is_video")) + post["is_video"] = video = \ + True if image.get("is_video") else False ext = image["type"] if video: diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index d47ffa2..0b64ea3 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -56,6 +56,7 @@ class PatreonExtractor(Extractor): text.nameext_from_url(name, post) if text.ext_from_url(url) == "m3u8": url = "ytdl:" + url + post["_ytdl_manifest"] = "hls" post["extension"] = "mp4" yield Message.Url, url, post else: @@ -310,7 +311,7 @@ class PatreonCreatorExtractor(PatreonExtractor): subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))" - r"([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") + r"(?:c/)?([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?") example = "https://www.patreon.com/USER" def posts(self): @@ -340,9 +341,9 @@ class PatreonCreatorExtractor(PatreonExtractor): user_id = query.get("u") if user_id: - url = "{}/user/posts?u={}".format(self.root, user_id) + url = "{}/user?u={}".format(self.root, user_id) else: - url = "{}/{}/posts".format(self.root, creator) + url = "{}/{}".format(self.root, creator) page = self.request(url, notfound="creator").text try: diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 8c04ed5..499c579 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -18,8 +18,8 @@ BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+" class PinterestExtractor(Extractor): """Base class for pinterest extractors""" category = "pinterest" - filename_fmt = "{category}_{id}{media_id:?_//}.{extension}" - archive_fmt = "{id}{media_id}" + filename_fmt = "{category}_{id}{media_id|page_id:?_//}.{extension}" + archive_fmt = "{id}{media_id|page_id}" root = "https://www.pinterest.com" def _init(self): @@ -30,12 +30,12 @@ class PinterestExtractor(Extractor): self.root = text.ensure_http_scheme(domain) self.api = PinterestAPI(self) + self.stories = self.config("stories", True) + self.videos = self.config("videos", True) def items(self): data = self.metadata() - videos = self.config("videos", True) - yield Message.Directory, data for pin in self.pins(): if isinstance(pin, tuple): @@ -43,40 +43,35 @@ class PinterestExtractor(Extractor): yield Message.Queue, url, data continue + try: + files = self._extract_files(pin) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.warning( + "%s: Error when extracting download URLs (%s: %s)", + pin.get("id"), exc.__class__.__name__, exc) + continue + pin.update(data) + pin["count"] = len(files) - carousel_data = pin.get("carousel_data") - if carousel_data: - pin["count"] = len(carousel_data["carousel_slots"]) - for num, slot in enumerate(carousel_data["carousel_slots"], 1): - slot["media_id"] = slot.pop("id") - pin.update(slot) - pin["num"] = num - size, image = next(iter(slot["images"].items())) - url = image["url"].replace("/" + size + "/", "/originals/") - yield Message.Url, url, text.nameext_from_url(url, pin) - - else: - try: - media = self._media_from_pin(pin) - except Exception: - self.log.debug("Unable to fetch download URL for pin %s", - pin.get("id")) - continue + yield Message.Directory, pin + for pin["num"], file in enumerate(files, 1): + url = file["url"] + text.nameext_from_url(url, pin) + pin.update(file) - if videos or media.get("duration") is None: - pin.update(media) - pin["num"] = pin["count"] = 1 + if "media_id" not in file: pin["media_id"] = "" + if "page_id" not in file: + pin["page_id"] = "" - url = media["url"] - text.nameext_from_url(url, pin) + if pin["extension"] == "m3u8": + url = "ytdl:" + url + pin["_ytdl_manifest"] = "hls" + pin["extension"] = "mp4" - if pin["extension"] == "m3u8": - url = "ytdl:" + url - pin["extension"] = "mp4" - - yield Message.Url, url, pin + yield Message.Url, url, pin def metadata(self): """Return general metadata""" @@ -84,26 +79,108 @@ class PinterestExtractor(Extractor): def pins(self): """Return all relevant pin objects""" - @staticmethod - def _media_from_pin(pin): + def _extract_files(self, pin): + story_pin_data = pin.get("story_pin_data") + if story_pin_data and self.stories: + return self._extract_story(pin, story_pin_data) + + carousel_data = pin.get("carousel_data") + if carousel_data: + return self._extract_carousel(pin, carousel_data) + videos = pin.get("videos") - if videos: - video_formats = videos["video_list"] + if videos and self.videos: + return (self._extract_video(videos),) - for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"): - if fmt in video_formats: - media = video_formats[fmt] - break - else: - media = max(video_formats.values(), - key=lambda x: x.get("width", 0)) + try: + return (pin["images"]["orig"],) + except Exception: + self.log.debug("%s: No files found", pin.get("id")) + return () + + def _extract_story(self, pin, story): + files = [] + story_id = story.get("id") + + for page in story["pages"]: + page_id = page.get("id") + + for block in page["blocks"]: + type = block.get("type") + + if type == "story_pin_image_block": + if 1 == len(page["blocks"]) == len(story["pages"]): + try: + media = pin["images"]["orig"] + except Exception: + media = self._extract_image(page, block) + else: + media = self._extract_image(page, block) + + elif type == "story_pin_video_block": + video = block["video"] + media = self._extract_video(video) + media["media_id"] = video.get("id") or "" + + elif type == "story_pin_paragraph_block": + media = {"url": "text:" + block["text"], + "extension": "txt", + "media_id": block.get("id")} + + else: + self.log.warning("%s: Unsupported story block '%s'", + pin.get("id"), type) + continue - if "V_720P" in video_formats: - media["_fallback"] = (video_formats["V_720P"]["url"],) + media["story_id"] = story_id + media["page_id"] = page_id + files.append(media) + + return files + + def _extract_carousel(self, pin, carousel_data): + files = [] + for slot in carousel_data["carousel_slots"]: + size, image = next(iter(slot["images"].items())) + slot["media_id"] = slot.pop("id") + slot["url"] = image["url"].replace( + "/" + size + "/", "/originals/", 1) + files.append(slot) + return files + + def _extract_image(self, page, block): + sig = block.get("image_signature") or page["image_signature"] + url_base = "https://i.pinimg.com/originals/{}/{}/{}/{}.".format( + sig[0:2], sig[2:4], sig[4:6], sig) + url_jpg = url_base + "jpg" + url_png = url_base + "png" + url_webp = url_base + "webp" - return media + try: + media = block["image"]["images"]["originals"] + except Exception: + media = {"url": url_jpg, "_fallback": (url_png, url_webp,)} - return pin["images"]["orig"] + if media["url"] == url_jpg: + media["_fallback"] = (url_png, url_webp,) + else: + media["_fallback"] = (url_jpg, url_png, url_webp,) + media["media_id"] = sig + + return media + + def _extract_video(self, video): + video_formats = video["video_list"] + for fmt in ("V_HLSV4", "V_HLSV3_WEB", "V_HLSV3_MOBILE"): + if fmt in video_formats: + media = video_formats[fmt] + break + else: + media = max(video_formats.values(), + key=lambda x: x.get("width", 0)) + if "V_720P" in video_formats: + media["_fallback"] = (video_formats["V_720P"]["url"],) + return media class PinterestPinExtractor(PinterestExtractor): diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index c2d1243..8c6e6d8 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -38,6 +38,7 @@ class PixivExtractor(Extractor): self.meta_user = self.config("metadata") self.meta_bookmark = self.config("metadata-bookmark") self.meta_comments = self.config("comments") + self.meta_captions = self.config("captions") def items(self): tags = self.config("tags", "japanese") @@ -76,8 +77,8 @@ class PixivExtractor(Extractor): detail = self.api.illust_bookmark_detail(work["id"]) work["tags_bookmark"] = [tag["name"] for tag in detail["tags"] if tag["is_registered"]] - if self.sanity_workaround and not work.get("caption") and \ - not work.get("_mypixiv"): + if self.meta_captions and not work.get("caption") and \ + not work.get("_mypixiv") and not work.get("_ajax"): body = self._request_ajax("/illust/" + str(work["id"])) if body: work["caption"] = text.unescape(body["illustComment"]) @@ -108,10 +109,10 @@ class PixivExtractor(Extractor): if self.load_ugoira: try: return self._extract_ugoira(work) - except exception.StopExtraction as exc: + except Exception as exc: self.log.warning( - "Unable to retrieve Ugoira metatdata (%s - %s)", - work["id"], exc.message) + "%s: Unable to retrieve Ugoira metatdata (%s - %s)", + work["id"], exc.__class__.__name__, exc) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] @@ -186,6 +187,7 @@ class PixivExtractor(Extractor): return None def _extract_ajax(self, work, body): + work["_ajax"] = True url = self._extract_ajax_url(body) if not url: return () @@ -243,12 +245,12 @@ class PixivExtractor(Extractor): original = body["urls"]["original"] if original: return original - except KeyError: + except Exception: pass try: square1200 = body["userIllusts"][body["id"]]["url"] - except KeyError: + except Exception: return parts = square1200.rpartition("_p0")[0].split("/") del parts[3:5] @@ -293,9 +295,6 @@ class PixivExtractor(Extractor): "x_restrict" : 0, } - def _web_to_mobile(self, work): - return work - def works(self): """Return an iterable containing all relevant 'work' objects""" @@ -334,15 +333,17 @@ class PixivUserExtractor(PixivExtractor): class PixivArtworksExtractor(PixivExtractor): """Extractor for artworks of a pixiv user""" subcategory = "artworks" + _warning = True pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" r"(?:/([^/?#]+))?/?(?:$|[?#])" r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") example = "https://www.pixiv.net/en/users/12345/artworks" - def __init__(self, match): - PixivExtractor.__init__(self, match) - u1, t1, u2, t2 = match.groups() + def _init(self): + PixivExtractor._init(self) + + u1, t1, u2, t2 = self.groups if t1: t1 = text.unquote(t1) elif t2: @@ -350,6 +351,14 @@ class PixivArtworksExtractor(PixivExtractor): self.user_id = u1 or u2 self.tag = t1 or t2 + if self.sanity_workaround: + self.cookies_domain = d = ".pixiv.net" + self._init_cookies() + if self._warning and not self.cookies.get("PHPSESSID", domain=d): + PixivArtworksExtractor._warning = False + self.log.warning("No 'PHPSESSID' cookie set. Can detect only " + "non R-18 'sanity_level' works.") + def metadata(self): if self.config("metadata"): self.api.user_detail(self.user_id) @@ -358,6 +367,19 @@ class PixivArtworksExtractor(PixivExtractor): def works(self): works = self.api.user_illusts(self.user_id) + if self.sanity_workaround: + body = self._request_ajax( + "/user/{}/profile/all".format(self.user_id)) + try: + ajax_ids = list(map(int, body["illusts"])) + ajax_ids.extend(map(int, body["manga"])) + ajax_ids.sort() + except Exception as exc: + self.log.warning("Unable to collect artwork IDs using AJAX " + "API (%s: %s)", exc.__class__.__name__, exc) + else: + works = self._extend_sanity(works, ajax_ids) + if self.tag: tag = self.tag.lower() works = ( @@ -367,6 +389,35 @@ class PixivArtworksExtractor(PixivExtractor): return works + def _extend_sanity(self, works, ajax_ids): + user = {"id": 1} + index = len(ajax_ids) - 1 + + for work in works: + while index >= 0: + work_id = work["id"] + ajax_id = ajax_ids[index] + + if ajax_id == work_id: + index -= 1 + break + + elif ajax_id > work_id: + index -= 1 + self.log.debug("Inserting work %s", ajax_id) + yield self._make_work(ajax_id, self.sanity_url, user) + + else: # ajax_id < work_id + break + + yield work + + while index >= 0: + ajax_id = ajax_ids[index] + self.log.debug("Inserting work %s", ajax_id) + yield self._make_work(ajax_id, self.sanity_url, user) + index -= 1 + class PixivAvatarExtractor(PixivExtractor): """Extractor for pixiv avatars""" diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py index 29b351b..8877175 100644 --- a/gallery_dl/extractor/postmill.py +++ b/gallery_dl/extractor/postmill.py @@ -50,7 +50,7 @@ class PostmillExtractor(BaseExtractor): forum = match.group(1) id = int(match.group(2)) - is_text_post = url.startswith("/") + is_text_post = (url[0] == "/") is_image_post = self._search_image_tag(page) is not None data = { "title": title, diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index ce602f6..8577e74 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -31,6 +31,7 @@ class RedditExtractor(Extractor): parentdir = self.config("parent-directory") max_depth = self.config("recursion", 0) previews = self.config("previews", True) + embeds = self.config("embeds", True) videos = self.config("videos", True) if videos: @@ -100,7 +101,7 @@ class RedditExtractor(Extractor): for comment in comments: html = comment["body_html"] or "" href = (' href="' in html) - media = ("media_metadata" in comment) + media = (embeds and "media_metadata" in comment) if media or href: comment["date"] = text.parse_timestamp( @@ -211,8 +212,9 @@ class RedditExtractor(Extractor): def _extract_video_dash(self, submission): submission["_ytdl_extra"] = {"title": submission["title"]} try: - return (submission["secure_media"]["reddit_video"]["dash_url"] + - "#__youtubedl_smuggle=%7B%22to_generic%22%3A+1%7D") + url = submission["secure_media"]["reddit_video"]["dash_url"] + submission["_ytdl_manifest"] = "dash" + return url except Exception: return submission["url"] diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py new file mode 100644 index 0000000..9f9f0c4 --- /dev/null +++ b/gallery_dl/extractor/scrolller.py @@ -0,0 +1,227 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://scrolller.com/""" + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?scrolller\.com" + + +class ScrolllerExtractor(Extractor): + """Base class for scrolller extractors""" + category = "scrolller" + root = "https://scrolller.com" + directory_fmt = ("{category}", "{subredditTitle}") + filename_fmt = "{id}{title:? //}.{extension}" + archive_fmt = "{id}" + request_interval = (0.5, 1.5) + + def _init(self): + self.auth_token = None + + def items(self): + self.login() + + for post in self.posts(): + + src = max(post["mediaSources"], key=self._sort_key) + post.update(src) + url = src["url"] + text.nameext_from_url(url, post) + + yield Message.Directory, post + yield Message.Url, url, post + + def posts(self): + return () + + def login(self): + username, password = self._get_auth_info() + if username: + self.auth_token = self._login_impl(username, password) + + @cache(maxage=28*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + variables = { + "username": username, + "password": password, + } + + try: + data = self._request_graphql("LoginQuery", variables) + except exception.HttpError as exc: + if exc.status == 403: + raise exception.AuthenticationError() + raise + + return data["login"]["token"] + + def _request_graphql(self, opname, variables): + url = "https://api.scrolller.com/api/v2/graphql" + headers = { + "Content-Type" : "text/plain;charset=UTF-8", + "Origin" : self.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", + } + data = { + "query" : QUERIES[opname], + "variables" : variables, + "authorization": self.auth_token, + } + return self.request( + url, method="POST", headers=headers, data=util.json_dumps(data), + ).json()["data"] + + def _pagination(self, opname, variables): + while True: + data = self._request_graphql(opname, variables) + + while "items" not in data: + data = data.popitem()[1] + yield from data["items"] + + if not data["iterator"]: + return + variables["iterator"] = data["iterator"] + + def _sort_key(self, src): + return src["width"], not src["isOptimized"] + + +class ScrolllerSubredditExtractor(ScrolllerExtractor): + """Extractor for media from a scrolller subreddit""" + subcategory = "subreddit" + pattern = BASE_PATTERN + r"(/r/[^/?#]+)(?:/?\?([^#]+))?" + example = "https://scrolller.com/r/SUBREDDIT" + + def posts(self): + url, query = self.groups + filter = None + + if query: + params = text.parse_query(query) + if "filter" in params: + filter = params["filter"].upper().rstrip("S") + + variables = { + "url" : url, + "iterator" : None, + "filter" : filter, + "hostsDown": None, + } + return self._pagination("SubredditQuery", variables) + + +class ScrolllerFollowingExtractor(ScrolllerExtractor): + """Extractor for followed scrolller subreddits""" + subcategory = "following" + pattern = BASE_PATTERN + r"/following" + example = "https://scrolller.com/following" + + def items(self): + self.login() + + if not self.auth_token: + raise exception.AuthorizationError("Login required") + + variables = { + "iterator" : None, + "hostsDown": None, + } + + for subreddit in self._pagination("FollowingQuery", variables): + url = self.root + subreddit["url"] + subreddit["_extractor"] = ScrolllerSubredditExtractor + yield Message.Queue, url, subreddit + + +class ScrolllerPostExtractor(ScrolllerExtractor): + """Extractor for media from a single scrolller post""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(?!r/|following$)([^/?#]+)" + example = "https://scrolller.com/title-slug-a1b2c3d4f5" + + def posts(self): + url = "{}/{}".format(self.root, self.groups[0]) + page = self.request(url).text + data = util.json_loads(text.extr( + page, '<script>window.scrolllerConfig="', '"</script>') + .replace('\\"', '"')) + return (data["item"],) + + +QUERIES = { + + "SubredditQuery": """\ +query SubredditQuery( + $url: String! + $filter: SubredditPostFilter + $iterator: String +) { + getSubreddit( + url: $url + ) { + children( + limit: 50 + iterator: $iterator + filter: $filter + disabledHosts: null + ) { + iterator items { + __typename id url title subredditId subredditTitle + subredditUrl redditPath isNsfw albumUrl hasAudio + fullLengthSource gfycatSource redgifsSource ownerAvatar + username displayName isPaid tags isFavorite + mediaSources { url width height isOptimized } + blurredMediaSources { url width height isOptimized } + } + } + } +} +""", + + "FollowingQuery": """\ +query FollowingQuery( + $iterator: String +) { + getFollowing( + limit: 10 + iterator: $iterator + ) { + iterator items { + __typename id url title secondaryTitle description createdAt isNsfw + subscribers isComplete itemCount videoCount pictureCount albumCount + isPaid username tags isFollowing + banner { url width height isOptimized } + } + } +} +""", + + "LoginQuery": """\ +query LoginQuery( + $username: String!, + $password: String! +) { + login( + username: $username, + password: $password + ) { + username token expiresAt isAdmin status isPremium + } +} +""", + +} diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py index dd5988f..468840b 100644 --- a/gallery_dl/extractor/telegraph.py +++ b/gallery_dl/extractor/telegraph.py @@ -49,7 +49,7 @@ class TelegraphGalleryExtractor(GalleryExtractor): url, pos = text.extract(figure, 'src="', '"') if url.startswith("/embed/"): continue - elif url.startswith("/"): + elif url[0] == "/": url = self.root + url caption, pos = text.extract(figure, "<figcaption>", "<", pos) num += 1 diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index bce661a..b196aeb 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -148,8 +148,10 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor): data["PageNumber"] += 1 def _parse(self, query): + if not query: + return {} try: - if query.startswith("?"): + if query[0] == "?": return self._parse_simple(query) return self._parse_jsurl(query) except Exception as exc: @@ -187,8 +189,6 @@ class TsuminoSearchExtractor(TsuminoBase, Extractor): Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill)) Ref: https://github.com/Sage/jsurl """ - if not data: - return {} i = 0 imax = len(data) diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index b21709a..f7ce44b 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -7,7 +7,7 @@ """Extractors for https://urlgalleries.net/""" from .common import GalleryExtractor, Message -from .. import text +from .. import text, exception class UrlgalleriesGalleryExtractor(GalleryExtractor): @@ -16,27 +16,31 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor): root = "urlgalleries.net" request_interval = (0.5, 1.0) pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)" - example = "https://blog.urlgalleries.net/gallery-12345/TITLE" + example = "https://BLOG.urlgalleries.net/gallery-12345/TITLE" - def __init__(self, match): - self.blog, self.gallery_id = match.groups() + def items(self): + blog, self.gallery_id = self.groups url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format( - self.blog, self.gallery_id) - GalleryExtractor.__init__(self, match, url) + blog, self.gallery_id) + + with self.request(url, allow_redirects=False, fatal=...) as response: + if 300 <= response.status_code < 500: + if response.headers.get("location", "").endswith( + "/not_found_adult.php"): + raise exception.NotFoundError("gallery") + raise exception.HttpError(None, response) + page = response.text - def items(self): - page = self.request(self.gallery_url).text imgs = self.images(page) data = self.metadata(page) data["count"] = len(imgs) - del page - root = "https://{}.urlgalleries.net".format(self.blog) + root = "https://{}.urlgalleries.net".format(blog) yield Message.Directory, data for data["num"], img in enumerate(imgs, 1): - response = self.request( - root + img, method="HEAD", allow_redirects=False) - yield Message.Queue, response.headers["Location"], data + page = self.request(root + img).text + url = text.extr(page, "window.location.href = '", "'") + yield Message.Queue, url, data def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 95eeafe..ea034a7 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -24,6 +24,13 @@ class VkExtractor(Extractor): root = "https://vk.com" request_interval = (0.5, 1.5) + def _init(self): + self.offset = text.parse_int(self.config("offset")) + + def skip(self, num): + self.offset += num + return num + def items(self): sub = re.compile(r"/imp[fg]/").sub sizes = "wzyxrqpo" @@ -75,7 +82,7 @@ class VkExtractor(Extractor): "al" : "1", "direction": "1", "list" : photos_id, - "offset" : 0, + "offset" : self.offset, } while True: diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index 116f557..4eae537 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -193,7 +193,10 @@ class WikimediaArticleExtractor(WikimediaExtractor): def __init__(self, match): WikimediaExtractor.__init__(self, match) - path = match.group(match.lastindex) + path = self.groups[-1] + if path[2] == "/": + self.root = self.root + "/" + path[:2] + path = path[3:] if path.startswith("wiki/"): path = path[5:] diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 4affd55..30801ee 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -347,6 +347,9 @@ class DownloadJob(Job): self.status |= 4 self.log.error("Failed to download %s", pathfmt.filename or url) + if "error" in hooks: + for callback in hooks["error"]: + callback(pathfmt) return if not pathfmt.temppath: @@ -433,7 +436,8 @@ class DownloadJob(Job): if status: self.status |= status - if "_fallback" in kwdict and self.fallback: + if (status & 95 and # not FormatError or OSError + "_fallback" in kwdict and self.fallback): fallback = kwdict["_fallback"] = \ iter(kwdict["_fallback"]) try: diff --git a/gallery_dl/option.py b/gallery_dl/option.py index c4f5b94..b38ad74 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -10,6 +10,7 @@ import argparse import logging +import os.path import sys from . import job, util, version @@ -152,6 +153,49 @@ class UgoiraAction(argparse.Action): namespace.postprocessors.append(pp) +class PrintAction(argparse.Action): + def __call__(self, parser, namespace, value, option_string=None): + if self.const: + filename = self.const + base = None + mode = "w" + else: + value, path = value + base, filename = os.path.split(path) + mode = "a" + + event, sep, format_string = value.partition(":") + if not sep: + format_string = event + event = ("prepare",) + else: + event = event.strip().lower() + if event not in {"init", "file", "after", "skip", "error", + "prepare", "prepare-after", "post", "post-after", + "finalize", "finalize-success", "finalize-error"}: + format_string = value + event = ("prepare",) + + if not format_string: + return + + if "{" not in format_string and \ + " " not in format_string and \ + format_string[0] != "\f": + format_string = "{" + format_string + "}" + if format_string[-1] != "\n": + format_string += "\n" + + namespace.postprocessors.append({ + "name" : "metadata", + "event" : event, + "filename" : filename, + "base-directory": base or ".", + "content-format": format_string, + "open" : mode, + }) + + class Formatter(argparse.HelpFormatter): """Custom HelpFormatter class to customize help output""" def __init__(self, prog): @@ -343,6 +387,19 @@ def build_parser(): help="Add input URLs which returned an error to FILE", ) output.add_argument( + "-N", "--print", + dest="postprocessors", metavar="[EVENT:]FORMAT", + action=PrintAction, const="-", default=[], + help=("Write FORMAT during EVENT (default 'prepare') to standard " + "output. Examples: 'id' or 'post:{md5[:8]}'"), + ) + output.add_argument( + "--print-to-file", + dest="postprocessors", metavar="[EVENT:]FORMAT FILE", + action=PrintAction, nargs=2, + help="Append FORMAT during EVENT to FILE", + ) + output.add_argument( "--list-modules", dest="list_modules", action="store_true", help="Print a list of available extractor modules", @@ -616,7 +673,7 @@ def build_parser(): postprocessor = parser.add_argument_group("Post-processing Options") postprocessor.add_argument( "-P", "--postprocessor", - dest="postprocessors", metavar="NAME", action="append", default=[], + dest="postprocessors", metavar="NAME", action="append", help="Activate the specified post processor", ) postprocessor.add_argument( diff --git a/gallery_dl/util.py b/gallery_dl/util.py index d5bc171..6cdd994 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -403,9 +403,9 @@ def set_mtime(path, mtime): pass -def cookiestxt_load(fp, cookiejar): - """Parse a Netscape cookies.txt file and add its Cookies to 'cookiejar'""" - set_cookie = cookiejar.set_cookie +def cookiestxt_load(fp): + """Parse a Netscape cookies.txt file and add return its Cookies""" + cookies = [] for line in fp: @@ -427,18 +427,20 @@ def cookiestxt_load(fp, cookiejar): name = value value = None - set_cookie(Cookie( + cookies.append(Cookie( 0, name, value, None, False, domain, domain_specified == "TRUE", - domain.startswith("."), + domain[0] == "." if domain else False, path, False, secure == "TRUE", None if expires == "0" or not expires else expires, False, None, None, {}, )) + return cookies + def cookiestxt_store(fp, cookies): """Write 'cookies' in Netscape cookies.txt format to 'fp'""" @@ -456,9 +458,10 @@ def cookiestxt_store(fp, cookies): name = cookie.name value = cookie.value + domain = cookie.domain write("\t".join(( - cookie.domain, - "TRUE" if cookie.domain.startswith(".") else "FALSE", + domain, + "TRUE" if domain and domain[0] == "." else "FALSE", cookie.path, "TRUE" if cookie.secure else "FALSE", "0" if cookie.expires is None else str(cookie.expires), @@ -529,6 +532,24 @@ class HTTPBasicAuth(): return request +class ModuleProxy(): + __slots__ = () + + def __getitem__(self, key, modules=sys.modules): + try: + return modules[key] + except KeyError: + pass + try: + __import__(key) + except ImportError: + modules[key] = NONE + return NONE + return modules[key] + + __getattr__ = __getitem__ + + class LazyPrompt(): __slots__ = () @@ -537,6 +558,7 @@ class LazyPrompt(): class NullContext(): + __slots__ = () def __enter__(self): return None @@ -643,6 +665,7 @@ GLOBALS = { "restart" : raises(exception.RestartExtraction), "hash_sha1": sha1, "hash_md5" : md5, + "std" : ModuleProxy(), "re" : re, } diff --git a/gallery_dl/version.py b/gallery_dl/version.py index dd96a9a..6c2a32e 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.27.6" +__version__ = "1.27.7" __variant__ = None @@ -136,6 +136,7 @@ def build_setuptools(): "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Topic :: Internet :: WWW/HTTP", diff --git a/test/test_results.py b/test/test_results.py index ed9c9a9..f36f798 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -358,7 +358,7 @@ class TestPathfmt(): def __enter__(self): return self - def __exit__(self, *args): + def __exit__(self, exc_type, exc_value, traceback): pass def open(self, mode): diff --git a/test/test_util.py b/test/test_util.py index e2db29b..888a70a 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -205,9 +205,8 @@ class TestCookiesTxt(unittest.TestCase): def test_cookiestxt_load(self): def _assert(content, expected): - jar = http.cookiejar.CookieJar() - util.cookiestxt_load(io.StringIO(content, None), jar) - for c, e in zip(jar, expected): + cookies = util.cookiestxt_load(io.StringIO(content, None)) + for c, e in zip(cookies, expected): self.assertEqual(c.__dict__, e.__dict__) _assert("", []) @@ -253,8 +252,7 @@ class TestCookiesTxt(unittest.TestCase): ) with self.assertRaises(ValueError): - util.cookiestxt_load("example.org\tTRUE\t/\tTRUE\t0\tname", - http.cookiejar.CookieJar()) + util.cookiestxt_load("example.org\tTRUE\t/\tTRUE\t0\tname") def test_cookiestxt_store(self): @@ -832,6 +830,34 @@ def hash(value): i += 1 self.assertEqual(i, 0) + def test_module_proxy(self): + proxy = util.ModuleProxy() + + self.assertIs(proxy.os, os) + self.assertIs(proxy.os.path, os.path) + self.assertIs(proxy["os"], os) + self.assertIs(proxy["os.path"], os.path) + self.assertIs(proxy["os"].path, os.path) + + self.assertIs(proxy.abcdefghi, util.NONE) + self.assertIs(proxy["abcdefghi"], util.NONE) + self.assertIs(proxy["abc.def.ghi"], util.NONE) + self.assertIs(proxy["os.path2"], util.NONE) + + def test_null_context(self): + with util.NullContext(): + pass + + with util.NullContext() as ctx: + self.assertIs(ctx, None) + + try: + with util.NullContext() as ctx: + exc_orig = ValueError() + raise exc_orig + except ValueError as exc: + self.assertIs(exc, exc_orig) + class TestExtractor(): category = "test_category" |
