diff options
| author | 2024-08-03 20:27:44 -0400 | |
|---|---|---|
| committer | 2024-08-03 20:27:44 -0400 | |
| commit | 032e5bed275a253e122ed9ac86dac7b8c4204172 (patch) | |
| tree | b4eda52ebfe00c4d22e9d633b1ab2d158a9f0573 | |
| parent | 80e39a8fc7de105510cbbdca8507f2a4b8c9e01d (diff) | |
New upstream version 1.27.2.upstream/1.27.2
65 files changed, 1664 insertions, 492 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index ebede9f..1ca8647 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,40 +1,83 @@ -## 1.27.1 - 2024-06-22 +## 1.27.2 - 2024-08-03 ### Extractors #### Additions -- [hentainexus] restore module ([#5275](https://github.com/mikf/gallery-dl/issues/5275), [#5712](https://github.com/mikf/gallery-dl/issues/5712)) -- [shimmie2] support `vidya.pics` ([#5632](https://github.com/mikf/gallery-dl/issues/5632)) -- [tcbscans] support other domains ([#5774](https://github.com/mikf/gallery-dl/issues/5774)) +- [agnph] add `tag` and `post` extractors ([#5284](https://github.com/mikf/gallery-dl/issues/5284), [#5890](https://github.com/mikf/gallery-dl/issues/5890)) +- [aryion] add `favorite` extractor ([#4511](https://github.com/mikf/gallery-dl/issues/4511), [#5870](https://github.com/mikf/gallery-dl/issues/5870)) +- [cien] add support ([#2885](https://github.com/mikf/gallery-dl/issues/2885), [#4103](https://github.com/mikf/gallery-dl/issues/4103), [#5240](https://github.com/mikf/gallery-dl/issues/5240)) +- [instagram] add `info` extractor ([#5262](https://github.com/mikf/gallery-dl/issues/5262)) +- [koharu] add `gallery`, `search`, and `favorite` extractors ([#5893](https://github.com/mikf/gallery-dl/issues/5893), [#4707](https://github.com/mikf/gallery-dl/issues/4707)) +- [twitter] add `info` extractor ([#3623](https://github.com/mikf/gallery-dl/issues/3623)) #### Fixes -- [deviantart] fix watching module ID extraction ([#5696](https://github.com/mikf/gallery-dl/issues/5696), [#5772](https://github.com/mikf/gallery-dl/issues/5772)) -- [fanbox] handle KeyError for no longer existing plans ([#5759](https://github.com/mikf/gallery-dl/issues/5759)) -- [kemonoparty:favorite] fix exception when sorting `null` objects ([#5692](https://github.com/mikf/gallery-dl/issues/5692). [#5721](https://github.com/mikf/gallery-dl/issues/5721)) -- [skeb] fix `429 Too Many Requests` errors ([#5766](https://github.com/mikf/gallery-dl/issues/5766)) -- [speakerdeck] fix extraction ([#5730](https://github.com/mikf/gallery-dl/issues/5730)) -- [twitter] fix duplicate `ArkoseLogin` check +- [8chan] update `TOS` cookie name ([#5868](https://github.com/mikf/gallery-dl/issues/5868)) +- [behance] fix image extraction ([#5873](https://github.com/mikf/gallery-dl/issues/5873), [#5926](https://github.com/mikf/gallery-dl/issues/5926)) +- [booru] prevent crash when file URL is empty ([#5859](https://github.com/mikf/gallery-dl/issues/5859)) +- [deviantart] try to work around journal/status API changes ([#5916](https://github.com/mikf/gallery-dl/issues/5916)) +- [hentainexus] fix error with spread pages ([#5827](https://github.com/mikf/gallery-dl/issues/5827)) +- [hotleak] fix faulty image URLs ([#5915](https://github.com/mikf/gallery-dl/issues/5915)) +- [inkbunny:following] fix potentially infinite loop +- [nijie] fix image URLs of single image posts ([#5842](https://github.com/mikf/gallery-dl/issues/5842)) +- [readcomiconline] fix extraction ([#5866](https://github.com/mikf/gallery-dl/issues/5866)) +- [toyhouse] fix Content Warning bypass ([#5820](https://github.com/mikf/gallery-dl/issues/5820)) +- [tumblr] revert to `offset` pagination, implement `pagination` option ([#5880](https://github.com/mikf/gallery-dl/issues/5880)) +- [twitter] fix `username-alt` option name ([#5715](https://github.com/mikf/gallery-dl/issues/5715)) +- [warosu] fix extraction +- [zerochan] handle `KeyError - 'items'` ([#5826](https://github.com/mikf/gallery-dl/issues/5826)) +- [zerochan] fix error on tag redirections ([#5891](https://github.com/mikf/gallery-dl/issues/5891)) +- [zerochan] fix `Invalid control character` errors ([#5892](https://github.com/mikf/gallery-dl/issues/5892)) #### Improvements -- [nijie] support downloading videos ([#5707](https://github.com/mikf/gallery-dl/issues/5707), [#5617](https://github.com/mikf/gallery-dl/issues/5617)) -- [philomena] support downloading `.svg` files ([#5643](https://github.com/mikf/gallery-dl/issues/5643)) -- [szurubooru] support empty tag searches ([#5711](https://github.com/mikf/gallery-dl/issues/5711)) -- [twitter] ignore `Unavailable` media ([#5736](https://github.com/mikf/gallery-dl/issues/5736)) +- [bunkr] support `bunkr.fi` domain ([#5872](https://github.com/mikf/gallery-dl/issues/5872)) +- [deviantart:following] use OAuth API endpoint ([#2511](https://github.com/mikf/gallery-dl/issues/2511)) +- [directlink] extend recognized file extensions ([#5924](https://github.com/mikf/gallery-dl/issues/5924)) +- [exhentai] improve error message when temporarily banned ([#5845](https://github.com/mikf/gallery-dl/issues/5845)) +- [gelbooru_v02] use total number of posts as pagination end marker ([#5830](https://github.com/mikf/gallery-dl/issues/5830)) +- [imagefap] add enumeration index to default filenames ([#1746](https://github.com/mikf/gallery-dl/issues/1746), [#5887](https://github.com/mikf/gallery-dl/issues/5887)) +- [paheal] implement fast `--range` support ([#5905](https://github.com/mikf/gallery-dl/issues/5905)) +- [redgifs] support URLs with numeric IDs ([#5898](https://github.com/mikf/gallery-dl/issues/5898), [#5899](https://github.com/mikf/gallery-dl/issues/5899)) +- [sankaku] match URLs with `www` subdomain ([#5907](https://github.com/mikf/gallery-dl/issues/5907)) +- [sankakucomplex] update domain to `news.sankakucomplex.com` +- [twitter] implement `cursor` support ([#5753](https://github.com/mikf/gallery-dl/issues/5753)) +- [vipergirls] improve `thread` URL pattern +- [wallpapercave] support `album` listings ([#5925](https://github.com/mikf/gallery-dl/issues/5925)) #### Metadata -- [hitomi] extract `title_jpn` metadata ([#5706](https://github.com/mikf/gallery-dl/issues/5706)) -- [instagram] extract `liked` metadata ([#5609](https://github.com/mikf/gallery-dl/issues/5609)) +- [dynastyscans] extract chapter `tags` ([#5904](https://github.com/mikf/gallery-dl/issues/5904)) +- [erome] extract `date` metadata ([#5796](https://github.com/mikf/gallery-dl/issues/5796)) +- [furaffinity] extract `folders` and `thumbnail` metadata ([#1284](https://github.com/mikf/gallery-dl/issues/1284), [#5824](https://github.com/mikf/gallery-dl/issues/5824)) +- [sankaku] implement `notes` extraction ([#5865](https://github.com/mikf/gallery-dl/issues/5865)) +- [subscribestar] fix `date` parsing in updated posts ([#5783](https://github.com/mikf/gallery-dl/issues/5783)) +- [twitter] extract `bookmark_count` and `view_count` metadata ([#5802](https://github.com/mikf/gallery-dl/issues/5802)) +- [zerochan] fix `source` metadata +- [zerochan] fix tag category extraction ([#5874](https://github.com/mikf/gallery-dl/issues/5874)) +- [zerochan] delay fetching extended metadata ([#5869](https://github.com/mikf/gallery-dl/issues/5869)) #### Options -- [newgrounds] extend `format` option ([#5709](https://github.com/mikf/gallery-dl/issues/5709)) -- [twitter] extend `ratelimit` option ([#5532](https://github.com/mikf/gallery-dl/issues/5532)) -- [twitter] add `username-alt` option ([#5715](https://github.com/mikf/gallery-dl/issues/5715)) +- [agnph] implement `tags` option ([#5284](https://github.com/mikf/gallery-dl/issues/5284)) +- [booru] allow multiple `url` keys ([#5859](https://github.com/mikf/gallery-dl/issues/5859)) +- [cien] add `files` option ([#2885](https://github.com/mikf/gallery-dl/issues/2885)) +- [koharu] add `cbz` and `format` options ([#5893](https://github.com/mikf/gallery-dl/issues/5893)) +- [vsco] add `include` option ([#5911](https://github.com/mikf/gallery-dl/issues/5911)) +- [zerochan] implement `tags` option ([#5874](https://github.com/mikf/gallery-dl/issues/5874)) #### Removals -- [photobucket] remove module -- [nitter] remove instances -- [vichan] remove `wikieat.club` -### Downloaders -- [ytdl] fix exception due to missing `ext` in unavailable videos ([#5675](https://github.com/mikf/gallery-dl/issues/5675)) -### Formatter -- implement `C` format specifier ([#5647](https://github.com/mikf/gallery-dl/issues/5647)) -- implement `X` format specifier ([#5770](https://github.com/mikf/gallery-dl/issues/5770)) +- [fallenangels] remove module +### Post Processors +- [metadata] allow using format strings for `directory` ([#5728](https://github.com/mikf/gallery-dl/issues/5728)) ### Options -- add `--no-input` command-line and `input` config option ([#5733](https://github.com/mikf/gallery-dl/issues/5733)) -- add `--config-open` command-line option ([#5713](https://github.com/mikf/gallery-dl/issues/5713)) -- add `--config-status` command-line option ([#5713](https://github.com/mikf/gallery-dl/issues/5713)) +- add `--print-traffic` command-line option +- add `-J/--resolve-json` command-line option ([#5864](https://github.com/mikf/gallery-dl/issues/5864)) +- add `filters-environment` option +- implement `archive-event` option ([#5784](https://github.com/mikf/gallery-dl/issues/5784)) +### Actions +- [actions] support multiple actions per pattern +- [actions] add `exec` action ([#5619](https://github.com/mikf/gallery-dl/issues/5619)) +- [actions] add `abort` and `terminate` actions ([#5778](https://github.com/mikf/gallery-dl/issues/5778)) +- [actions] allow setting a duration for `wait` +- [actions] emit logging messages before waiting/exiting/etc +### Tests +- [tests] enable test results for external extractors ([#5262](https://github.com/mikf/gallery-dl/issues/5262)) +- [tests] load results from `${GDL_TEST_RESULTS}` ([#5262](https://github.com/mikf/gallery-dl/issues/5262)) ### Miscellaneous -- [actions] fix exception when `msg` is not a string ([#5683](https://github.com/mikf/gallery-dl/issues/5683)) +- [cookies] add `thorium` support ([#5781](https://github.com/mikf/gallery-dl/issues/5781)) +- [job] add `resolve` argument to DataJob ([#5864](https://github.com/mikf/gallery-dl/issues/5864)) +- [path] fix moving temporary files across drives on Windows ([#5807](https://github.com/mikf/gallery-dl/issues/5807)) +- [ytdl] fix `--cookies-from-browser` option parsing ([#5885](https://github.com/mikf/gallery-dl/issues/5885)) +- make exceptions in filters/conditionals non-fatal +- update default User-Agent header to Firefox 128 ESR +- include `zstd` in Accept-Encoding header when supported @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.27.1 +Version: 1.27.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -65,10 +65,12 @@ Dependencies Optional -------- +- yt-dlp_ or youtube-dl_: HLS/DASH video downloads, ``ytdl`` integration - FFmpeg_: Pixiv Ugoira conversion -- yt-dlp_ or youtube-dl_: Video downloads +- mkvmerge_: Accurate Ugoira frame timecodes - PySocks_: SOCKS proxy support - brotli_ or brotlicffi_: Brotli compression support +- zstandard_: Zstandard compression support - PyYAML_: YAML configuration file support - toml_: TOML configuration file support for Python<3.11 - SecretStorage_: GNOME keyring passwords for ``--cookies-from-browser`` @@ -112,9 +114,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.2/gallery-dl.bin>`__ Nightly Builds @@ -457,11 +459,13 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _pip: https://pip.pypa.io/en/stable/ .. _Requests: https://requests.readthedocs.io/en/master/ .. _FFmpeg: https://www.ffmpeg.org/ +.. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html .. _yt-dlp: https://github.com/yt-dlp/yt-dlp .. _youtube-dl: https://ytdl-org.github.io/youtube-dl/ .. _PySocks: https://pypi.org/project/PySocks/ .. _brotli: https://github.com/google/brotli .. _brotlicffi: https://github.com/python-hyper/brotlicffi +.. _zstandard: https://github.com/indygreg/python-zstandard .. _PyYAML: https://pyyaml.org/ .. _toml: https://pypi.org/project/toml/ .. _SecretStorage: https://pypi.org/project/SecretStorage/ @@ -25,10 +25,12 @@ Dependencies Optional -------- +- yt-dlp_ or youtube-dl_: HLS/DASH video downloads, ``ytdl`` integration - FFmpeg_: Pixiv Ugoira conversion -- yt-dlp_ or youtube-dl_: Video downloads +- mkvmerge_: Accurate Ugoira frame timecodes - PySocks_: SOCKS proxy support - brotli_ or brotlicffi_: Brotli compression support +- zstandard_: Zstandard compression support - PyYAML_: YAML configuration file support - toml_: TOML configuration file support for Python<3.11 - SecretStorage_: GNOME keyring passwords for ``--cookies-from-browser`` @@ -72,9 +74,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.2/gallery-dl.bin>`__ Nightly Builds @@ -417,11 +419,13 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _pip: https://pip.pypa.io/en/stable/ .. _Requests: https://requests.readthedocs.io/en/master/ .. _FFmpeg: https://www.ffmpeg.org/ +.. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html .. _yt-dlp: https://github.com/yt-dlp/yt-dlp .. _youtube-dl: https://ytdl-org.github.io/youtube-dl/ .. _PySocks: https://pypi.org/project/PySocks/ .. _brotli: https://github.com/google/brotli .. _brotlicffi: https://github.com/python-hyper/brotlicffi +.. _zstandard: https://github.com/indygreg/python-zstandard .. _PyYAML: https://pyyaml.org/ .. _toml: https://pypi.org/project/toml/ .. _SecretStorage: https://pypi.org/project/SecretStorage/ diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 14b7321..3308e98 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -26,6 +26,7 @@ _arguments -s -S \ {-g,--get-urls}'[Print URLs instead of downloading]' \ {-G,--resolve-urls}'[Print URLs instead of downloading; resolve intermediary URLs]' \ {-j,--dump-json}'[Print JSON information]' \ +{-J,--resolve-json}'[Print JSON information; resolve intermediary URLs]' \ {-s,--simulate}'[Simulate data extraction; do not download anything]' \ {-E,--extractor-info}'[Print extractor defaults and settings]' \ {-K,--list-keywords}'[Print a list of available keywords and example values for the given URLs]' \ @@ -35,6 +36,7 @@ _arguments -s -S \ --write-log'[Write logging output to FILE]':'<file>':_files \ --write-unsupported'[Write URLs, which get emitted by other extractors but cannot be handled, to FILE]':'<file>':_files \ --write-pages'[Write downloaded intermediary pages to files in the current directory to debug problems]' \ +--print-traffic'[Display sent and read HTTP traffic]' \ --no-colors'[Do not emit ANSI color codes in output]' \ {-R,--retries}'[Maximum number of retries for failed HTTP requests or -1 for infinite retries (default: 4)]':'<n>' \ --http-timeout'[Timeout for HTTP connections (default: 30.0)]':'<seconds>' \ diff --git a/data/completion/gallery-dl b/data/completion/gallery-dl index 625ecd6..0d933fa 100644 --- a/data/completion/gallery-dl +++ b/data/completion/gallery-dl @@ -10,7 +10,7 @@ _gallery_dl() elif [[ "${prev}" =~ ^()$ ]]; then COMPREPLY=( $(compgen -d -- "${cur}") ) else - COMPREPLY=( $(compgen -W "--help --version --filename --destination --directory --extractors --user-agent --clear-cache --update --update-to --update-check --input-file --input-file-comment --input-file-delete --no-input --quiet --warning --verbose --get-urls --resolve-urls --dump-json --simulate --extractor-info --list-keywords --error-file --list-modules --list-extractors --write-log --write-unsupported --write-pages --no-colors --retries --http-timeout --proxy --source-address --no-check-certificate --limit-rate --chunk-size --sleep --sleep-request --sleep-extractor --no-part --no-skip --no-mtime --no-download --option --config --config-yaml --config-toml --config-create --config-status --config-open --config-ignore --ignore-config --username --password --netrc --cookies --cookies-export --cookies-from-browser --abort --terminate --filesize-min --filesize-max --download-archive --range --chapter-range --filter --chapter-filter --postprocessor --no-postprocessors --postprocessor-option --write-metadata --write-info-json --write-infojson --write-tags --zip --cbz --mtime --mtime-from-date --ugoira --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --exec --exec-after" -- "${cur}") ) + COMPREPLY=( $(compgen -W "--help --version --filename --destination --directory --extractors --user-agent --clear-cache --update --update-to --update-check --input-file --input-file-comment --input-file-delete --no-input --quiet --warning --verbose --get-urls --resolve-urls --dump-json --resolve-json --simulate --extractor-info --list-keywords --error-file --list-modules --list-extractors --write-log --write-unsupported --write-pages --print-traffic --no-colors --retries --http-timeout --proxy --source-address --no-check-certificate --limit-rate --chunk-size --sleep --sleep-request --sleep-extractor --no-part --no-skip --no-mtime --no-download --option --config --config-yaml --config-toml --config-create --config-status --config-open --config-ignore --ignore-config --username --password --netrc --cookies --cookies-export --cookies-from-browser --abort --terminate --filesize-min --filesize-max --download-archive --range --chapter-range --filter --chapter-filter --postprocessor --no-postprocessors --postprocessor-option --write-metadata --write-info-json --write-infojson --write-tags --zip --cbz --mtime --mtime-from-date --ugoira --ugoira-conv --ugoira-conv-lossless --ugoira-conv-copy --exec --exec-after" -- "${cur}") ) fi } diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish index a67cd63..7243998 100644 --- a/data/completion/gallery-dl.fish +++ b/data/completion/gallery-dl.fish @@ -20,6 +20,7 @@ complete -c gallery-dl -s 'v' -l 'verbose' -d 'Print various debugging informati complete -c gallery-dl -s 'g' -l 'get-urls' -d 'Print URLs instead of downloading' complete -c gallery-dl -s 'G' -l 'resolve-urls' -d 'Print URLs instead of downloading; resolve intermediary URLs' complete -c gallery-dl -s 'j' -l 'dump-json' -d 'Print JSON information' +complete -c gallery-dl -s 'J' -l 'resolve-json' -d 'Print JSON information; resolve intermediary URLs' complete -c gallery-dl -s 's' -l 'simulate' -d 'Simulate data extraction; do not download anything' complete -c gallery-dl -s 'E' -l 'extractor-info' -d 'Print extractor defaults and settings' complete -c gallery-dl -s 'K' -l 'list-keywords' -d 'Print a list of available keywords and example values for the given URLs' @@ -29,6 +30,7 @@ complete -c gallery-dl -l 'list-extractors' -d 'Print a list of extractor classe complete -c gallery-dl -r -F -l 'write-log' -d 'Write logging output to FILE' complete -c gallery-dl -r -F -l 'write-unsupported' -d 'Write URLs, which get emitted by other extractors but cannot be handled, to FILE' complete -c gallery-dl -l 'write-pages' -d 'Write downloaded intermediary pages to files in the current directory to debug problems' +complete -c gallery-dl -l 'print-traffic' -d 'Display sent and read HTTP traffic' complete -c gallery-dl -l 'no-colors' -d 'Do not emit ANSI color codes in output' complete -c gallery-dl -x -s 'R' -l 'retries' -d 'Maximum number of retries for failed HTTP requests or -1 for infinite retries (default: 4)' complete -c gallery-dl -x -l 'http-timeout' -d 'Timeout for HTTP connections (default: 30.0)' diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 37529bf..d1eddd6 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2024-06-22" "1.27.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2024-08-03" "1.27.2" "gallery-dl Manual" .\" disable hyphenation .nh @@ -80,6 +80,9 @@ Print URLs instead of downloading; resolve intermediary URLs .B "\-j, \-\-dump\-json" Print JSON information .TP +.B "\-J, \-\-resolve\-json" +Print JSON information; resolve intermediary URLs +.TP .B "\-s, \-\-simulate" Simulate data extraction; do not download anything .TP @@ -107,6 +110,9 @@ Write URLs, which get emitted by other extractors but cannot be handled, to FILE .B "\-\-write\-pages" Write downloaded intermediary pages to files in the current directory to debug problems .TP +.B "\-\-print\-traffic" +Display sent and read HTTP traffic +.TP .B "\-\-no\-colors" Do not emit ANSI color codes in output .TP diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index e3ed58a..8f75284 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2024-06-22" "1.27.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2024-08-03" "1.27.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -456,6 +456,7 @@ response before \f[I]retrying\f[] the request. .br * \f[I]"0.5-1.5"\f[] \f[I][Danbooru]\f[], \f[I][E621]\f[], \f[I][foolfuuka]:search\f[], \f[I]itaku\f[], +\f[I]koharu\f[], \f[I]newgrounds\f[], \f[I][philomena]\f[], \f[I]pixiv:novel\f[], \f[I]plurk\f[], \f[I]poipiku\f[] , \f[I]pornpics\f[], \f[I]soundgasm\f[], \f[I]urlgalleries\f[], \f[I]vk\f[], \f[I]zerochan\f[] @@ -536,6 +537,8 @@ and optional for .br * \f[I]kemonoparty\f[] .br +* \f[I]koharu\f[] +.br * \f[I]mangadex\f[] .br * \f[I]mangoxo\f[] @@ -728,7 +731,7 @@ or a \f[I]list\f[] with IP and explicit port number as elements. \f[I]string\f[] .IP "Default:" 9 -\f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0"\f[] +\f[I]"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0"\f[] .IP "Description:" 4 User-Agent header value to be used for HTTP requests. @@ -1017,6 +1020,29 @@ but be aware that using external inputs for building local paths may pose a security risk. +.SS extractor.*.archive-event +.IP "Type:" 6 ++ \f[I]string\f[] ++ \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"file"\f[] + +.IP "Example:" 4 +.br +* "file,skip" +.br +* ["file", "skip"] + +.IP "Description:" 4 +\f[I]Event(s)\f[] +for which IDs get written to an +\f[I]archive\f[]. + +Available events are: +\f[I]file\f[], \f[I]skip\f[] + + .SS extractor.*.archive-format .IP "Type:" 6 \f[I]string\f[] @@ -1075,25 +1101,33 @@ for available \f[I]PRAGMA\f[] statements and further details. .SS extractor.*.actions .IP "Type:" 6 .br -* \f[I]object\f[] (pattern -> action) +* \f[I]object\f[] (pattern -> action(s)) .br -* \f[I]list\f[] of \f[I]lists\f[] with 2 \f[I]strings\f[] as elements +* \f[I]list\f[] of \f[I]lists\f[] with pattern -> action(s) pairs as elements .IP "Example:" 4 .. code:: json { -"error" : "status \f[I]= 1", +"info:Logging in as .+" : "level = debug", "warning:(?i)unable to .+": "exit 127", -"info:Logging in as .+" : "level = debug" +"error" : [ +"status \f[I]= 1", +"exec notify.sh 'gdl error'", +"abort" +] } .. code:: json [ -["error" , "status \f[]= 1" ], +["info:Logging in as .+" , "level = debug"], ["warning:(?i)unable to .+", "exit 127" ], -["info:Logging in as .+" , "level = debug"] +["error" , [ +"status \f[]= 1", +"exec notify.sh 'gdl error'", +"abort" +]] ] @@ -1110,6 +1144,9 @@ matches logging messages of all levels \f[I]action\f[] is parsed as action type followed by (optional) arguments. +It is possible to specify more than one \f[I]action\f[] per \f[I]pattern\f[] +by providing them as a \f[I]list\f[]: \f[I]["<action1>", "<action2>", …]\f[] + Supported Action Types: \f[I]status\f[]: @@ -1128,12 +1165,21 @@ Modify severity level of the current logging message. .br Can be one of \f[I]debug\f[], \f[I]info\f[], \f[I]warning\f[], \f[I]error\f[] or an integer value. .br -\f[I]print\f[] +\f[I]print\f[]: Write argument to stdout. +\f[I]exec\f[]: +Run a shell command. +\f[I]abort\f[]: +Stop the current extractor run. +\f[I]terminate\f[]: +Stop the current extractor run, including parent extractors. \f[I]restart\f[]: Restart the current extractor run. \f[I]wait\f[]: -Stop execution until Enter is pressed. +Sleep for a given \f[I]Duration\f[] or +.br +wait until Enter is pressed when no argument was given. +.br \f[I]exit\f[]: Exit the program with the given argument as exit status. @@ -1642,6 +1688,23 @@ Sets the maximum depth of returned reply posts. Process reposts. +.SS extractor.cien.files +.IP "Type:" 6 +\f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]["image", "video", "download", "gallery"]\f[] + +.IP "Description:" 4 +Determines the type and order of files to be downloaded. + +Available types are +\f[I]image\f[], +\f[I]video\f[], +\f[I]download\f[], +\f[I]gallery\f[]. + + .SS extractor.cyberdrop.domain .IP "Type:" 6 \f[I]string\f[] @@ -3004,6 +3067,36 @@ If the selected format is not available, the first in the list gets chosen (usually mp3). +.SS extractor.koharu.cbz +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download each gallery as a single \f[I].cbz\f[] file. + +Disabling this option causes a gallery +to be downloaded as individual image files. + + +.SS extractor.koharu.format +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"original"\f[] + +.IP "Description:" 4 +Name of the image format to download. + +Available formats are +.br +\f[I]"780"\f[], \f[I]"980"\f[], \f[I]"1280"\f[], \f[I]"1600"\f[], \f[I]"0"\f[]/\f[I]"original"\f[] +.br + + .SS extractor.lolisafe.domain .IP "Type:" 6 \f[I]string\f[] @@ -4310,6 +4403,27 @@ or each inline image, use an extra HTTP request to find the URL to its full-resolution version. +.SS extractor.tumblr.pagination +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"offset"\f[] + +.IP "Description:" 4 +Controls how to paginate over blog posts. + +.br +* \f[I]"api"\f[]: \f[I]next\f[] parameter provided by the API +(potentially misses posts due to a +\f[I]bug\f[] +in Tumblr's API) +.br +* \f[I]"before"\f[]: timestamp of last post +.br +* \f[I]"offset"\f[]: post offset number + + .SS extractor.tumblr.ratelimit .IP "Type:" 6 \f[I]string\f[] @@ -4919,6 +5033,35 @@ Note: Requires \f[I]login\f[] or \f[I]cookies\f[] +.SS extractor.vsco.include +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] + +.IP "Default:" 9 +\f[I]"gallery"\f[] + +.IP "Example:" 4 +.br +* "avatar,collection" +.br +* ["avatar", "collection"] + +.IP "Description:" 4 +A (comma-separated) list of subcategories to include +when processing a user profile. + +Possible values are +\f[I]"avatar"\f[], +\f[I]"gallery"\f[], +\f[I]"spaces"\f[], +\f[I]"collection"\f[], + +It is possible to use \f[I]"all"\f[] instead of listing all values separately. + + .SS extractor.vsco.videos .IP "Type:" 6 \f[I]bool\f[] @@ -5282,17 +5425,25 @@ Note: This requires 1 additional HTTP request per post. .SS extractor.[booru].url .IP "Type:" 6 -\f[I]string\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 \f[I]"file_url"\f[] .IP "Example:" 4 -"preview_url" +.br +* "preview_url" +.br +* ["sample_url", "preview_url", "file_url"} .IP "Description:" 4 Alternate field name to retrieve download URLs from. +When multiple names are given, download the first available one. + .SS extractor.[manga-extractor].chapter-reverse .IP "Type:" 6 @@ -6249,13 +6400,19 @@ If this option is set, \f[I]metadata.extension\f[] and .SS metadata.directory .IP "Type:" 6 -\f[I]string\f[] +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 \f[I]"."\f[] .IP "Example:" 4 -"metadata" +.br +* "metadata" +.br +* ["..", "metadata", "\\fF {id // 500 * 500}"] .IP "Description:" 4 Directory where metadata files are stored in relative to the @@ -6965,6 +7122,19 @@ Set this option to \f[I]null\f[] or an invalid path to disable this cache. +.SS filters-environment +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Evaluate filter expressions raising an exception as \f[I]false\f[] +instead of aborting the current extractor run +by wrapping them in a try/except block. + + .SS format-separator .IP "Type:" 6 \f[I]string\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 9f12652..2a7f8f2 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -10,7 +10,7 @@ "proxy": null, "skip": true, - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/115.0", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0", "retries": 4, "timeout": 30.0, "verify": true, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index a06aa55..eec2e32 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.27.1 +Version: 1.27.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -65,10 +65,12 @@ Dependencies Optional -------- +- yt-dlp_ or youtube-dl_: HLS/DASH video downloads, ``ytdl`` integration - FFmpeg_: Pixiv Ugoira conversion -- yt-dlp_ or youtube-dl_: Video downloads +- mkvmerge_: Accurate Ugoira frame timecodes - PySocks_: SOCKS proxy support - brotli_ or brotlicffi_: Brotli compression support +- zstandard_: Zstandard compression support - PyYAML_: YAML configuration file support - toml_: TOML configuration file support for Python<3.11 - SecretStorage_: GNOME keyring passwords for ``--cookies-from-browser`` @@ -112,9 +114,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.27.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.27.2/gallery-dl.bin>`__ Nightly Builds @@ -457,11 +459,13 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. _pip: https://pip.pypa.io/en/stable/ .. _Requests: https://requests.readthedocs.io/en/master/ .. _FFmpeg: https://www.ffmpeg.org/ +.. _mkvmerge: https://www.matroska.org/downloads/mkvtoolnix.html .. _yt-dlp: https://github.com/yt-dlp/yt-dlp .. _youtube-dl: https://ytdl-org.github.io/youtube-dl/ .. _PySocks: https://pypi.org/project/PySocks/ .. _brotli: https://github.com/google/brotli .. _brotlicffi: https://github.com/python-hyper/brotlicffi +.. _zstandard: https://github.com/indygreg/python-zstandard .. _PyYAML: https://pyyaml.org/ .. _toml: https://pypi.org/project/toml/ .. _SecretStorage: https://pypi.org/project/SecretStorage/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index a892544..de5738a 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -56,6 +56,7 @@ gallery_dl/extractor/8chan.py gallery_dl/extractor/8muses.py gallery_dl/extractor/__init__.py gallery_dl/extractor/adultempire.py +gallery_dl/extractor/agnph.py gallery_dl/extractor/architizer.py gallery_dl/extractor/artstation.py gallery_dl/extractor/aryion.py @@ -68,6 +69,7 @@ gallery_dl/extractor/booru.py gallery_dl/extractor/bunkr.py gallery_dl/extractor/catbox.py gallery_dl/extractor/chevereto.py +gallery_dl/extractor/cien.py gallery_dl/extractor/comicvine.py gallery_dl/extractor/common.py gallery_dl/extractor/cyberdrop.py @@ -79,7 +81,6 @@ gallery_dl/extractor/dynastyscans.py gallery_dl/extractor/e621.py gallery_dl/extractor/erome.py gallery_dl/extractor/exhentai.py -gallery_dl/extractor/fallenangels.py gallery_dl/extractor/fanbox.py gallery_dl/extractor/fanleaks.py gallery_dl/extractor/fantia.py @@ -125,6 +126,7 @@ gallery_dl/extractor/kabeuchi.py gallery_dl/extractor/keenspot.py gallery_dl/extractor/kemonoparty.py gallery_dl/extractor/khinsider.py +gallery_dl/extractor/koharu.py gallery_dl/extractor/komikcast.py gallery_dl/extractor/lensdump.py gallery_dl/extractor/lexica.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 1d4215e..4b39c15 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -105,6 +105,11 @@ def main(): output.ANSI = True + # filter environment + filterenv = config.get((), "filters-environment", True) + if not filterenv: + util.compile_expression = util.compile_expression_raw + # format string separator separator = config.get((), "format-separator") if separator: @@ -145,6 +150,10 @@ def main(): log.debug("Configuration Files %s", config._files) + if args.print_traffic: + import requests + requests.packages.urllib3.connection.HTTPConnection.debuglevel = 1 + # extractor modules modules = config.get(("extractor",), "modules") if modules is not None: @@ -240,6 +249,9 @@ def main(): if config.get(("output",), "fallback", True): jobtype.handle_url = \ staticmethod(jobtype.handle_url_fallback) + elif args.dump_json: + jobtype = job.DataJob + jobtype.resolve = args.dump_json - 1 else: jobtype = args.jobtype or job.DownloadJob @@ -299,6 +311,8 @@ def main(): else: input_manager.success() + except exception.StopExtraction: + pass except exception.TerminateExtraction: pass except exception.RestartExtraction: diff --git a/gallery_dl/actions.py b/gallery_dl/actions.py index 883e38b..668032d 100644 --- a/gallery_dl/actions.py +++ b/gallery_dl/actions.py @@ -9,8 +9,10 @@ """ """ import re +import time import logging import operator +import functools from . import util, exception @@ -19,29 +21,100 @@ def parse(actionspec): actionspec = actionspec.items() actions = {} - actions[logging.DEBUG] = actions_d = [] - actions[logging.INFO] = actions_i = [] - actions[logging.WARNING] = actions_w = [] - actions[logging.ERROR] = actions_e = [] + actions[-logging.DEBUG] = actions_bd = [] + actions[-logging.INFO] = actions_bi = [] + actions[-logging.WARNING] = actions_bw = [] + actions[-logging.ERROR] = actions_be = [] + actions[logging.DEBUG] = actions_ad = [] + actions[logging.INFO] = actions_ai = [] + actions[logging.WARNING] = actions_aw = [] + actions[logging.ERROR] = actions_ae = [] for event, spec in actionspec: level, _, pattern = event.partition(":") - type, _, args = spec.partition(" ") - action = (re.compile(pattern).search, ACTIONS[type](args)) + search = re.compile(pattern).search if pattern else util.true + + if isinstance(spec, str): + type, _, args = spec.partition(" ") + before, after = ACTIONS[type](args) + else: + actions_before = [] + actions_after = [] + for s in spec: + type, _, args = s.partition(" ") + before, after = ACTIONS[type](args) + if before: + actions_before.append(before) + if after: + actions_after.append(after) + before = _chain_actions(actions_before) + after = _chain_actions(actions_after) level = level.strip() if not level or level == "*": - actions_d.append(action) - actions_i.append(action) - actions_w.append(action) - actions_e.append(action) + if before: + action = (search, before) + actions_bd.append(action) + actions_bi.append(action) + actions_bw.append(action) + actions_be.append(action) + if after: + action = (search, after) + actions_ad.append(action) + actions_ai.append(action) + actions_aw.append(action) + actions_ae.append(action) else: - - actions[_level_to_int(level)].append(action) + level = _level_to_int(level) + if before: + actions[-level].append((search, before)) + if after: + actions[level].append((search, after)) return actions +class LoggerAdapter(): + + def __init__(self, logger, job): + self.logger = logger + self.extra = job._logger_extra + self.actions = job._logger_actions + + self.debug = functools.partial(self.log, logging.DEBUG) + self.info = functools.partial(self.log, logging.INFO) + self.warning = functools.partial(self.log, logging.WARNING) + self.error = functools.partial(self.log, logging.ERROR) + + def log(self, level, msg, *args, **kwargs): + msg = str(msg) + if args: + msg = msg % args + + before = self.actions[-level] + after = self.actions[level] + + if before: + args = self.extra.copy() + args["level"] = level + + for cond, action in before: + if cond(msg): + action(args) + + level = args["level"] + + if self.logger.isEnabledFor(level): + kwargs["extra"] = self.extra + self.logger._log(level, msg, (), **kwargs) + + if after: + args = self.extra.copy() + for cond, action in after: + if cond(msg): + action(args) + + def _level_to_int(level): try: return logging._nameToLevel[level] @@ -49,10 +122,19 @@ def _level_to_int(level): return int(level) +def _chain_actions(actions): + def _chain(args): + for action in actions: + action(args) + return _chain + + +# -------------------------------------------------------------------- + def action_print(opts): def _print(_): print(opts) - return _print + return None, _print def action_status(opts): @@ -69,7 +151,7 @@ def action_status(opts): def _status(args): args["job"].status = op(args["job"].status, value) - return _status + return _status, None def action_level(opts): @@ -77,17 +159,38 @@ def action_level(opts): def _level(args): args["level"] = level - return _level + return _level, None + + +def action_exec(opts): + def _exec(_): + util.Popen(opts, shell=True).wait() + return None, _exec def action_wait(opts): - def _wait(args): - input("Press Enter to continue") - return _wait + if opts: + seconds = util.build_duration_func(opts) + + def _wait(args): + time.sleep(seconds()) + else: + def _wait(args): + input("Press Enter to continue") + + return None, _wait + + +def action_abort(opts): + return None, util.raises(exception.StopExtraction) + + +def action_terminate(opts): + return None, util.raises(exception.TerminateExtraction) def action_restart(opts): - return util.raises(exception.RestartExtraction) + return None, util.raises(exception.RestartExtraction) def action_exit(opts): @@ -98,14 +201,17 @@ def action_exit(opts): def _exit(args): raise SystemExit(opts) - return _exit + return None, _exit ACTIONS = { - "print" : action_print, - "status" : action_status, - "level" : action_level, - "restart": action_restart, - "wait" : action_wait, - "exit" : action_exit, + "abort" : action_abort, + "exec" : action_exec, + "exit" : action_exit, + "level" : action_level, + "print" : action_print, + "restart" : action_restart, + "status" : action_status, + "terminate": action_terminate, + "wait" : action_wait, } diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 47f78a7..f017929 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -25,7 +25,7 @@ from . import aes, text, util SUPPORTED_BROWSERS_CHROMIUM = { - "brave", "chrome", "chromium", "edge", "opera", "vivaldi"} + "brave", "chrome", "chromium", "edge", "opera", "thorium", "vivaldi"} SUPPORTED_BROWSERS = SUPPORTED_BROWSERS_CHROMIUM | {"firefox", "safari"} logger = logging.getLogger("cookies") @@ -354,6 +354,7 @@ def _get_chromium_based_browser_settings(browser_name): "chromium": join(appdata_local, R"Chromium\User Data"), "edge" : join(appdata_local, R"Microsoft\Edge\User Data"), "opera" : join(appdata_roaming, R"Opera Software\Opera Stable"), + "thorium" : join(appdata_local, R"Thorium\User Data"), "vivaldi" : join(appdata_local, R"Vivaldi\User Data"), }[browser_name] @@ -365,6 +366,7 @@ def _get_chromium_based_browser_settings(browser_name): "chromium": join(appdata, "Chromium"), "edge" : join(appdata, "Microsoft Edge"), "opera" : join(appdata, "com.operasoftware.Opera"), + "thorium" : join(appdata, "Thorium"), "vivaldi" : join(appdata, "Vivaldi"), }[browser_name] @@ -377,6 +379,7 @@ def _get_chromium_based_browser_settings(browser_name): "chromium": join(config, "chromium"), "edge" : join(config, "microsoft-edge"), "opera" : join(config, "opera"), + "thorium" : join(config, "Thorium"), "vivaldi" : join(config, "vivaldi"), }[browser_name] @@ -390,6 +393,7 @@ def _get_chromium_based_browser_settings(browser_name): "edge" : "Microsoft Edge" if sys.platform == "darwin" else "Chromium", "opera" : "Opera" if sys.platform == "darwin" else "Chromium", + "thorium" : "Thorium", "vivaldi" : "Vivaldi" if sys.platform == "darwin" else "Chrome", }[browser_name] diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index a4b0997..a5e8b27 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -27,7 +27,8 @@ class _8chanExtractor(Extractor): Extractor.__init__(self, match) def _init(self): - self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2]) + self.cookies.set( + "TOS20240718", "1", domain=self.root.rpartition("/")[2]) @memcache() def cookies_prepare(self): diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6aff1f3..e103cb1 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -22,6 +22,7 @@ modules = [ "8chan", "8muses", "adultempire", + "agnph", "architizer", "artstation", "aryion", @@ -33,6 +34,7 @@ modules = [ "bunkr", "catbox", "chevereto", + "cien", "comicvine", "cyberdrop", "danbooru", @@ -42,7 +44,6 @@ modules = [ "e621", "erome", "exhentai", - "fallenangels", "fanbox", "fanleaks", "fantia", @@ -84,6 +85,7 @@ modules = [ "keenspot", "kemonoparty", "khinsider", + "koharu", "komikcast", "lensdump", "lexica", diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py new file mode 100644 index 0000000..653b73f --- /dev/null +++ b/gallery_dl/extractor/agnph.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://agn.ph/""" + +from . import booru +from .. import text + +from xml.etree import ElementTree +import collections +import re + +BASE_PATTERN = r"(?:https?://)?agn\.ph" + + +class AgnphExtractor(booru.BooruExtractor): + category = "agnph" + root = "https://agn.ph" + page_start = 1 + per_page = 45 + + TAG_TYPES = { + "a": "artist", + "b": "copyright", + "c": "character", + "d": "species", + "m": "general", + } + + def _init(self): + self.cookies.set("confirmed_age", "true", domain="agn.ph") + + def _prepare(self, post): + post["date"] = text.parse_timestamp(post["created_at"]) + post["status"] = post["status"].strip() + post["has_children"] = ("true" in post["has_children"]) + + def _xml_to_dict(self, xml): + return {element.tag: element.text for element in xml} + + def _pagination(self, url, params): + params["api"] = "xml" + if "page" in params: + params["page"] = \ + self.page_start + text.parse_int(params["page"]) - 1 + else: + params["page"] = self.page_start + + while True: + data = self.request(url, params=params).text + root = ElementTree.fromstring(data) + + yield from map(self._xml_to_dict, root) + + attrib = root.attrib + if int(attrib["offset"]) + len(root) >= int(attrib["count"]): + return + + params["page"] += 1 + + def _html(self, post): + url = "{}/gallery/post/show/{}/".format(self.root, post["id"]) + return self.request(url).text + + def _tags(self, post, page): + tag_container = text.extr( + page, '<ul class="taglist">', '<h3>Statistics</h3>') + if not tag_container: + return + + tags = collections.defaultdict(list) + pattern = re.compile(r'class="(.)typetag">([^<]+)') + for tag_type, tag_name in pattern.findall(tag_container): + tags[tag_type].append(text.unquote(tag_name).replace(" ", "_")) + for key, value in tags.items(): + post["tags_" + self.TAG_TYPES[key]] = " ".join(value) + + +class AgnphTagExtractor(AgnphExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/gallery/post/(?:\?([^#]+))?$" + example = "https://agn.ph/gallery/post/?search=TAG" + + def __init__(self, match): + AgnphExtractor.__init__(self, match) + self.params = text.parse_query(self.groups[0]) + + def metadata(self): + return {"search_tags": self.params.get("search") or ""} + + def posts(self): + url = self.root + "/gallery/post/" + return self._pagination(url, self.params.copy()) + + +class AgnphPostExtractor(AgnphExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/gallery/post/show/(\d+)" + example = "https://agn.ph/gallery/post/show/12345/" + + def posts(self): + url = "{}/gallery/post/show/{}/?api=xml".format( + self.root, self.groups[0]) + post = ElementTree.fromstring(self.request(url).text) + return (self._xml_to_dict(post),) diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index ec86263..17b780e 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -79,18 +79,20 @@ class AryionExtractor(Extractor): def metadata(self): """Return general metadata""" - def _pagination_params(self, url, params=None): + def _pagination_params(self, url, params=None, needle=None): if params is None: params = {"p": 1} else: params["p"] = text.parse_int(params.get("p"), 1) + if needle is None: + needle = "class='gallery-item' id='" + while True: page = self.request(url, params=params).text cnt = 0 - for post_id in text.extract_iter( - page, "class='gallery-item' id='", "'"): + for post_id in text.extract_iter(page, needle, "'"): cnt += 1 yield post_id @@ -200,6 +202,21 @@ class AryionGalleryExtractor(AryionExtractor): return util.advance(self._pagination_next(url), self.offset) +class AryionFavoriteExtractor(AryionExtractor): + """Extractor for a user's favorites gallery""" + subcategory = "favorite" + directory_fmt = ("{category}", "{user!l}", "favorites") + archive_fmt = "f_{user}_{id}" + categorytransfer = True + pattern = BASE_PATTERN + r"/favorites/([^/?#]+)" + example = "https://aryion.com/g4/favorites/USER" + + def posts(self): + url = "{}/g4/favorites/{}".format(self.root, self.user) + return self._pagination_params( + url, None, "class='gallery-item favorite' id='") + + class AryionTagExtractor(AryionExtractor): """Extractor for tag searches on eka's portal""" subcategory = "tag" diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index ad0caf9..f24059f 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -152,8 +152,16 @@ class BehanceGalleryExtractor(BehanceExtractor): continue if mtype == "image": - url = module["imageSizes"]["size_original"]["url"] - append((url, module)) + sizes = { + size["url"].rsplit("/", 2)[1]: size + for size in module["imageSizes"]["allAvailable"] + } + size = (sizes.get("source") or + sizes.get("max_3840") or + sizes.get("fs") or + sizes.get("hd") or + sizes.get("disp")) + append((size["url"], module)) elif mtype == "video": try: diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index cbd0e07..7e26f38 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -29,16 +29,21 @@ class BooruExtractor(BaseExtractor): url_key = self.config("url") if url_key: - self._file_url = operator.itemgetter(url_key) + if isinstance(url_key, (list, tuple)): + self._file_url = self._file_url_list + self._file_url_keys = url_key + else: + self._file_url = operator.itemgetter(url_key) for post in self.posts(): try: url = self._file_url(post) if url[0] == "/": url = self.root + url - except (KeyError, TypeError): - self.log.debug("Unable to fetch download URL for post %s " - "(md5: %s)", post.get("id"), post.get("md5")) + except Exception as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + self.log.warning("Unable to fetch download URL for post %s " + "(md5: %s)", post.get("id"), post.get("md5")) continue if fetch_html: @@ -73,6 +78,11 @@ class BooruExtractor(BaseExtractor): _file_url = operator.itemgetter("file_url") + def _file_url_list(self, post): + urls = (post[key] for key in self._file_url_keys if post.get(key)) + post["_fallback"] = it = iter(urls) + return next(it) + def _prepare(self, post): """Prepare a 'post's metadata""" diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index a093347..77f0de6 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -13,7 +13,7 @@ from .. import text BASE_PATTERN = ( r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|ru|la|is|to|ac|black|cat|media|red|site|ws))" + r"\.(?:s[kiu]|fi|ru|la|is|to|ac|black|cat|media|red|site|ws))" ) LEGACY_DOMAINS = { diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py new file mode 100644 index 0000000..bae86d0 --- /dev/null +++ b/gallery_dl/extractor/cien.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://ci-en.net/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" + + +class CienExtractor(Extractor): + category = "cien" + root = "https://ci-en.net" + request_interval = (1.0, 2.0) + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + Extractor.__init__(self, match) + + def _init(self): + self.cookies.set("accepted_rating", "r18g", domain="ci-en.dlsite.com") + + def _pagination_articles(self, url, params): + data = {"_extractor": CienArticleExtractor} + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + + for card in text.extract_iter( + page, ' class="c-cardCase-item', '</div>'): + article_url = text.extr(card, ' href="', '"') + yield Message.Queue, article_url, data + + if ' rel="next"' not in page: + return + params["page"] += 1 + + +class CienArticleExtractor(CienExtractor): + subcategory = "article" + filename_fmt = "{num:>02} {filename}.{extension}" + directory_fmt = ("{category}", "{author[name]}", "{post_id} {name}") + archive_fmt = "{post_id}_{num}" + pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" + example = "https://ci-en.net/creator/123/article/12345" + + def items(self): + url = "{}/creator/{}/article/{}".format( + self.root, self.groups[0], self.groups[1]) + page = self.request(url, notfound="article").text + + post = util.json_loads(text.extr( + page, '<script type="application/ld+json">', '</script>'))[0] + + files = self._extract_files(post.get("articleBody") or page) + + post["post_url"] = url + post["post_id"] = text.parse_int(self.groups[1]) + post["count"] = len(files) + post["date"] = text.parse_datetime(post["datePublished"]) + + try: + del post["publisher"] + del post["sameAs"] + except Exception: + pass + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + post.update(file) + if "extension" not in file: + text.nameext_from_url(file["url"], post) + yield Message.Url, file["url"], post + + def _extract_files(self, page): + files = [] + + filetypes = self.config("files") + if filetypes is None: + self._extract_files_image(page, files) + self._extract_files_video(page, files) + self._extract_files_download(page, files) + self._extract_files_gallery(page, files) + else: + generators = { + "image" : self._extract_files_image, + "video" : self._extract_files_video, + "download": self._extract_files_download, + "gallery" : self._extract_files_gallery, + "gallerie": self._extract_files_gallery, + } + if isinstance(filetypes, str): + filetypes = filetypes.split(",") + for ft in filetypes: + generators[ft.rstrip("s")](page, files) + + return files + + def _extract_files_image(self, page, files): + for image in text.extract_iter( + page, 'class="file-player-image"', "</figure>"): + size = text.extr(image, ' data-size="', '"') + w, _, h = size.partition("x") + + files.append({ + "url" : text.extr(image, ' data-raw="', '"'), + "width" : text.parse_int(w), + "height": text.parse_int(h), + "type" : "image", + }) + + def _extract_files_video(self, page, files): + for video in text.extract_iter( + page, "<vue-file-player", "</vue-file-player>"): + path = text.extr(video, ' base-path="', '"') + name = text.extr(video, ' file-name="', '"') + auth = text.extr(video, ' auth-key="', '"') + + file = text.nameext_from_url(name) + file["url"] = "{}video-web.mp4?{}".format(path, auth) + file["type"] = "video" + files.append(file) + + def _extract_files_download(self, page, files): + for download in text.extract_iter( + page, 'class="downloadBlock', "</div>"): + name = text.extr(download, "<p>", "<") + + file = text.nameext_from_url(name.rpartition(" ")[0]) + file["url"] = text.extr(download, ' href="', '"') + file["type"] = "download" + files.append(file) + + def _extract_files_gallery(self, page, files): + for gallery in text.extract_iter( + page, "<vue-image-gallery", "</vue-image-gallery>"): + + url = self.root + "/api/creator/gallery/images" + params = { + "hash" : text.extr(gallery, ' hash="', '"'), + "gallery_id": text.extr(gallery, ' gallery-id="', '"'), + "time" : text.extr(gallery, ' time="', '"'), + } + data = self.request(url, params=params).json() + url = self.root + "/api/creator/gallery/imagePath" + + for params["page"], params["file_id"] in enumerate( + data["imgList"]): + path = self.request(url, params=params).json()["path"] + + file = params.copy() + file["url"] = path + files.append(file) + + +class CienCreatorExtractor(CienExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" + example = "https://ci-en.net/creator/123" + + def items(self): + url = "{}/creator/{}/article".format(self.root, self.groups[0]) + params = text.parse_query(self.groups[1]) + params["mode"] = "list" + return self._pagination_articles(url, params) + + +class CienRecentExtractor(CienExtractor): + subcategory = "recent" + pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" + example = "https://ci-en.net/mypage/recent" + + def items(self): + url = self.root + "/mypage/recent" + params = text.parse_query(self.groups[0]) + return self._pagination_articles(url, params) + + +class CienFollowingExtractor(CienExtractor): + subcategory = "following" + pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" + example = "https://ci-en.net/mypage/subscription" + + def items(self): + url = self.root + "/mypage/subscription" + (self.groups[0] or "") + page = self.request(url).text + data = {"_extractor": CienCreatorExtractor} + + for subscription in text.extract_iter( + page, 'class="c-grid-subscriptionInfo', '</figure>'): + url = text.extr(subscription, ' href="', '"') + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index d7a41bc..df70571 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -378,7 +378,7 @@ class Extractor(): useragent = self.config("user-agent") if useragent is None: useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:109.0) Gecko/20100101 Firefox/115.0") + "rv:128.0) Gecko/20100101 Firefox/128.0") elif useragent == "browser": useragent = _browser_useragent() headers["User-Agent"] = useragent @@ -390,6 +390,8 @@ class Extractor(): headers["Accept-Encoding"] = "gzip, deflate, br" else: headers["Accept-Encoding"] = "gzip, deflate" + if ZSTD: + headers["Accept-Encoding"] += ", zstd" referer = self.config("referer", self.referer) if referer: @@ -789,10 +791,11 @@ class BaseExtractor(Extractor): instances = () def __init__(self, match): - Extractor.__init__(self, match) if not self.category: + self.groups = match.groups() + self.match = match self._init_category() - self._cfgpath = ("extractor", self.category, self.subcategory) + Extractor.__init__(self, match) def _init_category(self): for index, group in enumerate(self.groups): @@ -911,13 +914,12 @@ _browser_cookies = {} HTTP_HEADERS = { "firefox": ( ("User-Agent", "Mozilla/5.0 ({}; " - "rv:109.0) Gecko/20100101 Firefox/115.0"), + "rv:128.0) Gecko/20100101 Firefox/128.0"), ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9," - "image/avif,image/webp,*/*;q=0.8"), + "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"), ("Accept-Language", "en-US,en;q=0.5"), ("Accept-Encoding", None), ("Referer", None), - ("DNT", "1"), ("Connection", "keep-alive"), ("Upgrade-Insecure-Requests", "1"), ("Cookie", None), @@ -991,6 +993,12 @@ try: except AttributeError: BROTLI = False +# detect zstandard support +try: + ZSTD = urllib3.response.HAS_ZSTD +except AttributeError: + ZSTD = False + # set (urllib3) warnings filter action = config.get((), "warnings", "default") if action: diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2199cc8..a70710c 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -846,55 +846,6 @@ class DeviantartStatusExtractor(DeviantartExtractor): ) -class DeviantartPopularExtractor(DeviantartExtractor): - """Extractor for popular deviations""" - subcategory = "popular" - directory_fmt = ("{category}", "Popular", - "{popular[range]}", "{popular[search]}") - archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" - pattern = (r"(?:https?://)?www\.deviantart\.com/(?:" - r"(?:deviations/?)?\?order=(popular-[^/?#]+)" - r"|((?:[\w-]+/)*)(popular-[^/?#]+)" - r")/?(?:\?([^#]*))?") - example = "https://www.deviantart.com/popular-24-hours/" - - def __init__(self, match): - DeviantartExtractor.__init__(self, match) - self.user = "" - - trange1, path, trange2, query = match.groups() - query = text.parse_query(query) - self.search_term = query.get("q") - - trange = trange1 or trange2 or query.get("order", "") - if trange.startswith("popular-"): - trange = trange[8:] - self.time_range = { - "newest" : "now", - "most-recent" : "now", - "this-week" : "1week", - "this-month" : "1month", - "this-century": "alltime", - "all-time" : "alltime", - }.get(trange, "alltime") - - self.popular = { - "search": self.search_term or "", - "range" : trange or "all-time", - "path" : path.strip("/") if path else "", - } - - def deviations(self): - if self.time_range == "now": - return self.api.browse_newest(self.search_term, self.offset) - return self.api.browse_popular( - self.search_term, self.time_range, self.offset) - - def prepare(self, deviation): - DeviantartExtractor.prepare(self, deviation) - deviation["popular"] = self.popular - - class DeviantartTagExtractor(DeviantartExtractor): """Extractor for deviations from tag searches""" subcategory = "tag" @@ -1077,14 +1028,14 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): class DeviantartFollowingExtractor(DeviantartExtractor): """Extractor for user's watched users""" subcategory = "following" - pattern = BASE_PATTERN + "/about#watching$" + pattern = BASE_PATTERN + "/(?:about#)?watching" example = "https://www.deviantart.com/USER/about#watching" def items(self): - eclipse_api = DeviantartEclipseAPI(self) + api = DeviantartOAuthAPI(self) - for user in eclipse_api.user_watching(self.user, self.offset): - url = "{}/{}".format(self.root, user["username"]) + for user in api.user_friends(self.user): + url = "{}/{}".format(self.root, user["user"]["username"]) user["_extractor"] = DeviantartUserExtractor yield Message.Queue, url, user @@ -1095,7 +1046,7 @@ class DeviantartFollowingExtractor(DeviantartExtractor): class DeviantartOAuthAPI(): """Interface for the DeviantArt OAuth API - Ref: https://www.deviantart.com/developers/http/v1/20160316 + https://www.deviantart.com/developers/http/v1/20160316 """ CLIENT_ID = "5388" CLIENT_SECRET = "76b08c69cfb27f26d6161f9ab6d061a1" @@ -1188,29 +1139,6 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination(endpoint, params, public=False, unpack=True) - def browse_newest(self, query=None, offset=0): - """Browse newest deviations""" - endpoint = "/browse/newest" - params = { - "q" : query, - "limit" : 120, - "offset" : offset, - "mature_content": self.mature, - } - return self._pagination(endpoint, params) - - def browse_popular(self, query=None, timerange=None, offset=0): - """Yield popular deviations""" - endpoint = "/browse/popular" - params = { - "q" : query, - "limit" : 120, - "timerange" : timerange, - "offset" : offset, - "mature_content": self.mature, - } - return self._pagination(endpoint, params) - def browse_tags(self, tag, offset=0): """ Browse a tag """ endpoint = "/browse/tags" @@ -1223,11 +1151,12 @@ class DeviantartOAuthAPI(): return self._pagination(endpoint, params) def browse_user_journals(self, username, offset=0): - """Yield all journal entries of a specific user""" - endpoint = "/browse/user/journals" - params = {"username": username, "offset": offset, "limit": 50, - "mature_content": self.mature, "featured": "false"} - return self._pagination(endpoint, params) + journals = filter( + lambda post: "/journal/" in post["url"], + self.user_profile_posts(username)) + if offset: + journals = util.advance(journals, offset) + return journals def collections(self, username, folder_id, offset=0): """Yield all Deviation-objects contained in a collection folder""" @@ -1339,16 +1268,10 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params) - @memcache(keyarg=1) - def user_profile(self, username): - """Get user profile information""" - endpoint = "/user/profile/" + username - return self._call(endpoint, fatal=False) - - def user_statuses(self, username, offset=0): - """Yield status updates of a specific user""" - endpoint = "/user/statuses/" - params = {"username": username, "offset": offset, "limit": 50} + def user_friends(self, username, offset=0): + """Get the users list of friends""" + endpoint = "/user/friends/" + username + params = {"limit": 50, "offset": offset, "mature_content": self.mature} return self._pagination(endpoint, params) def user_friends_watch(self, username): @@ -1376,6 +1299,27 @@ class DeviantartOAuthAPI(): endpoint, method="POST", public=False, fatal=False, ).get("success") + @memcache(keyarg=1) + def user_profile(self, username): + """Get user profile information""" + endpoint = "/user/profile/" + username + return self._call(endpoint, fatal=False) + + def user_profile_posts(self, username): + endpoint = "/user/profile/posts" + params = {"username": username, "limit": 50, + "mature_content": self.mature} + return self._pagination(endpoint, params) + + def user_statuses(self, username, offset=0): + """Yield status updates of a specific user""" + statuses = filter( + lambda post: "/status-update/" in post["url"], + self.user_profile_posts(username)) + if offset: + statuses = util.advance(statuses, offset) + return statuses + def authenticate(self, refresh_token_key): """Authenticate the application by requesting an access token""" self.headers["Authorization"] = \ @@ -1464,7 +1408,7 @@ class DeviantartOAuthAPI(): self.log.error(msg) return data - def _switch_tokens(self, results, params): + def _should_switch_tokens(self, results, params): if len(results) < params["limit"]: return True @@ -1496,7 +1440,7 @@ class DeviantartOAuthAPI(): results = [item["journal"] for item in results if "journal" in item] if extend: - if public and self._switch_tokens(results, params): + if public and self._should_switch_tokens(results, params): if self.refresh_token_key: self.log.debug("Switching to private access token") public = False @@ -1540,6 +1484,11 @@ class DeviantartOAuthAPI(): return params["offset"] = int(params["offset"]) + len(results) + def _pagination_list(self, endpoint, params, key="results"): + result = [] + result.extend(self._pagination(endpoint, params, False, key=key)) + return result + @staticmethod def _shared_content(results): """Return an iterable of shared deviations in 'results'""" @@ -1548,11 +1497,6 @@ class DeviantartOAuthAPI(): if "deviation" in item: yield item["deviation"] - def _pagination_list(self, endpoint, params, key="results"): - result = [] - result.extend(self._pagination(endpoint, params, False, key=key)) - return result - def _metadata(self, deviations): """Add extended metadata to each deviation object""" if len(deviations) <= self.limit: diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py index 26f2184..2f0230a 100644 --- a/gallery_dl/extractor/directlink.py +++ b/gallery_dl/extractor/directlink.py @@ -18,7 +18,8 @@ class DirectlinkExtractor(Extractor): filename_fmt = "{domain}/{path}/{filename}.{extension}" archive_fmt = filename_fmt pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\." - r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" + r"(?:jpe?g|jpe|png|gif|bmp|svg|web[mp]|avif|heic|psd" + r"|mp4|m4v|mov|mkv|og[gmv]|wav|mp3|opus|zip|rar|7z|pdf|swf))" r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$") example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png" diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 733d0d8..583869f 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -66,6 +66,8 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): text.extr(group, ' alt="', '"')), "date" : text.parse_datetime(extr( '"icon-calendar"></i> ', '<'), "%b %d, %Y"), + "tags" : text.split_html(extr( + "class='tags'>", "<div id='chapter-actions'")), "lang" : "en", "language": "English", } diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 8c9da2f..e6d136f 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -46,18 +46,24 @@ class EromeExtractor(Extractor): page, 'href="https://www.erome.com/', '"', pos) urls = [] + date = None groups = page.split('<div class="media-group"') for group in util.advance(groups, 1): url = (text.extr(group, '<source src="', '"') or text.extr(group, 'data-src="', '"')) if url: urls.append(url) + if not date: + ts = text.extr(group, '?v=', '"') + if len(ts) > 1: + date = text.parse_timestamp(ts) data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), "count" : len(urls), + "date" : date, "_http_headers": {"Referer": url}, } diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 1805403..1b4f995 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -394,6 +394,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.original = False return self.data["_url_1280"] + if " temporarily banned " in page: + raise exception.AuthorizationError("Temporarily Banned") + self._report_limits() return True diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py deleted file mode 100644 index 650a707..0000000 --- a/gallery_dl/extractor/fallenangels.py +++ /dev/null @@ -1,84 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2017-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://www.fascans.com/""" - -from .common import ChapterExtractor, MangaExtractor -from .. import text, util - - -class FallenangelsChapterExtractor(ChapterExtractor): - """Extractor for manga chapters from fascans.com""" - category = "fallenangels" - pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com" - r"/manga/([^/?#]+)/([^/?#]+)") - example = "https://manga.fascans.com/manga/NAME/CHAPTER/" - - def __init__(self, match): - self.version, self.manga, self.chapter = match.groups() - url = "https://{}.fascans.com/manga/{}/{}/1".format( - self.version, self.manga, self.chapter) - ChapterExtractor.__init__(self, match, url) - - def metadata(self, page): - extr = text.extract_from(page) - lang = "vi" if self.version == "truyen" else "en" - chapter, sep, minor = self.chapter.partition(".") - return { - "manga" : extr('name="description" content="', ' Chapter '), - "title" : extr(': ', ' - Page 1'), - "chapter" : chapter, - "chapter_minor": sep + minor, - "lang" : lang, - "language": util.code_to_language(lang), - } - - @staticmethod - def images(page): - return [ - (img["page_image"], None) - for img in util.json_loads( - text.extr(page, "var pages = ", ";") - ) - ] - - -class FallenangelsMangaExtractor(MangaExtractor): - """Extractor for manga from fascans.com""" - chapterclass = FallenangelsChapterExtractor - category = "fallenangels" - pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$" - example = "https://manga.fascans.com/manga/NAME" - - def __init__(self, match): - url = "https://" + match.group(1) - self.lang = "vi" if match.group(2) == "truyen" else "en" - MangaExtractor.__init__(self, match, url) - - def chapters(self, page): - extr = text.extract_from(page) - results = [] - language = util.code_to_language(self.lang) - while extr('<li style="', '"'): - vol = extr('class="volume-', '"') - url = extr('href="', '"') - cha = extr('>', '<') - title = extr('<em>', '</em>') - - manga, _, chapter = cha.rpartition(" ") - chapter, dot, minor = chapter.partition(".") - results.append((url, { - "manga" : manga, - "title" : text.unescape(title), - "volume" : text.parse_int(vol), - "chapter" : text.parse_int(chapter), - "chapter_minor": dot + minor, - "lang" : self.lang, - "language": language, - })) - return results diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 6040187..f48a984 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -113,6 +113,12 @@ class FuraffinityExtractor(Extractor): data["gender"] = rh(extr('>Gender</strong>', '</div>')) data["width"] = pi(extr("<span>", "x")) data["height"] = pi(extr("", "p")) + data["folders"] = folders = [] + for folder in extr( + "<h3>Listed in Folders</h3>", "</section>").split("</a>"): + folder = rh(folder) + if folder: + folders.append(folder) else: # old site layout data["title"] = text.unescape(extr("<h2>", "</h2>")) @@ -132,11 +138,14 @@ class FuraffinityExtractor(Extractor): data["_description"] = extr( '<td valign="top" align="left" width="70%" class="alt1" ' 'style="padding:8px">', ' </td>') + data["folders"] = () # folders not present in old layout data["artist_url"] = data["artist"].replace("_", "").lower() data["user"] = self.user or data["artist_url"] data["date"] = text.parse_timestamp(data["filename"].partition(".")[0]) data["description"] = self._process_description(data["_description"]) + data["thumbnail"] = "https://t.furaffinity.net/{}@600-{}.jpg".format( + post_id, path.rsplit("/", 2)[1]) return data diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 8d8b8ad..fbbd26c 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -36,7 +36,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = self.page_start params["limit"] = self.per_page - post = None + post = total = None + count = 0 + while True: try: root = self._api_request(params) @@ -50,12 +52,29 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = 0 continue + if total is None: + try: + total = int(root.attrib["count"]) + self.log.debug("%s posts in total", total) + except Exception as exc: + total = 0 + self.log.debug( + "Failed to get total number of posts (%s: %s)", + exc.__class__.__name__, exc) + post = None for post in root: yield post.attrib - if len(root) < self.per_page: - return + num = len(root) + count += num + if num < self.per_page: + if not total or count >= total: + return + if not num: + self.log.debug("Empty response - Retrying") + continue + params["pid"] += 1 def _pagination_html(self, params): diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 97b7844..286ee38 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -70,10 +70,13 @@ class HentainexusGalleryExtractor(GalleryExtractor): for img in imgs: img["_http_headers"] = headers - return [ - (img["image"], img) - for img in imgs - ] + results = [] + for img in imgs: + try: + results.append((img["image"], img)) + except KeyError: + pass + return results @staticmethod def _decode(data): diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index a2b51be..34fbabd 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -23,6 +23,12 @@ class HotleakExtractor(Extractor): def items(self): for post in self.posts(): + if self.type == "photo": + post["url"] = ( + post["url"] + .replace("/storage/storage/", "/storage/") + .replace("_thumb.", ".") + ) post["_http_expected_status"] = (404,) yield Message.Directory, post yield Message.Url, post["url"], post diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 85446c0..345f51d 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -19,7 +19,7 @@ class ImagefapExtractor(Extractor): category = "imagefap" root = "https://www.imagefap.com" directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{filename}.{extension}" + filename_fmt = "{category}_{gallery_id}_{num:04}_{filename}.{extension}" archive_fmt = "{gallery_id}_{image_id}" request_interval = (2.0, 4.0) diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 2ae8cbe..f3098f1 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -246,14 +246,12 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor): data = {"_extractor": InkbunnyUserExtractor} while True: - cnt = 0 for user in text.extract_iter( page, '<a class="widget_userNameSmall" href="', '"', page.index('id="changethumboriginal_form"')): - cnt += 1 yield Message.Queue, self.root + user, data - if cnt < 20: + if "<a title='next page' " not in page: return params["page"] += 1 page = self.request(url, params=params).text diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index f7a5cc7..dbe2df3 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -596,6 +596,22 @@ class InstagramTagExtractor(InstagramExtractor): return self.api.tags_media(self.item) +class InstagramInfoExtractor(InstagramExtractor): + """Extractor for an Instagram user's profile data""" + subcategory = "info" + pattern = USER_PATTERN + r"/info" + example = "https://www.instagram.com/USER/info/" + + def items(self): + screen_name = self.item + if screen_name.startswith("id:"): + user = self.api.user_by_id(screen_name[3:]) + else: + user = self.api.user_by_name(screen_name) + + return iter(((Message.Directory, user),)) + + class InstagramAvatarExtractor(InstagramExtractor): """Extractor for an Instagram user's avatar""" subcategory = "avatar" @@ -975,9 +991,9 @@ class InstagramGraphqlAPI(): if not info["has_next_page"]: return extr._update_cursor(None) elif not data["edges"]: - s = "" if self.item.endswith("s") else "s" + s = "" if self.extractor.item.endswith("s") else "s" raise exception.StopExtraction( - "%s'%s posts are private", self.item, s) + "%s'%s posts are private", self.extractor.item, s) variables["after"] = extr._update_cursor(info["end_cursor"]) diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py new file mode 100644 index 0000000..979b1a2 --- /dev/null +++ b/gallery_dl/extractor/koharu.py @@ -0,0 +1,221 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://koharu.to/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception +from ..cache import cache + +BASE_PATTERN = r"(?i)(?:https?://)?(?:koharu|anchira)\.to" + + +class KoharuExtractor(Extractor): + """Base class for koharu extractors""" + category = "koharu" + root = "https://koharu.to" + root_api = "https://api.koharu.to" + request_interval = (0.5, 1.5) + + def _init(self): + self.headers = { + "Accept" : "*/*", + "Referer": self.root + "/", + "Origin" : self.root, + } + + def _pagination(self, endpoint, params): + url_api = self.root_api + endpoint + + while True: + data = self.request( + url_api, params=params, headers=self.headers).json() + + try: + entries = data["entries"] + except KeyError: + return + + for entry in entries: + url = "{}/g/{}/{}".format( + self.root, entry["id"], entry["public_key"]) + entry["_extractor"] = KoharuGalleryExtractor + yield Message.Queue, url, entry + + try: + if data["limit"] * data["page"] >= data["total"]: + return + except Exception: + pass + params["page"] += 1 + + +class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor): + """Extractor for koharu galleries""" + filename_fmt = "{num:>03}.{extension}" + directory_fmt = ("{category}", "{id} {title}") + archive_fmt = "{id}_{num}" + request_interval = 0.0 + pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)" + example = "https://koharu.to/g/12345/67890abcde/" + + TAG_TYPES = { + 0 : "general", + 1 : "artist", + 2 : "circle", + 3 : "parody", + 4 : "magazine", + 5 : "character", + 6 : "", + 7 : "uploader", + 8 : "male", + 9 : "female", + 10: "mixed", + 11: "language", + 12: "other", + } + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_url = None + + def _init(self): + self.headers = { + "Accept" : "*/*", + "Referer": self.root + "/", + "Origin" : self.root, + } + + self.fmt = self.config("format") + self.cbz = self.config("cbz", True) + + if self.cbz: + self.filename_fmt = "{id} {title}.{extension}" + self.directory_fmt = ("{category}",) + + def metadata(self, _): + url = "{}/books/detail/{}/{}".format( + self.root_api, self.groups[0], self.groups[1]) + self.data = data = self.request(url, headers=self.headers).json() + + tags = [] + for tag in data["tags"]: + name = tag["name"] + namespace = tag.get("namespace", 0) + tags.append(self.TAG_TYPES[namespace] + ":" + name) + data["tags"] = tags + data["date"] = text.parse_timestamp(data["created_at"] // 1000) + + try: + if self.cbz: + data["count"] = len(data["thumbnails"]["entries"]) + del data["thumbnails"] + del data["rels"] + except Exception: + pass + + return data + + def images(self, _): + data = self.data + fmt = self._select_format(data["data"]) + + url = "{}/books/data/{}/{}/{}/{}".format( + self.root_api, + data["id"], data["public_key"], + fmt["id"], fmt["public_key"], + ) + params = { + "v": data["updated_at"], + "w": fmt["w"], + } + + if self.cbz: + params["action"] = "dl" + base = self.request( + url, method="POST", params=params, headers=self.headers, + ).json()["base"] + url = "{}?v={}&w={}".format(base, data["updated_at"], fmt["w"]) + info = text.nameext_from_url(base) + if not info["extension"]: + info["extension"] = "cbz" + return ((url, info),) + + data = self.request(url, params=params, headers=self.headers).json() + base = data["base"] + + results = [] + for entry in data["entries"]: + dimensions = entry["dimensions"] + info = { + "w": dimensions[0], + "h": dimensions[1], + "_http_headers": self.headers, + } + results.append((base + entry["path"], info)) + return results + + def _select_format(self, formats): + if not self.fmt or self.fmt == "original": + fmtid = "0" + else: + fmtid = str(self.fmt) + + try: + fmt = formats[fmtid] + except KeyError: + raise exception.NotFoundError("format") + + fmt["w"] = fmtid + return fmt + + +class KoharuSearchExtractor(KoharuExtractor): + """Extractor for koharu search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/\?([^#]*)" + example = "https://koharu.to/?s=QUERY" + + def items(self): + params = text.parse_query(self.groups[0]) + params["page"] = text.parse_int(params.get("page"), 1) + return self._pagination("/books", params) + + +class KoharuFavoriteExtractor(KoharuExtractor): + """Extractor for koharu favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" + example = "https://koharu.to/favorites" + + def items(self): + self.login() + + params = text.parse_query(self.groups[0]) + params["page"] = text.parse_int(params.get("page"), 1) + return self._pagination("/favorites", params) + + def login(self): + username, password = self._get_auth_info() + if username: + self.headers["Authorization"] = \ + "Bearer " + self._login_impl(username, password) + return + + raise exception.AuthenticationError("Username and password required") + + @cache(maxage=86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = "https://auth.koharu.to/login" + data = {"uname": username, "passwd": password} + response = self.request( + url, method="POST", headers=self.headers, data=data) + + return response.json()["session"] diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 60cca22..b01c591 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -120,7 +120,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): ] else: pos = page.find('id="view-center"') + 1 - return (text.extr(page, 'itemprop="image" src="', '"', pos),) + # do NOT use text.extr() here, as it doesn't support a pos argument + return (text.extract(page, 'itemprop="image" src="', '"', pos)[0],) @staticmethod def _extract_user_name(page): diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index b21e1eb..2330b08 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -77,6 +77,7 @@ class PahealTagExtractor(PahealExtractor): pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/list/([^/?#]+)") example = "https://rule34.paheal.net/post/list/TAG/1" + page_start = 1 per_page = 70 def __init__(self, match): @@ -87,11 +88,16 @@ class PahealTagExtractor(PahealExtractor): if self.config("metadata"): self._extract_data = self._extract_data_ex + def skip(self, num): + pages = num // self.per_page + self.page_start += pages + return pages * self.per_page + def get_metadata(self): return {"search_tags": self.tags} def get_posts(self): - pnum = 1 + pnum = self.page_start while True: url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) page = self.request(url).text diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 115de9a..271fa50 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -78,12 +78,16 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): } def images(self, page): - return [ - (beau(url), None) - for url in text.extract_iter( - page, "lstImages.push('", "'", - ) - ] + results = [] + + for block in page.split(" pth = '")[1:]: + pth = text.extr(block, "", "'") + for needle, repl in re.findall( + r"pth = pth\.replace\(/([^/]+)/g, [\"']([^\"']*)", block): + pth = pth.replace(needle, repl) + results.append((beau(pth), None)) + + return results class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): @@ -116,9 +120,9 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): def beau(url): - """https://readcomiconline.li/Scripts/rguard.min.js""" - url = url.replace("_x236", "d") - url = url.replace("_x945", "g") + """https://readcomiconline.li/Scripts/rguard.min.js?v=1.5.1""" + url = url.replace("pw_.g28x", "b") + url = url.replace("d2pr.x_27", "h") if url.startswith("https"): return url @@ -126,8 +130,8 @@ def beau(url): url, sep, rest = url.partition("?") containsS0 = "=s0" in url url = url[:-3 if containsS0 else -6] - url = url[4:22] + url[25:] - url = url[0:-6] + url[-2:] + url = url[15:33] + url[50:] + url = url[0:-11] + url[-2:] url = binascii.a2b_base64(url).decode() url = url[0:13] + url[17:] url = url[0:-2] + ("=s0" if containsS0 else "=s1600") diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 327bcd1..506f6ac 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -190,7 +190,7 @@ class RedgifsImageExtractor(RedgifsExtractor): r"(?:\w+\.)?redgifs\.com/(?:watch|ifr)|" r"(?:\w+\.)?gfycat\.com(?:/gifs/detail|/\w+)?|" r"(?:www\.)?gifdeliverynetwork\.com|" - r"i\.redgifs\.com/i)/([A-Za-z]+)") + r"i\.redgifs\.com/i)/([A-Za-z0-9]+)") example = "https://redgifs.com/watch/ID" def gifs(self): diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index caf3e16..ad3efa7 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -16,7 +16,7 @@ import collections import re BASE_PATTERN = r"(?:https?://)?" \ - r"(?:(?:chan|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ + r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \ r"(?:/[a-z]{2})?" @@ -45,6 +45,9 @@ class SankakuExtractor(BooruExtractor): def skip(self, num): return 0 + def _init(self): + self.api = SankakuAPI(self) + def _file_url(self, post): url = post["file_url"] if not url: @@ -81,6 +84,15 @@ class SankakuExtractor(BooruExtractor): post["tags_" + key] = value post["tag_string_" + key] = " ".join(value) + def _notes(self, post, page): + if post.get("has_notes"): + post["notes"] = self.api.notes(post["id"]) + for note in post["notes"]: + note["created_at"] = note["created_at"]["s"] + note["updated_at"] = note["updated_at"]["s"] + else: + post["notes"] = () + class SankakuTagExtractor(SankakuExtractor): """Extractor for images from sankaku.app by search-tags""" @@ -109,7 +121,7 @@ class SankakuTagExtractor(SankakuExtractor): def posts(self): params = {"tags": self.tags} - return SankakuAPI(self).posts_keyset(params) + return self.api.posts_keyset(params) class SankakuPoolExtractor(SankakuExtractor): @@ -125,7 +137,7 @@ class SankakuPoolExtractor(SankakuExtractor): self.pool_id = match.group(1) def metadata(self): - pool = SankakuAPI(self).pools(self.pool_id) + pool = self.api.pools(self.pool_id) pool["tags"] = [tag["name"] for tag in pool["tags"]] pool["artist_tags"] = [tag["name"] for tag in pool["artist_tags"]] @@ -151,7 +163,7 @@ class SankakuPostExtractor(SankakuExtractor): self.post_id = match.group(1) def posts(self): - return SankakuAPI(self).posts(self.post_id) + return self.api.posts(self.post_id) class SankakuBooksExtractor(SankakuExtractor): @@ -167,7 +179,7 @@ class SankakuBooksExtractor(SankakuExtractor): def items(self): params = {"tags": self.tags, "pool_type": "0"} - for pool in SankakuAPI(self).pools_keyset(params): + for pool in self.api.pools_keyset(params): pool["_extractor"] = SankakuPoolExtractor url = "https://sankaku.app/books/{}".format(pool["id"]) yield Message.Queue, url, pool @@ -192,6 +204,10 @@ class SankakuAPI(): if not self.username: self.authenticate = util.noop + def notes(self, post_id): + params = {"lang": "en"} + return self._call("/posts/{}/notes".format(post_id), params) + def pools(self, pool_id): params = {"lang": "en"} return self._call("/pools/" + pool_id, params) diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index e1d4153..50c21e3 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://www.sankakucomplex.com/""" +"""Extractors for https://news.sankakucomplex.com/""" from .common import Extractor, Message from .. import text, util @@ -16,7 +16,7 @@ import re class SankakucomplexExtractor(Extractor): """Base class for sankakucomplex extractors""" category = "sankakucomplex" - root = "https://www.sankakucomplex.com" + root = "https://news.sankakucomplex.com" def __init__(self, match): Extractor.__init__(self, match) @@ -24,14 +24,14 @@ class SankakucomplexExtractor(Extractor): class SankakucomplexArticleExtractor(SankakucomplexExtractor): - """Extractor for articles on www.sankakucomplex.com""" + """Extractor for articles on news.sankakucomplex.com""" subcategory = "article" directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}") filename_fmt = "{filename}.{extension}" archive_fmt = "{date:%Y%m%d}_{filename}" - pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + pattern = (r"(?:https?://)?(?:news|www)\.sankakucomplex\.com" r"/(\d\d\d\d/\d\d/\d\d/[^/?#]+)") - example = "https://www.sankakucomplex.com/1970/01/01/TITLE" + example = "https://news.sankakucomplex.com/1970/01/01/TITLE" def items(self): url = "{}/{}/?pg=X".format(self.root, self.path) @@ -87,9 +87,9 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): class SankakucomplexTagExtractor(SankakucomplexExtractor): """Extractor for sankakucomplex blog articles by tag or author""" subcategory = "tag" - pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + pattern = (r"(?:https?://)?(?:news|www)\.sankakucomplex\.com" r"/((?:tag|category|author)/[^/?#]+)") - example = "https://www.sankakucomplex.com/tag/TAG/" + example = "https://news.sankakucomplex.com/tag/TAG/" def items(self): pnum = 1 diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 0abb3ab..7c760ac 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -127,6 +127,8 @@ class SubscribestarExtractor(Extractor): } def _parse_datetime(self, dt): + if dt.startswith("Updated on "): + dt = dt[11:] date = text.parse_datetime(dt, "%b %d, %Y %I:%M %p") if date is dt: date = text.parse_datetime(dt, "%B %d, %Y %I:%M %p") diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py index 78ff265..64fa951 100644 --- a/gallery_dl/extractor/toyhouse.py +++ b/gallery_dl/extractor/toyhouse.py @@ -77,23 +77,27 @@ class ToyhouseExtractor(Extractor): cnt += 1 yield self._parse_post(post) - if cnt == 0 and params["page"] == 1: - token, pos = text.extract( - page, '<input name="_token" type="hidden" value="', '"') - if not token: - return - data = { - "_token": token, - "user" : text.extract(page, 'value="', '"', pos)[0], - } - self.request(self.root + "/~account/warnings/accept", - method="POST", data=data, allow_redirects=False) - continue + if not cnt and params["page"] == 1: + if self._accept_content_warning(page): + continue + return if cnt < 18: return params["page"] += 1 + def _accept_content_warning(self, page): + pos = page.find(' name="_token"') + 1 + token, pos = text.extract(page, ' value="', '"', pos) + user , pos = text.extract(page, ' value="', '"', pos) + if not token or not user: + return False + + data = {"_token": token, "user": user} + self.request(self.root + "/~account/warnings/accept", + method="POST", data=data, allow_redirects=False) + return True + class ToyhouseArtExtractor(ToyhouseExtractor): """Extractor for artworks of a toyhouse user""" diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index c34910f..ff29c04 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -386,7 +386,7 @@ class TumblrAPI(oauth.OAuth1API): def posts(self, blog, params): """Retrieve published posts""" params["offset"] = self.extractor.config("offset") - params["limit"] = "50" + params["limit"] = 50 params["reblog_info"] = "true" params["type"] = self.posts_type params["before"] = self.before @@ -398,8 +398,14 @@ class TumblrAPI(oauth.OAuth1API): def likes(self, blog): """Retrieve liked posts""" + endpoint = "/v2/blog/{}/likes".format(blog) params = {"limit": "50", "before": self.before} - return self._pagination(blog, "/likes", params, key="liked_posts") + while True: + posts = self._call(endpoint, params)["liked_posts"] + if not posts: + return + yield from posts + params["before"] = posts[-1]["liked_timestamp"] def _call(self, endpoint, params, **kwargs): url = self.ROOT + endpoint @@ -474,6 +480,7 @@ class TumblrAPI(oauth.OAuth1API): if self.api_key: params["api_key"] = self.api_key + strategy = self.extractor.config("pagination") while True: data = self._call(endpoint, params) @@ -481,13 +488,31 @@ class TumblrAPI(oauth.OAuth1API): self.BLOG_CACHE[blog] = data["blog"] cache = False - yield from data[key] - - try: - endpoint = data["_links"]["next"]["href"] - except KeyError: - return + posts = data[key] + yield from posts - params = None - if self.api_key: - endpoint += "&api_key=" + self.api_key + if strategy == "api": + try: + endpoint = data["_links"]["next"]["href"] + except KeyError: + return + + params = None + if self.api_key: + endpoint += "&api_key=" + self.api_key + + elif strategy == "before": + if not posts: + return + timestamp = posts[-1]["timestamp"] + 1 + if params["before"] and timestamp >= params["before"]: + return + params["before"] = timestamp + params["offset"] = None + + else: # offset + params["offset"] = \ + text.parse_int(params["offset"]) + params["limit"] + params["before"] = None + if params["offset"] >= data["total_posts"]: + return diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ec098aa..9fa5b3f 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -51,6 +51,8 @@ class TwitterExtractor(Extractor): if not self.config("transform", True): self._transform_user = util.identity self._transform_tweet = util.identity + + self._cursor = None self._user = None self._user_obj = None self._user_cache = {} @@ -321,8 +323,17 @@ class TwitterExtractor(Extractor): "quote_count" : tget("quote_count"), "reply_count" : tget("reply_count"), "retweet_count" : tget("retweet_count"), + "bookmark_count": tget("bookmark_count"), } + if "views" in tweet: + try: + tdata["view_count"] = int(tweet["views"]["count"]) + except Exception: + tdata["view_count"] = 0 + else: + tdata["view_count"] = 0 + if "note_tweet" in tweet: note = tweet["note_tweet"]["note_tweet_results"]["result"] content = note["text"] @@ -492,6 +503,14 @@ class TwitterExtractor(Extractor): }, } + def _init_cursor(self): + return self.config("cursor") or None + + def _update_cursor(self, cursor): + self.log.debug("Cursor: %s", cursor) + self._cursor = cursor + return cursor + def metadata(self): """Return general metadata""" return {} @@ -499,6 +518,11 @@ class TwitterExtractor(Extractor): def tweets(self): """Yield all relevant tweet objects""" + def finalize(self): + if self._cursor: + self.log.info("Use '-o cursor=%s' to continue downloading " + "from the current position", self._cursor) + def login(self): if self.cookies_check(self.cookies_names): return @@ -530,6 +554,9 @@ class TwitterUserExtractor(TwitterExtractor): def initialize(self): pass + def finalize(self): + pass + def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( @@ -549,30 +576,73 @@ class TwitterTimelineExtractor(TwitterExtractor): pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" example = "https://x.com/USER/timeline" + def _init_cursor(self): + if self._cursor: + return self._cursor.partition("/")[2] or None + return None + + def _update_cursor(self, cursor): + if cursor: + self._cursor = self._cursor_prefix + cursor + self.log.debug("Cursor: %s", self._cursor) + else: + self._cursor = None + return cursor + def tweets(self): - # yield initial batch of (media) tweets - tweet = None - for tweet in self._select_tweet_source()(self.user): - yield tweet - if tweet is None: - return + self._cursor = cursor = self.config("cursor") or None + reset = False - # build search query - query = "from:{} max_id:{}".format( - self._user["name"], tweet["rest_id"]) - if self.retweets: - query += " include:retweets include:nativeretweets" + if cursor: + state = cursor.partition("/")[0] + state, _, tweet_id = state.partition("_") + state = text.parse_int(state, 1) + else: + state = 1 + + if state <= 1: + self._cursor_prefix = "1/" - if not self.textonly: - # try to search for media-only tweets + # yield initial batch of (media) tweets tweet = None - for tweet in self.api.search_timeline(query + " filter:links"): + for tweet in self._select_tweet_source()(self.user): yield tweet - if tweet is not None: + if tweet is None and not cursor: return + tweet_id = tweet["rest_id"] + + state = reset = 2 + else: + self.api._user_id_by_screen_name(self.user) + + # build search query + query = "from:{} max_id:{}".format(self._user["name"], tweet_id) + if self.retweets: + query += " include:retweets include:nativeretweets" - # yield unfiltered search results - yield from self.api.search_timeline(query) + if state <= 2: + self._cursor_prefix = "2_{}/".format(tweet_id) + if reset: + self._cursor = self._cursor_prefix + + if not self.textonly: + # try to search for media-only tweets + tweet = None + for tweet in self.api.search_timeline(query + " filter:links"): + yield tweet + if tweet is not None: + return self._update_cursor(None) + + state = reset = 3 + + if state <= 3: + # yield unfiltered search results + self._cursor_prefix = "3_{}/".format(tweet_id) + if reset: + self._cursor = self._cursor_prefix + + yield from self.api.search_timeline(query) + return self._update_cursor(None) def _select_tweet_source(self): strategy = self.config("strategy") @@ -854,6 +924,24 @@ class TwitterQuotesExtractor(TwitterExtractor): yield Message.Queue, url, data +class TwitterInfoExtractor(TwitterExtractor): + """Extractor for a user's profile data""" + subcategory = "info" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/info" + example = "https://x.com/USER/info" + + def items(self): + api = TwitterAPI(self) + + screen_name = self.user + if screen_name.startswith("id:"): + user = api.user_by_rest_id(screen_name[3:]) + else: + user = api.user_by_screen_name(screen_name) + + return iter(((Message.Directory, self._transform_user(user)),)) + + class TwitterAvatarExtractor(TwitterExtractor): subcategory = "avatar" filename_fmt = "avatar {date}.{extension}" @@ -1388,7 +1476,11 @@ class TwitterAPI(): "%s %s (%s)", response.status_code, response.reason, errors) def _pagination_legacy(self, endpoint, params): - original_retweets = (self.extractor.retweets == "original") + extr = self.extractor + cursor = extr._init_cursor() + if cursor: + params["cursor"] = cursor + original_retweets = (extr.retweets == "original") bottom = ("cursor-bottom-", "sq-cursor-bottom") while True: @@ -1396,7 +1488,7 @@ class TwitterAPI(): instructions = data["timeline"]["instructions"] if not instructions: - return + return extr._update_cursor(None) tweets = data["globalObjects"]["tweets"] users = data["globalObjects"]["users"] @@ -1477,8 +1569,8 @@ class TwitterAPI(): # stop on empty response if not cursor or (not tweets and not tweet_id): - return - params["cursor"] = cursor + return extr._update_cursor(None) + params["cursor"] = extr._update_cursor(cursor) def _pagination_tweets(self, endpoint, variables, path=None, stop_tweets=True, features=None): @@ -1487,6 +1579,9 @@ class TwitterAPI(): pinned_tweet = extr.pinned params = {"variables": None} + cursor = extr._init_cursor() + if cursor: + variables["cursor"] = cursor if features is None: features = self.features_pagination if features: @@ -1523,7 +1618,7 @@ class TwitterAPI(): cursor = entry["content"]["value"] if entries is None: if not cursor: - return + return extr._update_cursor(None) entries = () except LookupError: @@ -1672,12 +1767,16 @@ class TwitterAPI(): continue if stop_tweets and not tweet: - return + return extr._update_cursor(None) if not cursor or cursor == variables.get("cursor"): - return - variables["cursor"] = cursor + return extr._update_cursor(None) + variables["cursor"] = extr._update_cursor(cursor) def _pagination_users(self, endpoint, variables, path=None): + extr = self.extractor + cursor = extr._init_cursor() + if cursor: + variables["cursor"] = cursor params = { "variables": None, "features" : self._json_dumps(self.features_pagination), @@ -1697,7 +1796,7 @@ class TwitterAPI(): data = data[key] instructions = data["instructions"] except KeyError: - return + return extr._update_cursor(None) for instr in instructions: if instr["type"] == "TimelineAddEntries": @@ -1715,8 +1814,8 @@ class TwitterAPI(): cursor = entry["content"]["value"] if not cursor or cursor.startswith(("-1|", "0|")) or not entry: - return - variables["cursor"] = cursor + return extr._update_cursor(None) + variables["cursor"] = extr._update_cursor(cursor) def _handle_ratelimit(self, response): rl = self.extractor.config("ratelimit") @@ -1864,7 +1963,7 @@ def _login_impl(extr, username, password): }, } elif subtask == "LoginEnterAlternateIdentifierSubtask": - alt = extr.config("username_alt") or extr.input( + alt = extr.config("username-alt") or extr.input( "Alternate Identifier (username, email, phone number): ") data = { "enter_text": { diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 6dfb23c..5cde0d6 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -101,7 +101,8 @@ class VipergirlsExtractor(Extractor): class VipergirlsThreadExtractor(VipergirlsExtractor): """Extractor for vipergirls threads""" subcategory = "thread" - pattern = BASE_PATTERN + r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?$" + pattern = (BASE_PATTERN + + r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?(?:$|#|\?(?!p=))") example = "https://vipergirls.to/threads/12345-TITLE" def __init__(self, match): diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index c112f4a..922a591 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -115,9 +115,28 @@ class VscoExtractor(Extractor): class VscoUserExtractor(VscoExtractor): - """Extractor for images from a user on vsco.co""" + """Extractor for a vsco user profile""" subcategory = "user" - pattern = USER_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])" + pattern = USER_PATTERN + r"/?$" + example = "https://vsco.co/USER" + + def initialize(self): + pass + + def items(self): + base = "{}/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (VscoAvatarExtractor , base + "avatar"), + (VscoGalleryExtractor , base + "gallery"), + (VscoSpacesExtractor , base + "spaces"), + (VscoCollectionExtractor, base + "collection"), + ), ("gallery",)) + + +class VscoGalleryExtractor(VscoExtractor): + """Extractor for a vsco user's gallery""" + subcategory = "gallery" + pattern = USER_PATTERN + r"/(?:gallery|images)" example = "https://vsco.co/USER/gallery" def images(self): diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py index faf3b0d..796f3f8 100644 --- a/gallery_dl/extractor/wallpapercave.py +++ b/gallery_dl/extractor/wallpapercave.py @@ -18,7 +18,7 @@ class WallpapercaveImageExtractor(Extractor): category = "wallpapercave" subcategory = "image" root = "https://wallpapercave.com" - pattern = r"(?:https?://)?(?:www\.)?wallpapercave\.com" + pattern = r"(?:https?://)?(?:www\.)?wallpapercave\.com/" example = "https://wallpapercave.com/w/wp12345" def items(self): @@ -40,3 +40,12 @@ class WallpapercaveImageExtractor(Extractor): image = text.nameext_from_url(path) yield Message.Directory, image yield Message.Url, self.root + path, image + + if path is None: + for wp in text.extract_iter( + page, 'class="wallpaper" id="wp', '</picture>'): + path = text.rextract(wp, ' src="', '"')[0] + if path: + image = text.nameext_from_url(path) + yield Message.Directory, image + yield Message.Url, self.root + path, image diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index e91f45f..61a36d5 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -64,7 +64,7 @@ class WarosuThreadExtractor(Extractor): def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if "<span> File:" in post and self._extract_image(post, data): + if "<span class=fileinfo>" in post and self._extract_image(post, data): part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] @@ -83,7 +83,7 @@ class WarosuThreadExtractor(Extractor): def _extract_image(self, post, data): extr = text.extract_from(post) - data["fsize"] = extr("<span> File: ", ", ") + data["fsize"] = extr("<span class=fileinfo> File: ", ", ") data["w"] = extr("", "x") data["h"] = extr("", ", ") data["filename"] = text.unquote(extr( diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index fc61dff..126ef49 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -11,6 +11,8 @@ from .booru import BooruExtractor from ..cache import cache from .. import text, util, exception +import collections +import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -76,22 +78,29 @@ class ZerochanExtractor(BooruExtractor): 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('<ul id="tags"', '</ul>'), - "source" : extr('<h2>Source</h2>', '</p><h2>').rpartition( - ">")[2] or None, + "source" : text.unescape(text.extr( + extr('id="source-url"', '</a>'), 'href="', '"')), } html = data["tags"] tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: - category = text.extr(tag, 'data-type="', '"') + category = text.extr(tag, '"', '"') name = text.extr(tag, 'data-tag="', '"') - tags.append(category.capitalize() + ":" + name) + tags.append(category.partition(" ")[0].capitalize() + ":" + name) return data def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) - item = self.request(url).json() + text = self.request(url).text + try: + item = util.json_loads(text) + except ValueError as exc: + if " control character " not in str(exc): + raise + text = re.sub(r"[\x00-\x1f\x7f]", "", text) + item = util.json_loads(text) data = { "id" : item["id"], @@ -109,6 +118,14 @@ class ZerochanExtractor(BooruExtractor): return data + def _tags(self, post, page): + tags = collections.defaultdict(list) + for tag in post["tags"]: + category, _, name = tag.partition(":") + tags[category].append(name) + for key, value in tags.items(): + post["tags_" + key.lower()] = value + class ZerochanTagExtractor(ZerochanExtractor): subcategory = "tag" @@ -180,10 +197,16 @@ class ZerochanTagExtractor(ZerochanExtractor): static = "https://static.zerochan.net/.full." while True: - data = self.request(url, params=params).json() + response = self.request(url, params=params, allow_redirects=False) + if response.status_code >= 300: + url = text.urljoin(self.root, response.headers["location"]) + response = self.request(url, params=params) + data = response.json() + try: posts = data["items"] - except ValueError: + except Exception: + self.log.debug("Server response: %s", data) return if metadata: @@ -191,13 +214,13 @@ class ZerochanTagExtractor(ZerochanExtractor): post_id = post["id"] post.update(self._parse_entry_html(post_id)) post.update(self._parse_entry_api(post_id)) + yield post else: for post in posts: base = static + str(post["id"]) post["file_url"] = base + ".jpg" post["_fallback"] = (base + ".png",) - - yield from posts + yield post if not data.get("next"): return diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 4562b05..0e0916d 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -33,6 +33,7 @@ stdout_write = output.stdout_write class Job(): """Base class for Job types""" ulog = None + _logger_adapter = output.LoggerAdapter def __init__(self, extr, parent=None): if isinstance(extr, str): @@ -77,9 +78,9 @@ class Job(): actions = extr.config("actions") if actions: - from .actions import parse + from .actions import LoggerAdapter, parse + self._logger_adapter = LoggerAdapter self._logger_actions = parse(actions) - self._wrap_logger = self._wrap_logger_actions path_proxy = output.PathfmtProxy(self) self._logger_extra = { @@ -267,10 +268,7 @@ class Job(): return self._wrap_logger(logging.getLogger(name)) def _wrap_logger(self, logger): - return output.LoggerAdapter(logger, self) - - def _wrap_logger_actions(self, logger): - return output.LoggerAdapterActions(logger, self) + return self._logger_adapter(logger, self) def _write_unsupported(self, url): if self.ulog: @@ -315,7 +313,7 @@ class DownloadJob(Job): pathfmt.build_path() if pathfmt.exists(): - if archive: + if archive and self._archive_write_skip: archive.add(kwdict) self.handle_skip() return @@ -345,7 +343,7 @@ class DownloadJob(Job): return if not pathfmt.temppath: - if archive: + if archive and self._archive_write_skip: archive.add(kwdict) self.handle_skip() return @@ -359,7 +357,7 @@ class DownloadJob(Job): pathfmt.finalize() self.out.success(pathfmt.path) self._skipcnt = 0 - if archive: + if archive and self._archive_write_file: archive.add(kwdict) if "after" in hooks: for callback in hooks["after"]: @@ -561,6 +559,16 @@ class DownloadJob(Job): else: extr.log.debug("Using download archive '%s'", archive_path) + events = cfg("archive-event") + if events is None: + self._archive_write_file = True + self._archive_write_skip = False + else: + if isinstance(events, str): + events = events.split(",") + self._archive_write_file = ("file" in events) + self._archive_write_skip = ("skip" in events) + skip = cfg("skip", True) if skip: self._skipexc = None @@ -676,7 +684,7 @@ class SimulationJob(DownloadJob): kwdict["extension"] = "jpg" if self.sleep: self.extractor.sleep(self.sleep(), "download") - if self.archive: + if self.archive and self._archive_write_skip: self.archive.add(kwdict) self.out.skip(self.pathfmt.build_filename(kwdict)) @@ -848,16 +856,22 @@ class InfoJob(Job): class DataJob(Job): """Collect extractor results and dump them""" + resolve = False - def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True): + def __init__(self, url, parent=None, file=sys.stdout, ensure_ascii=True, + resolve=False): Job.__init__(self, url, parent) self.file = file self.data = [] self.ascii = config.get(("output",), "ascii", ensure_ascii) + self.resolve = 128 if resolve is True else (resolve or self.resolve) private = config.get(("output",), "private") self.filter = dict.copy if private else util.filter_dict + if self.resolve > 0: + self.handle_queue = self.handle_queue_resolve + def run(self): self._init() @@ -883,12 +897,13 @@ class DataJob(Job): for msg in self.data: util.transform_dict(msg[-1], util.number_to_string) - # dump to 'file' - try: - util.dump_json(self.data, self.file, self.ascii, 2) - self.file.flush() - except Exception: - pass + if self.file: + # dump to 'file' + try: + util.dump_json(self.data, self.file, self.ascii, 2) + self.file.flush() + except Exception: + pass return 0 @@ -900,3 +915,17 @@ class DataJob(Job): def handle_queue(self, url, kwdict): self.data.append((Message.Queue, url, self.filter(kwdict))) + + def handle_queue_resolve(self, url, kwdict): + cls = kwdict.get("_extractor") + if cls: + extr = cls.from_url(url) + else: + extr = extractor.find(url) + + if not extr: + return self.data.append((Message.Queue, url, self.filter(kwdict))) + + job = self.__class__(extr, self, None, self.ascii, self.resolve-1) + job.data = self.data + job.run() diff --git a/gallery_dl/option.py b/gallery_dl/option.py index f31d5ac..155cbd9 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -293,10 +293,15 @@ def build_parser(): ) output.add_argument( "-j", "--dump-json", - dest="jobtype", action="store_const", const=job.DataJob, + dest="dump_json", action="count", help="Print JSON information", ) output.add_argument( + "-J", "--resolve-json", + dest="dump_json", action="store_const", const=128, + help="Print JSON information; resolve intermediary URLs", + ) + output.add_argument( "-s", "--simulate", dest="jobtype", action="store_const", const=job.SimulationJob, help="Simulate data extraction; do not download anything", @@ -346,6 +351,11 @@ def build_parser(): "in the current directory to debug problems"), ) output.add_argument( + "--print-traffic", + dest="print_traffic", action="store_true", + help=("Display sent and read HTTP traffic"), + ) + output.add_argument( "--no-colors", dest="colors", action="store_false", help=("Do not emit ANSI color codes in output"), diff --git a/gallery_dl/output.py b/gallery_dl/output.py index bd5d959..13b6a8a 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -10,7 +10,6 @@ import os import sys import shutil import logging -import functools import unicodedata from . import config, util, formatter @@ -92,39 +91,6 @@ class LoggerAdapter(): self.logger._log(logging.ERROR, msg, args, **kwargs) -class LoggerAdapterActions(): - - def __init__(self, logger, job): - self.logger = logger - self.extra = job._logger_extra - self.actions = job._logger_actions - - self.debug = functools.partial(self.log, logging.DEBUG) - self.info = functools.partial(self.log, logging.INFO) - self.warning = functools.partial(self.log, logging.WARNING) - self.error = functools.partial(self.log, logging.ERROR) - - def log(self, level, msg, *args, **kwargs): - msg = str(msg) - if args: - msg = msg % args - - actions = self.actions[level] - if actions: - args = self.extra.copy() - args["level"] = level - - for cond, action in actions: - if cond(msg): - action(args) - - level = args["level"] - - if self.logger.isEnabledFor(level): - kwargs["extra"] = self.extra - self.logger._log(level, msg, (), **kwargs) - - class PathfmtProxy(): __slots__ = ("job",) diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 1616bbd..7892776 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -51,6 +51,7 @@ class PathFormat(): raise exception.FilenameFormatError(exc) directory_fmt = config("directory") + self.directory_conditions = () try: if directory_fmt is None: directory_fmt = extractor.directory_fmt @@ -266,7 +267,7 @@ class PathFormat(): try: for fmt in self.directory_formatters: segment = fmt(kwdict).strip() - if strip: + if strip and segment != "..": # remove trailing dots and spaces (#647) segment = segment.rstrip(strip) if segment: @@ -288,7 +289,7 @@ class PathFormat(): formatters = self.directory_formatters for fmt in formatters: segment = fmt(kwdict).strip() - if strip: + if strip and segment != "..": segment = segment.rstrip(strip) if segment: append(self.clean_segment(segment)) @@ -344,7 +345,11 @@ class PathFormat(): continue except OSError: # move across different filesystems - shutil.copyfile(self.temppath, self.realpath) + try: + shutil.copyfile(self.temppath, self.realpath) + except FileNotFoundError: + os.makedirs(self.realdirectory) + shutil.copyfile(self.temppath, self.realpath) os.unlink(self.temppath) break diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 18d00e1..a520a34 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -56,7 +56,13 @@ class MetadataPP(PostProcessor): ext = "json" directory = options.get("directory") - if directory: + if isinstance(directory, list): + self._directory = self._directory_format + self._directory_formatters = [ + formatter.parse(dirfmt, util.NONE).format_map + for dirfmt in directory + ] + elif directory: self._directory = self._directory_custom sep = os.sep + (os.altsep or "") self._metadir = util.expand_path(directory).rstrip(sep) + os.sep @@ -147,6 +153,19 @@ class MetadataPP(PostProcessor): def _directory_custom(self, pathfmt): return os.path.join(pathfmt.realdirectory, self._metadir) + def _directory_format(self, pathfmt): + formatters = pathfmt.directory_formatters + conditions = pathfmt.directory_conditions + try: + pathfmt.directory_formatters = self._directory_formatters + pathfmt.directory_conditions = () + segments = pathfmt.build_directory(pathfmt.kwdict) + directory = pathfmt.clean_path(os.sep.join(segments) + os.sep) + return os.path.join(pathfmt.realdirectory, directory) + finally: + pathfmt.directory_conditions = conditions + pathfmt.directory_formatters = formatters + def _filename(self, pathfmt): return (pathfmt.filename or "metadata") + "." + self.extension diff --git a/gallery_dl/util.py b/gallery_dl/util.py index e76ddf3..5744ef3 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -540,10 +540,14 @@ class CustomNone(): def __bool__(): return False + def __eq__(self, other): + return self is other + + def __ne__(self, other): + return self is not other + __lt__ = true __le__ = true - __eq__ = false - __ne__ = true __gt__ = false __ge__ = false @@ -616,11 +620,28 @@ else: Popen = subprocess.Popen -def compile_expression(expr, name="<expr>", globals=None): +def compile_expression_raw(expr, name="<expr>", globals=None): code_object = compile(expr, name, "eval") return functools.partial(eval, code_object, globals or GLOBALS) +def compile_expression_tryexcept(expr, name="<expr>", globals=None): + code_object = compile(expr, name, "eval") + + def _eval(locals=None, globals=(globals or GLOBALS), co=code_object): + try: + return eval(co, globals, locals) + except exception.GalleryDLException: + raise + except Exception: + return False + + return _eval + + +compile_expression = compile_expression_tryexcept + + def import_file(path): """Import a Python module from a filesystem path""" path, name = os.path.split(path) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index a8ff38e..f234af1 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.27.1" +__version__ = "1.27.2" __variant__ = None diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index 0a0bf86..d4fdedc 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -249,6 +249,22 @@ def parse_command_line(module, argv): None if opts.match_filter is None else module.match_filter_func(opts.match_filter)) + cookiesfrombrowser = getattr(opts, "cookiesfrombrowser", None) + if cookiesfrombrowser: + match = re.fullmatch(r"""(?x) + (?P<name>[^+:]+) + (?:\s*\+\s*(?P<keyring>[^:]+))? + (?:\s*:\s*(?!:)(?P<profile>.+?))? + (?:\s*::\s*(?P<container>.+))? + """, cookiesfrombrowser) + if match: + browser, keyring, profile, container = match.groups() + if keyring is not None: + keyring = keyring.upper() + cookiesfrombrowser = (browser.lower(), profile, keyring, container) + else: + cookiesfrombrowser = None + return { "usenetrc": opts.usenetrc, "netrc_location": getattr(opts, "netrc_location", None), @@ -364,7 +380,7 @@ def parse_command_line(module, argv): "skip_playlist_after_errors": getattr( opts, "skip_playlist_after_errors", None), "cookiefile": opts.cookiefile, - "cookiesfrombrowser": getattr(opts, "cookiesfrombrowser", None), + "cookiesfrombrowser": cookiesfrombrowser, "nocheckcertificate": opts.no_check_certificate, "prefer_insecure": opts.prefer_insecure, "proxy": opts.proxy, diff --git a/test/test_extractor.py b/test/test_extractor.py index 6af1226..abf122b 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -17,7 +17,7 @@ import string from datetime import datetime, timedelta sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from gallery_dl import extractor # noqa E402 +from gallery_dl import extractor, util # noqa E402 from gallery_dl.extractor import mastodon # noqa E402 from gallery_dl.extractor.common import Extractor, Message # noqa E402 from gallery_dl.extractor.directlink import DirectlinkExtractor # noqa E402 @@ -25,7 +25,11 @@ from gallery_dl.extractor.directlink import DirectlinkExtractor # noqa E402 _list_classes = extractor._list_classes try: - from test import results + RESULTS = os.environ.get("GDL_TEST_RESULTS") + if RESULTS: + results = util.import_file(RESULTS) + else: + from test import results except ImportError: results = None @@ -109,6 +113,7 @@ class TestExtractorModule(unittest.TestCase): print("Skipping '{}' category checks".format(cat)) continue raise + self.assertTrue(extr, url) self.assertEqual(extr.category, cat, url) self.assertEqual(extr.subcategory, sub, url) self.assertEqual(extr.basecategory, base, url) diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index d509052..3e6d1df 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -440,6 +440,18 @@ class MetadataTest(BasePostprocessorTest): path = self.pathfmt.realdirectory + "metadata/file.json" m.assert_called_once_with(path, "w", encoding="utf-8") + def test_metadata_directory_format(self): + self._create( + {"directory": ["..", "json", "\fE str(id // 500 * 500 + 500)"]}, + {"id": 12345}, + ) + + with patch("builtins.open", mock_open()) as m: + self._trigger() + + path = self.pathfmt.realdirectory + "../json/12500/file.ext.json" + m.assert_called_once_with(path, "w", encoding="utf-8") + def test_metadata_filename(self): self._create({ "filename" : "{category}_{filename}_/meta/\n\r.data", diff --git a/test/test_results.py b/test/test_results.py index ab3668e..e2c7ca2 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -20,7 +20,13 @@ import collections sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from gallery_dl import \ extractor, util, job, config, exception, formatter # noqa E402 -from test import results # noqa E402 + + +RESULTS = os.environ.get("GDL_TEST_RESULTS") +if RESULTS: + results = util.import_file(RESULTS) +else: + from test import results # temporary issues, etc. @@ -86,38 +92,34 @@ class TestExtractorResults(unittest.TestCase): def _run_test(self, result): result.pop("#comment", None) - only_matching = (len(result) <= 3) + auth = result.pop("#auth", None) + + extractor.find(result["#url"]) + extr = result["#class"].from_url(result["#url"]) + if not extr: + raise exception.NoExtractorError() + if len(result) <= 3: + return # only matching - auth = result.get("#auth") if auth is None: auth = (result["#category"][1] in AUTH) elif not auth: + # auth explicitly disabled for key in AUTH_CONFIG: config.set((), key, None) - if auth: - extr = result["#class"].from_url(result["#url"]) - if not any(extr.config(key) for key in AUTH_CONFIG): - self._skipped.append((result["#url"], "no auth")) - only_matching = True + if auth and not any(extr.config(key) for key in AUTH_CONFIG): + return self._skipped.append((result["#url"], "no auth")) - if only_matching: - content = False - else: - if "#options" in result: - for key, value in result["#options"].items(): - key = key.split(".") - config.set(key[:-1], key[-1], value) - if "#range" in result: - config.set((), "image-range" , result["#range"]) - config.set((), "chapter-range", result["#range"]) - content = ("#sha1_content" in result) - - tjob = ResultJob(result["#url"], content=content) - self.assertEqual(result["#class"], tjob.extractor.__class__, "#class") - - if only_matching: - return + if "#options" in result: + for key, value in result["#options"].items(): + key = key.split(".") + config.set(key[:-1], key[-1], value) + if "#range" in result: + config.set((), "image-range" , result["#range"]) + config.set((), "chapter-range", result["#range"]) + + tjob = ResultJob(extr, content=("#sha1_content" in result)) if "#exception" in result: with self.assertRaises(result["#exception"], msg="#exception"): diff --git a/test/test_util.py b/test/test_util.py index 35e7247..4622c28 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -134,19 +134,18 @@ class TestPredicate(unittest.TestCase): with self.assertRaises(SyntaxError): util.FilterPredicate("(") - with self.assertRaises(exception.FilterError): - util.FilterPredicate("a > 1")(url, {"a": None}) - - with self.assertRaises(exception.FilterError): - util.FilterPredicate("b > 1")(url, {"a": 2}) + self.assertFalse( + util.FilterPredicate("a > 1")(url, {"a": None})) + self.assertFalse( + util.FilterPredicate("b > 1")(url, {"a": 2})) pred = util.FilterPredicate(["a < 3", "b < 4", "c < 5"]) self.assertTrue(pred(url, {"a": 2, "b": 3, "c": 4})) self.assertFalse(pred(url, {"a": 3, "b": 3, "c": 4})) self.assertFalse(pred(url, {"a": 2, "b": 4, "c": 4})) self.assertFalse(pred(url, {"a": 2, "b": 3, "c": 5})) - with self.assertRaises(exception.FilterError): - pred(url, {"a": 2}) + + self.assertFalse(pred(url, {"a": 2})) def test_build_predicate(self): pred = util.build_predicate([]) @@ -445,6 +444,7 @@ class TestOther(unittest.TestCase): self.assertEqual(expr({"a": 1, "b": 2, "c": 3}), 7) self.assertEqual(expr({"a": 9, "b": 9, "c": 9}), 90) + expr = util.compile_expression_raw("a + b * c") with self.assertRaises(NameError): expr() with self.assertRaises(NameError): @@ -755,8 +755,9 @@ def hash(value): self.assertLess(obj, "foo") self.assertLessEqual(obj, None) - self.assertFalse(obj == obj) - self.assertTrue(obj != obj) + self.assertTrue(obj == obj) + self.assertFalse(obj == 0) + self.assertFalse(obj != obj) self.assertGreater(123, obj) self.assertGreaterEqual(1.23, obj) diff --git a/test/test_ytdl.py b/test/test_ytdl.py index 878ac85..fd2e40a 100644 --- a/test/test_ytdl.py +++ b/test/test_ytdl.py @@ -294,6 +294,20 @@ class Test_CommandlineArguments_YtDlp(Test_CommandlineArguments): self._(["--geo-bypass-ip-block", "198.51.100.14/24"], "geo_bypass", "198.51.100.14/24") + def test_cookiesfrombrowser(self): + self._(["--cookies-from-browser", "firefox"], + "cookiesfrombrowser", ("firefox", None, None, None)) + self._(["--cookies-from-browser", "firefox:profile"], + "cookiesfrombrowser", ("firefox", "profile", None, None)) + self._(["--cookies-from-browser", "firefox+keyring"], + "cookiesfrombrowser", ("firefox", None, "KEYRING", None)) + self._(["--cookies-from-browser", "firefox::container"], + "cookiesfrombrowser", ("firefox", None, None, "container")) + self._(["--cookies-from-browser", + "firefox+keyring:profile::container"], + "cookiesfrombrowser", + ("firefox", "profile", "KEYRING", "container")) + if __name__ == "__main__": unittest.main(warnings="ignore") |
