diff options
46 files changed, 1380 insertions, 322 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index c64d80d..ffd11a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,44 @@ # Changelog +## 1.21.0 - 2022-03-14 +### Additions +- [fantia] add `num` enumeration index ([#2377](https://github.com/mikf/gallery-dl/issues/2377)) +- [fantia] support "Blog Post" content ([#2381](https://github.com/mikf/gallery-dl/issues/2381)) +- [imagebam] add support for /view/ paths ([#2378](https://github.com/mikf/gallery-dl/issues/2378)) +- [kemonoparty] match beta.kemono.party URLs ([#2348](https://github.com/mikf/gallery-dl/issues/2348)) +- [kissgoddess] add `gallery` and `model` extractors ([#1052](https://github.com/mikf/gallery-dl/issues/1052), [#2304](https://github.com/mikf/gallery-dl/issues/2304)) +- [mememuseum] add `tag` and `post` extractors ([#2264](https://github.com/mikf/gallery-dl/issues/2264)) +- [newgrounds] add `post_url` metadata field ([#2328](https://github.com/mikf/gallery-dl/issues/2328)) +- [patreon] add `image_large` file type ([#2257](https://github.com/mikf/gallery-dl/issues/2257)) +- [toyhouse] support `art` listings ([#1546](https://github.com/mikf/gallery-dl/issues/1546), [#2331](https://github.com/mikf/gallery-dl/issues/2331)) +- [twibooru] add extractors for searches, galleries, and posts ([#2219](https://github.com/mikf/gallery-dl/issues/2219)) +- [postprocessor:metadata] implement `mtime` option ([#2307](https://github.com/mikf/gallery-dl/issues/2307)) +- [postprocessor:mtime] add `event` option ([#2307](https://github.com/mikf/gallery-dl/issues/2307)) +- add fish shell completion ([#2363](https://github.com/mikf/gallery-dl/issues/2363)) +- add `timedelta` class to global namespace in filter expressions +### Changes +- [seiga] require authentication with `user_session` cookie ([#2372](https://github.com/mikf/gallery-dl/issues/2372)) + - remove username & password login due to 2FA +- refactor proxy support ([#2357](https://github.com/mikf/gallery-dl/issues/2357)) + - allow gallery-dl proxy settings to overwrite environment proxies + - allow specifying different proxies for data extraction and download +### Fixes +- [bunkr] fix mp4 downloads ([#2239](https://github.com/mikf/gallery-dl/issues/2239)) +- [fanbox] fetch data for each individual post ([#2388](https://github.com/mikf/gallery-dl/issues/2388)) +- [hentaicosplays] send `Referer` header ([#2317](https://github.com/mikf/gallery-dl/issues/2317)) +- [imagebam] set `nsfw_inter` cookie ([#2334](https://github.com/mikf/gallery-dl/issues/2334)) +- [kemonoparty] limit default filename length ([#2373](https://github.com/mikf/gallery-dl/issues/2373)) +- [mangadex] fix chapters without `translatedLanguage` ([#2352](https://github.com/mikf/gallery-dl/issues/2352)) +- [newgrounds] fix video descriptions ([#2328](https://github.com/mikf/gallery-dl/issues/2328)) +- [skeb] add `sent-requests` option ([#2322](https://github.com/mikf/gallery-dl/issues/2322), [#2330](https://github.com/mikf/gallery-dl/issues/2330)) +- [slideshare] fix extraction +- [subscribestar] unescape attachment URLs ([#2370](https://github.com/mikf/gallery-dl/issues/2370)) +- [twitter] fix handling of 429 Too Many Requests responses ([#2339](https://github.com/mikf/gallery-dl/issues/2339)) +- [twitter] warn about age-restricted Tweets ([#2354](https://github.com/mikf/gallery-dl/issues/2354)) +- [twitter] handle Tweets with "softIntervention" entries +- [twitter] update query hashes +- fix another bug in `_check_cookies()` ([#2160](https://github.com/mikf/gallery-dl/issues/2160)) + ## 1.20.5 - 2022-02-14 ### Additions - [furaffinity] add `layout` option ([#2277](https://github.com/mikf/gallery-dl/issues/2277)) @@ -1237,7 +1276,7 @@ - Miscellaneous fixes for `*reactor`, `simplyhentai` ## 1.10.1 - 2019-08-02 -## Fixes +### Fixes - Use the correct domain for exhentai.org input URLs ## 1.10.0 - 2019-08-01 @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.20.5 +Version: 1.21.0 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -72,14 +72,14 @@ easily installed or upgraded using pip_: .. code:: bash - $ python3 -m pip install -U gallery-dl + python3 -m pip install -U gallery-dl Installing the latest dev version directly from GitHub can be done with pip_ as well: .. code:: bash - $ python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz + python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz Note: Windows users should use :code:`py -3` instead of :code:`python3`. @@ -89,7 +89,7 @@ To ensure these packages are up-to-date, run .. code:: bash - $ python3 -m pip install --upgrade pip setuptools wheel + python3 -m pip install --upgrade pip setuptools wheel Standalone Executable @@ -98,8 +98,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.20.5/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.20.5/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.21.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.21.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -112,7 +112,7 @@ Linux users that are using a distro that is supported by Snapd_ can install *gal .. code:: bash - $ snap install gallery-dl + snap install gallery-dl Chocolatey @@ -122,7 +122,7 @@ Windows users that have Chocolatey_ installed can install *gallery-dl* from the .. code:: powershell - $ choco install gallery-dl + choco install gallery-dl Scoop @@ -132,7 +132,7 @@ Scoop .. code:: powershell - $ scoop install gallery-dl + scoop install gallery-dl Usage @@ -143,7 +143,7 @@ from: .. code:: bash - $ gallery-dl [OPTION]... URL... + gallery-dl [OPTION]... URL... See also :code:`gallery-dl --help`. @@ -155,21 +155,21 @@ Download images; in this case from danbooru via tag search for 'bonocho': .. code:: bash - $ gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" + gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" -Get the direct URL of an image from a site that requires authentication: +Get the direct URL of an image from a site supporting authentication with username & password: .. code:: bash - $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" + gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256" Filter manga chapters by language and chapter number: .. code:: bash - $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" | Search a remote resource for URLs and download images from them: @@ -177,7 +177,7 @@ Filter manga chapters by language and chapter number: .. code:: bash - $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" + gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" If a site's address is nonstandard for its extractor, you can prefix the URL with the @@ -185,7 +185,7 @@ extractor's name to force the use of a specific extractor: .. code:: bash - $ gallery-dl "tumblr:https://sometumblrblog.example" + gallery-dl "tumblr:https://sometumblrblog.example" Configuration @@ -233,7 +233,7 @@ Username & Password Some extractors require you to provide valid login credentials in the form of a username & password pair. This is necessary for -``nijie`` and ``seiga`` +``nijie`` and optional for ``aryion``, ``danbooru``, @@ -259,7 +259,7 @@ You can set the necessary information in your configuration file { "extractor": { - "seiga": { + "twitter": { "username": "<username>", "password": "<password>" } @@ -272,8 +272,8 @@ or you can provide them directly via the .. code:: bash - $ gallery-dl -u <username> -p <password> URL - $ gallery-dl -o username=<username> -o password=<password> URL + gallery-dl -u <username> -p <password> URL + gallery-dl -o username=<username> -o password=<password> URL Cookies @@ -317,7 +317,7 @@ the :code:`--cookies` command-line option: .. code:: bash - $ gallery-dl --cookies "$HOME/path/to/cookies.txt" URL + gallery-dl --cookies "$HOME/path/to/cookies.txt" URL OAuth @@ -335,7 +335,7 @@ To link your account to *gallery-dl*, start by invoking it with .. code:: bash - $ gallery-dl oauth:flickr + gallery-dl oauth:flickr You will be sent to the site's authorization page and asked to grant read access to *gallery-dl*. Authorize it and you will be shown one or more @@ -346,8 +346,8 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. code:: bash - $ gallery-dl oauth:mastodon:pawoo.net - $ gallery-dl oauth:mastodon:https://mastodon.social/ + gallery-dl oauth:mastodon:pawoo.net + gallery-dl oauth:mastodon:https://mastodon.social/ @@ -38,14 +38,14 @@ easily installed or upgraded using pip_: .. code:: bash - $ python3 -m pip install -U gallery-dl + python3 -m pip install -U gallery-dl Installing the latest dev version directly from GitHub can be done with pip_ as well: .. code:: bash - $ python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz + python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz Note: Windows users should use :code:`py -3` instead of :code:`python3`. @@ -55,7 +55,7 @@ To ensure these packages are up-to-date, run .. code:: bash - $ python3 -m pip install --upgrade pip setuptools wheel + python3 -m pip install --upgrade pip setuptools wheel Standalone Executable @@ -64,8 +64,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.20.5/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.20.5/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.21.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.21.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -78,7 +78,7 @@ Linux users that are using a distro that is supported by Snapd_ can install *gal .. code:: bash - $ snap install gallery-dl + snap install gallery-dl Chocolatey @@ -88,7 +88,7 @@ Windows users that have Chocolatey_ installed can install *gallery-dl* from the .. code:: powershell - $ choco install gallery-dl + choco install gallery-dl Scoop @@ -98,7 +98,7 @@ Scoop .. code:: powershell - $ scoop install gallery-dl + scoop install gallery-dl Usage @@ -109,7 +109,7 @@ from: .. code:: bash - $ gallery-dl [OPTION]... URL... + gallery-dl [OPTION]... URL... See also :code:`gallery-dl --help`. @@ -121,21 +121,21 @@ Download images; in this case from danbooru via tag search for 'bonocho': .. code:: bash - $ gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" + gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" -Get the direct URL of an image from a site that requires authentication: +Get the direct URL of an image from a site supporting authentication with username & password: .. code:: bash - $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" + gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256" Filter manga chapters by language and chapter number: .. code:: bash - $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" | Search a remote resource for URLs and download images from them: @@ -143,7 +143,7 @@ Filter manga chapters by language and chapter number: .. code:: bash - $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" + gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" If a site's address is nonstandard for its extractor, you can prefix the URL with the @@ -151,7 +151,7 @@ extractor's name to force the use of a specific extractor: .. code:: bash - $ gallery-dl "tumblr:https://sometumblrblog.example" + gallery-dl "tumblr:https://sometumblrblog.example" Configuration @@ -199,7 +199,7 @@ Username & Password Some extractors require you to provide valid login credentials in the form of a username & password pair. This is necessary for -``nijie`` and ``seiga`` +``nijie`` and optional for ``aryion``, ``danbooru``, @@ -225,7 +225,7 @@ You can set the necessary information in your configuration file { "extractor": { - "seiga": { + "twitter": { "username": "<username>", "password": "<password>" } @@ -238,8 +238,8 @@ or you can provide them directly via the .. code:: bash - $ gallery-dl -u <username> -p <password> URL - $ gallery-dl -o username=<username> -o password=<password> URL + gallery-dl -u <username> -p <password> URL + gallery-dl -o username=<username> -o password=<password> URL Cookies @@ -283,7 +283,7 @@ the :code:`--cookies` command-line option: .. code:: bash - $ gallery-dl --cookies "$HOME/path/to/cookies.txt" URL + gallery-dl --cookies "$HOME/path/to/cookies.txt" URL OAuth @@ -301,7 +301,7 @@ To link your account to *gallery-dl*, start by invoking it with .. code:: bash - $ gallery-dl oauth:flickr + gallery-dl oauth:flickr You will be sent to the site's authorization page and asked to grant read access to *gallery-dl*. Authorize it and you will be shown one or more @@ -312,8 +312,8 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. code:: bash - $ gallery-dl oauth:mastodon:pawoo.net - $ gallery-dl oauth:mastodon:https://mastodon.social/ + gallery-dl oauth:mastodon:pawoo.net + gallery-dl oauth:mastodon:https://mastodon.social/ diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish new file mode 100644 index 0000000..8f915fd --- /dev/null +++ b/data/completion/gallery-dl.fish @@ -0,0 +1,62 @@ +complete -c gallery-dl -x +complete -c gallery-dl -s 'h' -l 'help' -d 'Print this help message and exit' +complete -c gallery-dl -l 'version' -d 'Print program version and exit' +complete -c gallery-dl -r -F -s 'i' -l 'input-file' -d 'Download URLs found in FILE ("-" for stdin). More than one --input-file can be specified' +complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'd' -l 'destination' -d 'Target location for file downloads' +complete -c gallery-dl -x -a '(__fish_complete_directories)' -s 'D' -l 'directory' -d 'Exact location for file downloads' +complete -c gallery-dl -x -s 'f' -l 'filename' -d 'Filename format string for downloaded files ("/O" for "original" filenames)' +complete -c gallery-dl -r -F -l 'cookies' -d 'File to load additional cookies from' +complete -c gallery-dl -x -l 'proxy' -d 'Use the specified proxy' +complete -c gallery-dl -x -l 'source-address' -d 'Client-side IP address to bind to' +complete -c gallery-dl -x -l 'clear-cache' -d 'Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)' +complete -c gallery-dl -s 'q' -l 'quiet' -d 'Activate quiet mode' +complete -c gallery-dl -s 'v' -l 'verbose' -d 'Print various debugging information' +complete -c gallery-dl -s 'g' -l 'get-urls' -d 'Print URLs instead of downloading' +complete -c gallery-dl -s 'G' -l 'resolve-urls' -d 'Print URLs instead of downloading; resolve intermediary URLs' +complete -c gallery-dl -s 'j' -l 'dump-json' -d 'Print JSON information' +complete -c gallery-dl -s 's' -l 'simulate' -d 'Simulate data extraction; do not download anything' +complete -c gallery-dl -s 'E' -l 'extractor-info' -d 'Print extractor defaults and settings' +complete -c gallery-dl -s 'K' -l 'list-keywords' -d 'Print a list of available keywords and example values for the given URLs' +complete -c gallery-dl -l 'list-modules' -d 'Print a list of available extractor modules' +complete -c gallery-dl -l 'list-extractors' -d 'Print a list of extractor classes with description, (sub)category and example URL' +complete -c gallery-dl -r -F -l 'write-log' -d 'Write logging output to FILE' +complete -c gallery-dl -r -F -l 'write-unsupported' -d 'Write URLs, which get emitted by other extractors but cannot be handled, to FILE' +complete -c gallery-dl -l 'write-pages' -d 'Write downloaded intermediary pages to files in the current directory to debug problems' +complete -c gallery-dl -x -s 'r' -l 'limit-rate' -d 'Maximum download rate (e.g. 500k or 2.5M)' +complete -c gallery-dl -x -s 'R' -l 'retries' -d 'Maximum number of retries for failed HTTP requests or -1 for infinite retries (default: 4)' +complete -c gallery-dl -x -l 'http-timeout' -d 'Timeout for HTTP connections (default: 30.0)' +complete -c gallery-dl -x -l 'sleep' -d 'Number of seconds to wait before each download. This can be either a constant value or a range (e.g. 2.7 or 2.0-3.5)' +complete -c gallery-dl -x -l 'sleep-request' -d 'Number of seconds to wait between HTTP requests during data extraction' +complete -c gallery-dl -x -l 'sleep-extractor' -d 'Number of seconds to wait before starting data extraction for an input URL' +complete -c gallery-dl -x -l 'filesize-min' -d 'Do not download files smaller than SIZE (e.g. 500k or 2.5M)' +complete -c gallery-dl -x -l 'filesize-max' -d 'Do not download files larger than SIZE (e.g. 500k or 2.5M)' +complete -c gallery-dl -l 'no-part' -d 'Do not use .part files' +complete -c gallery-dl -l 'no-skip' -d 'Do not skip downloads; overwrite existing files' +complete -c gallery-dl -l 'no-mtime' -d 'Do not set file modification times according to Last-Modified HTTP response headers' +complete -c gallery-dl -l 'no-download' -d 'Do not download any files' +complete -c gallery-dl -l 'no-check-certificate' -d 'Disable HTTPS certificate validation' +complete -c gallery-dl -r -F -s 'c' -l 'config' -d 'Additional configuration files' +complete -c gallery-dl -r -F -l 'config-yaml' -d '==SUPPRESS==' +complete -c gallery-dl -x -s 'o' -l 'option' -d 'Additional "<key>=<value>" option values' +complete -c gallery-dl -l 'ignore-config' -d 'Do not read the default configuration files' +complete -c gallery-dl -x -s 'u' -l 'username' -d 'Username to login with' +complete -c gallery-dl -x -s 'p' -l 'password' -d 'Password belonging to the given username' +complete -c gallery-dl -l 'netrc' -d 'Enable .netrc authentication data' +complete -c gallery-dl -r -F -l 'download-archive' -d 'Record all downloaded files in the archive file and skip downloading any file already in it' +complete -c gallery-dl -x -s 'A' -l 'abort' -d 'Stop current extractor run after N consecutive file downloads were skipped' +complete -c gallery-dl -x -s 'T' -l 'terminate' -d 'Stop current and parent extractor runs after N consecutive file downloads were skipped' +complete -c gallery-dl -x -l 'range' -d 'Index-range(s) specifying which images to download. For example "5-10" or "1,3-5,10-"' +complete -c gallery-dl -x -l 'chapter-range' -d 'Like "--range", but applies to manga-chapters and other delegated URLs' +complete -c gallery-dl -x -l 'filter' -d 'Python expression controlling which images to download. Files for which the expression evaluates to False are ignored. Available keys are the filename-specific ones listed by "-K". Example: --filter "image_width >= 1000 and rating in ("s", "q")"' +complete -c gallery-dl -x -l 'chapter-filter' -d 'Like "--filter", but applies to manga-chapters and other delegated URLs' +complete -c gallery-dl -l 'zip' -d 'Store downloaded files in a ZIP archive' +complete -c gallery-dl -l 'ugoira-conv' -d 'Convert Pixiv Ugoira to WebM (requires FFmpeg)' +complete -c gallery-dl -l 'ugoira-conv-lossless' -d 'Convert Pixiv Ugoira to WebM in VP9 lossless mode' +complete -c gallery-dl -l 'write-metadata' -d 'Write metadata to separate JSON files' +complete -c gallery-dl -l 'write-info-json' -d 'Write gallery metadata to a info.json file' +complete -c gallery-dl -l 'write-infojson' -d '==SUPPRESS==' +complete -c gallery-dl -l 'write-tags' -d 'Write image tags to separate text files' +complete -c gallery-dl -l 'mtime-from-date' -d 'Set file modification times according to "date" metadata' +complete -c gallery-dl -x -l 'exec' -d 'Execute CMD for each downloaded file. Example: --exec "convert {} {}.png && rm {}"' +complete -c gallery-dl -x -l 'exec-after' -d 'Execute CMD after all files were downloaded successfully. Example: --exec-after "cd {} && convert * ../doc.pdf"' +complete -c gallery-dl -x -s 'P' -l 'postprocessor' -d 'Activate the specified post processor' diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 1671d2d..3e373fd 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-02-14" "1.20.5" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-03-14" "1.21.0" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 51b20cd..9651d18 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-02-14" "1.20.5" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-03-14" "1.21.0" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -419,8 +419,6 @@ Specifying a username and password is required for .br * \f[I]nijie\f[] -.br -* \f[I]seiga\f[] and optional for @@ -553,6 +551,28 @@ Note: All proxy URLs should include a scheme, otherwise \f[I]http://\f[] is assumed. +.SS extractor.*.source-address +.IP "Type:" 6 +.br +* \f[I]string\f[] +.br +* \f[I]list\f[] with 1 \f[I]string\f[] and 1 \f[I]integer\f[] as elements + +.IP "Example:" 4 +.br +* "192.168.178.20" +.br +* ["192.168.178.20", 8080] + +.IP "Description:" 4 +Client-side IP address to bind to. + +Can be either a simple \f[I]string\f[] with just the local IP address +.br +or a \f[I]list\f[] with IP and explicit port number as elements. +.br + + .SS extractor.*.user-agent .IP "Type:" 6 \f[I]string\f[] @@ -1874,13 +1894,13 @@ port than the default. \f[I]list\f[] of \f[I]strings\f[] .IP "Default:" 9 -\f[I]["images", "attachments", "postfile", "content"]\f[] +\f[I]["images", "image_large", "attachments", "postfile", "content"]\f[] .IP "Description:" 4 Determines the type and order of files to be downloaded. Available types are -\f[I]postfile\f[], \f[I]images\f[], \f[I]attachments\f[], and \f[I]content\f[]. +\f[I]postfile\f[], \f[I]images\f[], \f[I]image_large\f[], \f[I]attachments\f[], and \f[I]content\f[]. .SS extractor.photobucket.subalbums @@ -2228,6 +2248,17 @@ Download video embeds from external sites. Download videos. +.SS extractor.skeb.sent-requests +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download sent requests. + + .SS extractor.skeb.thumbnails .IP "Type:" 6 \f[I]bool\f[] @@ -2331,6 +2362,34 @@ Possible types are \f[I]text\f[], \f[I]quote\f[], \f[I]link\f[], \f[I]answer\f[] You can use \f[I]"all"\f[] instead of listing all types separately. +.SS extractor.twibooru.api-key +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Your \f[I]Twibooru API Key\f[], +to use your account's browsing settings and filters. + + +.SS extractor.twibooru.filter +.IP "Type:" 6 +\f[I]integer\f[] + +.IP "Default:" 9 +\f[I]2\f[] (\f[I]Everything\f[] filter) + +.IP "Description:" 4 +The content filter ID to use. + +Setting an explicit filter ID overrides any default filters and can be used +to access 18+ content without \f[I]API Key\f[]. + +See \f[I]Filters\f[] for details. + + .SS extractor.twitter.cards .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -2511,18 +2570,6 @@ Control video download behavior. * \f[I]false\f[]: Skip video Tweets -.SS extractor.twitter.warnings -.IP "Type:" 6 -\f[I]bool\f[] - -.IP "Default:" 9 -\f[I]false\f[] - -.IP "Description:" 4 -Emit \f[I]logging messages\f[] -for non-fatal errors reported by Twitter's API. - - .SS extractor.unsplash.format .IP "Type:" 6 \f[I]string\f[] @@ -2912,6 +2959,20 @@ Connection timeout during file downloads. Certificate validation during file downloads. +.SS downloader.*.proxy +.IP "Type:" 6 +\f[I]string\f[] or \f[I]object\f[] + +.IP "Default:" 9 +\f[I]extractor.*.proxy\f[] + +.IP "Description:" 4 +Proxy server used for file downloads. +.br +Disable the use of a proxy by explicitly setting this option to \f[I]null\f[]. +.br + + .SS downloader.http.adjust-extensions .IP "Type:" 6 \f[I]bool\f[] @@ -3450,6 +3511,41 @@ Custom format string to build the content of metadata files with. Note: Only applies for \f[I]"mode": "custom"\f[]. +.SS metadata.mtime +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Set modification times for generated metadata files +according to the accompanying downloaded file. + +Enabling this option will only have an effect +*if* there is actual \f[I]mtime\f[] metadata available, that is + +.br +* after a file download (\f[I]"event": "file"\f[] (default), \f[I]"event": "after"\f[]) +.br +* when running *after* an \f[I]mtime\f[] post processes for the same \f[I]event\f[] + +For example, a \f[I]metadata\f[] post processor for \f[I]"event": "post"\f[] will +*not* be able to set its file's modification time unless an \f[I]mtime\f[] +post processor with \f[I]"event": "post"\f[] runs *before* it. + + +.SS mtime.event +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"file"\f[] + +.IP "Description:" 4 +See \f[I]metadata.event\f[] + + .SS mtime.key .IP "Type:" 6 \f[I]string\f[] diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 7e3e5cf..009ede8 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.20.5 +Version: 1.21.0 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Author: Mike Fährmann @@ -72,14 +72,14 @@ easily installed or upgraded using pip_: .. code:: bash - $ python3 -m pip install -U gallery-dl + python3 -m pip install -U gallery-dl Installing the latest dev version directly from GitHub can be done with pip_ as well: .. code:: bash - $ python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz + python3 -m pip install -U -I --no-deps --no-cache-dir https://github.com/mikf/gallery-dl/archive/master.tar.gz Note: Windows users should use :code:`py -3` instead of :code:`python3`. @@ -89,7 +89,7 @@ To ensure these packages are up-to-date, run .. code:: bash - $ python3 -m pip install --upgrade pip setuptools wheel + python3 -m pip install --upgrade pip setuptools wheel Standalone Executable @@ -98,8 +98,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.20.5/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.20.5/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.21.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.21.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -112,7 +112,7 @@ Linux users that are using a distro that is supported by Snapd_ can install *gal .. code:: bash - $ snap install gallery-dl + snap install gallery-dl Chocolatey @@ -122,7 +122,7 @@ Windows users that have Chocolatey_ installed can install *gallery-dl* from the .. code:: powershell - $ choco install gallery-dl + choco install gallery-dl Scoop @@ -132,7 +132,7 @@ Scoop .. code:: powershell - $ scoop install gallery-dl + scoop install gallery-dl Usage @@ -143,7 +143,7 @@ from: .. code:: bash - $ gallery-dl [OPTION]... URL... + gallery-dl [OPTION]... URL... See also :code:`gallery-dl --help`. @@ -155,21 +155,21 @@ Download images; in this case from danbooru via tag search for 'bonocho': .. code:: bash - $ gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" + gallery-dl "https://danbooru.donmai.us/posts?tags=bonocho" -Get the direct URL of an image from a site that requires authentication: +Get the direct URL of an image from a site supporting authentication with username & password: .. code:: bash - $ gallery-dl -g -u "<username>" -p "<password>" "https://seiga.nicovideo.jp/seiga/im3211703" + gallery-dl -g -u "<username>" -p "<password>" "https://twitter.com/i/web/status/604341487988576256" Filter manga chapters by language and chapter number: .. code:: bash - $ gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" + gallery-dl --chapter-filter "lang == 'fr' and 10 <= chapter < 20" "https://mangadex.org/title/2354/" | Search a remote resource for URLs and download images from them: @@ -177,7 +177,7 @@ Filter manga chapters by language and chapter number: .. code:: bash - $ gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" + gallery-dl "r:https://pastebin.com/raw/FLwrCYsT" If a site's address is nonstandard for its extractor, you can prefix the URL with the @@ -185,7 +185,7 @@ extractor's name to force the use of a specific extractor: .. code:: bash - $ gallery-dl "tumblr:https://sometumblrblog.example" + gallery-dl "tumblr:https://sometumblrblog.example" Configuration @@ -233,7 +233,7 @@ Username & Password Some extractors require you to provide valid login credentials in the form of a username & password pair. This is necessary for -``nijie`` and ``seiga`` +``nijie`` and optional for ``aryion``, ``danbooru``, @@ -259,7 +259,7 @@ You can set the necessary information in your configuration file { "extractor": { - "seiga": { + "twitter": { "username": "<username>", "password": "<password>" } @@ -272,8 +272,8 @@ or you can provide them directly via the .. code:: bash - $ gallery-dl -u <username> -p <password> URL - $ gallery-dl -o username=<username> -o password=<password> URL + gallery-dl -u <username> -p <password> URL + gallery-dl -o username=<username> -o password=<password> URL Cookies @@ -317,7 +317,7 @@ the :code:`--cookies` command-line option: .. code:: bash - $ gallery-dl --cookies "$HOME/path/to/cookies.txt" URL + gallery-dl --cookies "$HOME/path/to/cookies.txt" URL OAuth @@ -335,7 +335,7 @@ To link your account to *gallery-dl*, start by invoking it with .. code:: bash - $ gallery-dl oauth:flickr + gallery-dl oauth:flickr You will be sent to the site's authorization page and asked to grant read access to *gallery-dl*. Authorize it and you will be shown one or more @@ -346,8 +346,8 @@ To authenticate with a ``mastodon`` instance, run *gallery-dl* with .. code:: bash - $ gallery-dl oauth:mastodon:pawoo.net - $ gallery-dl oauth:mastodon:https://mastodon.social/ + gallery-dl oauth:mastodon:pawoo.net + gallery-dl oauth:mastodon:https://mastodon.social/ diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index edcc5e2..4139a4d 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -6,6 +6,7 @@ setup.cfg setup.py data/completion/_gallery-dl data/completion/gallery-dl +data/completion/gallery-dl.fish data/man/gallery-dl.1 data/man/gallery-dl.conf.5 docs/gallery-dl-example.conf @@ -102,6 +103,7 @@ gallery_dl/extractor/kabeuchi.py gallery_dl/extractor/keenspot.py gallery_dl/extractor/kemonoparty.py gallery_dl/extractor/khinsider.py +gallery_dl/extractor/kissgoddess.py gallery_dl/extractor/kohlchan.py gallery_dl/extractor/komikcast.py gallery_dl/extractor/lightroom.py @@ -118,6 +120,7 @@ gallery_dl/extractor/mangapark.py gallery_dl/extractor/mangasee.py gallery_dl/extractor/mangoxo.py gallery_dl/extractor/mastodon.py +gallery_dl/extractor/mememuseum.py gallery_dl/extractor/message.py gallery_dl/extractor/moebooru.py gallery_dl/extractor/myhentaigallery.py @@ -166,9 +169,11 @@ gallery_dl/extractor/speakerdeck.py gallery_dl/extractor/subscribestar.py gallery_dl/extractor/tapas.py gallery_dl/extractor/test.py +gallery_dl/extractor/toyhouse.py gallery_dl/extractor/tsumino.py gallery_dl/extractor/tumblr.py gallery_dl/extractor/tumblrgallery.py +gallery_dl/extractor/twibooru.py gallery_dl/extractor/twitter.py gallery_dl/extractor/unsplash.py gallery_dl/extractor/vanillarock.py diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index d858075..1168d83 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -27,6 +27,12 @@ class DownloaderBase(): self.partdir = util.expand_path(self.partdir) os.makedirs(self.partdir, exist_ok=True) + proxies = self.config("proxy", util.SENTINEL) + if proxies is util.SENTINEL: + self.proxies = job.extractor._proxies + else: + self.proxies = util.build_proxy_map(proxies, self.log) + def config(self, key, default=None): """Interpolate downloader config value for 'key'""" return config.interpolate(("downloader", self.scheme), key, default) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 91ce731..b878f5f 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -121,7 +121,8 @@ class HttpDownloader(DownloaderBase): try: response = self.session.request( "GET", url, stream=True, headers=headers, - timeout=self.timeout, verify=self.verify) + timeout=self.timeout, verify=self.verify, + proxies=self.proxies) except (ConnectionError, Timeout) as exc: msg = str(exc) continue diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 462bbf8..2badccf 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -25,6 +25,7 @@ class YoutubeDLDownloader(DownloaderBase): "retries": retries+1 if retries >= 0 else float("inf"), "socket_timeout": self.config("timeout", extractor._timeout), "nocheckcertificate": not self.config("verify", extractor._verify), + "proxy": self.proxies.get("http") if self.proxies else None, } self.ytdl_instance = None diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index b52561e..1bec48e 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -67,6 +67,7 @@ modules = [ "keenspot", "kemonoparty", "khinsider", + "kissgoddess", "kohlchan", "komikcast", "lightroom", @@ -81,6 +82,7 @@ modules = [ "mangapark", "mangasee", "mangoxo", + "mememuseum", "myhentaigallery", "myportfolio", "naver", @@ -123,9 +125,11 @@ modules = [ "speakerdeck", "subscribestar", "tapas", + "toyhouse", "tsumino", "tumblr", "tumblrgallery", + "twibooru", "twitter", "unsplash", "vanillarock", diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index a42ec53..12d98b1 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -41,9 +41,9 @@ class BooruExtractor(BaseExtractor): page_html = self._extended_tags(post) if notes: self._notes(post, page_html) - self._prepare(post) - post.update(data) text.nameext_from_url(url, post) + post.update(data) + self._prepare(post) yield Message.Directory, post yield Message.Url, url, post diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5a2d3a3..e3559f9 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -55,6 +55,7 @@ class Extractor(): self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) + self._proxies = util.build_proxy_map(self.config("proxy"), self.log) self._interval = util.build_duration_func( self.config("sleep-request", self.request_interval), self.request_interval_min, @@ -65,7 +66,6 @@ class Extractor(): self._init_session() self._init_cookies() - self._init_proxies() @classmethod def from_url(cls, url): @@ -104,10 +104,12 @@ class Extractor(): def request(self, url, *, method="GET", session=None, retries=None, encoding=None, fatal=True, notfound=None, **kwargs): - if retries is None: - retries = self._retries if session is None: session = self.session + if retries is None: + retries = self._retries + if "proxies" not in kwargs: + kwargs["proxies"] = self._proxies if "timeout" not in kwargs: kwargs["timeout"] = self._timeout if "verify" not in kwargs: @@ -289,20 +291,6 @@ class Extractor(): session.mount("https://", adapter) session.mount("http://", adapter) - def _init_proxies(self): - """Update the session's proxy map""" - proxies = self.config("proxy") - if proxies: - if isinstance(proxies, str): - proxies = {"http": proxies, "https": proxies} - if isinstance(proxies, dict): - for scheme, proxy in proxies.items(): - if "://" not in proxy: - proxies[scheme] = "http://" + proxy.lstrip("/") - self.session.proxies = proxies - else: - self.log.warning("invalid proxy specifier: %s", proxies) - def _init_cookies(self): """Populate the session's cookiejar""" self._cookiefile = None @@ -371,20 +359,25 @@ class Extractor(): for cookie in self._cookiejar: if cookie.name in names and ( not domain or cookie.domain == domain): + if cookie.expires: diff = int(cookie.expires - now) + if diff <= 0: self.log.warning( "Cookie '%s' has expired", cookie.name) + continue + elif diff <= 86400: hours = diff // 3600 self.log.warning( "Cookie '%s' will expire in less than %s hour%s", cookie.name, hours + 1, "s" if hours else "") - else: - names.discard(cookie.name) - if not names: - return True + continue + + names.discard(cookie.name) + if not names: + return True return False def _prepare_ddosguard_cookies(self): @@ -616,8 +609,7 @@ class BaseExtractor(Extractor): if index: self.category, self.root = self.instances[index-1] if not self.root: - url = text.ensure_http_scheme(match.group(0)) - self.root = url[:url.index("/", 8)] + self.root = text.root_from_url(match.group(0)) else: self.root = group self.category = group.partition("://")[2] diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 94fec16..fda7220 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -1004,6 +1004,7 @@ class DeviantartOAuthAPI(): self.extractor = extractor self.log = extractor.log self.headers = {"dA-minor-version": "20200519"} + self._warn_429 = True self.delay = extractor.config("wait-min", 0) self.delay_min = max(2, self.delay) @@ -1260,6 +1261,16 @@ class DeviantartOAuthAPI(): if self.delay < 30: self.delay += 1 self.log.warning("%s. Using %ds delay.", msg, self.delay) + + if self._warn_429 and self.delay >= 3: + self._warn_429 = False + if self.client_id == self.CLIENT_ID: + self.log.info( + "Register your own OAuth application and use its " + "credentials to prevent this error: " + "https://github.com/mikf/gallery-dl/blob/master/do" + "cs/configuration.rst#extractordeviantartclient-id" + "--client-secret") else: self.log.error(msg) return data diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index ef79808..11436cb 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -51,19 +51,16 @@ class FanboxExtractor(Extractor): url = text.ensure_http_scheme(url) body = self.request(url, headers=headers).json()["body"] for item in body["items"]: - yield self._process_post(item) + yield self._get_post_data(item["id"]) url = body["nextUrl"] - def _get_post_data_from_id(self, post_id): + def _get_post_data(self, post_id): """Fetch and process post data""" headers = {"Origin": self.root} url = "https://api.fanbox.cc/post.info?postId="+post_id post = self.request(url, headers=headers).json()["body"] - return self._process_post(post) - - def _process_post(self, post): content_body = post.pop("body", None) if content_body: if "html" in content_body: @@ -279,7 +276,7 @@ class FanboxPostExtractor(FanboxExtractor): self.post_id = match.group(3) def posts(self): - return (self._get_post_data_from_id(self.post_id),) + return (self._get_post_data(self.post_id),) class FanboxRedirectExtractor(Extractor): diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 89a965f..c05ec39 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -8,6 +8,7 @@ from .common import Extractor, Message from .. import text +import json class FantiaExtractor(Extractor): @@ -29,7 +30,9 @@ class FantiaExtractor(Extractor): for post_id in self.posts(): full_response, post = self._get_post_data(post_id) yield Message.Directory, post + post["num"] = 0 for url, url_data in self._get_urls_from_post(full_response, post): + post["num"] += 1 fname = url_data["content_filename"] or url text.nameext_from_url(fname, url_data) url_data["file_url"] = url @@ -90,14 +93,39 @@ class FantiaExtractor(Extractor): post["content_title"] = content["title"] post["content_filename"] = content.get("filename", "") post["content_id"] = content["id"] + + if "comment" in content: + post["content_comment"] = content["comment"] + if "post_content_photos" in content: for photo in content["post_content_photos"]: post["file_id"] = photo["id"] yield photo["url"]["original"], post + if "download_uri" in content: post["file_id"] = content["id"] yield self.root+"/"+content["download_uri"], post + if content["category"] == "blog" and "comment" in content: + comment_json = json.loads(content["comment"]) + ops = comment_json.get("ops", ()) + + # collect blogpost text first + blog_text = "" + for op in ops: + insert = op.get("insert") + if isinstance(insert, str): + blog_text += insert + post["blogpost_text"] = blog_text + + # collect images + for op in ops: + insert = op.get("insert") + if isinstance(insert, dict) and "fantiaImage" in insert: + img = insert["fantiaImage"] + post["file_id"] = img["id"] + yield "https://fantia.jp" + img["original_url"], post + class FantiaCreatorExtractor(FantiaExtractor): """Extractor for a Fantia creator's works""" diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 7dd047c..b4f433b 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -57,6 +57,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor): self.root = text.ensure_http_scheme(root) url = "{}/story/{}/".format(self.root, self.slug) GalleryExtractor.__init__(self, match, url) + self.session.headers["Referer"] = url def metadata(self, page): title = text.extract(page, "<title>", "</title>")[0] diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 9370840..7cd67d6 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2021 Mike Fährmann +# Copyright 2014-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,45 +10,40 @@ from .common import Extractor, Message from .. import text, exception +import re class ImagebamExtractor(Extractor): """Base class for imagebam extractors""" category = "imagebam" root = "https://www.imagebam.com" - cookies = None def __init__(self, match): Extractor.__init__(self, match) - self.key = match.group(1) - if self.cookies: - self.session.cookies = self.cookies - - def get_image_data(self, data): - page_url = "{}/image/{}".format(self.root, data["image_key"]) - page = self.request(page_url).text - image_url, pos = text.extract(page, '<img src="https://images', '"') - - if not image_url: - # cache cookies - ImagebamExtractor.cookies = self.session.cookies - # repeat request to get past "Continue to your image" pages - page = self.request(page_url).text - image_url, pos = text.extract( - page, '<img src="https://images', '"') + self.path = match.group(1) + self.session.cookies.set("nsfw_inter", "1", domain="www.imagebam.com") + def _parse_image_page(self, path): + page = self.request(self.root + path).text + url, pos = text.extract(page, '<img src="https://images', '"') filename = text.unescape(text.extract(page, 'alt="', '"', pos)[0]) - data["url"] = "https://images" + image_url + + data = { + "url" : "https://images" + url, + "image_key": path.rpartition("/")[2], + } data["filename"], _, data["extension"] = filename.rpartition(".") + return data class ImagebamGalleryExtractor(ImagebamExtractor): - """Extractor for image galleries from imagebam.com""" + """Extractor for imagebam galleries""" subcategory = "gallery" directory_fmt = ("{category}", "{title} {gallery_key}") filename_fmt = "{num:>03} {filename}.{extension}" archive_fmt = "{gallery_key}_{image_key}" - pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)" + pattern = (r"(?:https?://)?(?:www\.)?imagebam\.com" + r"(/(?:gallery/|view/G)[a-zA-Z0-9]+)") test = ( ("https://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { "url": "76d976788ae2757ac81694736b07b72356f5c4c8", @@ -63,50 +58,56 @@ class ImagebamGalleryExtractor(ImagebamExtractor): ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { "exception": exception.HttpError, }), + # /view/ path (#2378) + ("https://www.imagebam.com/view/GA3MT1", { + "url": "35018ce1e00a2d2825a33d3cd37857edaf804919", + "keyword": "3a9f98178f73694c527890c0d7ca9a92b46987ba", + }), ) def items(self): - url = "{}/gallery/{}".format(self.root, self.key) - page = self.request(url).text + page = self.request(self.root + self.path).text - data = self.get_metadata(page) - keys = self.get_image_keys(page) - keys.reverse() - data["count"] = len(keys) - data["gallery_key"] = self.key + images = self.images(page) + images.reverse() + + data = self.metadata(page) + data["count"] = len(images) + data["gallery_key"] = self.path.rpartition("/")[2] yield Message.Directory, data - for data["num"], data["image_key"] in enumerate(keys, 1): - self.get_image_data(data) - yield Message.Url, data["url"], data + for data["num"], path in enumerate(images, 1): + image = self._parse_image_page(path) + image.update(data) + yield Message.Url, image["url"], image @staticmethod - def get_metadata(page): - """Return gallery metadata""" - title = text.extract(page, 'id="gallery-name">', '<')[0] - return {"title": text.unescape(title.strip())} - - def get_image_keys(self, page): - """Return a list of all image keys""" - keys = [] + def metadata(page): + return {"title": text.unescape(text.extract( + page, 'id="gallery-name">', '<')[0].strip())} + + def images(self, page): + findall = re.compile(r'<a href="https://www\.imagebam\.com' + r'(/(?:image/|view/M)[a-zA-Z0-9]+)').findall + + paths = [] while True: - keys.extend(text.extract_iter( - page, '<a href="https://www.imagebam.com/image/', '"')) + paths += findall(page) pos = page.find('rel="next" aria-label="Next') if pos > 0: url = text.rextract(page, 'href="', '"', pos)[0] if url: page = self.request(url).text continue - return keys + return paths class ImagebamImageExtractor(ImagebamExtractor): - """Extractor for single images from imagebam.com""" + """Extractor for single imagebam images""" subcategory = "image" archive_fmt = "{image_key}" pattern = (r"(?:https?://)?(?:\w+\.)?imagebam\.com" - r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)") + r"(/(?:image/|view/M|(?:[0-9a-f]{2}/){3})[a-zA-Z0-9]+)") test = ( ("https://www.imagebam.com/image/94d56c502511890", { "url": "5e9ba3b1451f8ded0ae3a1b84402888893915d4a", @@ -118,10 +119,19 @@ class ImagebamImageExtractor(ImagebamExtractor): ("https://www.imagebam.com/image/0850951366904951", { "url": "d37297b17ed1615b4311c8ed511e50ce46e4c748", }), + # /view/ path (#2378) + ("https://www.imagebam.com/view/ME8JOQP", { + "url": "4dca72bbe61a0360185cf4ab2bed8265b49565b8", + "keyword": "15a494c02fd30846b41b42a26117aedde30e4ceb", + "content": "f81008666b17a42d8834c4749b910e1dc10a6e83", + }), ) def items(self): - data = {"image_key": self.key} - self.get_image_data(data) - yield Message.Directory, data - yield Message.Url, data["url"], data + path = self.path + if path[3] == "/": + path = ("/view/" if path[10] == "M" else "/image/") + path[10:] + + image = self._parse_image_page(path) + yield Message.Directory, image + yield Message.Url, image["url"], image diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index b898e3b..9537263 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -14,7 +14,7 @@ from ..cache import cache import itertools import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?(kemono|coomer)\.party" +BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.party" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" @@ -23,15 +23,15 @@ class KemonopartyExtractor(Extractor): category = "kemonoparty" root = "https://kemono.party" directory_fmt = ("{category}", "{service}", "{user}") - filename_fmt = "{id}_{title}_{num:>02}_{filename}.{extension}" + filename_fmt = "{id}_{title}_{num:>02}_{filename[:180]}.{extension}" archive_fmt = "{service}_{user}_{id}_{num}" cookiedomain = ".kemono.party" def __init__(self, match): if match.group(1) == "coomer": self.category = "coomerparty" - self.root = "https://coomer.party" self.cookiedomain = ".coomer.party" + self.root = text.root_from_url(match.group(0)) Extractor.__init__(self, match) def items(self): @@ -291,6 +291,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): }), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"), + ("https://beta.kemono.party/subscribestar/user/alcorart/post/184330"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/kissgoddess.py b/gallery_dl/extractor/kissgoddess.py new file mode 100644 index 0000000..85ec806 --- /dev/null +++ b/gallery_dl/extractor/kissgoddess.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://kissgoddess.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception + + +class KissgoddessGalleryExtractor(GalleryExtractor): + """Extractor for image galleries on kissgoddess.com""" + category = "kissgoddess" + root = "https://kissgoddess.com" + pattern = r"(?:https?://)?(?:www\.)?kissgoddess\.com/album/(\d+)" + test = ("https://kissgoddess.com/album/18285.html", { + "pattern": r"https://pic\.kissgoddess\.com" + r"/gallery/16473/18285/s/\d+\.jpg", + "count": 8, + "keyword": { + "gallery_id": 18285, + "title": "[Young Champion Extra] 2016.02 No.03 菜乃花 安枝瞳 葉月あや", + }, + }) + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/album/{}.html".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.extract( + page, '<title>', "<")[0].rpartition(" | ")[0], + } + + def images(self, page): + pnum = 1 + + while page: + for url in text.extract_iter(page, "<img src='", "'"): + yield url, None + + pnum += 1 + url = "{}/album/{}_{}.html".format( + self.root, self.gallery_id, pnum) + try: + page = self.request(url).text + except exception.HttpError: + return + + +class KissgoddessModelExtractor(Extractor): + """Extractor for all galleries of a model on kissgoddess.com""" + category = "kissgoddess" + subcategory = "model" + root = "https://kissgoddess.com" + pattern = r"(?:https?://)?(?:www\.)?kissgoddess\.com/people/([^./?#]+)" + test = ("https://kissgoddess.com/people/aya-hazuki.html", { + "pattern": KissgoddessGalleryExtractor.pattern, + "count": ">= 7", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.model = match.group(1) + + def items(self): + url = "{}/people/{}.html".format(self.root, self.model) + page = self.request(url).text + + data = {"_extractor": KissgoddessGalleryExtractor} + for path in text.extract_iter(page, 'thumb"><a href="/album/', '"'): + url = self.root + "/album/" + path + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index c63fa51..43377bd 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -44,7 +44,7 @@ class LolisafelbumExtractor(LolisafeExtractor): }), # mp4 (#2239) ("https://bunkr.is/a/ptRHaCn2", { - "pattern": r"https://cdn\.bunkr\.is/_-RnHoW69L\.mp4", + "pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4", "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }), ("https://bunkr.to/a/Lktg9Keq"), @@ -73,9 +73,8 @@ class LolisafelbumExtractor(LolisafeExtractor): data["name"], sep, data["id"] = data["filename"].rpartition("-") if data["extension"] == "mp4": - data["_http_validate"] = self._check_rewrite - else: - data["_http_validate"] = None + url = url.replace( + "//cdn.bunkr.is/", "//media-files.bunkr.is/", 1) yield Message.Url, url, data def fetch_album(self, album_id): @@ -87,13 +86,3 @@ class LolisafelbumExtractor(LolisafeExtractor): "album_name": text.unescape(data["title"]), "count" : data["count"], } - - @staticmethod - def _check_rewrite(response): - if response.history and response.headers.get( - "Content-Type").startswith("text/html"): - # consume content to reuse connection - response.content - # rewrite to download URL - return response.url.replace("/v/", "/d/", 1) - return True diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 152da4f..7194757 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -53,7 +53,10 @@ class MangadexExtractor(Extractor): cattributes = chapter["attributes"] mattributes = manga["attributes"] - lang = cattributes["translatedLanguage"].partition("-")[0] + + lang = cattributes.get("translatedLanguage") + if lang: + lang = lang.partition("-")[0] if cattributes["chapter"]: chnum, sep, minor = cattributes["chapter"].partition(".") diff --git a/gallery_dl/extractor/mememuseum.py b/gallery_dl/extractor/mememuseum.py new file mode 100644 index 0000000..1de0d76 --- /dev/null +++ b/gallery_dl/extractor/mememuseum.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://meme.museum/""" + +from .common import Extractor, Message +from .. import text + + +class MememuseumExtractor(Extractor): + """Base class for meme.museum extractors""" + basecategory = "booru" + category = "mememuseum" + filename_fmt = "{category}_{id}_{md5}.{extension}" + archive_fmt = "{id}" + root = "https://meme.museum" + + def items(self): + data = self.metadata() + + for post in self.posts(): + url = post["file_url"] + for key in ("id", "width", "height"): + post[key] = text.parse_int(post[key]) + post["tags"] = text.unquote(post["tags"]) + post.update(data) + yield Message.Directory, post + yield Message.Url, url, text.nameext_from_url(url, post) + + def metadata(self): + """Return general metadata""" + return () + + def posts(self): + """Return an iterable containing data of all relevant posts""" + return () + + +class MememuseumTagExtractor(MememuseumExtractor): + """Extractor for images from meme.museum by search-tags""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = r"(?:https?://)?meme\.museum/post/list/([^/?#]+)" + test = ("https://meme.museum/post/list/animated/1", { + "pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20", + "count": ">= 30" + }) + per_page = 25 + + def __init__(self, match): + MememuseumExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1)) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + pnum = 1 + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + extr = text.extract_from(self.request(url).text) + + while True: + mime = extr("data-mime='", "'") + if not mime: + break + + pid = extr("data-post-id='", "'") + tags, dimensions, size = extr("title='", "'").split(" // ") + md5 = extr("/_thumbs/", "/") + width, _, height = dimensions.partition("x") + + yield { + "file_url": "{}/_images/{}/{}%20-%20{}.{}".format( + self.root, md5, pid, text.quote(tags), + mime.rpartition("/")[2]), + "id": pid, "md5": md5, "tags": tags, + "width": width, "height": height, + "size": text.parse_bytes(size[:-1]), + } + + if not extr(">Next<", ">"): + return + pnum += 1 + + +class MememuseumPostExtractor(MememuseumExtractor): + """Extractor for single images from meme.museum""" + subcategory = "post" + pattern = r"(?:https?://)?meme\.museum/post/view/(\d+)" + test = ("https://meme.museum/post/view/10243", { + "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc4997" + r"1f78/10243%20-%20g%20beard%20open_source%20richard_stallm" + r"an%20stallman%20tagme%20text\.jpg", + "keyword": "3c8009251480cf17248c08b2b194dc0c4d59580e", + "content": "45565f3f141fc960a8ae1168b80e718a494c52d2", + }) + + def __init__(self, match): + MememuseumExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + extr = text.extract_from(self.request(url).text) + + return ({ + "id" : self.post_id, + "tags" : extr(": ", "<"), + "md5" : extr("/_thumbs/", "/"), + "file_url": self.root + extr("id='main_image' src='", "'"), + "width" : extr("data-width=", " ").strip("'\""), + "height" : extr("data-height=", " ").strip("'\""), + "size" : 0, + },) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 54e2040..6d0e94b 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -103,7 +103,7 @@ class NewgroundsExtractor(Extractor): } def extract_post(self, post_url): - + url = post_url if "/art/view/" in post_url: extract_data = self._extract_image_data elif "/audio/listen/" in post_url: @@ -111,18 +111,19 @@ class NewgroundsExtractor(Extractor): else: extract_data = self._extract_media_data if self.flash: - post_url += "/format/flash" + url += "/format/flash" - response = self.request(post_url, fatal=False) + response = self.request(url, fatal=False) if response.status_code >= 400: return {} page = response.text extr = text.extract_from(page) data = extract_data(extr, post_url) - data["_comment"] = extr('id="author_comments"', '</div>') + data["_comment"] = extr( + 'id="author_comments"', '</div>').partition(">")[2] data["comment"] = text.unescape(text.remove_html( - data["_comment"].partition(">")[2], "", "")) + data["_comment"], "", "")) data["favorites"] = text.parse_int(extr( 'id="faves_load">', '<').replace(",", "")) data["score"] = text.parse_float(extr('id="score_number">', '<')) @@ -134,6 +135,7 @@ class NewgroundsExtractor(Extractor): data["tags"].sort() data["user"] = self.user or data["artist"][0] + data["post_url"] = post_url return data @staticmethod @@ -171,6 +173,7 @@ class NewgroundsExtractor(Extractor): def _extract_media_data(self, extr, url): index = url.split("/")[5] title = extr('"og:title" content="', '"') + descr = extr('"og:description" content="', '"') src = extr('{"url":"', '"') if src: @@ -209,7 +212,7 @@ class NewgroundsExtractor(Extractor): "title" : text.unescape(title), "url" : src, "date" : date, - "description": text.unescape(extr( + "description": text.unescape(descr or extr( 'itemprop="description" content="', '"')), "rating" : extr('class="rated-', '"'), "index" : text.parse_int(index), @@ -319,6 +322,7 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): "artist" : ["kickinthehead", "danpaladin", "tomfulp"], "comment" : "re:My fan trailer for Alien Hominid HD!", "date" : "dt:2013-02-01 09:50:49", + "description": "Fan trailer for Alien Hominid HD!", "favorites" : int, "filename" : "564957_alternate_31", "index" : 595355, diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 6812f35..428f772 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2021 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -399,7 +399,7 @@ class OAuthPixiv(OAuthBase): if "error" in data: print(data) - if data["error"] == "invalid_request": + if data["error"] in ("invalid_request", "invalid_grant"): print("'code' expired, try again") return @@ -417,6 +417,10 @@ class OAuthPixiv(OAuthBase): 2) Login 3) Select the last network monitor entry ('callback?state=...') 4) Copy its 'code' query parameter, paste it below, and press Enter + +- This 'code' will expire 30 seconds after logging in. +- Copy-pasting more than just the 'code' value will work as well, + like the entire URL or several query parameters. """) code = input("code: ") return code.rpartition("=")[2].strip() diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 051f1ef..35a015f 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -71,6 +71,15 @@ class PatreonExtractor(Extractor): name = image.get("file_name") or self._filename(url) or url yield "image", url, name + def _image_large(self, post): + image = post.get("image") + if image: + url = image.get("large_url") + if url: + name = image.get("file_name") or self._filename(url) or url + return (("image_large", url, name),) + return () + def _attachments(self, post): for attachment in post["attachments"]: url = self.request( @@ -212,10 +221,11 @@ class PatreonExtractor(Extractor): def _build_file_generators(self, filetypes): if filetypes is None: - return (self._images, self._attachments, - self._postfile, self._content) + return (self._images, self._image_large, + self._attachments, self._postfile, self._content) genmap = { "images" : self._images, + "image_large": self._image_large, "attachments": self._attachments, "postfile" : self._postfile, "content" : self._content, diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index bf38a77..22c9487 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -1,16 +1,15 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://seiga.nicovideo.jp/""" +"""Extractors for https://seiga.nicovideo.jp/""" from .common import Extractor, Message from .. import text, util, exception -from ..cache import cache class SeigaExtractor(Extractor): @@ -25,7 +24,9 @@ class SeigaExtractor(Extractor): self.start_image = 0 def items(self): - self.login() + if not self._check_cookies(("user_session",)): + raise exception.StopExtraction("'user_session' cookie required") + images = iter(self.get_images()) data = next(images) @@ -45,28 +46,6 @@ class SeigaExtractor(Extractor): url, method="HEAD", allow_redirects=False, notfound="image") return response.headers["Location"].replace("/o/", "/priv/", 1) - def login(self): - """Login and set necessary cookies""" - if not self._check_cookies(("user_session",)): - username, password = self._get_auth_info() - self._update_cookies(self._login_impl(username, password)) - - @cache(maxage=7*24*3600, keyarg=1) - def _login_impl(self, username, password): - if not username or not password: - raise exception.AuthenticationError( - "Username and password required") - - self.log.info("Logging in as %s", username) - url = "https://account.nicovideo.jp/api/v1/login" - data = {"mail_tel": username, "password": password} - - self.request(url, method="POST", data=data) - if "user_session" not in self.session.cookies: - raise exception.AuthenticationError() - del self.session.cookies["nicosid"] - return self.session.cookies - class SeigaUserExtractor(SeigaExtractor): """Extractor for images of a user from seiga.nicovideo.jp""" diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 2c806ad..965391c 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -22,10 +22,11 @@ class SkebExtractor(Extractor): Extractor.__init__(self, match) self.user_name = match.group(1) self.thumbnails = self.config("thumbnails", False) + self.sent_requests = self.config("sent-requests", False) def items(self): - for post_num in self.posts(): - response, post = self._get_post_data(post_num) + for user_name, post_num in self.posts(): + response, post = self._get_post_data(user_name, post_num) yield Message.Directory, post for data in self._get_urls_from_post(response, post): url = data["file_url"] @@ -38,24 +39,33 @@ class SkebExtractor(Extractor): url = "{}/api/users/{}/works".format(self.root, self.user_name) params = {"role": "creator", "sort": "date", "offset": 0} headers = {"Referer": self.root, "Authorization": "Bearer null"} + do_requests = self.sent_requests while True: posts = self.request(url, params=params, headers=headers).json() for post in posts: post_num = post["path"].rpartition("/")[2] + user_name = post["path"].split("/")[1][1:] if post["private"]: - self.log.debug("Skipping %s (private)", post_num) + self.log.debug("Skipping @%s/%s (private)", + user_name, post_num) continue - yield post_num + yield user_name, post_num if len(posts) < 30: - return + if do_requests: + params["offset"] = 0 + params['role'] = "client" + do_requests = False + continue + else: + return params["offset"] += 30 - def _get_post_data(self, post_num): + def _get_post_data(self, user_name, post_num): url = "{}/api/users/{}/works/{}".format( - self.root, self.user_name, post_num) + self.root, user_name, post_num) headers = {"Referer": self.root, "Authorization": "Bearer null"} resp = self.request(url, headers=headers).json() creator = resp["creator"] @@ -130,7 +140,7 @@ class SkebPostExtractor(SkebExtractor): self.post_num = match.group(2) def posts(self): - return (self.post_num,) + return ((self.user_name, self.post_num),) class SkebUserExtractor(SkebExtractor): diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index 91386e8..557c9fb 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann, Leonardo Taccari +# Copyright 2016-2022 Mike Fährmann, Leonardo Taccari # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,11 +8,12 @@ """Extractors for https://www.slideshare.net/""" -from .common import Extractor, Message +from .common import GalleryExtractor from .. import text +import json -class SlidesharePresentationExtractor(Extractor): +class SlidesharePresentationExtractor(GalleryExtractor): """Extractor for images from a presentation on slideshare.net""" category = "slideshare" subcategory = "presentation" @@ -24,13 +25,36 @@ class SlidesharePresentationExtractor(Extractor): test = ( (("https://www.slideshare.net" "/Slideshare/get-started-with-slide-share"), { - "url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18", - "content": "2e90a01c6ca225579ebf8f98ab46f97a28a5e45c", + "pattern": r"https://image\.slidesharecdn\.com/getstartedwithslide" + r"share-150520173821-lva1-app6892/95/get-started-with-s" + r"lide-share-\d+-1024\.jpg\?cb=\d+", + "count": 19, + "content": "2b6a191eab60b3978fdacfecf2da302dd45bc108", + "keyword": { + "comments": "0", + "description": "Get Started with SlideShare - " + "A Beginngers Guide for Creators", + "likes": r"re:\d{3,}", + "presentation": "get-started-with-slide-share", + "published": "dt:2015-05-20 00:00:00", + "title": "Getting Started With SlideShare", + "user": "Slideshare", + "views": r"re:\d{7,}", + }, }), - # long title + # long title and description (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren" "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), { "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7", + "keyword": { + "title": "Warum Sie nicht Ihren Mitarbeitenden ändern " + "sollten, sondern Ihr Managementsystem", + "description": "Mitarbeitende verhalten sich mehrheitlich so, " + "wie das System es ihnen vorgibt. Welche Voraus" + "setzungen es braucht, damit Ihre Mitarbeitende" + "n ihr ganzes Herzblut einsetzen, bespricht Fre" + "di Schmidli in diesem Referat.", + }, }), # mobile URL (("https://www.slideshare.net" @@ -40,48 +64,50 @@ class SlidesharePresentationExtractor(Extractor): ) def __init__(self, match): - Extractor.__init__(self, match) self.user, self.presentation = match.groups() + url = "https://www.slideshare.net/{}/{}".format( + self.user, self.presentation) + GalleryExtractor.__init__(self, match, url) - def items(self): - page = self.request("https://www.slideshare.net/" + self.user + - "/" + self.presentation).text - data = self.get_job_metadata(page) - imgs = self.get_image_urls(page) - data["count"] = len(imgs) - yield Message.Directory, data - for data["num"], url in enumerate(imgs, 1): - yield Message.Url, url, text.nameext_from_url(url, data) + def metadata(self, page): + extr = text.extract_from(page) + descr = extr('<meta name="description" content="', '"') + title = extr('<span class="j-title-breadcrumb">', '</span>') + published = extr('<div class="metadata-item">', '</div>') + comments = extr('content="UserComments:', '"') + likes = extr('content="UserLikes:', '"') + views = extr('content="UserPageVisits:', '"') - def get_job_metadata(self, page): - """Collect metadata for extractor-job""" - descr, pos = text.extract( - page, '<meta name="description" content="', '"') - category, pos = text.extract( - page, '<div class="metadata-item">', '</div>', pos) - views, pos = text.extract( - page, '<div class="metadata-item">', '</div>', pos) - published, pos = text.extract( - page, '<div class="metadata-item">', '</div>', pos) - title, pos = text.extract( - page, '<span class="j-title-breadcrumb">', '</span>', pos) - alt_descr, pos = text.extract( - page, '<p class="slideshow-description notranslate">', '</p>', pos) - - if descr.endswith("…") and alt_descr: - descr = text.remove_html(alt_descr).strip() + if descr.endswith("…"): + alt_descr = extr( + 'id="slideshow-description-text" class="notranslate">', '</p>') + if alt_descr: + descr = text.remove_html(alt_descr).strip() return { "user": self.user, "presentation": self.presentation, "title": text.unescape(title.strip()), "description": text.unescape(descr), - "views": text.parse_int(views.rpartition( - " views")[0].replace(",", "")), - "published": published.strip(), + "views": views, + "likes": likes, + "comments": comments, + "published": text.parse_datetime( + published.strip(), "%b. %d, %Y"), } @staticmethod - def get_image_urls(page): - """Extract and return a list of all image-urls""" - return list(text.extract_iter(page, 'data-full="', '"')) + def images(page): + data = json.loads(text.extract( + page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0]) + + # useing 'stripped_title' here is technically wrong, but it works all + # the same, slideshare doesn't seem to care what characters go there + begin = "https://image.slidesharecdn.com/{}/95/{}-".format( + data["ppt_location"], data["stripped_title"]) + end = "-1024.jpg?cb=" + str(data["timestamp"]) + + return [ + (begin + str(n) + end, None) + for n in range(1, data["slide_count"]+1) + ] diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 69e3854..b57013a 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2021 Mike Fährmann +# Copyright 2020-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -105,7 +105,7 @@ class SubscribestarExtractor(Extractor): att, 'data-upload-id="', '"')[0]), "name": text.unescape(text.extract( att, 'doc_preview-title">', '<')[0] or ""), - "url" : text.extract(att, 'href="', '"')[0], + "url" : text.unescape(text.extract(att, 'href="', '"')[0]), "type": "attachment", }) diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py new file mode 100644 index 0000000..c6be38d --- /dev/null +++ b/gallery_dl/extractor/toyhouse.py @@ -0,0 +1,173 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://toyhou.se/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?toyhou\.se" + + +class ToyhouseExtractor(Extractor): + """Base class for toyhouse extractors""" + category = "toyhouse" + root = "https://toyhou.se" + directory_fmt = ("{category}", "{user|artists!S}") + archive_fmt = "{id}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + self.offset = 0 + + def items(self): + metadata = self.metadata() + + for post in util.advance(self.posts(), self.offset): + if metadata: + post.update(metadata) + text.nameext_from_url(post["url"], post) + post["id"], _, post["hash"] = post["filename"].partition("_") + yield Message.Directory, post + yield Message.Url, post["url"], post + + def posts(self): + return () + + def metadata(self): + return None + + def skip(self, num): + self.offset += num + return num + + def _parse_post(self, post, needle='<a href="'): + extr = text.extract_from(post) + return { + "url": extr(needle, '"'), + "date": text.parse_datetime(extr( + 'Credits\n</h2>\n<div class="mb-1">', '<'), + "%d %b %Y, %I:%M:%S %p"), + "artists": [ + text.remove_html(artist) + for artist in extr( + '<div class="artist-credit">', '</div>\n</div>').split( + '<div class="artist-credit">') + ], + "characters": text.split_html(extr( + '<div class="image-characters', '</div>\n</div>'))[2:], + } + + def _pagination(self, path): + url = self.root + path + params = {"page": 1} + + while True: + page = self.request(url, params=params).text + + cnt = 0 + for post in text.extract_iter( + page, '<li class="gallery-item">', '</li>'): + cnt += 1 + yield self._parse_post(post) + + if cnt == 0 and params["page"] == 1: + token, pos = text.extract( + page, '<input name="_token" type="hidden" value="', '"') + if not token: + return + data = { + "_token": token, + "user" : text.extract(page, 'value="', '"', pos)[0], + } + self.request(self.root + "/~account/warnings/accept", + method="POST", data=data, allow_redirects=False) + continue + + if cnt < 18: + return + params["page"] += 1 + + +class ToyhouseArtExtractor(ToyhouseExtractor): + """Extractor for artworks of a toyhouse user""" + subcategory = "art" + pattern = BASE_PATTERN + r"/([^/?#]+)/art" + + test = ( + ("https://www.toyhou.se/d-floe/art", { + "range": "1-30", + "count": 30, + "pattern": r"https://f\d+\.toyhou\.se/file/f\d+-toyhou-se" + r"/images/\d+_\w+\.\w+$", + "keyword": { + "artists": list, + "characters": list, + "date": "type:datetime", + "hash": r"re:\w+", + "id": r"re:\d+", + "url": str, + "user": "d-floe", + }, + }), + # protected by Content Warning + ("https://www.toyhou.se/kroksoc/art", { + "count": ">= 19", + }), + ) + + def posts(self): + return self._pagination("/{}/art".format(self.user)) + + def metadata(self): + return {"user": self.user} + + +class ToyhouseImageExtractor(ToyhouseExtractor): + """Extractor for individual toyhouse images""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:" + r"(?:www\.)?toyhou\.se/~images|" + r"f\d+\.toyhou\.se/file/[^/?#]+/(?:image|watermark)s" + r")/(\d+)") + test = ( + ("https://toyhou.se/~images/40587320", { + "content": "058ec8427977ab432c4cc5be5a6dd39ce18713ef", + "keyword": { + "artists": ["d-floe"], + "characters": ["Sumi"], + "date": "dt:2021-10-08 01:32:47", + "extension": "png", + "filename": "40587320_TT1NaBUr3FLkS1p", + "hash": "TT1NaBUr3FLkS1p", + "id": "40587320", + "url": "https://f2.toyhou.se/file/f2-toyhou-se/images" + "/40587320_TT1NaBUr3FLkS1p.png", + }, + }), + # direct link, multiple artists + (("https://f2.toyhou.se/file/f2-toyhou-se" + "/watermarks/36817425_bqhGcwcnU.png?1625561467"), { + "keyword": { + "artists": [ + "http://aminoapps.com/p/92sf3z", + "kroksoc (Color)"], + "characters": ["❀Reiichi❀"], + "date": "dt:2021-07-03 20:02:02", + "hash": "bqhGcwcnU", + "id": "36817425", + }, + }), + ("https://f2.toyhou.se/file/f2-toyhou-se" + "/images/40587320_TT1NaBUr3FLkS1p.png"), + ) + + def posts(self): + url = "{}/~images/{}".format(self.root, self.user) + return (self._parse_post(self.request(url).text, '<img src="'),) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 358bc95..fbe641d 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -400,6 +400,15 @@ class TumblrAPI(oauth.OAuth1API): t = (datetime.now() + timedelta(seconds=float(reset))).time() self.log.error("Daily API rate limit exceeded") + + api_key = self.api_key or self.session.auth.consumer_key + if api_key == self.API_KEY: + self.log.info("Register your own OAuth application and " + "use its credentials to prevent this error: " + "https://github.com/mikf/gallery-dl/blob/mas" + "ter/docs/configuration.rst#extractortumblra" + "pi-key--api-secret") + raise exception.StopExtraction( "Aborting - Rate limit will reset at %s", "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)) diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py new file mode 100644 index 0000000..ec8ab35 --- /dev/null +++ b/gallery_dl/extractor/twibooru.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://twibooru.org/""" + +from .booru import BooruExtractor +from .. import text, exception +import operator + +BASE_PATTERN = r"(?:https?://)?twibooru\.org" + + +class TwibooruExtractor(BooruExtractor): + """Base class for twibooru extractors""" + category = "twibooru" + basecategory = "philomena" + filename_fmt = "{id}_{filename}.{extension}" + archive_fmt = "{id}" + request_interval = 6.05 + per_page = 50 + root = "https://twibooru.org" + + def __init__(self, match): + BooruExtractor.__init__(self, match) + self.api = TwibooruAPI(self) + + _file_url = operator.itemgetter("view_url") + + @staticmethod + def _prepare(post): + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ") + + name, sep, rest = post["name"].rpartition(".") + post["filename"] = name if sep else rest + + +class TwibooruPostExtractor(TwibooruExtractor): + """Extractor for single twibooru posts""" + subcategory = "post" + request_interval = 1.0 + pattern = BASE_PATTERN + r"/(\d+)" + test = ("https://twibooru.org/1", { + "pattern": r"https://cdn.twibooru.org/img/2020/7/8/1/full.png", + "content": "aac4d1dba611883ac701aaa8f0b2b322590517ae", + "keyword": { + "animated": False, + "aspect_ratio": 1.0, + "comment_count": int, + "created_at": "2020-07-08T22:26:55.743Z", + "date": "dt:2020-07-08 22:26:55", + "description": "Why have I done this?", + "downvotes": 0, + "duration": 0.0, + "faves": int, + "first_seen_at": "2020-07-08T22:26:55.743Z", + "format": "png", + "height": 576, + "hidden_from_users": False, + "id": 1, + "intensities": dict, + "locations": [], + "media_type": "image", + "mime_type": "image/png", + "name": "1676547__safe_artist-colon-scraggleman_oc_oc-colon-" + "floor+bored_oc+only_bags+under+eyes_bust_earth+pony_" + "female_goggles_helmet_mare_meme_neet_neet+home+g.png", + "orig_sha512_hash": "re:8b4c00d2[0-9a-f]{120}", + "processed": True, + "representations": dict, + "score": int, + "sha512_hash": "8b4c00d2eff52d51ad9647e14738944ab306fd1d8e1bf6" + "34fbb181b32f44070aa588938e26c4eb072b1eb61489aa" + "f3062fb644a76c79f936b97723a2c3e0e5d3", + "size": 70910, + "source_url": "", + "tag_ids": list, + "tags": list, + "thumbnails_generated": True, + "updated_at": "2022-02-03T15:49:07.110Z", + "upvotes": int, + "view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png", + "width": 576, + "wilson_score": float, + }, + }) + + def __init__(self, match): + TwibooruExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + return (self.api.post(self.post_id),) + + +class TwibooruSearchExtractor(TwibooruExtractor): + """Extractor for twibooru search results""" + subcategory = "search" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"/(?:search/?\?([^#]+)|tags/([^/?#]+))" + test = ( + ("https://twibooru.org/search?q=cute", { + "range": "40-60", + "count": 21, + }), + ("https://twibooru.org/tags/cute", { + "range": "1-20", + "count": 20, + }), + ) + + def __init__(self, match): + TwibooruExtractor.__init__(self, match) + query, tag = match.groups() + if tag: + q = tag.replace("+", " ") + for old, new in ( + ("-colon-" , ":"), + ("-dash-" , "-"), + ("-dot-" , "."), + ("-plus-" , "+"), + ("-fwslash-", "/"), + ("-bwslash-", "\\"), + ): + if old in q: + q = q.replace(old, new) + self.params = {"q": text.unquote(text.unquote(q))} + else: + self.params = text.parse_query(query) + + def metadata(self): + return {"search_tags": self.params.get("q", "")} + + def posts(self): + return self.api.search(self.params) + + +class TwibooruGalleryExtractor(TwibooruExtractor): + """Extractor for twibooru galleries""" + subcategory = "gallery" + directory_fmt = ("{category}", "galleries", + "{gallery[id]} {gallery[title]}") + pattern = BASE_PATTERN + r"/galleries/(\d+)" + test = ("https://twibooru.org/galleries/1", { + "range": "1-20", + "keyword": { + "gallery": { + "description": "Best nation pone and " + "russian related pics.", + "id": 1, + "spoiler_warning": "Russia", + "thumbnail_id": 694923, + "title": "Marussiaverse", + }, + }, + }) + + def __init__(self, match): + TwibooruExtractor.__init__(self, match) + self.gallery_id = match.group(1) + + def metadata(self): + return {"gallery": self.api.gallery(self.gallery_id)} + + def posts(self): + gallery_id = "gallery_id:" + self.gallery_id + params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id} + return self.api.search(params) + + +class TwibooruAPI(): + """Interface for the Twibooru API + + https://twibooru.org/pages/api + """ + + def __init__(self, extractor): + self.extractor = extractor + self.root = "https://twibooru.org/api" + + def gallery(self, gallery_id): + endpoint = "/v3/galleries/" + gallery_id + return self._call(endpoint)["gallery"] + + def post(self, post_id): + endpoint = "/v3/posts/" + post_id + return self._call(endpoint)["post"] + + def search(self, params): + endpoint = "/v3/search/posts" + return self._pagination(endpoint, params) + + def _call(self, endpoint, params=None): + url = self.root + endpoint + + while True: + response = self.extractor.request(url, params=params, fatal=None) + + if response.status_code < 400: + return response.json() + + if response.status_code == 429: + until = text.parse_datetime( + response.headers["X-RL-Reset"], "%Y-%m-%d %H:%M:%S %Z") + # wait an extra minute, just to be safe + self.extractor.wait(until=until, adjust=60.0) + continue + + # error + self.extractor.log.debug(response.content) + raise exception.StopExtraction( + "%s %s", response.status_code, response.reason) + + def _pagination(self, endpoint, params): + extr = self.extractor + + api_key = extr.config("api-key") + if api_key: + params["key"] = api_key + + filter_id = extr.config("filter") + if filter_id: + params["filter_id"] = filter_id + elif not api_key: + params["filter_id"] = "2" + + params["page"] = 1 + params["per_page"] = per_page = extr.per_page + + while True: + data = self._call(endpoint, params) + yield from data["posts"] + + if len(data["posts"]) < per_page: + return + params["page"] += 1 diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 46b06c2..6d51834 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -649,6 +649,10 @@ class TwitterTweetExtractor(TwitterExtractor): ("https://twitter.com/i/web/status/1460044411165888515", { "count": 0, }), + # "Misleading" content + ("https://twitter.com/i/web/status/1486373748911575046", { + "count": 4, + }), ) def __init__(self, match): @@ -765,7 +769,7 @@ class TwitterAPI(): "__fs_dont_mention_me_view_api_enabled": False, } - self._log_warnings = extractor.config("warnings") + self._nsfw_warning = True self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode self._user = None @@ -789,7 +793,7 @@ class TwitterAPI(): self.headers["x-guest-token"] = guest_token def tweet_detail(self, tweet_id): - endpoint = "/graphql/aD0-HB47XIOxiBl5kTkX5Q/TweetDetail" + endpoint = "/graphql/ItejhtHVxU7ksltgMmyaLA/TweetDetail" variables = { "focalTweetId": tweet_id, "with_rux_injections": False, @@ -801,7 +805,7 @@ class TwitterAPI(): endpoint, variables, ("threaded_conversation_with_injections",)) def user_tweets(self, screen_name): - endpoint = "/graphql/LNhjy8t3XpIrBYM-ms7sPQ/UserTweets" + endpoint = "/graphql/WZT7sCTrLvSOaWOXLDsWbQ/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -810,7 +814,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/Vg5aF036K40ST3FWvnvRGA/UserTweetsAndReplies" + endpoint = "/graphql/t4wEKVulW4Mbv1P0kgxTEw/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -819,7 +823,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_media(self, screen_name): - endpoint = "/graphql/Hl6C7ac051l_QBe3HjGz_A/UserMedia" + endpoint = "/graphql/nRybED9kRbN-TOWioHq1ng/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -827,7 +831,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_likes(self, screen_name): - endpoint = "/graphql/smISlRVSnz-GaU_XpU_akw/Likes" + endpoint = "/graphql/9MSTt44HoGjVFSg_u3rHDw/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -835,7 +839,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_bookmarks(self): - endpoint = "/graphql/yKNebSjZKbo2tOd-Qdc7Xg/Bookmarks" + endpoint = "/graphql/uKP9v_I31k0_VSBmlpq2Xg/Bookmarks" variables = { "count": 100, } @@ -843,7 +847,7 @@ class TwitterAPI(): endpoint, variables, ("bookmark_timeline", "timeline")) def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/RxUL5UHi4Msxt_P9O1729w/ListLatestTweetsTimeline" + endpoint = "/graphql/z3l-EHlx-fyg8OvGO4JN8A/ListLatestTweetsTimeline" variables = { "listId": list_id, "count": 100, @@ -889,7 +893,7 @@ class TwitterAPI(): raise exception.NotFoundError("list") def list_members(self, list_id): - endpoint = "/graphql/kk9RQtSa2sc-4_9figZVBw/ListMembers" + endpoint = "/graphql/snESM0DPs3c7M1SBm4rvVw/ListMembers" variables = { "listId": list_id, "count": 100, @@ -899,7 +903,7 @@ class TwitterAPI(): endpoint, variables, ("list", "members_timeline", "timeline")) def user_following(self, screen_name): - endpoint = "/graphql/kz464_e4MAOXc3bGOA9kow/Following" + endpoint = "/graphql/mIwX8GogcobVlRwlgpHNYA/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -961,20 +965,9 @@ class TwitterAPI(): if csrf_token: self.headers["x-csrf-token"] = csrf_token - data = response.json() - if "errors" in data: - try: - errors = ", ".join(e["message"] for e in data["errors"]) - except Exception: - errors = data["errors"] - else: - errors = "" - if response.status_code < 400: # success - if errors and self._log_warnings: - self.extractor.log.warning(errors) - return data + return response.json() if response.status_code == 429: # rate limit exceeded @@ -984,6 +977,14 @@ class TwitterAPI(): continue # error + try: + data = response.json() + errors = ", ".join(e["message"] for e in data["errors"]) + except ValueError: + errors = response.text + except Exception: + errors = data.get("errors", "") + raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, errors) @@ -1151,6 +1152,10 @@ class TwitterAPI(): tweets.extend(entry["content"]["items"]) elif esw("conversationthread-"): tweets.extend(entry["content"]["items"]) + elif esw("tombstone-"): + self._report_tombstone( + entry, + entry["content"]["itemContent"]["tombstoneInfo"]) elif esw("cursor-bottom-"): cursor = entry["content"] if not cursor.get("stopOnEmptyResponse", True): @@ -1162,6 +1167,11 @@ class TwitterAPI(): try: tweet = ((entry.get("content") or entry["item"]) ["itemContent"]["tweet_results"]["result"]) + if "tombstone" in tweet: + self._report_tombstone(entry, tweet["tombstone"]) + continue + if "tweet" in tweet: + tweet = tweet["tweet"] legacy = tweet["legacy"] except KeyError: extr.log.debug( @@ -1248,3 +1258,11 @@ class TwitterAPI(): if stop or not cursor or not entry: return variables["cursor"] = cursor + + def _report_tombstone(self, entry, tombstone): + text = (tombstone.get("richText") or tombstone["text"])["text"] + if text.startswith("Age-restricted") and self._nsfw_warning: + self.extractor.log.warning(text) + self._nsfw_warning = False + self.extractor.log.debug( + "Skipping %s (%s)", entry["entryId"].rpartition("-")[2], text) diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py index 8f3ef9a..b3a1652 100644 --- a/gallery_dl/extractor/ytdl.py +++ b/gallery_dl/extractor/ytdl.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -64,6 +64,9 @@ class YoutubeDLExtractor(Extractor): "nocheckcertificate" : not self._verify, } + if self._proxies: + user_opts["proxy"] = self._proxies.get("http") + username, password = self._get_auth_info() if username: user_opts["username"], user_opts["password"] = username, password diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 15db67f..c85bb88 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,10 +10,8 @@ import os import re -import time import shutil import functools -from email.utils import mktime_tz, parsedate_tz from . import util, formatter, exception WINDOWS = util.WINDOWS @@ -327,10 +325,4 @@ class PathFormat(): mtime = self.kwdict.get("_mtime") if mtime: - # Set file modification time - try: - if isinstance(mtime, str): - mtime = mktime_tz(parsedate_tz(mtime)) - os.utime(self.realpath, (time.time(), mtime)) - except Exception: - pass + util.set_mtime(self.realpath, mtime) diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index fe65c88..e776888 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -59,6 +59,8 @@ class MetadataPP(PostProcessor): events = events.split(",") job.register_hooks({event: self.run for event in events}, options) + self.mtime = options.get("mtime") + def run(self, pathfmt): directory = self._directory(pathfmt) path = directory + self._filename(pathfmt) @@ -71,6 +73,11 @@ class MetadataPP(PostProcessor): with open(path, "w", encoding="utf-8") as fp: self.write(fp, pathfmt.kwdict) + if self.mtime: + mtime = pathfmt.kwdict.get("_mtime") + if mtime: + util.set_mtime(path, mtime) + def _directory(self, pathfmt): return pathfmt.realdirectory diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py index d2f1915..098984a 100644 --- a/gallery_dl/postprocessor/mtime.py +++ b/gallery_dl/postprocessor/mtime.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,7 +17,13 @@ class MtimePP(PostProcessor): def __init__(self, job, options): PostProcessor.__init__(self, job) self.key = options.get("key", "date") - job.register_hooks({"file": self.run}, options) + + events = options.get("event") + if events is None: + events = ("file",) + elif isinstance(events, str): + events = events.split(",") + job.register_hooks({event: self.run for event in events}, options) def run(self, pathfmt): mtime = pathfmt.kwdict.get(self.key) diff --git a/gallery_dl/text.py b/gallery_dl/text.py index ac4bbcb..97ef3ac 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -46,6 +46,13 @@ def ensure_http_scheme(url, scheme="https://"): return url +def root_from_url(url, scheme="https://"): + """Extract scheme and domain from a URL""" + if not url.startswith(("https://", "http://")): + return scheme + url[:url.index("/")] + return url[:url.index("/", 8)] + + def filename_from_url(url): """Extract the last part of an URL to use as a filename""" try: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index bccae2d..92d1620 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2021 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,6 +12,7 @@ import re import os import sys import json +import time import random import sqlite3 import binascii @@ -20,6 +21,7 @@ import functools import itertools import urllib.parse from http.cookiejar import Cookie +from email.utils import mktime_tz, parsedate_tz from . import text, exception @@ -272,6 +274,15 @@ def remove_directory(path): pass +def set_mtime(path, mtime): + try: + if isinstance(mtime, str): + mtime = mktime_tz(parsedate_tz(mtime)) + os.utime(path, (time.time(), mtime)) + except Exception: + pass + + def load_cookiestxt(fp): """Parse a Netscape cookies.txt file and return a list of its Cookies""" cookies = [] @@ -413,6 +424,7 @@ GLOBALS = { "parse_int": text.parse_int, "urlsplit" : urllib.parse.urlsplit, "datetime" : datetime.datetime, + "timedelta": datetime.timedelta, "abort" : raises(exception.StopExtraction), "terminate": raises(exception.TerminateExtraction), "re" : re, @@ -510,6 +522,26 @@ def build_extractor_filter(categories, negate=True, special=None): return lambda extr: any(t(extr) for t in tests) +def build_proxy_map(proxies, log=None): + """Generate a proxy map""" + if not proxies: + return None + + if isinstance(proxies, str): + if "://" not in proxies: + proxies = "http://" + proxies.lstrip("/") + return {"http": proxies, "https": proxies} + + if isinstance(proxies, dict): + for scheme, proxy in proxies.items(): + if "://" not in proxy: + proxies[scheme] = "http://" + proxy.lstrip("/") + return proxies + + if log: + log.warning("invalid proxy specifier: %s", proxies) + + def build_predicate(predicates): if not predicates: return lambda url, kwdict: True diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 4bc9b57..54c81aa 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.20.5" +__version__ = "1.21.0" diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index e6953eb..45b9826 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -46,8 +46,6 @@ def construct_YoutubeDL(module, obj, user_opts, system_opts=None): if opts.get("format") is None: opts["format"] = config("format") - if opts.get("proxy") is None: - opts["proxy"] = obj.session.proxies.get("http") if opts.get("nopart") is None: opts["nopart"] = not config("part", True) if opts.get("updatetime") is None: @@ -35,6 +35,7 @@ FILES = [ for (path, files) in [ ("share/bash-completion/completions", ["data/completion/gallery-dl"]), ("share/zsh/site-functions" , ["data/completion/_gallery-dl"]), + ("share/fish/vendor_completions.d" , ["data/completion/gallery-dl.fish"]), ("share/man/man1" , ["data/man/gallery-dl.1"]), ("share/man/man5" , ["data/man/gallery-dl.conf.5"]), ] diff --git a/test/test_cookies.py b/test/test_cookies.py index d103d02..0657456 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2017-2020 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,6 +12,7 @@ import sys import unittest from unittest import mock +import time import logging import tempfile from os.path import join @@ -88,7 +89,7 @@ class TestCookiedict(unittest.TestCase): self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values())) def test_domain(self): - for category in ["exhentai", "idolcomplex", "nijie", "seiga"]: + for category in ["exhentai", "idolcomplex", "nijie"]: extr = _get_extractor(category) cookies = extr.session.cookies for key in self.cdict: @@ -107,7 +108,6 @@ class TestCookieLogin(unittest.TestCase): "exhentai" : ("ipb_member_id", "ipb_pass_hash"), "idolcomplex": ("login", "pass_hash"), "nijie" : ("nemail", "nlogin"), - "seiga" : ("user_session",), } for category, cookienames in extr_cookies.items(): cookies = {name: "value" for name in cookienames} @@ -118,6 +118,86 @@ class TestCookieLogin(unittest.TestCase): mock_login.assert_not_called() +class TestCookieUtils(unittest.TestCase): + + def test_check_cookies(self): + extr = extractor.find("test:") + self.assertFalse(extr._cookiejar, "empty") + self.assertFalse(extr.cookiedomain, "empty") + + # always returns False when checking for empty cookie list + self.assertFalse(extr._check_cookies(())) + + self.assertFalse(extr._check_cookies(("a",))) + self.assertFalse(extr._check_cookies(("a", "b"))) + self.assertFalse(extr._check_cookies(("a", "b", "c"))) + + extr._cookiejar.set("a", "1") + self.assertTrue(extr._check_cookies(("a",))) + self.assertFalse(extr._check_cookies(("a", "b"))) + self.assertFalse(extr._check_cookies(("a", "b", "c"))) + + extr._cookiejar.set("b", "2") + self.assertTrue(extr._check_cookies(("a",))) + self.assertTrue(extr._check_cookies(("a", "b"))) + self.assertFalse(extr._check_cookies(("a", "b", "c"))) + + def test_check_cookies_domain(self): + extr = extractor.find("test:") + self.assertFalse(extr._cookiejar, "empty") + extr.cookiedomain = ".example.org" + + self.assertFalse(extr._check_cookies(("a",))) + self.assertFalse(extr._check_cookies(("a", "b"))) + + extr._cookiejar.set("a", "1") + self.assertFalse(extr._check_cookies(("a",))) + + extr._cookiejar.set("a", "1", domain=extr.cookiedomain) + self.assertTrue(extr._check_cookies(("a",))) + + extr._cookiejar.set("a", "1", domain="www" + extr.cookiedomain) + self.assertEqual(len(extr._cookiejar), 3) + self.assertTrue(extr._check_cookies(("a",))) + + extr._cookiejar.set("b", "2", domain=extr.cookiedomain) + extr._cookiejar.set("c", "3", domain=extr.cookiedomain) + self.assertTrue(extr._check_cookies(("a", "b", "c"))) + + def test_check_cookies_expires(self): + extr = extractor.find("test:") + self.assertFalse(extr._cookiejar, "empty") + self.assertFalse(extr.cookiedomain, "empty") + + now = int(time.time()) + log = logging.getLogger("test") + + extr._cookiejar.set("a", "1", expires=now-100) + with mock.patch.object(log, "warning") as mw: + self.assertFalse(extr._check_cookies(("a",))) + self.assertEqual(mw.call_count, 1) + self.assertEqual(mw.call_args[0], ("Cookie '%s' has expired", "a")) + + extr._cookiejar.set("a", "1", expires=now+100) + with mock.patch.object(log, "warning") as mw: + self.assertFalse(extr._check_cookies(("a",))) + self.assertEqual(mw.call_count, 1) + self.assertEqual(mw.call_args[0], ( + "Cookie '%s' will expire in less than %s hour%s", "a", 1, "")) + + extr._cookiejar.set("a", "1", expires=now+100+7200) + with mock.patch.object(log, "warning") as mw: + self.assertFalse(extr._check_cookies(("a",))) + self.assertEqual(mw.call_count, 1) + self.assertEqual(mw.call_args[0], ( + "Cookie '%s' will expire in less than %s hour%s", "a", 3, "s")) + + extr._cookiejar.set("a", "1", expires=now+100+24*3600) + with mock.patch.object(log, "warning") as mw: + self.assertTrue(extr._check_cookies(("a",))) + self.assertEqual(mw.call_count, 0) + + def _get_extractor(category): for extr in extractor.extractors(): if extr.category == category and hasattr(extr, "_login_impl"): diff --git a/test/test_text.py b/test/test_text.py index 3ab9e73..ffed726 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -102,6 +102,18 @@ class TestText(unittest.TestCase): for value in INVALID_ALT: self.assertEqual(f(value), value) + def test_root_from_url(self, f=text.root_from_url): + result = "https://example.org" + self.assertEqual(f("https://example.org/") , result) + self.assertEqual(f("https://example.org/path"), result) + self.assertEqual(f("example.org/") , result) + self.assertEqual(f("example.org/path/") , result) + + result = "http://example.org" + self.assertEqual(f("http://example.org/") , result) + self.assertEqual(f("http://example.org/path/"), result) + self.assertEqual(f("example.org/", "http://") , result) + def test_filename_from_url(self, f=text.filename_from_url): result = "filename.ext" |
