From a768930761f7f20587ae40a8cacca0e55c85290a Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Mon, 29 Aug 2022 02:17:16 -0400 Subject: New upstream version 1.23.0. --- CHANGELOG.md | 54 ++++++++++++ PKG-INFO | 9 +- README.rst | 7 +- data/man/gallery-dl.1 | 2 +- data/man/gallery-dl.conf.5 | 152 ++++++++++++++++++++++++++++++++-- docs/gallery-dl.conf | 21 ++++- gallery_dl.egg-info/PKG-INFO | 9 +- gallery_dl.egg-info/SOURCES.txt | 2 + gallery_dl/__init__.py | 13 ++- gallery_dl/extractor/__init__.py | 2 + gallery_dl/extractor/artstation.py | 5 +- gallery_dl/extractor/blogger.py | 3 - gallery_dl/extractor/bunkr.py | 14 ++-- gallery_dl/extractor/catbox.py | 56 +++++++++++++ gallery_dl/extractor/common.py | 13 ++- gallery_dl/extractor/danbooru.py | 9 +- gallery_dl/extractor/deviantart.py | 11 ++- gallery_dl/extractor/fanbox.py | 2 + gallery_dl/extractor/foolfuuka.py | 28 +++++-- gallery_dl/extractor/gelbooru.py | 48 +++++++---- gallery_dl/extractor/gelbooru_v02.py | 55 +++++++----- gallery_dl/extractor/hitomi.py | 14 ++-- gallery_dl/extractor/instagram.py | 29 ++++++- gallery_dl/extractor/itaku.py | 11 +-- gallery_dl/extractor/kemonoparty.py | 48 ++++++++--- gallery_dl/extractor/luscious.py | 4 +- gallery_dl/extractor/mastodon.py | 16 ++-- gallery_dl/extractor/nijie.py | 2 +- gallery_dl/extractor/oauth.py | 3 +- gallery_dl/extractor/philomena.py | 2 +- gallery_dl/extractor/poipiku.py | 8 +- gallery_dl/extractor/skeb.py | 16 +++- gallery_dl/extractor/slideshare.py | 8 +- gallery_dl/extractor/smugmug.py | 4 +- gallery_dl/extractor/tapas.py | 2 +- gallery_dl/extractor/tumblr.py | 58 ++++++++++--- gallery_dl/extractor/twitter.py | 155 ++++++++++++++++++++-------------- gallery_dl/extractor/unsplash.py | 4 +- gallery_dl/extractor/vk.py | 7 +- gallery_dl/extractor/vsco.py | 2 +- gallery_dl/extractor/wallhaven.py | 13 ++- gallery_dl/extractor/weibo.py | 32 ++++--- gallery_dl/extractor/zerochan.py | 156 +++++++++++++++++++++++++++++++++++ gallery_dl/formatter.py | 11 ++- gallery_dl/job.py | 7 +- gallery_dl/output.py | 6 ++ gallery_dl/postprocessor/metadata.py | 39 ++++++++- gallery_dl/text.py | 10 +++ gallery_dl/util.py | 13 +-- gallery_dl/version.py | 2 +- test/test_formatter.py | 3 +- test/test_postprocessor.py | 46 +++++++++++ test/test_text.py | 19 ++++- test/test_util.py | 36 ++++++++ 54 files changed, 1063 insertions(+), 238 deletions(-) create mode 100644 gallery_dl/extractor/catbox.py create mode 100644 gallery_dl/extractor/zerochan.py diff --git a/CHANGELOG.md b/CHANGELOG.md index be9a4f7..61987d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,59 @@ # Changelog +## 1.23.0 - 2022-08-28 +### Changes +- [twitter] update `user` and `author` metdata fields + - for URLs with a single username or ID like `https://twitter.com/USER` or a search with a single `from:` statement, `user` will now always refer to the user referenced in the URL. + - for all other URLs like `https://twitter.com/i/bookmarks`, `user` and `author` refer to the same user + - `author` will always refer to the original Tweet author +- [twitter] update `quote_id` and `quote_by` metadata fields + - `quote_id` is now non-zero for quoted Tweets and contains the Tweet ID of the quotng Tweet (was the other way round before) + - `quote_by` is only defined for quoted Tweets like before, but now contains the screen name of the user quoting this Tweet +- [skeb] improve archive IDs for thumbnails and article images +### Additions +- [artstation] add `num` and `count` metadata fields ([#2764](https://github.com/mikf/gallery-dl/issues/2764)) +- [catbox] add `album` extractor ([#2410](https://github.com/mikf/gallery-dl/issues/2410)) +- [blogger] emit metadata for posts without files ([#2789](https://github.com/mikf/gallery-dl/issues/2789)) +- [foolfuuka] update supported domains +- [gelbooru] add support for `api_key` and `user_id` ([#2767](https://github.com/mikf/gallery-dl/issues/2767)) +- [gelbooru] implement pagination for `pool` results ([#2853](https://github.com/mikf/gallery-dl/issues/2853)) +- [instagram] add support for a user's saved collections ([#2769](https://github.com/mikf/gallery-dl/issues/2769)) +- [instagram] provide `date` for directory format strings ([#2830](https://github.com/mikf/gallery-dl/issues/2830)) +- [kemonoparty] add `favorites` option ([#2826](https://github.com/mikf/gallery-dl/issues/2826), [#2831](https://github.com/mikf/gallery-dl/issues/2831)) +- [oauth] add `host` config option ([#2806](https://github.com/mikf/gallery-dl/issues/2806)) +- [rule34] implement pagination for `pool` results ([#2853](https://github.com/mikf/gallery-dl/issues/2853)) +- [skeb] add option to download `article` images ([#1031](https://github.com/mikf/gallery-dl/issues/1031)) +- [tumblr] download higher-quality images ([#2761](https://github.com/mikf/gallery-dl/issues/2761)) +- [tumblr] add `count` metadata field ([#2804](https://github.com/mikf/gallery-dl/issues/2804)) +- [wallhaven] implement `metadata` option ([#2803](https://github.com/mikf/gallery-dl/issues/2803)) +- [zerochan] add `tag` and `image` extractors ([#1434](https://github.com/mikf/gallery-dl/issues/1434)) +- [zerochan] implement login with username & password ([#1434](https://github.com/mikf/gallery-dl/issues/1434)) +- [postprocessor:metadata] implement `mode: modify` and `mode: delete` ([#2640](https://github.com/mikf/gallery-dl/issues/2640)) +- [formatter] add `g` conversion for slugifying a string ([#2410](https://github.com/mikf/gallery-dl/issues/2410)) +- [formatter] apply `:J` only to lists ([#2833](https://github.com/mikf/gallery-dl/issues/2833)) +- implement `path-metadata` option ([#2734](https://github.com/mikf/gallery-dl/issues/2734)) +- allow comments after input file URLs ([#2808](https://github.com/mikf/gallery-dl/issues/2808)) +- add global `warnings` option to control `urllib3` warning behavior ([#2762](https://github.com/mikf/gallery-dl/issues/2762)) +### Fixes +- [bunkr] fix extraction ([#2788](https://github.com/mikf/gallery-dl/issues/2788)) +- [deviantart] use public access token for journals ([#2702](https://github.com/mikf/gallery-dl/issues/2702)) +- [e621] fix extraction of `popular` posts +- [fanbox] download cover images in original size ([#2784](https://github.com/mikf/gallery-dl/issues/2784)) +- [mastodon] allow downloading without access token ([#2782](https://github.com/mikf/gallery-dl/issues/2782)) +- [hitomi] update cache expiry time ([#2863](https://github.com/mikf/gallery-dl/issues/2863)) +- [hitomi] fix error when number of tag results is a multiple of 25 ([#2870](https://github.com/mikf/gallery-dl/issues/2870)) +- [mangahere] fix `page-reverse` option ([#2795](https://github.com/mikf/gallery-dl/issues/2795)) +- [poipiku] fix posts with more than one image ([#2796](https://github.com/mikf/gallery-dl/issues/2796)) +- [poipiku] update filter for static images ([#2796](https://github.com/mikf/gallery-dl/issues/2796)) +- [slideshare] fix metadata extraction +- [twitter] unescape `+` in search queries ([#2226](https://github.com/mikf/gallery-dl/issues/2226)) +- [twitter] fall back to unfiltered search ([#2766](https://github.com/mikf/gallery-dl/issues/2766)) +- [twitter] ignore invalid user entries ([#2850](https://github.com/mikf/gallery-dl/issues/2850)) +- [vk] prevent exceptions for broken/invalid photos ([#2774](https://github.com/mikf/gallery-dl/issues/2774)) +- [vsco] fix `collection` extraction +- [weibo] prevent exception for missing `playback_list` ([#2792](https://github.com/mikf/gallery-dl/issues/2792)) +- [weibo] prevent errors when paginating over album entries ([#2817](https://github.com/mikf/gallery-dl/issues/2817)) + ## 1.22.4 - 2022-07-15 ### Additions - [instagram] add `pinned` metadata field ([#2752](https://github.com/mikf/gallery-dl/issues/2752)) diff --git a/PKG-INFO b/PKG-INFO index aaf3516..60a798f 100644 --- a/PKG-INFO +++ b/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.22.4 +Version: 1.23.0 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -251,7 +251,8 @@ and optional for ``subscribestar``, ``tapas``, ``tsumino``, -and ``twitter``. +``twitter``, +and ``zerochan``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) diff --git a/README.rst b/README.rst index 1d25a83..2b45b27 100644 --- a/README.rst +++ b/README.rst @@ -66,8 +66,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -218,7 +218,8 @@ and optional for ``subscribestar``, ``tapas``, ``tsumino``, -and ``twitter``. +``twitter``, +and ``zerochan``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 751d470..d4efeed 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-07-15" "1.22.4" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-08-28" "1.23.0" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 39550ad..642cb78 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-07-15" "1.22.4" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-08-28" "1.23.0" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -458,6 +458,8 @@ and optional for * \f[I]tsumino\f[] .br * \f[I]twitter\f[] +.br +* \f[I]zerochan\f[] These values can also be specified via the \f[I]-u/--username\f[] and \f[I]-p/--password\f[] command-line options or @@ -667,6 +669,21 @@ This can then be used in \f[I]filenames\f[], with a \f[I]metadata\f[] post processor, etc. +.SS extractor.*.path-metadata +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Insert a reference to the current \f[I]PathFormat\f[] +data structure into metadata dictionaries as the given name. + +For example, setting this option to \f[I]"gdl_path"\f[] would make it possible +to access the current file's filename as \f[I]"[gdl_path.filename}"\f[]. + + .SS extractor.*.category-transfer .IP "Type:" 6 \f[I]bool\f[] @@ -1516,6 +1533,19 @@ Selects which site layout to expect when parsing posts. * \f[I]"new"\f[]: Expect the *new* site layout +.SS extractor.gelbooru.api-key & .user-id +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Values from the API Access Credentials section found at the bottom of your +\f[I]Account Options\f[] +page. + + .SS extractor.generic.enabled .IP "Type:" 6 \f[I]bool\f[] @@ -1751,6 +1781,19 @@ Controls how to handle duplicate files in a post. Extract a user's direct messages as \f[I]dms\f[] metadata. +.SS extractor.kemonoparty.favorites +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]artist\f[] + +.IP "Description:" 4 +Determines the type of favorites to be downloaded. + +Available types are \f[I]artist\f[], and \f[I]post\f[]. + + .SS extractor.kemonoparty.files .IP "Type:" 6 \f[I]list\f[] of \f[I]strings\f[] @@ -2007,6 +2050,17 @@ Store tokens received during OAuth authorizations in \f[I]cache\f[]. +.SS extractor.oauth.host +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"localhost"\f[] + +.IP "Description:" 4 +Host name / IP address to bind to during OAuth authorization. + + .SS extractor.oauth.port .IP "Type:" 6 \f[I]integer\f[] @@ -2424,6 +2478,17 @@ Download video embeds from external sites. Download videos. +.SS extractor.skeb.article +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download article images. + + .SS extractor.skeb.sent-requests .IP "Type:" 6 \f[I]bool\f[] @@ -2502,6 +2567,21 @@ images from them. Search posts for inline images and videos. +.SS extractor.tumblr.original +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download full-resolution \f[I]photo\f[] images. + +For each photo with "maximum" resolution +(width equal to 2048 or height equal to 3072), +use an extra HTTP request to find the URL to its full-resolution version. + + .SS extractor.tumblr.reblogs .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -2846,6 +2926,19 @@ to use your account's browsing settings and default filters when searching. See https://wallhaven.cc/help/api for more information. +.SS extractor.wallhaven.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract additional metadata (tags, uploader) + +Note: This requires 1 additional HTTP request for each post. + + .SS extractor.weasyl.api-key .IP "Type:" 6 \f[I]string\f[] @@ -3714,16 +3807,20 @@ See \f[I]metadata.event\f[] for a list of available events. \f[I]"json"\f[] .IP "Description:" 4 -Select how to write metadata. +Selects how to process metadata. .br -* \f[I]"json"\f[]: all metadata using \f[I]json.dump() +* \f[I]"json"\f[]: write metadata using \f[I]json.dump() \f[] .br -* \f[I]"tags"\f[]: \f[I]tags\f[] separated by newlines +* \f[I]"tags"\f[]: write \f[I]tags\f[] separated by newlines .br -* \f[I]"custom"\f[]: result of applying \f[I]metadata.content-format\f[] +* \f[I]"custom"\f[]: write the result of applying \f[I]metadata.content-format\f[] to a file's metadata dictionary +.br +* \f[I]"modify"\f[]: add or modify metadata entries +.br +* \f[I]"delete"\f[]: remove metadata entries .SS metadata.filename @@ -3821,6 +3918,39 @@ When starting to download all files of a post, e.g. a Tweet on Twitter or a post on Patreon. +.SS metadata.fields +.IP "Type:" 6 +.br +* \f[I]list\f[] of \f[I]strings\f[] +.br +* \f[I]object\f[] (field name -> \f[I]format string\f[]) + +.IP "Example:" 4 +.br +* .. code:: json + +["blocked", "watching", "status[creator][name]"] + +.br +* .. code:: json + +{ +"blocked" : "***", +"watching" : "\\fE 'yes' if watching else 'no'", +"status[username]": "{status[creator][name]!l}" +} + + +.IP "Description:" 4 +.br +* \f[I]"mode": "delete"\f[]: +A list of metadata field names to remove. +.br +* \f[I]"mode": "modify"\f[]: +An object with metadata field names mapping to a \f[I]format string\f[] +whose result is assigned to said field name. + + .SS metadata.content-format .IP "Type:" 6 \f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] @@ -4190,6 +4320,18 @@ The list of signal names to ignore, i.e. set as signal handler for. +.SS warnings +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"default"\f[] + +.IP "Description:" 4 +The \f[I]Warnings Filter action\f[] +used for (urllib3) warnings. + + .SS pyopenssl .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1492653..1e485ee 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -109,6 +109,11 @@ "include": "gallery", "layout": "auto" }, + "gelbooru": + { + "api-key": null, + "user-id": null + }, "gfycat": { "format": ["mp4", "webm", "mobile", "gif"] @@ -193,6 +198,7 @@ { "browser": true, "cache": true, + "host": "localhost", "port": 6414 }, "paheal": @@ -248,6 +254,12 @@ "username": null, "password": null }, + "skeb": + { + "article": false, + "sent-requests": false, + "thumbnails": false + }, "smugmug": { "videos": true @@ -273,6 +285,7 @@ "external": false, "inline": true, "posts": "all", + "original": true, "reblogs": true }, "twitter": @@ -302,7 +315,8 @@ }, "wallhaven": { - "api-key": null + "api-key": null, + "metadata": false }, "weasyl": { @@ -324,6 +338,11 @@ "module": null, "raw-options": null }, + "zerochan": + { + "username": null, + "password": null + }, "booru": { "tags": false, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 1e1d74d..6b9d68b 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.22.4 +Version: 1.23.0 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ -- `Linux `__ +- `Windows `__ +- `Linux `__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -251,7 +251,8 @@ and optional for ``subscribestar``, ``tapas``, ``tsumino``, -and ``twitter``. +``twitter``, +and ``zerochan``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index b323e38..5f5084b 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -58,6 +58,7 @@ gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py gallery_dl/extractor/booru.py gallery_dl/extractor/bunkr.py +gallery_dl/extractor/catbox.py gallery_dl/extractor/comicvine.py gallery_dl/extractor/common.py gallery_dl/extractor/cyberdrop.py @@ -197,6 +198,7 @@ gallery_dl/extractor/wikieat.py gallery_dl/extractor/xhamster.py gallery_dl/extractor/xvideos.py gallery_dl/extractor/ytdl.py +gallery_dl/extractor/zerochan.py gallery_dl/postprocessor/__init__.py gallery_dl/postprocessor/classify.py gallery_dl/postprocessor/common.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 04ea54c..329e7ab 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -38,11 +38,11 @@ def parse_inputfile(file, log): Lines starting with '#' and empty lines will be ignored. Lines starting with '-' will be interpreted as a key-value pair separated by an '='. where 'key' is a dot-separated option name and 'value' is a - JSON-parsable value for it. These config options will be applied while + JSON-parsable value. These configuration options will be applied while processing the next URL. Lines starting with '-G' are the same as above, except these options will - be valid for all following URLs, i.e. they are Global. - Everything else will be used as potential URL. + be applied for *all* following URLs, i.e. they are Global. + Everything else will be used as a potential URL. Example input file: @@ -57,7 +57,8 @@ def parse_inputfile(file, log): https://example.org/ # next URL uses default filename and 'skip' is false. - https://example.com/index.htm + https://example.com/index.htm # comment1 + https://example.com/404.htm # comment2 """ gconf = [] lconf = [] @@ -94,6 +95,10 @@ def parse_inputfile(file, log): else: # url + if " #" in line: + line = line.partition(" #")[0] + elif "\t#" in line: + line = line.partition("\t#")[0] if gconf or lconf: yield util.ExtendedUrl(line, gconf, lconf) gconf = [] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 70cebb3..9e4507a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -26,6 +26,7 @@ modules = [ "behance", "blogger", "bunkr", + "catbox", "comicvine", "cyberdrop", "danbooru", @@ -150,6 +151,7 @@ modules = [ "wikieat", "xhamster", "xvideos", + "zerochan", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 19b9d97..c0e8e67 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -32,9 +32,11 @@ class ArtstationExtractor(Extractor): data = self.metadata() for project in self.projects(): - for asset in self.get_project_assets(project["hash_id"]): + for num, asset in enumerate( + self.get_project_assets(project["hash_id"]), 1): asset.update(data) adict = asset["asset"] + asset["num"] = num yield Message.Directory, asset if adict["has_embedded_player"] and self.external: @@ -85,6 +87,7 @@ class ArtstationExtractor(Extractor): assets = data["assets"] del data["assets"] + data["count"] = len(assets) if len(assets) == 1: data["asset"] = assets[0] yield data diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 21ca991..e0885d2 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -67,9 +67,6 @@ class BloggerExtractor(Extractor): key=lambda x: x["format_id"], )["play_url"]) - if not files: - continue - post["author"] = post["author"]["displayName"] post["replies"] = post["replies"]["totalItems"] post["content"] = text.remove_html(content) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 9904d0a..3091f57 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -16,10 +16,10 @@ import json class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkr.is albums""" category = "bunkr" - root = "https://app.bunkr.is" + root = "https://bunkr.is" pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:is|to)/a/([^/?#]+)" test = ( - ("https://app.bunkr.is/a/Lktg9Keq", { + ("https://bunkr.is/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -33,7 +33,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): }, }), # mp4 (#2239) - ("https://bunkr.is/a/ptRHaCn2", { + ("https://app.bunkr.is/a/ptRHaCn2", { "pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4", "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }), @@ -70,16 +70,16 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): album = props["album"] files = props["files"] except Exception as exc: - self.log.debug(exc) + self.log.debug(exc.__class__.__name__, exc) self.root = self.root.replace("bunkr", "app.bunkr", 1) return self._fetch_album_api(album_id) for file in files: name = file["name"] + cdn = file["cdn"] if name.endswith(".mp4"): - file["file"] = "https://media-files.bunkr.is/" + name - else: - file["file"] = file["cdn"] + "/" + name + cdn = cdn.replace("//cdn", "//media-files") + file["file"] = cdn + "/" + name return files, { "album_id" : self.album_id, diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py new file mode 100644 index 0000000..509108f --- /dev/null +++ b/gallery_dl/extractor/catbox.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://catbox.moe/""" + +from .common import GalleryExtractor +from .. import text + + +class CatboxAlbumExtractor(GalleryExtractor): + """Extractor for catbox albums""" + category = "catbox" + subcategory = "album" + root = "https://catbox.moe" + filename_fmt = "{filename}.{extension}" + directory_fmt = ("{category}", "{album_name} ({album_id})") + archive_fmt = "{album_id}_{filename}" + pattern = r"(?:https?://)?(?:www\.)?catbox\.moe(/c/[^/?#]+)" + test = ( + ("https://catbox.moe/c/1igcbe", { + "url": "35866a88c29462814f103bc22ec031eaeb380f8a", + "content": "70ddb9de3872e2d17cc27e48e6bf395e5c8c0b32", + "pattern": r"https://files\.catbox\.moe/\w+\.\w{3}$", + "count": 3, + "keyword": { + "album_id": "1igcbe", + "album_name": "test", + "date": "dt:2022-08-18 00:00:00", + "description": "album test &>", + }, + }), + ("https://www.catbox.moe/c/cd90s1"), + ("https://catbox.moe/c/w7tm47#"), + ) + + def metadata(self, page): + extr = text.extract_from(page) + return { + "album_id" : self.gallery_url.rpartition("/")[2], + "album_name" : text.unescape(extr("

", "<")), + "date" : text.parse_datetime(extr( + "

Created ", "<"), "%B %d %Y"), + "description": text.unescape(extr("

", "<")), + } + + def images(self, page): + return [ + ("https://files.catbox.moe/" + path, None) + for path in text.extract_iter( + page, ">https://files.catbox.moe/", "<") + ] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 6ccae7f..1b41101 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -795,12 +795,23 @@ SSL_CIPHERS = { } +urllib3 = requests.packages.urllib3 + # detect brotli support try: - BROTLI = requests.packages.urllib3.response.brotli is not None + BROTLI = urllib3.response.brotli is not None except AttributeError: BROTLI = False +# set (urllib3) warnings filter +action = config.get((), "warnings", "default") +if action: + try: + import warnings + warnings.simplefilter(action, urllib3.exceptions.HTTPWarning) + except Exception: + pass +del action # Undo automatic pyOpenSSL injection by requests pyopenssl = config.get((), "pyopenssl", False) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index ec0db68..8c2ed53 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -34,6 +34,7 @@ class DanbooruExtractor(BaseExtractor): self.per_page = iget("per-page", 200) self.request_interval_min = iget("request-interval-min", 0.0) self._pools = iget("pools") + self._popular_endpoint = iget("popular", "/explore/posts/popular.json") BaseExtractor.__init__(self, match) @@ -150,6 +151,7 @@ INSTANCES = { "headers": {"User-Agent": "gallery-dl/{} (by mikf)".format( __version__)}, "pools": "sort", + "popular": "/popular.json", "page-limit": 750, "per-page": 320, "request-interval-min": 1.0, @@ -308,7 +310,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): subcategory = "popular" directory_fmt = ("{category}", "popular", "{scale}", "{date}") archive_fmt = "P_{scale[0]}_{date}_{id}" - pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?" + pattern = BASE_PATTERN + r"/(?:explore/posts/)?popular(?:\?([^#]*))?" test = ( ("https://danbooru.donmai.us/explore/posts/popular"), (("https://danbooru.donmai.us/explore/posts/popular" @@ -316,7 +318,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): "range": "1-120", "count": 120, }), - ("https://e621.net/explore/posts/popular"), + ("https://e621.net/popular"), (("https://e621.net/explore/posts/popular" "?date=2019-06-01&scale=month"), { "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+", @@ -345,8 +347,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): def posts(self): if self.page_start is None: self.page_start = 1 - return self._pagination( - "/explore/posts/popular.json", self.params, True) + return self._pagination(self._popular_endpoint, self.params, True) class DanbooruFavoriteExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 39ae484..60f644d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1128,11 +1128,18 @@ class DeviantartOAuthAPI(): self._folders((deviation,)) return deviation - def deviation_content(self, deviation_id, public=False): + def deviation_content(self, deviation_id, public=True): """Get extended content of a single Deviation""" endpoint = "/deviation/content" params = {"deviationid": deviation_id} - return self._call(endpoint, params=params, public=public) + content = self._call(endpoint, params=params, public=public) + if public and content["html"].startswith( + ' \d+)") test = ( ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { "count": 6, }), - ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { - "options": (("api", False),), - "count": 6, - }), ) def metadata(self): - url = "{}/index.php?page=pool&s=show&id={}".format( - self.root, self.pool_id) - page = self.request(url).text + url = self.root + "/index.php" + self._params = { + "page": "pool", + "s" : "show", + "id" : self.pool_id, + "pid" : self.page_start, + } + self._page = self.request(url, params=self._params).text - name, pos = text.extract(page, "

Now Viewing: ", "

") + name, pos = text.extract(self._page, "

Now Viewing: ", "

") if not name: raise exception.NotFoundError("pool") - self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos) return { "pool": text.parse_int(self.pool_id), @@ -114,9 +120,23 @@ class GelbooruPoolExtractor(GelbooruBase, } def posts(self): - params = {} - for params["id"] in util.advance(self.post_ids, self.page_start): - yield from self._api_request(params) + url = self.root + "/index.php" + params = self._params + + page = self._page + del self._page + data = {} + + while True: + num_ids = 0 + for data["id"] in text.extract_iter(page, '" id="p', '"'): + num_ids += 1 + yield from self._api_request(data) + + if num_ids < self.per_page: + return + params["pid"] += self.per_page + page = self.request(url, params=params).text class GelbooruPostExtractor(GelbooruBase, diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 35a3448..8214614 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -21,6 +21,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): def __init__(self, match): booru.BooruExtractor.__init__(self, match) + self.api_key = self.config("api-key") + self.user_id = self.config("user-id") + try: self.api_root = INSTANCES[self.category]["api_root"] except KeyError: @@ -59,6 +62,24 @@ class GelbooruV02Extractor(booru.BooruExtractor): return params["pid"] += 1 + def _pagination_html(self, params): + url = self.root + "/index.php" + params["pid"] = self.page_start * self.per_page + + data = {} + while True: + num_ids = 0 + page = self.request(url, params=params).text + + for data["id"] in text.extract_iter(page, '" id="p', '"'): + num_ids += 1 + for post in self._api_request(data): + yield post.attrib + + if num_ids < self.per_page: + return + params["pid"] += self.per_page + @staticmethod def _prepare(post): post["date"] = text.parse_datetime( @@ -204,7 +225,12 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): def __init__(self, match): GelbooruV02Extractor.__init__(self, match) self.pool_id = match.group(match.lastindex) - self.post_ids = () + + if self.category == "rule34": + self.posts = self._posts_pages + self.per_page = 45 + else: + self.post_ids = () def skip(self, num): self.page_start += num @@ -232,6 +258,13 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): for post in self._api_request(params): yield post.attrib + def _posts_pages(self): + return self._pagination_html({ + "page": "pool", + "s" : "show", + "id" : self.pool_id, + }) + class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): subcategory = "favorite" @@ -265,27 +298,11 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): return {"favorite_id": text.parse_int(self.favorite_id)} def posts(self): - url = self.root + "/index.php" - params = { + return self._pagination_html({ "page": "favorites", "s" : "view", "id" : self.favorite_id, - "pid" : self.page_start * self.per_page, - } - - data = {} - while True: - num_ids = 0 - page = self.request(url, params=params).text - - for data["id"] in text.extract_iter(page, '" id="p', '"'): - num_ids += 1 - for post in self._api_request(data): - yield post.attrib - - if num_ids < self.per_page: - return - params["pid"] += self.per_page + }) class GelbooruV02PostExtractor(GelbooruV02Extractor): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index ca7e692..f8b0c3b 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -174,23 +174,27 @@ class HitomiTagExtractor(Extractor): } offset = 0 + total = None while True: headers["Referer"] = "{}/{}/{}.html?page={}".format( self.root, self.type, self.tag, offset // 100 + 1) headers["Range"] = "bytes={}-{}".format(offset, offset+99) - nozomi = self.request(nozomi_url, headers=headers).content + response = self.request(nozomi_url, headers=headers) - for gallery_id in decode_nozomi(nozomi): + for gallery_id in decode_nozomi(response.content): gallery_url = "{}/galleries/{}.html".format( self.root, gallery_id) yield Message.Queue, gallery_url, data - if len(nozomi) < 100: - return offset += 100 + if total is None: + total = text.parse_int( + response.headers["content-range"].rpartition("/")[2]) + if offset >= total: + return -@memcache() +@memcache(maxage=1800) def _parse_gg(extr): page = extr.request("https://ltn.hitomi.la/gg.js").text diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 4a2c3bb..d56af8b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -270,6 +270,7 @@ class InstagramExtractor(Extractor): "post_shortcode": post["code"], "likes": post["like_count"], "pinned": post.get("timeline_pinned_user_ids", ()), + "date": text.parse_timestamp(post.get("taken_at")), } caption = post["caption"] @@ -399,6 +400,8 @@ class InstagramExtractor(Extractor): self.log.debug("Cursor: %s", self._cursor) def _pagination_api(self, endpoint, params=None): + if params is None: + params = {} while True: data = self._request_api(endpoint, params=params) yield from data["items"] @@ -509,7 +512,7 @@ class InstagramChannelExtractor(InstagramExtractor): class InstagramSavedExtractor(InstagramExtractor): """Extractor for ProfilePage saved media""" subcategory = "saved" - pattern = USER_PATTERN + r"/saved" + pattern = USER_PATTERN + r"/saved/?$" test = ("https://www.instagram.com/instagram/saved/",) def posts(self): @@ -518,6 +521,30 @@ class InstagramSavedExtractor(InstagramExtractor): return self._pagination_graphql(query_hash, variables) +class InstagramCollectionExtractor(InstagramExtractor): + """Extractor for ProfilePage saved collection media""" + subcategory = "collection" + pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)" + test = ( + "https://www.instagram.com/instagram/saved/collection_name/123456789/", + ) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.user, self.collection_name, self.collection_id = match.groups() + + def metadata(self): + return { + "collection_id" : self.collection_id, + "collection_name": text.unescape(self.collection_name), + } + + def posts(self): + endpoint = "/v1/feed/collection/{}/posts/".format(self.collection_id) + for item in self._pagination_api(endpoint): + yield item["media"] + + class InstagramTagExtractor(InstagramExtractor): """Extractor for TagPage""" subcategory = "tag" diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 6b2cf4c..00a32cd 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -101,9 +101,9 @@ class ItakuImageExtractor(ItakuExtractor): "/gallery_imgs/220504_oUNIAFT/xl.jpg", "liked_by_you": False, "maturity_rating": "SFW", - "num_comments": 2, - "num_likes": 80, - "num_reshares": 2, + "num_comments": int, + "num_likes": int, + "num_reshares": int, "obj_tags": 136446, "owner": 16775, "owner_avatar": "https://d1wmr8tlk3viaj.cloudfront.net" @@ -115,8 +115,9 @@ class ItakuImageExtractor(ItakuExtractor): "tags": list, "tags_character": ["hatsune_miku"], "tags_copyright": ["vocaloid"], - "tags_general" : ["twintails", "green_hair", "flag", "gloves", - "green_eyes", "female", "racing_miku"], + "tags_general" : ["female", "green_eyes", "twintails", + "green_hair", "gloves", "flag", + "racing_miku"], "title": "Racing Miku 2022 Ver.", "too_mature": False, "uncompressed_filesize": "0.62", diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index f1eb79f..816b561 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -440,20 +440,44 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): class KemonopartyFavoriteExtractor(KemonopartyExtractor): """Extractor for kemono.party favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites" - test = ("https://kemono.party/favorites", { - "pattern": KemonopartyUserExtractor.pattern, - "url": "f4b5b796979bcba824af84206578c79101c7f0e1", - "count": 3, - }) + pattern = BASE_PATTERN + r"/favorites(?:/?\?([^#]+))?" + test = ( + ("https://kemono.party/favorites", { + "pattern": KemonopartyUserExtractor.pattern, + "url": "f4b5b796979bcba824af84206578c79101c7f0e1", + "count": 3, + }), + ("https://kemono.party/favorites?type=post", { + "pattern": KemonopartyPostExtractor.pattern, + "url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", + "count": 3, + }), + ) + + def __init__(self, match): + KemonopartyExtractor.__init__(self, match) + self.favorites = (text.parse_query(match.group(2)).get("type") or + self.config("favorites") or + "artist") def items(self): self._prepare_ddosguard_cookies() self.login() - users = self.request(self.root + "/api/favorites").json() - for user in users: - user["_extractor"] = KemonopartyUserExtractor - url = "{}/{}/user/{}".format( - self.root, user["service"], user["id"]) - yield Message.Queue, url, user + if self.favorites == "artist": + users = self.request( + self.root + "/api/v1/account/favorites?type=artist").json() + for user in users: + user["_extractor"] = KemonopartyUserExtractor + url = "{}/{}/user/{}".format( + self.root, user["service"], user["id"]) + yield Message.Queue, url, user + + elif self.favorites == "post": + posts = self.request( + self.root + "/api/v1/account/favorites?type=post").json() + for post in posts: + post["_extractor"] = KemonopartyPostExtractor + url = "{}/{}/user/{}/post/{}".format( + self.root, post["service"], post["user"], post["id"]) + yield Message.Queue, url, post diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index b5db3dd..57db0c9 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -49,7 +49,9 @@ class LusciousAlbumExtractor(LusciousExtractor): r"/(?:albums|pictures/c/[^/?#]+/album)/[^/?#]+_(\d+)") test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { - "url": "7e4984a271a1072ac6483e4228a045895aff86f3", + "pattern": r"https://storage\.bhs\.cloud\.ovh\.net/v1/AUTH_\w+" + r"/images/NTRshouldbeillegal/277031" + r"/luscious_net_\d+_\d+\.jpg$", # "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", "keyword": { "album": { diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 6e780e8..493a8ef 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -179,12 +179,11 @@ class MastodonAPI(): try: access_token = INSTANCES[extractor.category]["access-token"] except (KeyError, TypeError): - raise exception.StopExtraction( - "Missing access token.\n" - "Run 'gallery-dl oauth:mastodon:%s' to obtain one.", - extractor.instance) - - self.headers = {"Authorization": "Bearer " + access_token} + pass + if access_token: + self.headers = {"Authorization": "Bearer " + access_token} + else: + self.headers = None def account_id_by_username(self, username): if username.startswith("id:"): @@ -232,6 +231,11 @@ class MastodonAPI(): if code < 400: return response + if code == 401: + raise exception.StopExtraction( + "Invalid or missing access token.\n" + "Run 'gallery-dl oauth:mastodon:%s' to obtain one.", + self.extractor.instance) if code == 404: raise exception.NotFoundError() if code == 429: diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 122ea46..2c8e72c 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -126,7 +126,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): username, password = self._get_auth_info() self._update_cookies(self._login_impl(username, password)) - @cache(maxage=150*24*3600, keyarg=1) + @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): if not username or not password: raise exception.AuthenticationError( diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 653822f..d6628c4 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -41,7 +41,8 @@ class OAuthBase(Extractor): stdout_write("Waiting for response. (Cancel with Ctrl+c)\n") server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(("localhost", self.config("port", 6414))) + server.bind((self.config("host", "localhost"), + self.config("port", 6414))) server.listen(1) # workaround for ctrl+c not working during server.accept on Windows diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index fba1312..225f0ff 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -122,7 +122,7 @@ class PhilomenaPostExtractor(PhilomenaExtractor): "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2022-04-25T09:30:57Z", + "updated_at": r"re:\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ", "uploader": "Clover the Clever", "uploader_id": 211188, "upvotes": int, diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index e1846cc..8203885 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -51,13 +51,13 @@ class PoipikuExtractor(Extractor): thumb = extr('class="IllustItemThumbImg" src="', '"') if not thumb: break - elif thumb.startswith("/img/"): + elif thumb.startswith(("//img.poipiku.com/img/", "/img/")): continue post["num"] += 1 url = text.ensure_http_scheme(thumb[:-8]) yield Message.Url, url, text.nameext_from_url(url, post) - if not extr(' show all', '<'): + if not extr('> show all', '<'): continue url = self.root + "/f/ShowAppendFileF.jsp" @@ -131,7 +131,7 @@ class PoipikuPostExtractor(PoipikuExtractor): pattern = BASE_PATTERN + r"/(\d+)/(\d+)" test = ( ("https://poipiku.com/25049/5864576.html", { - "pattern": r"https://img\.poipiku\.com/user_img03/000025049" + "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049" r"/005864576_EWN1Y65gQ\.png$", "keyword": { "count": "1", @@ -146,7 +146,7 @@ class PoipikuPostExtractor(PoipikuExtractor): }, }), ("https://poipiku.com/2166245/6411749.html", { - "pattern": r"https://img\.poipiku\.com/user_img01/002166245" + "pattern": r"https://img\.poipiku\.com/user_img\d+/002166245" r"/006411749_\w+\.jpeg$", "count": 4, "keyword": { diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 6dfc907..cd8c238 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -16,13 +16,14 @@ class SkebExtractor(Extractor): category = "skeb" directory_fmt = ("{category}", "{creator[screen_name]}") filename_fmt = "{post_num}_{file_id}.{extension}" - archive_fmt = "{post_num}_{file_id}_{content_category}" + archive_fmt = "{post_num}_{_file_id}_{content_category}" root = "https://skeb.jp" def __init__(self, match): Extractor.__init__(self, match) self.user_name = match.group(1) self.thumbnails = self.config("thumbnails", False) + self.article = self.config("article", False) def items(self): for user_name, post_num in self.posts(): @@ -64,6 +65,7 @@ class SkebExtractor(Extractor): resp = self.request(url, headers=headers).json() creator = resp["creator"] post = { + "post_id" : resp["id"], "post_num" : post_num, "post_url" : self.root + resp["path"], "body" : resp["body"], @@ -102,12 +104,22 @@ class SkebExtractor(Extractor): if self.thumbnails and "og_image_url" in resp: post["content_category"] = "thumb" post["file_id"] = "thumb" + post["_file_id"] = str(resp["id"]) + "t" post["file_url"] = resp["og_image_url"] yield post + if self.article and "article_image_url" in resp: + url = resp["article_image_url"] + if url: + post["content_category"] = "article" + post["file_id"] = "article" + post["_file_id"] = str(resp["id"]) + "a" + post["file_url"] = url + yield post + for preview in resp["previews"]: post["content_category"] = "preview" - post["file_id"] = preview["id"] + post["file_id"] = post["_file_id"] = preview["id"] post["file_url"] = preview["url"] info = preview["information"] post["original"] = { diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index b0b8f3b..506db26 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -59,7 +59,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): # mobile URL (("https://www.slideshare.net" "/mobile/uqudent/introduction-to-fixed-prosthodontics"), { - "url": "59993ad7b0cb93c73011547eedcd02c622649e9d", + "url": "43eda2adf4dd221a251c8df794dfb82649e94647", }), ) @@ -72,14 +72,14 @@ class SlidesharePresentationExtractor(GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) descr = extr('', '') - published = extr('') comments = extr('content="UserComments:', '"') likes = extr('content="UserLikes:', '"') views = extr('content="UserPageVisits:', '"') + title = extr('', '') + published = extr('') if descr.endswith("…"): - alt_descr = extr('id="slideshow-description-text"', '

') + alt_descr = extr('slideshow-description-text"', '

') if alt_descr: descr = text.remove_html(alt_descr.partition(">")[2]).strip() diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 98e914e..4010da3 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -111,13 +111,13 @@ class SmugmugImageExtractor(SmugmugExtractor): test = ( ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { "url": "e6408fd2c64e721fd146130dceb56a971ceb4259", - "keyword": "460a773f5addadd3e216bda346fc524fe4eedc52", + "keyword": "b31a63d07c9c26eb0f79f52d60d171a98938f99b", "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", }), # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", - "keyword": "eb74e5cf6780d5152ab8f11b431ec1b17fa8f69b", + "keyword": "4cef98133ace511adc874c9d9abac5817ba0d856", }), ) diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index fcdf18f..545a95b 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -108,7 +108,7 @@ class TapasSeriesExtractor(TapasExtractor): test = ( ("https://tapas.io/series/just-leave-me-be", { "pattern": r"https://\w+\.cloudfront\.net/pc/\w\w/[0-9a-f-]+\.jpg", - "count": 127, + "count": 132, }), ("https://tapas.io/series/yona", { # mature "count": 26, diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index ded7fd1..b694fa0 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -64,6 +64,7 @@ class TumblrExtractor(Extractor): self.inline = self.config("inline", True) self.reblogs = self.config("reblogs", True) self.external = self.config("external", False) + self.original = self.config("original", True) if len(self.types) == 1: self.api.posts_type = next(iter(self.types)) @@ -101,8 +102,7 @@ class TumblrExtractor(Extractor): del post["trail"] post["blog"] = blog post["date"] = text.parse_timestamp(post["timestamp"]) - yield Message.Directory, post - post["num"] = 0 + posts = [] if "photos" in post: # type "photo" or "link" photos = post["photos"] @@ -110,18 +110,31 @@ class TumblrExtractor(Extractor): for photo in photos: post["photo"] = photo - photo.update(photo["original_size"]) + + best_photo = photo["original_size"] + for alt_photo in photo["alt_sizes"]: + if (alt_photo["height"] > best_photo["height"] or + alt_photo["width"] > best_photo["width"]): + best_photo = alt_photo + photo.update(best_photo) + + if self.original and "/s2048x3072/" in photo["url"] and ( + photo["width"] == 2048 or photo["height"] == 3072): + photo["url"] = self._original_image(photo["url"]) + del photo["original_size"] del photo["alt_sizes"] - yield self._prepare_image(photo["url"], post) + posts.append( + self._prepare_image(photo["url"], post.copy())) + del post["photo"] url = post.get("audio_url") # type "audio" if url and url.startswith("https://a.tumblr.com/"): - yield self._prepare(url, post) + posts.append(self._prepare(url, post.copy())) url = post.get("video_url") # type "video" if url: - yield self._prepare(_original_video(url), post) + posts.append(self._prepare(_original_video(url), post.copy())) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their @@ -129,16 +142,25 @@ class TumblrExtractor(Extractor): body = post["reblog"]["comment"] + post["reblog"]["tree_html"] for url in re.findall('= meta["last_page"]: diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index bdbdc8c..189c0c5 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -99,13 +99,14 @@ class WeiboExtractor(Extractor): else: yield pic["largest"].copy() - if "page_info" in status: - page_info = status["page_info"] - if "media_info" not in page_info or not self.videos: - return - media = max(page_info["media_info"]["playback_list"], - key=lambda m: m["meta"]["quality_index"]) - yield media["play_info"].copy() + if "page_info" in status and self.videos: + try: + media = max(status["page_info"]["media_info"]["playback_list"], + key=lambda m: m["meta"]["quality_index"]) + except KeyError: + pass + else: + yield media["play_info"].copy() def _status_by_id(self, status_id): url = "{}/ajax/statuses/show?id={}".format(self.root, status_id) @@ -147,14 +148,17 @@ class WeiboExtractor(Extractor): return yield from statuses - if "next_cursor" in data: + if "next_cursor" in data: # videos, newvideo params["cursor"] = data["next_cursor"] - elif "page" in params: + elif "page" in params: # home, article params["page"] += 1 - elif data["since_id"]: + elif data["since_id"]: # album params["sinceid"] = data["since_id"] - else: - params["since_id"] = statuses[-1]["id"] - 1 + else: # feed, last album page + try: + params["since_id"] = statuses[-1]["id"] - 1 + except KeyError: + return def _sina_visitor_system(self, response): self.log.info("Sina Visitor System") @@ -366,6 +370,10 @@ class WeiboStatusExtractor(WeiboExtractor): "pattern": r"https://g\.us\.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM0104" r"120005tc0E010\.mp4\?label=gif_mp4", }), + # missing 'playback_list' (#2792) + ("https://weibo.com/2909128931/4409545658754086", { + "count": 9, + }), ("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"), ) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py new file mode 100644 index 0000000..2b5acd8 --- /dev/null +++ b/gallery_dl/extractor/zerochan.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.zerochan.net/""" + +from .booru import BooruExtractor +from ..cache import cache +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" + + +class ZerochanExtractor(BooruExtractor): + """Base class for zerochan extractors""" + category = "zerochan" + root = "https://www.zerochan.net" + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}" + cookiedomain = ".zerochan.net" + cookienames = ("z_id", "z_hash") + + def login(self): + if not self._check_cookies(self.cookienames): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + # force legacy layout + self.session.cookies.set("v3", "0", domain=self.cookiedomain) + + @cache(maxage=90*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/login" + headers = { + "Origin" : self.root, + "Referer" : url, + } + data = { + "ref" : "/", + "name" : username, + "password": password, + "login" : "Login", + } + + response = self.request(url, method="POST", headers=headers, data=data) + if not response.history: + raise exception.AuthenticationError() + + return response.cookies + + def _parse_entry_page(self, entry_id): + url = "{}/{}".format(self.root, entry_id) + extr = text.extract_from(self.request(url).text) + + return { + "id" : entry_id, + "author": extr('"author": "', '"'), + "file_url": extr('"contentUrl": "', '"'), + "date" : text.parse_datetime(extr( + '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"), + "width" : extr('"width": "', ' '), + "height": extr('"height": "', ' '), + "size" : extr('"contentSize": "', 'B'), + "path" : text.split_html(extr( + 'class="breadcrumbs', '

'))[3::2], + "tags" : extr('alt="Tags: ', '"').split(", ") + } + + +class ZerochanTagExtractor(ZerochanExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?" + test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", { + "pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)", + "count": "> 24", + "keywords": { + "extension": r"re:jpg|png", + "file_url": "", + "filename": r"re:Perth.\(Kantai.Collection\).full.\d+", + "height": r"re:^\d+$", + "id": r"re:^\d+$", + "name": "Perth (Kantai Collection)", + "search_tags": "Perth (Kantai Collection)", + "size": r"re:^\d+k$", + "width": r"re:^\d+$", + }, + }) + + def __init__(self, match): + ZerochanExtractor.__init__(self, match) + self.search_tag, self.query = match.groups() + + def metadata(self): + return {"search_tags": text.unquote( + self.search_tag.replace("+", " "))} + + def posts(self): + url = self.root + "/" + self.search_tag + params = text.parse_query(self.query) + params["p"] = text.parse_int(params.get("p"), 1) + + while True: + page = self.request(url, params=params).text + thumbs = text.extract(page, '