diff options
| author | 2022-08-29 02:17:16 -0400 | |
|---|---|---|
| committer | 2022-08-29 02:17:16 -0400 | |
| commit | a768930761f7f20587ae40a8cacca0e55c85290a (patch) | |
| tree | 5a4163db912b93fc45f717e5e43fd5be3e66f16c | |
| parent | ae2a0f5622beaa6f402526f8a7b939419283a090 (diff) | |
New upstream version 1.23.0.upstream/1.23.0
54 files changed, 1063 insertions, 238 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index be9a4f7..61987d9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,59 @@ # Changelog +## 1.23.0 - 2022-08-28 +### Changes +- [twitter] update `user` and `author` metdata fields + - for URLs with a single username or ID like `https://twitter.com/USER` or a search with a single `from:` statement, `user` will now always refer to the user referenced in the URL. + - for all other URLs like `https://twitter.com/i/bookmarks`, `user` and `author` refer to the same user + - `author` will always refer to the original Tweet author +- [twitter] update `quote_id` and `quote_by` metadata fields + - `quote_id` is now non-zero for quoted Tweets and contains the Tweet ID of the quotng Tweet (was the other way round before) + - `quote_by` is only defined for quoted Tweets like before, but now contains the screen name of the user quoting this Tweet +- [skeb] improve archive IDs for thumbnails and article images +### Additions +- [artstation] add `num` and `count` metadata fields ([#2764](https://github.com/mikf/gallery-dl/issues/2764)) +- [catbox] add `album` extractor ([#2410](https://github.com/mikf/gallery-dl/issues/2410)) +- [blogger] emit metadata for posts without files ([#2789](https://github.com/mikf/gallery-dl/issues/2789)) +- [foolfuuka] update supported domains +- [gelbooru] add support for `api_key` and `user_id` ([#2767](https://github.com/mikf/gallery-dl/issues/2767)) +- [gelbooru] implement pagination for `pool` results ([#2853](https://github.com/mikf/gallery-dl/issues/2853)) +- [instagram] add support for a user's saved collections ([#2769](https://github.com/mikf/gallery-dl/issues/2769)) +- [instagram] provide `date` for directory format strings ([#2830](https://github.com/mikf/gallery-dl/issues/2830)) +- [kemonoparty] add `favorites` option ([#2826](https://github.com/mikf/gallery-dl/issues/2826), [#2831](https://github.com/mikf/gallery-dl/issues/2831)) +- [oauth] add `host` config option ([#2806](https://github.com/mikf/gallery-dl/issues/2806)) +- [rule34] implement pagination for `pool` results ([#2853](https://github.com/mikf/gallery-dl/issues/2853)) +- [skeb] add option to download `article` images ([#1031](https://github.com/mikf/gallery-dl/issues/1031)) +- [tumblr] download higher-quality images ([#2761](https://github.com/mikf/gallery-dl/issues/2761)) +- [tumblr] add `count` metadata field ([#2804](https://github.com/mikf/gallery-dl/issues/2804)) +- [wallhaven] implement `metadata` option ([#2803](https://github.com/mikf/gallery-dl/issues/2803)) +- [zerochan] add `tag` and `image` extractors ([#1434](https://github.com/mikf/gallery-dl/issues/1434)) +- [zerochan] implement login with username & password ([#1434](https://github.com/mikf/gallery-dl/issues/1434)) +- [postprocessor:metadata] implement `mode: modify` and `mode: delete` ([#2640](https://github.com/mikf/gallery-dl/issues/2640)) +- [formatter] add `g` conversion for slugifying a string ([#2410](https://github.com/mikf/gallery-dl/issues/2410)) +- [formatter] apply `:J` only to lists ([#2833](https://github.com/mikf/gallery-dl/issues/2833)) +- implement `path-metadata` option ([#2734](https://github.com/mikf/gallery-dl/issues/2734)) +- allow comments after input file URLs ([#2808](https://github.com/mikf/gallery-dl/issues/2808)) +- add global `warnings` option to control `urllib3` warning behavior ([#2762](https://github.com/mikf/gallery-dl/issues/2762)) +### Fixes +- [bunkr] fix extraction ([#2788](https://github.com/mikf/gallery-dl/issues/2788)) +- [deviantart] use public access token for journals ([#2702](https://github.com/mikf/gallery-dl/issues/2702)) +- [e621] fix extraction of `popular` posts +- [fanbox] download cover images in original size ([#2784](https://github.com/mikf/gallery-dl/issues/2784)) +- [mastodon] allow downloading without access token ([#2782](https://github.com/mikf/gallery-dl/issues/2782)) +- [hitomi] update cache expiry time ([#2863](https://github.com/mikf/gallery-dl/issues/2863)) +- [hitomi] fix error when number of tag results is a multiple of 25 ([#2870](https://github.com/mikf/gallery-dl/issues/2870)) +- [mangahere] fix `page-reverse` option ([#2795](https://github.com/mikf/gallery-dl/issues/2795)) +- [poipiku] fix posts with more than one image ([#2796](https://github.com/mikf/gallery-dl/issues/2796)) +- [poipiku] update filter for static images ([#2796](https://github.com/mikf/gallery-dl/issues/2796)) +- [slideshare] fix metadata extraction +- [twitter] unescape `+` in search queries ([#2226](https://github.com/mikf/gallery-dl/issues/2226)) +- [twitter] fall back to unfiltered search ([#2766](https://github.com/mikf/gallery-dl/issues/2766)) +- [twitter] ignore invalid user entries ([#2850](https://github.com/mikf/gallery-dl/issues/2850)) +- [vk] prevent exceptions for broken/invalid photos ([#2774](https://github.com/mikf/gallery-dl/issues/2774)) +- [vsco] fix `collection` extraction +- [weibo] prevent exception for missing `playback_list` ([#2792](https://github.com/mikf/gallery-dl/issues/2792)) +- [weibo] prevent errors when paginating over album entries ([#2817](https://github.com/mikf/gallery-dl/issues/2817)) + ## 1.22.4 - 2022-07-15 ### Additions - [instagram] add `pinned` metadata field ([#2752](https://github.com/mikf/gallery-dl/issues/2752)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.22.4 +Version: 1.23.0 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.22.4/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.22.4/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -251,7 +251,8 @@ and optional for ``subscribestar``, ``tapas``, ``tsumino``, -and ``twitter``. +``twitter``, +and ``zerochan``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) @@ -66,8 +66,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.22.4/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.22.4/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -218,7 +218,8 @@ and optional for ``subscribestar``, ``tapas``, ``tsumino``, -and ``twitter``. +``twitter``, +and ``zerochan``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 751d470..d4efeed 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-07-15" "1.22.4" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-08-28" "1.23.0" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 39550ad..642cb78 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-07-15" "1.22.4" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-08-28" "1.23.0" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -458,6 +458,8 @@ and optional for * \f[I]tsumino\f[] .br * \f[I]twitter\f[] +.br +* \f[I]zerochan\f[] These values can also be specified via the \f[I]-u/--username\f[] and \f[I]-p/--password\f[] command-line options or @@ -667,6 +669,21 @@ This can then be used in \f[I]filenames\f[], with a \f[I]metadata\f[] post processor, etc. +.SS extractor.*.path-metadata +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Insert a reference to the current \f[I]PathFormat\f[] +data structure into metadata dictionaries as the given name. + +For example, setting this option to \f[I]"gdl_path"\f[] would make it possible +to access the current file's filename as \f[I]"[gdl_path.filename}"\f[]. + + .SS extractor.*.category-transfer .IP "Type:" 6 \f[I]bool\f[] @@ -1516,6 +1533,19 @@ Selects which site layout to expect when parsing posts. * \f[I]"new"\f[]: Expect the *new* site layout +.SS extractor.gelbooru.api-key & .user-id +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]null\f[] + +.IP "Description:" 4 +Values from the API Access Credentials section found at the bottom of your +\f[I]Account Options\f[] +page. + + .SS extractor.generic.enabled .IP "Type:" 6 \f[I]bool\f[] @@ -1751,6 +1781,19 @@ Controls how to handle duplicate files in a post. Extract a user's direct messages as \f[I]dms\f[] metadata. +.SS extractor.kemonoparty.favorites +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]artist\f[] + +.IP "Description:" 4 +Determines the type of favorites to be downloaded. + +Available types are \f[I]artist\f[], and \f[I]post\f[]. + + .SS extractor.kemonoparty.files .IP "Type:" 6 \f[I]list\f[] of \f[I]strings\f[] @@ -2007,6 +2050,17 @@ Store tokens received during OAuth authorizations in \f[I]cache\f[]. +.SS extractor.oauth.host +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"localhost"\f[] + +.IP "Description:" 4 +Host name / IP address to bind to during OAuth authorization. + + .SS extractor.oauth.port .IP "Type:" 6 \f[I]integer\f[] @@ -2424,6 +2478,17 @@ Download video embeds from external sites. Download videos. +.SS extractor.skeb.article +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Download article images. + + .SS extractor.skeb.sent-requests .IP "Type:" 6 \f[I]bool\f[] @@ -2502,6 +2567,21 @@ images from them. Search posts for inline images and videos. +.SS extractor.tumblr.original +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]true\f[] + +.IP "Description:" 4 +Download full-resolution \f[I]photo\f[] images. + +For each photo with "maximum" resolution +(width equal to 2048 or height equal to 3072), +use an extra HTTP request to find the URL to its full-resolution version. + + .SS extractor.tumblr.reblogs .IP "Type:" 6 \f[I]bool\f[] or \f[I]string\f[] @@ -2846,6 +2926,19 @@ to use your account's browsing settings and default filters when searching. See https://wallhaven.cc/help/api for more information. +.SS extractor.wallhaven.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract additional metadata (tags, uploader) + +Note: This requires 1 additional HTTP request for each post. + + .SS extractor.weasyl.api-key .IP "Type:" 6 \f[I]string\f[] @@ -3714,16 +3807,20 @@ See \f[I]metadata.event\f[] for a list of available events. \f[I]"json"\f[] .IP "Description:" 4 -Select how to write metadata. +Selects how to process metadata. .br -* \f[I]"json"\f[]: all metadata using \f[I]json.dump() +* \f[I]"json"\f[]: write metadata using \f[I]json.dump() <https://docs.python.org/3/library/json.html#json.dump>\f[] .br -* \f[I]"tags"\f[]: \f[I]tags\f[] separated by newlines +* \f[I]"tags"\f[]: write \f[I]tags\f[] separated by newlines .br -* \f[I]"custom"\f[]: result of applying \f[I]metadata.content-format\f[] +* \f[I]"custom"\f[]: write the result of applying \f[I]metadata.content-format\f[] to a file's metadata dictionary +.br +* \f[I]"modify"\f[]: add or modify metadata entries +.br +* \f[I]"delete"\f[]: remove metadata entries .SS metadata.filename @@ -3821,6 +3918,39 @@ When starting to download all files of a post, e.g. a Tweet on Twitter or a post on Patreon. +.SS metadata.fields +.IP "Type:" 6 +.br +* \f[I]list\f[] of \f[I]strings\f[] +.br +* \f[I]object\f[] (field name -> \f[I]format string\f[]) + +.IP "Example:" 4 +.br +* .. code:: json + +["blocked", "watching", "status[creator][name]"] + +.br +* .. code:: json + +{ +"blocked" : "***", +"watching" : "\\fE 'yes' if watching else 'no'", +"status[username]": "{status[creator][name]!l}" +} + + +.IP "Description:" 4 +.br +* \f[I]"mode": "delete"\f[]: +A list of metadata field names to remove. +.br +* \f[I]"mode": "modify"\f[]: +An object with metadata field names mapping to a \f[I]format string\f[] +whose result is assigned to said field name. + + .SS metadata.content-format .IP "Type:" 6 \f[I]string\f[] or \f[I]list\f[] of \f[I]strings\f[] @@ -4190,6 +4320,18 @@ The list of signal names to ignore, i.e. set as signal handler for. +.SS warnings +.IP "Type:" 6 +\f[I]string\f[] + +.IP "Default:" 9 +\f[I]"default"\f[] + +.IP "Description:" 4 +The \f[I]Warnings Filter action\f[] +used for (urllib3) warnings. + + .SS pyopenssl .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 1492653..1e485ee 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -109,6 +109,11 @@ "include": "gallery", "layout": "auto" }, + "gelbooru": + { + "api-key": null, + "user-id": null + }, "gfycat": { "format": ["mp4", "webm", "mobile", "gif"] @@ -193,6 +198,7 @@ { "browser": true, "cache": true, + "host": "localhost", "port": 6414 }, "paheal": @@ -248,6 +254,12 @@ "username": null, "password": null }, + "skeb": + { + "article": false, + "sent-requests": false, + "thumbnails": false + }, "smugmug": { "videos": true @@ -273,6 +285,7 @@ "external": false, "inline": true, "posts": "all", + "original": true, "reblogs": true }, "twitter": @@ -302,7 +315,8 @@ }, "wallhaven": { - "api-key": null + "api-key": null, + "metadata": false }, "weasyl": { @@ -324,6 +338,11 @@ "module": null, "raw-options": null }, + "zerochan": + { + "username": null, + "password": null + }, "booru": { "tags": false, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 1e1d74d..6b9d68b 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.22.4 +Version: 1.23.0 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -99,8 +99,8 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.22.4/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.22.4/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.exe>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.23.0/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -251,7 +251,8 @@ and optional for ``subscribestar``, ``tapas``, ``tsumino``, -and ``twitter``. +``twitter``, +and ``zerochan``. You can set the necessary information in your configuration file (cf. gallery-dl.conf_) diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index b323e38..5f5084b 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -58,6 +58,7 @@ gallery_dl/extractor/behance.py gallery_dl/extractor/blogger.py gallery_dl/extractor/booru.py gallery_dl/extractor/bunkr.py +gallery_dl/extractor/catbox.py gallery_dl/extractor/comicvine.py gallery_dl/extractor/common.py gallery_dl/extractor/cyberdrop.py @@ -197,6 +198,7 @@ gallery_dl/extractor/wikieat.py gallery_dl/extractor/xhamster.py gallery_dl/extractor/xvideos.py gallery_dl/extractor/ytdl.py +gallery_dl/extractor/zerochan.py gallery_dl/postprocessor/__init__.py gallery_dl/postprocessor/classify.py gallery_dl/postprocessor/common.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 04ea54c..329e7ab 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -38,11 +38,11 @@ def parse_inputfile(file, log): Lines starting with '#' and empty lines will be ignored. Lines starting with '-' will be interpreted as a key-value pair separated by an '='. where 'key' is a dot-separated option name and 'value' is a - JSON-parsable value for it. These config options will be applied while + JSON-parsable value. These configuration options will be applied while processing the next URL. Lines starting with '-G' are the same as above, except these options will - be valid for all following URLs, i.e. they are Global. - Everything else will be used as potential URL. + be applied for *all* following URLs, i.e. they are Global. + Everything else will be used as a potential URL. Example input file: @@ -57,7 +57,8 @@ def parse_inputfile(file, log): https://example.org/ # next URL uses default filename and 'skip' is false. - https://example.com/index.htm + https://example.com/index.htm # comment1 + https://example.com/404.htm # comment2 """ gconf = [] lconf = [] @@ -94,6 +95,10 @@ def parse_inputfile(file, log): else: # url + if " #" in line: + line = line.partition(" #")[0] + elif "\t#" in line: + line = line.partition("\t#")[0] if gconf or lconf: yield util.ExtendedUrl(line, gconf, lconf) gconf = [] diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 70cebb3..9e4507a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -26,6 +26,7 @@ modules = [ "behance", "blogger", "bunkr", + "catbox", "comicvine", "cyberdrop", "danbooru", @@ -150,6 +151,7 @@ modules = [ "wikieat", "xhamster", "xvideos", + "zerochan", "booru", "moebooru", "foolfuuka", diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 19b9d97..c0e8e67 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -32,9 +32,11 @@ class ArtstationExtractor(Extractor): data = self.metadata() for project in self.projects(): - for asset in self.get_project_assets(project["hash_id"]): + for num, asset in enumerate( + self.get_project_assets(project["hash_id"]), 1): asset.update(data) adict = asset["asset"] + asset["num"] = num yield Message.Directory, asset if adict["has_embedded_player"] and self.external: @@ -85,6 +87,7 @@ class ArtstationExtractor(Extractor): assets = data["assets"] del data["assets"] + data["count"] = len(assets) if len(assets) == 1: data["asset"] = assets[0] yield data diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 21ca991..e0885d2 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -67,9 +67,6 @@ class BloggerExtractor(Extractor): key=lambda x: x["format_id"], )["play_url"]) - if not files: - continue - post["author"] = post["author"]["displayName"] post["replies"] = post["replies"]["totalItems"] post["content"] = text.remove_html(content) diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 9904d0a..3091f57 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -16,10 +16,10 @@ import json class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkr.is albums""" category = "bunkr" - root = "https://app.bunkr.is" + root = "https://bunkr.is" pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:is|to)/a/([^/?#]+)" test = ( - ("https://app.bunkr.is/a/Lktg9Keq", { + ("https://bunkr.is/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -33,7 +33,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): }, }), # mp4 (#2239) - ("https://bunkr.is/a/ptRHaCn2", { + ("https://app.bunkr.is/a/ptRHaCn2", { "pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4", "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }), @@ -70,16 +70,16 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): album = props["album"] files = props["files"] except Exception as exc: - self.log.debug(exc) + self.log.debug(exc.__class__.__name__, exc) self.root = self.root.replace("bunkr", "app.bunkr", 1) return self._fetch_album_api(album_id) for file in files: name = file["name"] + cdn = file["cdn"] if name.endswith(".mp4"): - file["file"] = "https://media-files.bunkr.is/" + name - else: - file["file"] = file["cdn"] + "/" + name + cdn = cdn.replace("//cdn", "//media-files") + file["file"] = cdn + "/" + name return files, { "album_id" : self.album_id, diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py new file mode 100644 index 0000000..509108f --- /dev/null +++ b/gallery_dl/extractor/catbox.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://catbox.moe/""" + +from .common import GalleryExtractor +from .. import text + + +class CatboxAlbumExtractor(GalleryExtractor): + """Extractor for catbox albums""" + category = "catbox" + subcategory = "album" + root = "https://catbox.moe" + filename_fmt = "{filename}.{extension}" + directory_fmt = ("{category}", "{album_name} ({album_id})") + archive_fmt = "{album_id}_{filename}" + pattern = r"(?:https?://)?(?:www\.)?catbox\.moe(/c/[^/?#]+)" + test = ( + ("https://catbox.moe/c/1igcbe", { + "url": "35866a88c29462814f103bc22ec031eaeb380f8a", + "content": "70ddb9de3872e2d17cc27e48e6bf395e5c8c0b32", + "pattern": r"https://files\.catbox\.moe/\w+\.\w{3}$", + "count": 3, + "keyword": { + "album_id": "1igcbe", + "album_name": "test", + "date": "dt:2022-08-18 00:00:00", + "description": "album test &>", + }, + }), + ("https://www.catbox.moe/c/cd90s1"), + ("https://catbox.moe/c/w7tm47#"), + ) + + def metadata(self, page): + extr = text.extract_from(page) + return { + "album_id" : self.gallery_url.rpartition("/")[2], + "album_name" : text.unescape(extr("<h1>", "<")), + "date" : text.parse_datetime(extr( + "<p>Created ", "<"), "%B %d %Y"), + "description": text.unescape(extr("<p>", "<")), + } + + def images(self, page): + return [ + ("https://files.catbox.moe/" + path, None) + for path in text.extract_iter( + page, ">https://files.catbox.moe/", "<") + ] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 6ccae7f..1b41101 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -795,12 +795,23 @@ SSL_CIPHERS = { } +urllib3 = requests.packages.urllib3 + # detect brotli support try: - BROTLI = requests.packages.urllib3.response.brotli is not None + BROTLI = urllib3.response.brotli is not None except AttributeError: BROTLI = False +# set (urllib3) warnings filter +action = config.get((), "warnings", "default") +if action: + try: + import warnings + warnings.simplefilter(action, urllib3.exceptions.HTTPWarning) + except Exception: + pass +del action # Undo automatic pyOpenSSL injection by requests pyopenssl = config.get((), "pyopenssl", False) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index ec0db68..8c2ed53 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -34,6 +34,7 @@ class DanbooruExtractor(BaseExtractor): self.per_page = iget("per-page", 200) self.request_interval_min = iget("request-interval-min", 0.0) self._pools = iget("pools") + self._popular_endpoint = iget("popular", "/explore/posts/popular.json") BaseExtractor.__init__(self, match) @@ -150,6 +151,7 @@ INSTANCES = { "headers": {"User-Agent": "gallery-dl/{} (by mikf)".format( __version__)}, "pools": "sort", + "popular": "/popular.json", "page-limit": 750, "per-page": 320, "request-interval-min": 1.0, @@ -308,7 +310,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): subcategory = "popular" directory_fmt = ("{category}", "popular", "{scale}", "{date}") archive_fmt = "P_{scale[0]}_{date}_{id}" - pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?" + pattern = BASE_PATTERN + r"/(?:explore/posts/)?popular(?:\?([^#]*))?" test = ( ("https://danbooru.donmai.us/explore/posts/popular"), (("https://danbooru.donmai.us/explore/posts/popular" @@ -316,7 +318,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): "range": "1-120", "count": 120, }), - ("https://e621.net/explore/posts/popular"), + ("https://e621.net/popular"), (("https://e621.net/explore/posts/popular" "?date=2019-06-01&scale=month"), { "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+", @@ -345,8 +347,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): def posts(self): if self.page_start is None: self.page_start = 1 - return self._pagination( - "/explore/posts/popular.json", self.params, True) + return self._pagination(self._popular_endpoint, self.params, True) class DanbooruFavoriteExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 39ae484..60f644d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1128,11 +1128,18 @@ class DeviantartOAuthAPI(): self._folders((deviation,)) return deviation - def deviation_content(self, deviation_id, public=False): + def deviation_content(self, deviation_id, public=True): """Get extended content of a single Deviation""" endpoint = "/deviation/content" params = {"deviationid": deviation_id} - return self._call(endpoint, params=params, public=public) + content = self._call(endpoint, params=params, public=public) + if public and content["html"].startswith( + ' <span class=\"username-with-symbol'): + if self.refresh_token_key: + content = self._call(endpoint, params=params, public=False) + else: + self.log.warning("Private Journal") + return content def deviation_download(self, deviation_id, public=True): """Get the original file download (if allowed)""" diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 11436cb..8481248 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -6,6 +6,7 @@ """Extractors for https://www.fanbox.cc/""" +import re from .common import Extractor, Message from .. import text @@ -78,6 +79,7 @@ class FanboxExtractor(Extractor): num = 0 cover_image = post.get("coverImageUrl") if cover_image: + cover_image = re.sub("/c/[0-9a-z_]+", "", cover_image) final_post = post.copy() final_post["isCoverImage"] = True final_post["fileUrl"] = cover_image diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 34b52ef..5e6da5b 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -88,9 +88,13 @@ BASE_PATTERN = FoolfuukaExtractor.update({ "root": "https://boards.fireden.net", "pattern": r"boards\.fireden\.net", }, - "nyafuu": { - "root": "https://archive.nyafuu.org", - "pattern": r"(?:archive\.)?nyafuu\.org", + "rozenarcana": { + "root": "https://archive.alice.al", + "pattern": r"(?:archive\.)?alice\.al", + }, + "tokyochronos": { + "root": "https://www.tokyochronos.net", + "pattern": r"(?:www\.)?tokyochronos\.net", }, "rbt": { "root": "https://rbt.asia", @@ -111,7 +115,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)" test = ( ("https://archive.4plebs.org/tg/thread/54059290", { - "url": "07452944164b602502b02b24521f8cee5c484d2a", + "url": "fd823f17b5001442b941fddcd9ec91bafedfbc79", }), ("https://archived.moe/gd/thread/309639/", { "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8", @@ -133,8 +137,11 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): ("https://boards.fireden.net/sci/thread/11264294/", { "url": "61cab625c95584a12a30049d054931d64f8d20aa", }), - ("https://archive.nyafuu.org/c/thread/2849220/", { - "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", + ("https://archive.alice.al/c/thread/2849220/", { + "url": "632e2c8de05de6b3847685f4bf1b4e5c6c9e0ed5", + }), + ("https://www.tokyochronos.net/a/thread/241664141/", { + "url": "ae03852cf44e3dcfce5be70274cb1828e1dbb7d6", }), ("https://rbt.asia/g/thread/61487650/", { "url": "fadd274b25150a1bdf03a40c58db320fa3b617c4", @@ -180,7 +187,8 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/"), ("https://desuarchive.org/a/"), ("https://boards.fireden.net/sci/"), - ("https://archive.nyafuu.org/c/"), + ("https://archive.alice.al/c/"), + ("https://www.tokyochronos.net/a/"), ("https://rbt.asia/g/"), ("https://thebarchive.com/b/"), ) @@ -223,7 +231,8 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): ("https://archiveofsins.com/_/search/text/test/"), ("https://desuarchive.org/_/search/text/test/"), ("https://boards.fireden.net/_/search/text/test/"), - ("https://archive.nyafuu.org/_/search/text/test/"), + ("https://archive.alice.al/_/search/text/test/"), + ("https://www.tokyochronos.net/_/search/text/test/"), ("https://rbt.asia/_/search/text/test/"), ("https://thebarchive.com/_/search/text/test/"), ) @@ -288,7 +297,8 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/gallery/"), ("https://desuarchive.org/a/gallery/5"), ("https://boards.fireden.net/sci/gallery/6"), - ("https://archive.nyafuu.org/c/gallery/7"), + ("https://archive.alice.al/c/gallery/7"), + ("https://www.tokyochronos.net/a/gallery/7"), ("https://rbt.asia/g/gallery/8"), ("https://thebarchive.com/b/gallery/9"), ) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index e8bee37..92f7ac2 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from . import gelbooru_v02 -from .. import text, util, exception +from .. import text, exception import binascii @@ -21,10 +21,15 @@ class GelbooruBase(): root = "https://gelbooru.com" def _api_request(self, params): + params["api_key"] = self.api_key + params["user_id"] = self.user_id + url = self.root + "/index.php?page=dapi&s=post&q=index&json=1" data = self.request(url, params=params).json() + if "post" not in data: return () + posts = data["post"] if not isinstance(posts, list): return (posts,) @@ -85,28 +90,29 @@ class GelbooruTagExtractor(GelbooruBase, class GelbooruPoolExtractor(GelbooruBase, gelbooru_v02.GelbooruV02PoolExtractor): - """Extractor for image-pools from gelbooru.com""" + """Extractor for gelbooru pools""" + per_page = 45 pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=pool&s=show&id=(?P<pool>\d+)") test = ( ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { "count": 6, }), - ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { - "options": (("api", False),), - "count": 6, - }), ) def metadata(self): - url = "{}/index.php?page=pool&s=show&id={}".format( - self.root, self.pool_id) - page = self.request(url).text + url = self.root + "/index.php" + self._params = { + "page": "pool", + "s" : "show", + "id" : self.pool_id, + "pid" : self.page_start, + } + self._page = self.request(url, params=self._params).text - name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>") + name, pos = text.extract(self._page, "<h3>Now Viewing: ", "</h3>") if not name: raise exception.NotFoundError("pool") - self.post_ids = text.extract_iter(page, 'class="" id="p', '"', pos) return { "pool": text.parse_int(self.pool_id), @@ -114,9 +120,23 @@ class GelbooruPoolExtractor(GelbooruBase, } def posts(self): - params = {} - for params["id"] in util.advance(self.post_ids, self.page_start): - yield from self._api_request(params) + url = self.root + "/index.php" + params = self._params + + page = self._page + del self._page + data = {} + + while True: + num_ids = 0 + for data["id"] in text.extract_iter(page, '" id="p', '"'): + num_ids += 1 + yield from self._api_request(data) + + if num_ids < self.per_page: + return + params["pid"] += self.per_page + page = self.request(url, params=params).text class GelbooruPostExtractor(GelbooruBase, diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 35a3448..8214614 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -21,6 +21,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): def __init__(self, match): booru.BooruExtractor.__init__(self, match) + self.api_key = self.config("api-key") + self.user_id = self.config("user-id") + try: self.api_root = INSTANCES[self.category]["api_root"] except KeyError: @@ -59,6 +62,24 @@ class GelbooruV02Extractor(booru.BooruExtractor): return params["pid"] += 1 + def _pagination_html(self, params): + url = self.root + "/index.php" + params["pid"] = self.page_start * self.per_page + + data = {} + while True: + num_ids = 0 + page = self.request(url, params=params).text + + for data["id"] in text.extract_iter(page, '" id="p', '"'): + num_ids += 1 + for post in self._api_request(data): + yield post.attrib + + if num_ids < self.per_page: + return + params["pid"] += self.per_page + @staticmethod def _prepare(post): post["date"] = text.parse_datetime( @@ -204,7 +225,12 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): def __init__(self, match): GelbooruV02Extractor.__init__(self, match) self.pool_id = match.group(match.lastindex) - self.post_ids = () + + if self.category == "rule34": + self.posts = self._posts_pages + self.per_page = 45 + else: + self.post_ids = () def skip(self, num): self.page_start += num @@ -232,6 +258,13 @@ class GelbooruV02PoolExtractor(GelbooruV02Extractor): for post in self._api_request(params): yield post.attrib + def _posts_pages(self): + return self._pagination_html({ + "page": "pool", + "s" : "show", + "id" : self.pool_id, + }) + class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): subcategory = "favorite" @@ -265,27 +298,11 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): return {"favorite_id": text.parse_int(self.favorite_id)} def posts(self): - url = self.root + "/index.php" - params = { + return self._pagination_html({ "page": "favorites", "s" : "view", "id" : self.favorite_id, - "pid" : self.page_start * self.per_page, - } - - data = {} - while True: - num_ids = 0 - page = self.request(url, params=params).text - - for data["id"] in text.extract_iter(page, '" id="p', '"'): - num_ids += 1 - for post in self._api_request(data): - yield post.attrib - - if num_ids < self.per_page: - return - params["pid"] += self.per_page + }) class GelbooruV02PostExtractor(GelbooruV02Extractor): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index ca7e692..f8b0c3b 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -174,23 +174,27 @@ class HitomiTagExtractor(Extractor): } offset = 0 + total = None while True: headers["Referer"] = "{}/{}/{}.html?page={}".format( self.root, self.type, self.tag, offset // 100 + 1) headers["Range"] = "bytes={}-{}".format(offset, offset+99) - nozomi = self.request(nozomi_url, headers=headers).content + response = self.request(nozomi_url, headers=headers) - for gallery_id in decode_nozomi(nozomi): + for gallery_id in decode_nozomi(response.content): gallery_url = "{}/galleries/{}.html".format( self.root, gallery_id) yield Message.Queue, gallery_url, data - if len(nozomi) < 100: - return offset += 100 + if total is None: + total = text.parse_int( + response.headers["content-range"].rpartition("/")[2]) + if offset >= total: + return -@memcache() +@memcache(maxage=1800) def _parse_gg(extr): page = extr.request("https://ltn.hitomi.la/gg.js").text diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 4a2c3bb..d56af8b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -270,6 +270,7 @@ class InstagramExtractor(Extractor): "post_shortcode": post["code"], "likes": post["like_count"], "pinned": post.get("timeline_pinned_user_ids", ()), + "date": text.parse_timestamp(post.get("taken_at")), } caption = post["caption"] @@ -399,6 +400,8 @@ class InstagramExtractor(Extractor): self.log.debug("Cursor: %s", self._cursor) def _pagination_api(self, endpoint, params=None): + if params is None: + params = {} while True: data = self._request_api(endpoint, params=params) yield from data["items"] @@ -509,7 +512,7 @@ class InstagramChannelExtractor(InstagramExtractor): class InstagramSavedExtractor(InstagramExtractor): """Extractor for ProfilePage saved media""" subcategory = "saved" - pattern = USER_PATTERN + r"/saved" + pattern = USER_PATTERN + r"/saved/?$" test = ("https://www.instagram.com/instagram/saved/",) def posts(self): @@ -518,6 +521,30 @@ class InstagramSavedExtractor(InstagramExtractor): return self._pagination_graphql(query_hash, variables) +class InstagramCollectionExtractor(InstagramExtractor): + """Extractor for ProfilePage saved collection media""" + subcategory = "collection" + pattern = USER_PATTERN + r"/saved/([^/?#]+)/([^/?#]+)" + test = ( + "https://www.instagram.com/instagram/saved/collection_name/123456789/", + ) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.user, self.collection_name, self.collection_id = match.groups() + + def metadata(self): + return { + "collection_id" : self.collection_id, + "collection_name": text.unescape(self.collection_name), + } + + def posts(self): + endpoint = "/v1/feed/collection/{}/posts/".format(self.collection_id) + for item in self._pagination_api(endpoint): + yield item["media"] + + class InstagramTagExtractor(InstagramExtractor): """Extractor for TagPage""" subcategory = "tag" diff --git a/gallery_dl/extractor/itaku.py b/gallery_dl/extractor/itaku.py index 6b2cf4c..00a32cd 100644 --- a/gallery_dl/extractor/itaku.py +++ b/gallery_dl/extractor/itaku.py @@ -101,9 +101,9 @@ class ItakuImageExtractor(ItakuExtractor): "/gallery_imgs/220504_oUNIAFT/xl.jpg", "liked_by_you": False, "maturity_rating": "SFW", - "num_comments": 2, - "num_likes": 80, - "num_reshares": 2, + "num_comments": int, + "num_likes": int, + "num_reshares": int, "obj_tags": 136446, "owner": 16775, "owner_avatar": "https://d1wmr8tlk3viaj.cloudfront.net" @@ -115,8 +115,9 @@ class ItakuImageExtractor(ItakuExtractor): "tags": list, "tags_character": ["hatsune_miku"], "tags_copyright": ["vocaloid"], - "tags_general" : ["twintails", "green_hair", "flag", "gloves", - "green_eyes", "female", "racing_miku"], + "tags_general" : ["female", "green_eyes", "twintails", + "green_hair", "gloves", "flag", + "racing_miku"], "title": "Racing Miku 2022 Ver.", "too_mature": False, "uncompressed_filesize": "0.62", diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index f1eb79f..816b561 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -440,20 +440,44 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): class KemonopartyFavoriteExtractor(KemonopartyExtractor): """Extractor for kemono.party favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites" - test = ("https://kemono.party/favorites", { - "pattern": KemonopartyUserExtractor.pattern, - "url": "f4b5b796979bcba824af84206578c79101c7f0e1", - "count": 3, - }) + pattern = BASE_PATTERN + r"/favorites(?:/?\?([^#]+))?" + test = ( + ("https://kemono.party/favorites", { + "pattern": KemonopartyUserExtractor.pattern, + "url": "f4b5b796979bcba824af84206578c79101c7f0e1", + "count": 3, + }), + ("https://kemono.party/favorites?type=post", { + "pattern": KemonopartyPostExtractor.pattern, + "url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", + "count": 3, + }), + ) + + def __init__(self, match): + KemonopartyExtractor.__init__(self, match) + self.favorites = (text.parse_query(match.group(2)).get("type") or + self.config("favorites") or + "artist") def items(self): self._prepare_ddosguard_cookies() self.login() - users = self.request(self.root + "/api/favorites").json() - for user in users: - user["_extractor"] = KemonopartyUserExtractor - url = "{}/{}/user/{}".format( - self.root, user["service"], user["id"]) - yield Message.Queue, url, user + if self.favorites == "artist": + users = self.request( + self.root + "/api/v1/account/favorites?type=artist").json() + for user in users: + user["_extractor"] = KemonopartyUserExtractor + url = "{}/{}/user/{}".format( + self.root, user["service"], user["id"]) + yield Message.Queue, url, user + + elif self.favorites == "post": + posts = self.request( + self.root + "/api/v1/account/favorites?type=post").json() + for post in posts: + post["_extractor"] = KemonopartyPostExtractor + url = "{}/{}/user/{}/post/{}".format( + self.root, post["service"], post["user"], post["id"]) + yield Message.Queue, url, post diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index b5db3dd..57db0c9 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -49,7 +49,9 @@ class LusciousAlbumExtractor(LusciousExtractor): r"/(?:albums|pictures/c/[^/?#]+/album)/[^/?#]+_(\d+)") test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { - "url": "7e4984a271a1072ac6483e4228a045895aff86f3", + "pattern": r"https://storage\.bhs\.cloud\.ovh\.net/v1/AUTH_\w+" + r"/images/NTRshouldbeillegal/277031" + r"/luscious_net_\d+_\d+\.jpg$", # "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", "keyword": { "album": { diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 6e780e8..493a8ef 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -179,12 +179,11 @@ class MastodonAPI(): try: access_token = INSTANCES[extractor.category]["access-token"] except (KeyError, TypeError): - raise exception.StopExtraction( - "Missing access token.\n" - "Run 'gallery-dl oauth:mastodon:%s' to obtain one.", - extractor.instance) - - self.headers = {"Authorization": "Bearer " + access_token} + pass + if access_token: + self.headers = {"Authorization": "Bearer " + access_token} + else: + self.headers = None def account_id_by_username(self, username): if username.startswith("id:"): @@ -232,6 +231,11 @@ class MastodonAPI(): if code < 400: return response + if code == 401: + raise exception.StopExtraction( + "Invalid or missing access token.\n" + "Run 'gallery-dl oauth:mastodon:%s' to obtain one.", + self.extractor.instance) if code == 404: raise exception.NotFoundError() if code == 429: diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 122ea46..2c8e72c 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -126,7 +126,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): username, password = self._get_auth_info() self._update_cookies(self._login_impl(username, password)) - @cache(maxage=150*24*3600, keyarg=1) + @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): if not username or not password: raise exception.AuthenticationError( diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 653822f..d6628c4 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -41,7 +41,8 @@ class OAuthBase(Extractor): stdout_write("Waiting for response. (Cancel with Ctrl+c)\n") server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - server.bind(("localhost", self.config("port", 6414))) + server.bind((self.config("host", "localhost"), + self.config("port", 6414))) server.listen(1) # workaround for ctrl+c not working during server.accept on Windows diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index fba1312..225f0ff 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -122,7 +122,7 @@ class PhilomenaPostExtractor(PhilomenaExtractor): "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2022-04-25T09:30:57Z", + "updated_at": r"re:\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ", "uploader": "Clover the Clever", "uploader_id": 211188, "upvotes": int, diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index e1846cc..8203885 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -51,13 +51,13 @@ class PoipikuExtractor(Extractor): thumb = extr('class="IllustItemThumbImg" src="', '"') if not thumb: break - elif thumb.startswith("/img/"): + elif thumb.startswith(("//img.poipiku.com/img/", "/img/")): continue post["num"] += 1 url = text.ensure_http_scheme(thumb[:-8]) yield Message.Url, url, text.nameext_from_url(url, post) - if not extr('</i> show all', '<'): + if not extr('> show all', '<'): continue url = self.root + "/f/ShowAppendFileF.jsp" @@ -131,7 +131,7 @@ class PoipikuPostExtractor(PoipikuExtractor): pattern = BASE_PATTERN + r"/(\d+)/(\d+)" test = ( ("https://poipiku.com/25049/5864576.html", { - "pattern": r"https://img\.poipiku\.com/user_img03/000025049" + "pattern": r"https://img\.poipiku\.com/user_img\d+/000025049" r"/005864576_EWN1Y65gQ\.png$", "keyword": { "count": "1", @@ -146,7 +146,7 @@ class PoipikuPostExtractor(PoipikuExtractor): }, }), ("https://poipiku.com/2166245/6411749.html", { - "pattern": r"https://img\.poipiku\.com/user_img01/002166245" + "pattern": r"https://img\.poipiku\.com/user_img\d+/002166245" r"/006411749_\w+\.jpeg$", "count": 4, "keyword": { diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 6dfc907..cd8c238 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -16,13 +16,14 @@ class SkebExtractor(Extractor): category = "skeb" directory_fmt = ("{category}", "{creator[screen_name]}") filename_fmt = "{post_num}_{file_id}.{extension}" - archive_fmt = "{post_num}_{file_id}_{content_category}" + archive_fmt = "{post_num}_{_file_id}_{content_category}" root = "https://skeb.jp" def __init__(self, match): Extractor.__init__(self, match) self.user_name = match.group(1) self.thumbnails = self.config("thumbnails", False) + self.article = self.config("article", False) def items(self): for user_name, post_num in self.posts(): @@ -64,6 +65,7 @@ class SkebExtractor(Extractor): resp = self.request(url, headers=headers).json() creator = resp["creator"] post = { + "post_id" : resp["id"], "post_num" : post_num, "post_url" : self.root + resp["path"], "body" : resp["body"], @@ -102,12 +104,22 @@ class SkebExtractor(Extractor): if self.thumbnails and "og_image_url" in resp: post["content_category"] = "thumb" post["file_id"] = "thumb" + post["_file_id"] = str(resp["id"]) + "t" post["file_url"] = resp["og_image_url"] yield post + if self.article and "article_image_url" in resp: + url = resp["article_image_url"] + if url: + post["content_category"] = "article" + post["file_id"] = "article" + post["_file_id"] = str(resp["id"]) + "a" + post["file_url"] = url + yield post + for preview in resp["previews"]: post["content_category"] = "preview" - post["file_id"] = preview["id"] + post["file_id"] = post["_file_id"] = preview["id"] post["file_url"] = preview["url"] info = preview["information"] post["original"] = { diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index b0b8f3b..506db26 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -59,7 +59,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): # mobile URL (("https://www.slideshare.net" "/mobile/uqudent/introduction-to-fixed-prosthodontics"), { - "url": "59993ad7b0cb93c73011547eedcd02c622649e9d", + "url": "43eda2adf4dd221a251c8df794dfb82649e94647", }), ) @@ -72,14 +72,14 @@ class SlidesharePresentationExtractor(GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) descr = extr('<meta name="description" content="', '"') - title = extr('<span class="j-title-breadcrumb">', '</span>') - published = extr('<div class="metadata-item">', '</div>') comments = extr('content="UserComments:', '"') likes = extr('content="UserLikes:', '"') views = extr('content="UserPageVisits:', '"') + title = extr('<span class="j-title-breadcrumb">', '</span>') + published = extr('<div class="metadata-item">', '</div>') if descr.endswith("…"): - alt_descr = extr('id="slideshow-description-text"', '</p>') + alt_descr = extr('slideshow-description-text"', '</p>') if alt_descr: descr = text.remove_html(alt_descr.partition(">")[2]).strip() diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 98e914e..4010da3 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -111,13 +111,13 @@ class SmugmugImageExtractor(SmugmugExtractor): test = ( ("https://tdm.smugmug.com/Nature/Dove/i-kCsLJT6", { "url": "e6408fd2c64e721fd146130dceb56a971ceb4259", - "keyword": "460a773f5addadd3e216bda346fc524fe4eedc52", + "keyword": "b31a63d07c9c26eb0f79f52d60d171a98938f99b", "content": "ecbd9d7b4f75a637abc8d35319be9ec065a44eb0", }), # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", - "keyword": "eb74e5cf6780d5152ab8f11b431ec1b17fa8f69b", + "keyword": "4cef98133ace511adc874c9d9abac5817ba0d856", }), ) diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index fcdf18f..545a95b 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -108,7 +108,7 @@ class TapasSeriesExtractor(TapasExtractor): test = ( ("https://tapas.io/series/just-leave-me-be", { "pattern": r"https://\w+\.cloudfront\.net/pc/\w\w/[0-9a-f-]+\.jpg", - "count": 127, + "count": 132, }), ("https://tapas.io/series/yona", { # mature "count": 26, diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index ded7fd1..b694fa0 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -64,6 +64,7 @@ class TumblrExtractor(Extractor): self.inline = self.config("inline", True) self.reblogs = self.config("reblogs", True) self.external = self.config("external", False) + self.original = self.config("original", True) if len(self.types) == 1: self.api.posts_type = next(iter(self.types)) @@ -101,8 +102,7 @@ class TumblrExtractor(Extractor): del post["trail"] post["blog"] = blog post["date"] = text.parse_timestamp(post["timestamp"]) - yield Message.Directory, post - post["num"] = 0 + posts = [] if "photos" in post: # type "photo" or "link" photos = post["photos"] @@ -110,18 +110,31 @@ class TumblrExtractor(Extractor): for photo in photos: post["photo"] = photo - photo.update(photo["original_size"]) + + best_photo = photo["original_size"] + for alt_photo in photo["alt_sizes"]: + if (alt_photo["height"] > best_photo["height"] or + alt_photo["width"] > best_photo["width"]): + best_photo = alt_photo + photo.update(best_photo) + + if self.original and "/s2048x3072/" in photo["url"] and ( + photo["width"] == 2048 or photo["height"] == 3072): + photo["url"] = self._original_image(photo["url"]) + del photo["original_size"] del photo["alt_sizes"] - yield self._prepare_image(photo["url"], post) + posts.append( + self._prepare_image(photo["url"], post.copy())) + del post["photo"] url = post.get("audio_url") # type "audio" if url and url.startswith("https://a.tumblr.com/"): - yield self._prepare(url, post) + posts.append(self._prepare(url, post.copy())) url = post.get("video_url") # type "video" if url: - yield self._prepare(_original_video(url), post) + posts.append(self._prepare(_original_video(url), post.copy())) if self.inline and "reblog" in post: # inline media # only "chat" posts are missing a "reblog" key in their @@ -129,16 +142,25 @@ class TumblrExtractor(Extractor): body = post["reblog"]["comment"] + post["reblog"]["tree_html"] for url in re.findall('<img src="([^"]+)"', body): url = _original_inline_image(url) - yield self._prepare_image(url, post) + posts.append(self._prepare_image(url, post.copy())) for url in re.findall('<source src="([^"]+)"', body): url = _original_video(url) - yield self._prepare(url, post) + posts.append(self._prepare(url, post.copy())) if self.external: # external links - post["extension"] = None url = post.get("permalink_url") or post.get("url") if url: - yield Message.Queue, url, post + post["extension"] = None + posts.append((Message.Queue, url, post.copy())) + del post["extension"] + + post["count"] = len(posts) + yield Message.Directory, post + + for num, (msg, url, post) in enumerate(posts, 1): + post["num"] = num + post["count"] = len(posts) + yield msg, url, post def posts(self): """Return an iterable containing all relevant posts""" @@ -167,14 +189,12 @@ class TumblrExtractor(Extractor): @staticmethod def _prepare(url, post): text.nameext_from_url(url, post) - post["num"] += 1 post["hash"] = post["filename"].partition("_")[2] return Message.Url, url, post @staticmethod def _prepare_image(url, post): text.nameext_from_url(url, post) - post["num"] += 1 parts = post["filename"].split("_") try: @@ -188,7 +208,7 @@ class TumblrExtractor(Extractor): @staticmethod def _prepare_avatar(url, post, blog): text.nameext_from_url(url, post) - post["num"] = 1 + post["num"] = post["count"] = 1 post["blog"] = blog post["reblogged"] = False post["type"] = post["id"] = post["hash"] = "avatar" @@ -200,6 +220,12 @@ class TumblrExtractor(Extractor): def _skip_reblog_same_blog(self, post): return self.blog != post.get("reblogged_root_uuid") + def _original_image(self, url): + url = url.replace("/s2048x3072/", "/s99999x99999/", 1) + headers = {"Accept": "text/html,*/*;q=0.8"} + response = self.request(url, headers=headers) + return text.extract(response.text, '" src="', '"')[0] + class TumblrUserExtractor(TumblrExtractor): """Extractor for all images from a tumblr-user""" @@ -279,6 +305,12 @@ class TumblrPostExtractor(TumblrExtractor): ("https://mikf123.tumblr.com/post/181022380064/chat-post", { "count": 0, }), + ("https://mikf123.tumblr.com/image/689860196535762944", { + "pattern": r"^https://\d+\.media\.tumblr\.com" + r"/134791621559a79793563b636b5fe2c6" + r"/8f1131551cef6e74-bc/s99999x99999" + r"/188cf9b8915b0d0911c6c743d152fc62e8f38491\.png$", + }), ("http://ziemniax.tumblr.com/post/109697912859/", { "exception": exception.NotFoundError, # HTML response (#297) }), diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 36b4806..0df4ea2 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache +import itertools import json BASE_PATTERN = ( @@ -40,7 +41,7 @@ class TwitterExtractor(Extractor): self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) self.cards = self.config("cards", False) - self._user_id = None + self._user = self._user_obj = None self._user_cache = {} self._init_sizes() @@ -90,8 +91,9 @@ class TwitterExtractor(Extractor): if "in_reply_to_user_id_str" in data and ( not self.replies or ( self.replies == "self" and - (self._user_id or data["in_reply_to_user_id_str"]) != - data["user_id_str"] + data["user_id_str"] != + (self._user_obj["rest_id"] if self._user else + data["in_reply_to_user_id_str"]) ) ): self.log.debug("Skipping %s (reply)", data["id_str"]) @@ -229,11 +231,13 @@ class TwitterExtractor(Extractor): files.append({"url": url}) def _transform_tweet(self, tweet): - if "core" in tweet: - user = self._transform_user( - tweet["core"]["user_results"]["result"]) + if "author" in tweet: + author = tweet["author"] + elif "core" in tweet: + author = tweet["core"]["user_results"]["result"] else: - user = self._transform_user(tweet["user"]) + author = tweet["user"] + author = self._transform_user(author) if "legacy" in tweet: tweet = tweet["legacy"] @@ -245,12 +249,13 @@ class TwitterExtractor(Extractor): "retweet_id" : text.parse_int( tget("retweeted_status_id_str")), "quote_id" : text.parse_int( - tget("quoted_status_id_str")), + tget("quoted_by_id_str")), "reply_id" : text.parse_int( tget("in_reply_to_status_id_str")), "date" : text.parse_datetime( tweet["created_at"], "%a %b %d %H:%M:%S %z %Y"), - "user" : user, + "user" : self._user or author, + "author" : author, "lang" : tweet["lang"], "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), @@ -280,13 +285,8 @@ class TwitterExtractor(Extractor): if "in_reply_to_screen_name" in tweet: tdata["reply_to"] = tweet["in_reply_to_screen_name"] - if "quoted_by_id_str" in tweet: - tdata["quote_by"] = text.parse_int(tweet["quoted_by_id_str"]) - - if "author" in tweet: - tdata["author"] = self._transform_user(tweet["author"]) - else: - tdata["author"] = tdata["user"] + if "quoted_by" in tweet: + tdata["quote_by"] = tweet["quoted_by"] return tdata @@ -336,6 +336,10 @@ class TwitterExtractor(Extractor): return udata + def _assign_user(self, user): + self._user_obj = user + self._user = self._transform_user(user) + def _users_result(self, users): userfmt = self.config("users") if not userfmt or userfmt == "timeline": @@ -455,33 +459,24 @@ class TwitterTimelineExtractor(TwitterExtractor): tweet = None for tweet in self._select_tweet_source()(self.user): yield tweet - if tweet is None: return - # get username - if not self.user.startswith("id:"): - username = self.user - elif "core" in tweet: - username = (tweet["core"]["user_results"]["result"] - ["legacy"]["screen_name"]) - else: - username = tweet["user"]["screen_name"] - - # get tweet data - if "legacy" in tweet: - tweet = tweet["legacy"] - # build search query - query = "from:{} max_id:{}".format(username, tweet["id_str"]) + query = "from:{} max_id:{}".format( + self._user["name"], tweet["rest_id"]) if self.retweets: query += " include:retweets include:nativeretweets" + if not self.textonly: - query += (" (filter:images OR" - " filter:native_video OR" - " card_name:animated_gif)") + # try to search for media-only tweets + tweet = None + for tweet in self.api.search_adaptive(query + " filter:links"): + yield tweet + if tweet is not None: + return - # yield search results starting from last tweet id + # yield unfiltered search results yield from self.api.search_adaptive(query) def _select_tweet_source(self): @@ -625,7 +620,25 @@ class TwitterSearchExtractor(TwitterExtractor): return {"search": text.unquote(self.user)} def tweets(self): - return self.api.search_adaptive(text.unquote(self.user)) + query = text.unquote(self.user.replace("+", " ")) + + user = None + for item in query.split(): + item = item.strip("()") + if item.startswith("from:"): + if user: + user = None + break + else: + user = item[5:] + + if user is not None: + try: + self._assign_user(self.api.user_by_screen_name(user)) + except KeyError: + pass + + return self.api.search_adaptive(query) class TwitterEventExtractor(TwitterExtractor): @@ -693,7 +706,7 @@ class TwitterTweetExtractor(TwitterExtractor): }), ("https://twitter.com/i/web/status/1424898916156284928", { "options": (("replies", "self"),), - "count": 0, + "count": 1, }), # "quoted" option (#854) ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", { @@ -777,20 +790,38 @@ class TwitterTweetExtractor(TwitterExtractor): def tweets(self): if self.config("conversations", False): - return self.api.tweet_detail(self.tweet_id) + return self._tweets_conversation(self.tweet_id) + else: + return self._tweets_single(self.tweet_id) + def _tweets_single(self, tweet_id): tweets = [] - tweet_id = self.tweet_id + for tweet in self.api.tweet_detail(tweet_id): if tweet["rest_id"] == tweet_id or \ tweet.get("_retweet_id_str") == tweet_id: + self._assign_user(tweet["core"]["user_results"]["result"]) tweets.append(tweet) tweet_id = tweet["legacy"].get("quoted_status_id_str") if not tweet_id: break + return tweets + def _tweets_conversation(self, tweet_id): + tweets = self.api.tweet_detail(tweet_id) + buffer = [] + + for tweet in tweets: + buffer.append(tweet) + if tweet["rest_id"] == tweet_id or \ + tweet.get("_retweet_id_str") == tweet_id: + self._assign_user(tweet["core"]["user_results"]["result"]) + break + + return itertools.chain(buffer, tweets) + class TwitterImageExtractor(Extractor): category = "twitter" @@ -888,7 +919,6 @@ class TwitterAPI(): self._nsfw_warning = True self._syndication = extractor.config("syndication") self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode - self._user = None cookies = extractor.session.cookies cookiedomain = extractor.cookiedomain @@ -1050,13 +1080,13 @@ class TwitterAPI(): def _user_id_by_screen_name(self, screen_name): if screen_name.startswith("id:"): - self._user = util.SENTINEL user_id = screen_name[3:] + user = self.user_by_rest_id(user_id) else: user = () try: - user = self._user = self.user_by_screen_name(screen_name) + user = self.user_by_screen_name(screen_name) user_id = user["rest_id"] except KeyError: if "unavailable_message" in user: @@ -1066,7 +1096,7 @@ class TwitterAPI(): else: raise exception.NotFoundError("user") - self.extractor._user_id = user_id + self.extractor._assign_user(user) return user_id @cache(maxage=3600) @@ -1183,7 +1213,7 @@ class TwitterAPI(): if quoted: quoted = quoted.copy() quoted["author"] = users[quoted["user_id_str"]] - quoted["user"] = tweet["user"] + quoted["quoted_by"] = tweet["user"]["screen_name"] quoted["quoted_by_id_str"] = tweet["id_str"] yield quoted @@ -1226,17 +1256,10 @@ class TwitterAPI(): except LookupError: extr.log.debug(data) - if self._user: - user = self._user - if user is util.SENTINEL: - try: - user = self.user_by_rest_id(variables["userId"]) - except KeyError: - raise exception.NotFoundError("user") - user = user.get("legacy") - if not user: - pass - elif user.get("blocked_by"): + user = extr._user_obj + if user: + user = user["legacy"] + if user.get("blocked_by"): if self.headers["x-twitter-auth-type"] and \ extr.config("logout"): guest_token = self._guest_token() @@ -1322,7 +1345,7 @@ class TwitterAPI(): try: legacy["retweeted_status_id_str"] = \ retweet["rest_id"] - legacy["author"] = \ + tweet["author"] = \ retweet["core"]["user_results"]["result"] if "extended_entities" in retweet["legacy"] and \ "extended_entities" not in legacy: @@ -1336,9 +1359,9 @@ class TwitterAPI(): if "quoted_status_result" in tweet: try: quoted = tweet["quoted_status_result"]["result"] - quoted["legacy"]["author"] = \ - quoted["core"]["user_results"]["result"] - quoted["core"] = tweet["core"] + quoted["legacy"]["quoted_by"] = ( + tweet["core"]["user_results"]["result"] + ["legacy"]["screen_name"]) quoted["legacy"]["quoted_by_id_str"] = tweet["rest_id"] yield quoted except KeyError: @@ -1374,10 +1397,14 @@ class TwitterAPI(): if instr["type"] == "TimelineAddEntries": for entry in instr["entries"]: if entry["entryId"].startswith("user-"): - user = (entry["content"]["itemContent"] - ["user_results"]["result"]) - if "rest_id" in user: - yield user + try: + user = (entry["content"]["itemContent"] + ["user_results"]["result"]) + except KeyError: + pass + else: + if "rest_id" in user: + yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] elif instr["type"] == "TimelineTerminateTimeline": @@ -1439,6 +1466,6 @@ class TwitterAPI(): return { "rest_id": tweet["id_str"], "legacy" : tweet, - "user" : tweet["user"], + "core" : {"user_results": {"result": tweet["user"]}}, "_retweet_id_str": retweet_id, } diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index c29d730..623ed94 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -84,7 +84,7 @@ class UnsplashImageExtractor(UnsplashExtractor): "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz", "categories": list, "color": "#f3c08c", - "created_at": "2020-04-08T08:29:42-04:00", + "created_at": "2020-04-08T12:29:42Z", "date": "dt:2020-04-08 12:29:42", "description": "The Island", "downloads": int, @@ -112,7 +112,7 @@ class UnsplashImageExtractor(UnsplashExtractor): }, "title": "Beaver Dam, WI 53916, USA" }, - "promoted_at": "2020-04-08T11:12:03-04:00", + "promoted_at": "2020-04-08T15:12:03Z", "sponsorship": None, "tags": list, "updated_at": str, diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index ab2153f..25b00fe 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -40,12 +40,17 @@ class VkExtractor(Extractor): continue try: + photo["url"] = photo[size + "src"] + except KeyError: + self.log.warning("no photo URL found (%s)", photo.get("id")) + continue + + try: _, photo["width"], photo["height"] = photo[size] except ValueError: # photo without width/height entries (#2535) photo["width"] = photo["height"] = 0 - photo["url"] = photo[size + "src"] photo["id"] = photo["id"].rpartition("_")[2] photo.update(data) diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py index 756384b..668be0f 100644 --- a/gallery_dl/extractor/vsco.py +++ b/gallery_dl/extractor/vsco.py @@ -169,7 +169,7 @@ class VscoCollectionExtractor(VscoExtractor): return self._pagination(url, params, tkn, "medias", ( data["medias"]["byId"][mid["id"]]["media"] for mid in data - ["collections"]["byCollectionId"][cid]["byPage"]["1"]["collection"] + ["collections"]["byId"][cid]["1"]["collection"] )) diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 37eab24..0ad8523 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -189,7 +189,7 @@ class WallhavenAPI(): def collections(self, username): endpoint = "/v1/collections/" + username - return self._pagination(endpoint) + return self._pagination(endpoint, metadata=False) def search(self, params): endpoint = "/v1/search" @@ -200,13 +200,20 @@ class WallhavenAPI(): return self.extractor.request( url, headers=self.headers, params=params).json() - def _pagination(self, endpoint, params=None): + def _pagination(self, endpoint, params=None, metadata=None): if params is None: params = {} + if metadata is None: + metadata = self.extractor.config("metadata") while True: data = self._call(endpoint, params) - yield from data["data"] + + if metadata: + for wp in data["data"]: + yield self.info(str(wp["id"])) + else: + yield from data["data"] meta = data.get("meta") if not meta or meta["current_page"] >= meta["last_page"]: diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index bdbdc8c..189c0c5 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -99,13 +99,14 @@ class WeiboExtractor(Extractor): else: yield pic["largest"].copy() - if "page_info" in status: - page_info = status["page_info"] - if "media_info" not in page_info or not self.videos: - return - media = max(page_info["media_info"]["playback_list"], - key=lambda m: m["meta"]["quality_index"]) - yield media["play_info"].copy() + if "page_info" in status and self.videos: + try: + media = max(status["page_info"]["media_info"]["playback_list"], + key=lambda m: m["meta"]["quality_index"]) + except KeyError: + pass + else: + yield media["play_info"].copy() def _status_by_id(self, status_id): url = "{}/ajax/statuses/show?id={}".format(self.root, status_id) @@ -147,14 +148,17 @@ class WeiboExtractor(Extractor): return yield from statuses - if "next_cursor" in data: + if "next_cursor" in data: # videos, newvideo params["cursor"] = data["next_cursor"] - elif "page" in params: + elif "page" in params: # home, article params["page"] += 1 - elif data["since_id"]: + elif data["since_id"]: # album params["sinceid"] = data["since_id"] - else: - params["since_id"] = statuses[-1]["id"] - 1 + else: # feed, last album page + try: + params["since_id"] = statuses[-1]["id"] - 1 + except KeyError: + return def _sina_visitor_system(self, response): self.log.info("Sina Visitor System") @@ -366,6 +370,10 @@ class WeiboStatusExtractor(WeiboExtractor): "pattern": r"https://g\.us\.sinaimg.cn/o0/qNZcaAAglx07Wuf921CM0104" r"120005tc0E010\.mp4\?label=gif_mp4", }), + # missing 'playback_list' (#2792) + ("https://weibo.com/2909128931/4409545658754086", { + "count": 9, + }), ("https://m.weibo.cn/status/4339748116375525"), ("https://m.weibo.cn/5746766133/4339748116375525"), ) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py new file mode 100644 index 0000000..2b5acd8 --- /dev/null +++ b/gallery_dl/extractor/zerochan.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.zerochan.net/""" + +from .booru import BooruExtractor +from ..cache import cache +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" + + +class ZerochanExtractor(BooruExtractor): + """Base class for zerochan extractors""" + category = "zerochan" + root = "https://www.zerochan.net" + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}" + cookiedomain = ".zerochan.net" + cookienames = ("z_id", "z_hash") + + def login(self): + if not self._check_cookies(self.cookienames): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + # force legacy layout + self.session.cookies.set("v3", "0", domain=self.cookiedomain) + + @cache(maxage=90*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/login" + headers = { + "Origin" : self.root, + "Referer" : url, + } + data = { + "ref" : "/", + "name" : username, + "password": password, + "login" : "Login", + } + + response = self.request(url, method="POST", headers=headers, data=data) + if not response.history: + raise exception.AuthenticationError() + + return response.cookies + + def _parse_entry_page(self, entry_id): + url = "{}/{}".format(self.root, entry_id) + extr = text.extract_from(self.request(url).text) + + return { + "id" : entry_id, + "author": extr('"author": "', '"'), + "file_url": extr('"contentUrl": "', '"'), + "date" : text.parse_datetime(extr( + '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"), + "width" : extr('"width": "', ' '), + "height": extr('"height": "', ' '), + "size" : extr('"contentSize": "', 'B'), + "path" : text.split_html(extr( + 'class="breadcrumbs', '</p>'))[3::2], + "tags" : extr('alt="Tags: ', '"').split(", ") + } + + +class ZerochanTagExtractor(ZerochanExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = BASE_PATTERN + r"/(?!\d+$)([^/?#]+)/?(?:\?([^#]+))?" + test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", { + "pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)", + "count": "> 24", + "keywords": { + "extension": r"re:jpg|png", + "file_url": "", + "filename": r"re:Perth.\(Kantai.Collection\).full.\d+", + "height": r"re:^\d+$", + "id": r"re:^\d+$", + "name": "Perth (Kantai Collection)", + "search_tags": "Perth (Kantai Collection)", + "size": r"re:^\d+k$", + "width": r"re:^\d+$", + }, + }) + + def __init__(self, match): + ZerochanExtractor.__init__(self, match) + self.search_tag, self.query = match.groups() + + def metadata(self): + return {"search_tags": text.unquote( + self.search_tag.replace("+", " "))} + + def posts(self): + url = self.root + "/" + self.search_tag + params = text.parse_query(self.query) + params["p"] = text.parse_int(params.get("p"), 1) + + while True: + page = self.request(url, params=params).text + thumbs = text.extract(page, '<ul id="thumbs', '</ul>')[0] + extr = text.extract_from(thumbs) + + while True: + post = extr('<li class="', '>') + if not post: + break + yield { + "id" : extr('href="/', '"'), + "name" : extr('alt="', '"'), + "width" : extr('title="', 'x'), + "height": extr('', ' '), + "size" : extr('', 'B'), + "file_url": "https://static." + extr( + '<a href="https://static.', '"'), + } + + if 'rel="next"' not in page: + break + params["p"] += 1 + + +class ZerochanImageExtractor(ZerochanExtractor): + subcategory = "image" + pattern = BASE_PATTERN + r"/(\d+)" + test = ("https://www.zerochan.net/2920445", { + "pattern": r"https://static\.zerochan\.net/" + r"Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg", + "keyword": { + "author": "YukinoTokisaki", + "date": "dt:2020-04-24 21:33:44", + "file_url": str, + "filename": "Perth.(Kantai.Collection).full.2920445", + "height": "1366", + "id": "2920445", + "size": "1929k", + "width": "1920", + }, + }) + + def __init__(self, match): + ZerochanExtractor.__init__(self, match) + self.image_id = match.group(1) + + def posts(self): + return (self._parse_entry_page(self.image_id),) diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index bc4d837..dd32b8a 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -296,12 +296,14 @@ def _parse_maxlen(format_spec, default): def _parse_join(format_spec, default): separator, _, format_spec = format_spec.partition(_SEPARATOR) - separator = separator[1:] + join = separator[1:].join fmt = _build_format_func(format_spec, default) - def join(obj): - return fmt(separator.join(obj)) - return join + def apply_join(obj): + if isinstance(obj, str): + return fmt(obj) + return fmt(join(obj)) + return apply_join def _parse_replace(format_spec, default): @@ -379,6 +381,7 @@ _CONVERSIONS = { "T": util.datetime_to_timestamp_string, "d": text.parse_timestamp, "U": text.unescape, + "g": text.slugify, "S": util.to_string, "s": str, "r": repr, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 9636bef..7b22b1d 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -35,10 +35,13 @@ class Job(): self.status = 0 self.url_key = extr.config("url-metadata") + path_key = extr.config("path-metadata") + path_proxy = output.PathfmtProxy(self) + self._logger_extra = { "job" : self, "extractor": extr, - "path" : output.PathfmtProxy(self), + "path" : path_proxy, "keywords" : output.KwdictProxy(self), } extr.log = self._wrap_logger(extr.log) @@ -58,6 +61,8 @@ class Job(): kwdict = extr.config("keywords") if kwdict: self.kwdict.update(kwdict) + if path_key: + self.kwdict[path_key] = path_proxy # predicates self.pred_url = self._prepare_predicates("image", True) diff --git a/gallery_dl/output.py b/gallery_dl/output.py index e7c66cf..3017f85 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -73,6 +73,12 @@ class PathfmtProxy(): pathfmt = object.__getattribute__(self, "job").pathfmt return pathfmt.__dict__.get(name) if pathfmt else None + def __str__(self): + pathfmt = object.__getattribute__(self, "job").pathfmt + if pathfmt: + return pathfmt.path or pathfmt.directory + return "" + class KwdictProxy(): __slots__ = ("job",) diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 2d16db8..d9baed3 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -30,6 +30,17 @@ class MetadataPP(PostProcessor): elif mode == "tags": self.write = self._write_tags ext = "txt" + elif mode == "modify": + self.run = self._run_modify + self.fields = { + name: formatter.parse(value, None, util.identity).format_map + for name, value in options.get("fields").items() + } + ext = None + elif mode == "delete": + self.run = self._run_delete + self.fields = options.get("fields") + ext = None else: self.write = self._write_json self.indent = options.get("indent", 4) @@ -99,7 +110,7 @@ class MetadataPP(PostProcessor): with open(path, "w", encoding="utf-8") as fp: self.write(fp, pathfmt.kwdict) except FileNotFoundError: - os.makedirs(directory) + os.makedirs(directory, exist_ok=True) with open(path, "w", encoding="utf-8") as fp: self.write(fp, pathfmt.kwdict) @@ -114,6 +125,32 @@ class MetadataPP(PostProcessor): def _run_stdout(self, pathfmt): self.write(sys.stdout, pathfmt.kwdict) + def _run_modify(self, pathfmt): + kwdict = pathfmt.kwdict + for key, func in self.fields.items(): + obj = kwdict + try: + while "[" in key: + name, _, key = key.partition("[") + obj = obj[name] + key = key.rstrip("]") + obj[key] = func(kwdict) + except Exception: + pass + + def _run_delete(self, pathfmt): + kwdict = pathfmt.kwdict + for key in self.fields: + obj = kwdict + try: + while "[" in key: + name, _, key = key.partition("[") + obj = obj[name] + key = key.rstrip("]") + del obj[key] + except Exception: + pass + def _directory(self, pathfmt): return pathfmt.realdirectory diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 97ef3ac..79cf016 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -39,6 +39,16 @@ def split_html(txt): return [] +def slugify(value): + """Convert a string to a URL slug + + Adapted from: + https://github.com/django/django/blob/master/django/utils/text.py + """ + value = re.sub(r"[^\w\s-]", "", str(value).lower()) + return re.sub(r"[-\s]+", "-", value).strip("-_") + + def ensure_http_scheme(url, scheme="https://"): """Prepend 'scheme' to 'url' if it doesn't have one""" if url and not url.startswith(("https://", "http://")): diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 009ee08..4ba1cba 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -161,13 +161,16 @@ def delete_items(obj, keys): def enumerate_reversed(iterable, start=0, length=None): """Enumerate 'iterable' and return its elements in reverse order""" - start -= 1 if length is None: length = len(iterable) - return zip( - range(length - start, start, -1), - reversed(iterable), - ) + + try: + iterable = zip(range(start-1+length, start-1, -1), reversed(iterable)) + except TypeError: + iterable = list(zip(range(start, start+length), iterable)) + iterable.reverse() + + return iterable def number_to_string(value, numbers=(int, float)): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 76f879c..d12d088 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.22.4" +__version__ = "1.23.0" diff --git a/test/test_formatter.py b/test/test_formatter.py index aec091a..b335332 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -58,6 +58,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{dt!T}", "1262304000") self._run_test("{l!j}", '["a", "b", "c"]') self._run_test("{dt!j}", '"2010-01-01 00:00:00"') + self._run_test("{a!g}", "hello-world") with self.assertRaises(KeyError): self._run_test("{a!q}", "hello world") @@ -177,7 +178,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{l:J - />20}", " a - b - c") self._run_test("{a:J/}" , self.kwdict["a"]) - self._run_test("{a:J, /}" , ", ".join(self.kwdict["a"])) + self._run_test("{a:J, /}" , self.kwdict["a"]) def test_replace(self): self._run_test("{a:Rh/C/}" , "CElLo wOrLd") diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index 7a216bb..42babd3 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -339,6 +339,52 @@ class MetadataTest(BasePostprocessorTest): {"category": "test", "extension": "ext", "filename": "file"} """) + def test_metadata_modify(self): + kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3}} + self._create({ + "mode": "modify", + "fields": { + "foo" : "{filename}-{foo!s}", + "foo2" : "\fE bar['bax'] + 122", + "bar[baz]": "{_now}", + "bar[ba2]": "test", + }, + }, kwdict) + pdict = self.pathfmt.kwdict + + self.assertIsNot(kwdict, pdict) + self.assertEqual(pdict["foo"], kwdict["foo"]) + self.assertEqual(pdict["bar"], kwdict["bar"]) + + self._trigger() + + self.assertEqual(pdict["foo"] , "file-0") + self.assertEqual(pdict["foo2"] , 123) + self.assertEqual(pdict["bar"]["ba2"], "test") + self.assertIsInstance(pdict["bar"]["baz"], datetime) + + def test_metadata_delete(self): + kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3}} + self._create({"mode": "delete", "fields": ["foo", "bar[baz]"]}, kwdict) + pdict = self.pathfmt.kwdict + + self.assertIsNot(kwdict, pdict) + self.assertEqual(pdict["foo"], kwdict["foo"]) + self.assertEqual(pdict["bar"], kwdict["bar"]) + + del kwdict["foo"] + del kwdict["bar"]["baz"] + + self._trigger() + self.assertNotIn("foo", pdict) + self.assertNotIn("baz", pdict["bar"]) + self.assertEqual(kwdict["bar"], pdict["bar"]) + + self._trigger() + self.assertNotIn("foo", pdict) + self.assertNotIn("baz", pdict["bar"]) + self.assertEqual(kwdict["bar"], pdict["bar"]) + @staticmethod def _output(mock): return "".join( diff --git a/test/test_text.py b/test/test_text.py index ffed726..0ac7767 100644 --- a/test/test_text.py +++ b/test/test_text.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -75,6 +75,23 @@ class TestText(unittest.TestCase): for value in INVALID: self.assertEqual(f(value), empty) + def test_slugify(self, f=text.slugify): + self.assertEqual(f("Hello World"), "hello-world") + self.assertEqual(f("-HeLLo---World-"), "hello-world") + self.assertEqual(f("_-H#e:l#l:o+\t+W?o!rl=d-_"), "hello-world") + self.assertEqual(f("_Hello_World_"), "hello_world") + + self.assertEqual(f(""), "") + self.assertEqual(f("-"), "") + self.assertEqual(f("--"), "") + + self.assertEqual(f(()), "") + self.assertEqual(f([]), "") + self.assertEqual(f({}), "") + self.assertEqual(f(None), "none") + self.assertEqual(f(1), "1") + self.assertEqual(f(2.3), "23") + def test_ensure_http_scheme(self, f=text.ensure_http_scheme): result = "https://example.org/filename.ext" diff --git a/test/test_util.py b/test/test_util.py index 7ab1175..2921ea2 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -15,6 +15,7 @@ import io import random import string import datetime +import itertools import http.cookiejar sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -541,6 +542,41 @@ class TestOther(unittest.TestCase): r = util.filter_dict(d) self.assertEqual(r, {"foo": 123}) + def test_enumerate_reversed(self): + + seq = [11, 22, 33] + result = [(3, 33), (2, 22), (1, 11)] + + def gen(): + for i in seq: + yield i + + def gen_2(): + yield from seq + + def assertEqual(it1, it2): + ae = self.assertEqual + for i1, i2 in itertools.zip_longest(it1, it2): + ae(i1, i2) + + assertEqual( + util.enumerate_reversed(seq), [(2, 33), (1, 22), (0, 11)]) + assertEqual( + util.enumerate_reversed(seq, 1), result) + assertEqual( + util.enumerate_reversed(seq, 2), [(4, 33), (3, 22), (2, 11)]) + + assertEqual( + util.enumerate_reversed(gen(), 0, len(seq)), + [(2, 33), (1, 22), (0, 11)]) + assertEqual( + util.enumerate_reversed(gen(), 1, len(seq)), result) + assertEqual( + util.enumerate_reversed(gen_2(), 1, len(seq)), result) + assertEqual( + util.enumerate_reversed(gen_2(), 2, len(seq)), + [(4, 33), (3, 22), (2, 11)]) + def test_number_to_string(self, f=util.number_to_string): self.assertEqual(f(1) , "1") self.assertEqual(f(1.0) , "1.0") |
