diff options
| author | 2022-12-24 17:14:46 -0500 | |
|---|---|---|
| committer | 2022-12-24 17:14:46 -0500 | |
| commit | ebdfcd3cd3f76534a590ba08933ff7ea54813316 (patch) | |
| tree | 35db6003766dff695cf8a5aa24f47629b602b7c0 | |
| parent | 3338dfce719c999467ffe08fd45663be8190057a (diff) | |
New upstream version 1.24.2.upstream/1.24.2
57 files changed, 1072 insertions, 524 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 76d65cd..700efb1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,40 @@ # Changelog +## 1.24.2 - 2022-12-18 +### Additions +- [2chen] support `.club` URLs ([#3406](https://github.com/mikf/gallery-dl/issues/3406)) +- [deviantart] extract sta.sh URLs from `text_content` ([#3366](https://github.com/mikf/gallery-dl/issues/3366)) +- [deviantart] add `/view` URL support ([#3367](https://github.com/mikf/gallery-dl/issues/3367)) +- [e621] implement `threshold` option to control pagination ([#3413](https://github.com/mikf/gallery-dl/issues/3413)) +- [fapello] add `post`, `user` and `path` extractors ([#3065](https://github.com/mikf/gallery-dl/issues/3065), [#3360](https://github.com/mikf/gallery-dl/issues/3360), [#3415](https://github.com/mikf/gallery-dl/issues/3415)) +- [imgur] add support for imgur.io URLs ([#3419](https://github.com/mikf/gallery-dl/issues/3419)) +- [lynxchan] add generic extractors for lynxchan imageboards ([#3389](https://github.com/mikf/gallery-dl/issues/3389), [#3394](https://github.com/mikf/gallery-dl/issues/3394)) +- [mangafox] extract more metadata ([#3167](https://github.com/mikf/gallery-dl/issues/3167)) +- [pixiv] extract `date_url` metadata ([#3405](https://github.com/mikf/gallery-dl/issues/3405)) +- [soundgasm] add `audio` and `user` extractors ([#3384](https://github.com/mikf/gallery-dl/issues/3384), [#3388](https://github.com/mikf/gallery-dl/issues/3388)) +- [webmshare] add `video` extractor ([#2410](https://github.com/mikf/gallery-dl/issues/2410)) +- support Firefox containers for `--cookies-from-browser` ([#3346](https://github.com/mikf/gallery-dl/issues/3346)) +### Fixes +- [2chen] fix file URLs +- [bunkr] update domain ([#3391](https://github.com/mikf/gallery-dl/issues/3391)) +- [exhentai] fix pagination +- [imagetwist] fix extraction +- [imgth] rewrite +- [instagram] prevent post `date` overwriting file `date` ([#3392](https://github.com/mikf/gallery-dl/issues/3392)) +- [khinsider] fix metadata extraction +- [komikcast] update domain and fix extraction +- [reddit] increase `id-max` default value ([#3397](https://github.com/mikf/gallery-dl/issues/3397)) +- [seiga] raise error when redirected to login page ([#3401](https://github.com/mikf/gallery-dl/issues/3401)) +- [sexcom] fix video URLs ([#3408](https://github.com/mikf/gallery-dl/issues/3408), [#3414](https://github.com/mikf/gallery-dl/issues/3414)) +- [twitter] update `search` pagination ([#544](https://github.com/mikf/gallery-dl/issues/544)) +- [warosu] fix and update +- [zerochan] update for layout v3 +- restore paths for archived files ([#3362](https://github.com/mikf/gallery-dl/issues/3362), [#3377](https://github.com/mikf/gallery-dl/issues/3377)) +- use `util.NONE` as `keyword-default` default value ([#3334](https://github.com/mikf/gallery-dl/issues/3334)) +### Removals +- [foolslide] remove `kireicake` +- [kissgoddess] remove module + ## 1.24.1 - 2022-12-04 ### Additions - [artstation] add `pro-first` option ([#3273](https://github.com/mikf/gallery-dl/issues/3273)) @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.24.1 +Version: 1.24.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -103,8 +103,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.1/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.1/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.2/gallery-dl.exe>`__ + (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml @@ -66,8 +66,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.1/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.1/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.2/gallery-dl.exe>`__ + (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index eb5c0f4..13ee2ea 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -16,7 +16,7 @@ _arguments -C -S \ --user-agent'[User-Agent request header]':'<ua>' \ --clear-cache'[Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)]':'<module>' \ --cookies'[File to load additional cookies from]':'<file>':_files \ ---cookies-from-browser'[Name of the browser to load cookies from, with optional keyring name prefixed with "+" and profile prefixed with ":"]':'<browser[+keyring][:profile]>' \ +--cookies-from-browser'[Name of the browser to load cookies from, with optional keyring name prefixed with "+", profile prefixed with ":", and container prefixed with "::" ("none" for no container)]':'<browser[+keyring][:profile][::container]>' \ {-q,--quiet}'[Activate quiet mode]' \ {-v,--verbose}'[Print various debugging information]' \ {-g,--get-urls}'[Print URLs instead of downloading]' \ diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish index 87e625a..50ad132 100644 --- a/data/completion/gallery-dl.fish +++ b/data/completion/gallery-dl.fish @@ -10,7 +10,7 @@ complete -c gallery-dl -x -l 'source-address' -d 'Client-side IP address to bind complete -c gallery-dl -x -l 'user-agent' -d 'User-Agent request header' complete -c gallery-dl -x -l 'clear-cache' -d 'Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything)' complete -c gallery-dl -r -F -l 'cookies' -d 'File to load additional cookies from' -complete -c gallery-dl -x -l 'cookies-from-browser' -d 'Name of the browser to load cookies from, with optional keyring name prefixed with "+" and profile prefixed with ":"' +complete -c gallery-dl -x -l 'cookies-from-browser' -d 'Name of the browser to load cookies from, with optional keyring name prefixed with "+", profile prefixed with ":", and container prefixed with "::" ("none" for no container)' complete -c gallery-dl -s 'q' -l 'quiet' -d 'Activate quiet mode' complete -c gallery-dl -s 'v' -l 'verbose' -d 'Print various debugging information' complete -c gallery-dl -s 'g' -l 'get-urls' -d 'Print URLs instead of downloading' diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index aac3757..d85b1c9 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2022-12-04" "1.24.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2022-12-18" "1.24.2" "gallery-dl Manual" .\" disable hyphenation .nh @@ -50,8 +50,8 @@ Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything .B "\-\-cookies" \f[I]FILE\f[] File to load additional cookies from .TP -.B "\-\-cookies\-from\-browser" \f[I]BROWSER[+KEYRING][:PROFILE]\f[] -Name of the browser to load cookies from, with optional keyring name prefixed with '+' and profile prefixed with ':' +.B "\-\-cookies\-from\-browser" \f[I]BROWSER[+KEYRING][:PROFILE][::CONTAINER]\f[] +Name of the browser to load cookies from, with optional keyring name prefixed with '+', profile prefixed with ':', and container prefixed with '::' ('none' for no container) .TP .B "\-q, \-\-quiet" Activate quiet mode diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 6565e96..36b2c84 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2022-12-04" "1.24.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2022-12-18" "1.24.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -519,7 +519,7 @@ Source to read additional cookies from. This can be } .br -* A \f[I]list\f[] with up to 3 entries specifying a browser profile. +* A \f[I]list\f[] with up to 4 entries specifying a browser profile. .br * The first entry is the browser name @@ -527,10 +527,13 @@ Source to read additional cookies from. This can be * The optional second entry is a profile name or an absolute path to a profile directory .br * The optional third entry is the keyring to retrieve passwords for decrypting cookies from +.br +* The optional fourth entry is a (Firefox) container name (\f[I]"none"\f[] for only cookies with no container) .. code:: json ["firefox"] +["firefox", null, null, "Personal"] ["chromium", "Private", "kwallet"] @@ -1121,7 +1124,7 @@ Download embedded videos hosted on https://www.blogger.com/ \f[I]string\f[] .IP "Default:" 9 -\f[I]"auto"\f[] +\f[I]null\f[] .IP "Example:" 4 "cyberdrop.to" @@ -1158,6 +1161,24 @@ Extract additional metadata (notes, artist commentary, parent, children) Note: This requires 1 additional HTTP request for each post. +.SS extractor.danbooru.threshold +.IP "Type:" 6 +\f[I]string\f[] or \f[I]int\f[] + +.IP "Default:" 9 +\f[I]"auto"\f[] + +.IP "Description:" 4 +Stop paginating over API results if the length of a batch of returned +posts is less than the specified number. Defaults to the per-page limit +of the current instance, which is 320 for \f[I]e621\f[] and 200 for +everything else. + +Note: Changing this setting is normally not necessary. When the value is +greater than the per-page limit, gallery-dl will stop after the first +batch. The value cannot be less than 1. + + .SS extractor.danbooru.ugoira .IP "Type:" 6 \f[I]bool\f[] @@ -1961,7 +1982,7 @@ the first in the list gets chosen (usually mp3). \f[I]string\f[] .IP "Default:" 9 -\f[I]"auto"\f[] +\f[I]null\f[] .IP "Description:" 4 Specifies the domain used by a \f[I]lolisafe\f[] extractor diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 6b12721..98974e9 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -55,7 +55,7 @@ }, "cyberdrop": { - "domain": "auto" + "domain": null }, "danbooru": { @@ -251,8 +251,8 @@ "date-min": 0, "date-max": 253402210800, "date-format": "%Y-%m-%dT%H:%M:%S", - "id-min": "0", - "id-max": "zik0zj", + "id-min": null, + "id-max": null, "recursion": 0, "videos": true }, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 3b3201e..03c1930 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.24.1 +Version: 1.24.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -103,8 +103,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.1/gallery-dl.exe>`__ -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.1/gallery-dl.bin>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.24.2/gallery-dl.exe>`__ + (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.24.2/gallery-dl.bin>`__ | Executables build from the latest commit can be found at | https://github.com/mikf/gallery-dl/actions/workflows/executables.yml diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index ffa0e95..556dc49 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -74,6 +74,7 @@ gallery_dl/extractor/fallenangels.py gallery_dl/extractor/fanbox.py gallery_dl/extractor/fantia.py gallery_dl/extractor/fapachi.py +gallery_dl/extractor/fapello.py gallery_dl/extractor/flickr.py gallery_dl/extractor/foolfuuka.py gallery_dl/extractor/foolslide.py @@ -112,14 +113,13 @@ gallery_dl/extractor/kabeuchi.py gallery_dl/extractor/keenspot.py gallery_dl/extractor/kemonoparty.py gallery_dl/extractor/khinsider.py -gallery_dl/extractor/kissgoddess.py -gallery_dl/extractor/kohlchan.py gallery_dl/extractor/komikcast.py gallery_dl/extractor/lightroom.py gallery_dl/extractor/lineblog.py gallery_dl/extractor/livedoor.py gallery_dl/extractor/lolisafe.py gallery_dl/extractor/luscious.py +gallery_dl/extractor/lynxchan.py gallery_dl/extractor/mangadex.py gallery_dl/extractor/mangafox.py gallery_dl/extractor/mangahere.py @@ -176,6 +176,7 @@ gallery_dl/extractor/skeb.py gallery_dl/extractor/slickpic.py gallery_dl/extractor/slideshare.py gallery_dl/extractor/smugmug.py +gallery_dl/extractor/soundgasm.py gallery_dl/extractor/speakerdeck.py gallery_dl/extractor/subscribestar.py gallery_dl/extractor/tapas.py @@ -197,6 +198,7 @@ gallery_dl/extractor/wallhaven.py gallery_dl/extractor/wallpapercave.py gallery_dl/extractor/warosu.py gallery_dl/extractor/weasyl.py +gallery_dl/extractor/webmshare.py gallery_dl/extractor/webtoons.py gallery_dl/extractor/weibo.py gallery_dl/extractor/wikiart.py diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 3701d6f..611b2b9 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -66,7 +66,12 @@ def main(): if args.cookies_from_browser: browser, _, profile = args.cookies_from_browser.partition(":") browser, _, keyring = browser.partition("+") - config.set((), "cookies", (browser, profile, keyring)) + if profile.startswith(":"): + container = profile[1:] + profile = None + else: + profile, _, container = profile.partition("::") + config.set((), "cookies", (browser, profile, keyring, container)) for opts in args.options: config.set(*opts) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 6f9a92d..ee00bf7 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -24,7 +24,7 @@ import tempfile from datetime import datetime, timedelta, timezone from hashlib import pbkdf2_hmac from http.cookiejar import Cookie -from . import aes +from . import aes, text SUPPORTED_BROWSERS_CHROMIUM = { @@ -35,11 +35,10 @@ logger = logging.getLogger("cookies") def load_cookies(cookiejar, browser_specification): - browser_name, profile, keyring = \ + browser_name, profile, keyring, container = \ _parse_browser_specification(*browser_specification) - if browser_name == "firefox": - load_cookies_firefox(cookiejar, profile) + load_cookies_firefox(cookiejar, profile, container) elif browser_name == "safari": load_cookies_safari(cookiejar, profile) elif browser_name in SUPPORTED_BROWSERS_CHROMIUM: @@ -48,12 +47,24 @@ def load_cookies(cookiejar, browser_specification): raise ValueError("unknown browser '{}'".format(browser_name)) -def load_cookies_firefox(cookiejar, profile=None): - set_cookie = cookiejar.set_cookie - with _firefox_cookies_database(profile) as db: +def load_cookies_firefox(cookiejar, profile=None, container=None): + path, container_id = _firefox_cookies_database(profile, container) + with DatabaseCopy(path) as db: + + sql = ("SELECT name, value, host, path, isSecure, expiry " + "FROM moz_cookies") + parameters = () + + if container_id is False: + sql += " WHERE NOT INSTR(originAttributes,'userContextId=')" + elif container_id: + sql += " WHERE originAttributes LIKE ? OR originAttributes LIKE ?" + uid = "%userContextId={}".format(container_id) + parameters = (uid, uid + "&%") + + set_cookie = cookiejar.set_cookie for name, value, domain, path, secure, expires in db.execute( - "SELECT name, value, host, path, isSecure, expiry " - "FROM moz_cookies"): + sql, parameters): set_cookie(Cookie( 0, name, value, None, False, domain, bool(domain), domain.startswith("."), @@ -79,9 +90,10 @@ def load_cookies_safari(cookiejar, profile=None): def load_cookies_chrome(cookiejar, browser_name, profile, keyring): config = _get_chromium_based_browser_settings(browser_name) + path = _chrome_cookies_database(profile, config) + logger.debug("Extracting cookies from %s", path) - with _chrome_cookies_database(profile, config) as db: - + with DatabaseCopy(path) as db: db.text_factory = bytes decryptor = get_cookie_decryptor( config["directory"], config["keyring"], keyring=keyring) @@ -134,8 +146,8 @@ def load_cookies_chrome(cookiejar, browser_name, profile, keyring): # -------------------------------------------------------------------- # firefox -def _firefox_cookies_database(profile=None): - if profile is None: +def _firefox_cookies_database(profile=None, container=None): + if not profile: search_root = _firefox_browser_directory() elif _is_path(profile): search_root = profile @@ -146,14 +158,45 @@ def _firefox_cookies_database(profile=None): if path is None: raise FileNotFoundError("Unable to find Firefox cookies database in " "{}".format(search_root)) - logger.debug("Extracting cookies from %s", path) - return DatabaseCopy(path) + + if container == "none": + container_id = False + logger.debug("Only loading cookies not belonging to any container") + + elif container: + containers_path = os.path.join( + os.path.dirname(path), "containers.json") + + try: + with open(containers_path) as containers: + identities = json.load(containers)["identities"] + except OSError: + logger.error("Unable to read Firefox container database at %s", + containers_path) + raise + except KeyError: + identities = () + + for context in identities: + if container == context.get("name") or container == text.extr( + context.get("l10nID", ""), "userContext", ".label"): + container_id = context["userContextId"] + break + else: + raise ValueError("Unable to find Firefox container {}".format( + container)) + logger.debug("Only loading cookies from container '%s' (ID %s)", + container, container_id) + else: + container_id = None + + return path, container_id def _firefox_browser_directory(): if sys.platform in ("win32", "cygwin"): - return os.path.expandvars(R"%APPDATA%\Mozilla\Firefox\Profiles") + return os.path.expandvars(r"%APPDATA%\Mozilla\Firefox\Profiles") if sys.platform == "darwin": return os.path.expanduser("~/Library/Application Support/Firefox") return os.path.expanduser("~/.mozilla/firefox") @@ -237,7 +280,7 @@ def _safari_parse_cookies_record(data, cookiejar): cookiejar.set_cookie(Cookie( 0, name, value, None, False, - domain, bool(domain), domain.startswith('.'), + domain, bool(domain), domain.startswith("."), path, bool(path), is_secure, expiration_date, False, None, None, {}, )) @@ -265,9 +308,7 @@ def _chrome_cookies_database(profile, config): if path is None: raise FileNotFoundError("Unable to find {} cookies database in " "'{}'".format(config["browser"], search_root)) - - logger.debug("Extracting cookies from %s", path) - return DatabaseCopy(path) + return path def _get_chromium_based_browser_settings(browser_name): @@ -937,11 +978,12 @@ def _is_path(value): return os.path.sep in value -def _parse_browser_specification(browser, profile=None, keyring=None): +def _parse_browser_specification( + browser, profile=None, keyring=None, container=None): if browser not in SUPPORTED_BROWSERS: raise ValueError("unsupported browser '{}'".format(browser)) if keyring and keyring not in SUPPORTED_KEYRINGS: raise ValueError("unsupported keyring '{}'".format(keyring)) if profile and _is_path(profile): profile = os.path.expanduser(profile) - return browser, profile, keyring + return browser, profile, keyring, container diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py index 76a085a..d9674d8 100644 --- a/gallery_dl/extractor/2chen.py +++ b/gallery_dl/extractor/2chen.py @@ -17,18 +17,22 @@ class _2chenThreadExtractor(Extractor): directory_fmt = ("{category}", "{board}", "{thread} {title}") filename_fmt = "{time} {filename}.{extension}" archive_fmt = "{board}_{thread}_{hash}_{time}" - root = "https://2chen.moe" - pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)" + pattern = r"(?:https?://)?2chen\.(?:moe|club)/([^/?#]+)/(\d+)" test = ( ("https://2chen.moe/tv/496715", { + "pattern": r"https://2chen\.su/assets/images/src/\w{40}\.\w+$", "count": ">= 179", }), + ("https://2chen.club/tv/1", { + "count": 5, + }), # 404 ("https://2chen.moe/jp/303786"), ) def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.board, self.thread = match.groups() def items(self): @@ -36,13 +40,19 @@ class _2chenThreadExtractor(Extractor): page = self.request(url, encoding="utf-8", notfound="thread").text data = self.metadata(page) yield Message.Directory, data + for post in self.posts(page): - if not post["url"]: + + url = post["url"] + if not url: continue + if url[0] == "/": + url = self.root + url + post["url"] = url = url.partition("?")[0] + post.update(data) - post["url"] = self.root + post["url"] post["time"] = text.parse_int(post["date"].timestamp()) - yield Message.Url, post["url"], text.nameext_from_url( + yield Message.Url, url, text.nameext_from_url( post["filename"], post) def metadata(self, page): @@ -78,18 +88,19 @@ class _2chenBoardExtractor(Extractor): """Extractor for 2chen boards""" category = "2chen" subcategory = "board" - root = "https://2chen.moe" - pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)(?:/catalog|/?$)" + pattern = r"(?:https?://)?2chen\.(?:moe|club)/([^/?#]+)(?:/catalog|/?$)" test = ( ("https://2chen.moe/co/", { "pattern": _2chenThreadExtractor.pattern }), ("https://2chen.moe/co"), - ("https://2chen.moe/co/catalog") + ("https://2chen.club/tv"), + ("https://2chen.moe/co/catalog"), ) def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.board = match.group(1) def items(self): diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py index 28acc3d..f86691d 100644 --- a/gallery_dl/extractor/35photo.py +++ b/gallery_dl/extractor/35photo.py @@ -146,6 +146,7 @@ class _35photoTagExtractor(_35photoExtractor): test = ("https://35photo.pro/tags/landscape/", { "range": "1-25", "count": 25, + "archive": False, }) def __init__(self, match): diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index 1e020c2..0e128c3 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -92,8 +92,8 @@ class _8chanThreadExtractor(_8chanExtractor): "uniquePosters": 9, "usesCustomCss": True, "usesCustomJs": False, - "wsPort": 8880, - "wssPort": 2087, + "?wsPort": 8880, + "?wssPort": 2087, }, }), ("https://8chan.se/vhs/res/4.html"), diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index d2bbcbb..444075c 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -39,6 +39,7 @@ modules = [ "fallenangels", "fanbox", "fantia", + "fapello", "fapachi", "flickr", "furaffinity", @@ -74,13 +75,12 @@ modules = [ "keenspot", "kemonoparty", "khinsider", - "kissgoddess", - "kohlchan", "komikcast", "lightroom", "lineblog", "livedoor", "luscious", + "lynxchan", "mangadex", "mangafox", "mangahere", @@ -131,6 +131,7 @@ modules = [ "slickpic", "slideshare", "smugmug", + "soundgasm", "speakerdeck", "subscribestar", "tapas", @@ -151,6 +152,7 @@ modules = [ "wallpapercave", "warosu", "weasyl", + "webmshare", "webtoons", "weibo", "wikiart", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 7e9a422..882c2b3 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkr.is/""" +"""Extractors for https://bunkr.ru/""" from .lolisafe import LolisafeAlbumExtractor from .. import text @@ -14,13 +14,13 @@ import json class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkr.is albums""" + """Extractor for bunkr.ru albums""" category = "bunkr" - root = "https://bunkr.is" - pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:is|to)/a/([^/?#]+)" + root = "https://bunkr.ru" + pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:ru|is|to)/a/([^/?#]+)" test = ( - ("https://bunkr.is/a/Lktg9Keq", { - "pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png", + ("https://bunkr.ru/a/Lktg9Keq", { + "pattern": r"https://cdn\.bunkr\.ru/test-テスト-\"&>-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { "album_id": "Lktg9Keq", @@ -34,64 +34,46 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): }), # mp4 (#2239) ("https://app.bunkr.is/a/ptRHaCn2", { - "pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4", + "pattern": r"https://media-files\.bunkr\.ru/_-RnHoW69L\.mp4", "content": "80e61d1dbc5896ae7ef9a28734c747b28b320471", }), # cdn4 ("https://bunkr.is/a/iXTTc1o2", { - "pattern": r"https://(cdn|media-files)4\.bunkr\.is/", + "pattern": r"https://(cdn|media-files)4\.bunkr\.ru/", "content": "da29aae371b7adc8c5ef8e6991b66b69823791e8", }), ("https://bunkr.to/a/Lktg9Keq"), ) def fetch_album(self, album_id): - if "//app." in self.root: - return self._fetch_album_api(album_id) - else: - return self._fetch_album_site(album_id) - - def _fetch_album_api(self, album_id): - files, data = LolisafeAlbumExtractor.fetch_album(self, album_id) - - for file in files: - url = file["file"] - if url.endswith(".mp4"): - file["file"] = url.replace( - "//cdn.bunkr.is/", "//media-files.bunkr.is/", 1) - else: - file["_fallback"] = (url.replace("//cdn.", "//cdn3.", 1),) - - return files, data - - def _fetch_album_site(self, album_id): - url = self.root + "/a/" + self.album_id + root = self.root try: data = json.loads(text.extr( - self.request(url).text, + self.request(root + "/a/" + self.album_id).text, 'id="__NEXT_DATA__" type="application/json">', '<')) album = data["props"]["pageProps"]["album"] files = album["files"] except Exception as exc: - self.log.debug(exc.__class__.__name__, exc) - self.root = self.root.replace("bunkr", "app.bunkr", 1) - return self._fetch_album_api(album_id) - - headers = {"Referer": "https://stream.bunkr.is/"} + self.log.debug("%s: %s", exc.__class__.__name__, exc) + self.root = root.replace("://", "://app.", 1) + files, data = LolisafeAlbumExtractor.fetch_album(self, album_id) + else: + for file in files: + file["file"] = file["cdn"] + "/" + file["name"] + data = { + "album_id" : self.album_id, + "album_name" : text.unescape(album["name"]), + "description": text.unescape(album["description"]), + "count" : len(files), + } + headers = {"Referer": root.replace("://", "://stream.", 1) + "/"} for file in files: - name = file["name"] - cdn = file["cdn"] - if name.endswith((".mp4", ".m4v", ".mov", ".webm", - ".zip", ".rar", ".7z")): - cdn = cdn.replace("//cdn", "//media-files", 1) + if file["file"].endswith( + (".mp4", ".m4v", ".mov", ".webm", ".zip", ".rar", ".7z")): file["_http_headers"] = headers - file["file"] = cdn + "/" + name + file["file"] = file["file"].replace( + "://cdn", "://media-files", 1) - return files, { - "album_id" : self.album_id, - "album_name" : text.unescape(album["name"]), - "description": text.unescape(album["description"]), - "count" : len(files), - } + return files, data diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 5a44780..ef17176 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -41,6 +41,11 @@ class DanbooruExtractor(BaseExtractor): self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) self.extended_metadata = self.config("metadata", False) + threshold = self.config("threshold") + if isinstance(threshold, int): + self.threshold = 1 if threshold < 1 else threshold + else: + self.threshold = self.per_page username, api_key = self._get_auth_info() if username: @@ -126,7 +131,7 @@ class DanbooruExtractor(BaseExtractor): posts = posts["posts"] yield from posts - if len(posts) < self.per_page: + if len(posts) < self.threshold: return if pagenum: diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 45beddf..aa78cfb 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -124,9 +124,20 @@ class DeviantartExtractor(Extractor): deviation["_journal"] = journal["html"] yield self.commit_journal(deviation, journal) - if self.extra: - txt = (deviation.get("description", "") + - deviation.get("_journal", "")) + if not self.extra: + continue + + # ref: https://www.deviantart.com + # /developers/http/v1/20210526/object/editor_text + # the value of "features" is a JSON string with forward + # slashes escaped + text_content = \ + deviation["text_content"]["body"]["features"].replace( + "\\/", "/") if "text_content" in deviation else None + for txt in (text_content, deviation.get("description"), + deviation.get("_journal")): + if txt is None: + continue for match in DeviantartStashExtractor.pattern.finditer(txt): url = text.ensure_http_scheme(match.group(0)) deviation["_extractor"] = DeviantartStashExtractor @@ -854,7 +865,9 @@ class DeviantartDeviationExtractor(DeviantartExtractor): """Extractor for single deviations""" subcategory = "deviation" archive_fmt = "g_{_username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)" + pattern = (BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)" + r"|(?:https?://)?(?:www\.)?deviantart\.com/" + r"(?:view/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)(\d+)") test = ( (("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), { "options": (("original", 0),), @@ -896,19 +909,13 @@ class DeviantartDeviationExtractor(DeviantartExtractor): "range": "2-", "count": 4, }), - # video - ("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", { - "pattern": r"https://wixmp-.+wixmp.com/v/mp4/.+\.720p\.\w+.mp4", - "keyword": { - "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5", - "extension": "mp4", - "target": { - "duration": 306, - "filesize": 19367585, - "quality": "720p", - "src": str, - }, - } + # sta.sh URL from deviation["text_content"]["body"]["features"] + (("https://www.deviantart.com" + "/cimar-wildehopps/art/Honorary-Vixen-859809305"), { + "options": (("extra", 1),), + "pattern": ("text:<!DOCTYPE html>\n|" + + DeviantartStashExtractor.pattern), + "count": 2, }), # journal ("https://www.deviantart.com/shimoda7/journal/ARTility-583755752", { @@ -920,12 +927,28 @@ class DeviantartDeviationExtractor(DeviantartExtractor): "url": "e2e0044bd255304412179b6118536dbd9bb3bb0e", "pattern": "text:<!DOCTYPE html>\n", }), + # /view/ URLs + ("https://deviantart.com/view/904858796/", { + "content": "8770ec40ad1c1d60f6b602b16301d124f612948f", + }), + ("http://www.deviantart.com/view/890672057", { + "content": "1497e13d925caeb13a250cd666b779a640209236", + }), + ("https://www.deviantart.com/view/706871727", { + "content": "3f62ae0c2fca2294ac28e41888ea06bb37c22c65", + }), + ("https://www.deviantart.com/view/1", { + "exception": exception.NotFoundError, + }), # old-style URLs ("https://shimoda7.deviantart.com" "/art/For-the-sake-of-a-memory-10073852"), ("https://myria-moon.deviantart.com" "/art/Aime-Moi-part-en-vadrouille-261986576"), ("https://zzz.deviantart.com/art/zzz-1234567890"), + # old /view/ URLs from the Wayback Machine + ("https://www.deviantart.com/view.php?id=14864502"), + ("http://www.deviantart.com/view-full.php?id=100842"), ) skip = Extractor.skip @@ -933,11 +956,12 @@ class DeviantartDeviationExtractor(DeviantartExtractor): def __init__(self, match): DeviantartExtractor.__init__(self, match) self.type = match.group(3) - self.deviation_id = match.group(4) + self.deviation_id = match.group(4) or match.group(5) def deviations(self): url = "{}/{}/{}/{}".format( - self.root, self.user, self.type, self.deviation_id) + self.root, self.user or "u", self.type or "art", self.deviation_id) + uuid = text.extract(self._limited_request(url).text, '"deviationUuid\\":\\"', '\\')[0] if not uuid: diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index b4dadc7..ad3f16b 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -92,16 +92,29 @@ class EromeAlbumExtractor(EromeExtractor): """Extractor for albums on erome.com""" subcategory = "album" pattern = BASE_PATTERN + r"/a/(\w+)" - test = ("https://www.erome.com/a/TyFMI7ik", { - "pattern": r"https://s\d+\.erome\.com/\d+/TyFMI7ik/\w+", - "count": 9, - "keyword": { - "album_id": "TyFMI7ik", - "num": int, - "title": "Ryan Ryans", - "user": "xanub", - }, - }) + test = ( + ("https://www.erome.com/a/NQgdlWvk", { + "pattern": r"https://v\d+\.erome\.com/\d+" + r"/NQgdlWvk/j7jlzmYB_480p\.mp4", + "count": 1, + "keyword": { + "album_id": "NQgdlWvk", + "num": 1, + "title": "porn", + "user": "yYgWBZw8o8qsMzM", + }, + }), + ("https://www.erome.com/a/TdbZ4ogi", { + "pattern": r"https://s\d+\.erome\.com/\d+/TdbZ4ogi/\w+", + "count": 6, + "keyword": { + "album_id": "TdbZ4ogi", + "num": int, + "title": "82e78cfbb461ad87198f927fcb1fda9a1efac9ff.", + "user": "yYgWBZw8o8qsMzM", + }, + }), + ) def albums(self): return (self.item,) @@ -110,7 +123,7 @@ class EromeAlbumExtractor(EromeExtractor): class EromeUserExtractor(EromeExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/(?!a/|search\?)([^/?#]+)" - test = ("https://www.erome.com/xanub", { + test = ("https://www.erome.com/yYgWBZw8o8qsMzM", { "range": "1-25", "count": 25, }) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index a546f68..dccc74e 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -117,9 +117,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): r"|/s/([\da-f]{10})/(\d+)-(\d+))") test = ( ("https://exhentai.org/g/1200119/d55c44d3d0/", { + "options": (("original", False),), "keyword": { "cost": int, - "date": "dt:2018-03-18 20:15:00", + "date": "dt:2018-03-18 20:14:00", "eh_category": "Non-H", "expunged": False, "favorites": r"re:^[12]\d$", @@ -150,7 +151,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "uploader": "klorpa", "width": int, }, - "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff", + "content": ("2c68cff8a7ca540a78c36fdbf5fbae0260484f87", + "e9891a4c017ed0bb734cd1efba5cd03f594d31ff"), }), ("https://exhentai.org/g/960461/4f0e369d82/", { "exception": exception.NotFoundError, @@ -159,9 +161,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "exception": exception.AuthorizationError, }), ("https://exhentai.org/s/f68367b4c8/1200119-3", { + "options": (("original", False),), "count": 2, }), ("https://e-hentai.org/s/f68367b4c8/1200119-3", { + "options": (("original", False),), "count": 2, }), ("https://g.e-hentai.org/g/1200119/d55c44d3d0/"), @@ -516,7 +520,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): data["gallery_token"] = gallery.group(3) yield Message.Queue, url + "/", data - next_url = text.extr(page, 'nexturl = "', '"', None) + next_url = text.extr(page, 'nexturl="', '"', None) if next_url is not None: if not next_url: return diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py new file mode 100644 index 0000000..d6fcb4b --- /dev/null +++ b/gallery_dl/extractor/fapello.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://fapello.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +class FapelloPostExtractor(Extractor): + """Extractor for individual posts on fapello.com""" + category = "fapello" + subcategory = "post" + directory_fmt = ("{category}", "{model}") + filename_fmt = "{model}_{id}.{extension}" + archive_fmt = "{type}_{model}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" + r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)") + test = ( + ("https://fapello.com/carrykey/530/", { + "pattern": (r"https://fapello\.com/content/c/a" + r"/carrykey/1000/carrykey_0530\.jpg"), + "keyword": { + "model": "carrykey", + "id" : 530, + "type" : "photo", + "thumbnail": "", + }, + }), + ("https://fapello.com/vladislava-661/693/", { + "pattern": (r"https://cdn\.fapello\.com/content/v/l" + r"/vladislava-661/1000/vladislava-661_0693\.mp4"), + "keyword": { + "model": "vladislava-661", + "id" : 693, + "type" : "video", + "thumbnail": ("https://fapello.com/content/v/l" + "/vladislava-661/1000/vladislava-661_0693.jpg"), + }, + }), + ("https://fapello.com/carrykey/000/", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.model, self.id = match.groups() + + def items(self): + url = "https://fapello.com/{}/{}/".format(self.model, self.id) + page = text.extr( + self.request(url, allow_redirects=False).text, + 'class="uk-align-center"', "</div>", None) + if page is None: + raise exception.NotFoundError("post") + + data = { + "model": self.model, + "id" : text.parse_int(self.id), + "type" : "video" if 'type="video' in page else "photo", + "thumbnail": text.extr(page, 'poster="', '"'), + } + url = text.extr(page, 'src="', '"') + yield Message.Directory, data + yield Message.Url, url, text.nameext_from_url(url, data) + + +class FapelloModelExtractor(Extractor): + """Extractor for all posts from a fapello model""" + category = "fapello" + subcategory = "model" + pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" + r"/(?!top-(?:likes|followers)|popular_videos" + r"|videos|trending|search/?$)" + r"([^/?#]+)/?$") + test = ( + ("https://fapello.com/hyoon/", { + "pattern": FapelloPostExtractor.pattern, + "range" : "1-50", + "count" : 50, + }), + ("https://fapello.com/kobaebeefboo/"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.model = match.group(1) + + def items(self): + num = 1 + data = {"_extractor": FapelloPostExtractor} + while True: + url = "https://fapello.com/ajax/model/{}/page-{}/".format( + self.model, num) + page = self.request(url).text + if not page: + return + + for url in text.extract_iter(page, '<a href="', '"'): + yield Message.Queue, url, data + num += 1 + + +class FapelloPathExtractor(Extractor): + """Extractor for models and posts from fapello.com paths""" + category = "fapello" + subcategory = "path" + pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" + r"/(?!search/?$)(top-(?:likes|followers)|videos|trending" + r"|popular_videos/[^/?#]+)/?$") + test = ( + ("https://fapello.com/top-likes/", { + "pattern": FapelloModelExtractor.pattern, + "range" : "1-10", + "count" : 10, + }), + ("https://fapello.com/videos/", { + "pattern": FapelloPostExtractor.pattern, + "range" : "1-10", + "count" : 10, + }), + ("https://fapello.com/top-followers/"), + ("https://fapello.com/trending/"), + ("https://fapello.com/popular_videos/twelve_hours/"), + ("https://fapello.com/popular_videos/week/"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + + def items(self): + num = 1 + if self.path in ("top-likes", "top-followers"): + data = {"_extractor": FapelloModelExtractor} + else: + data = {"_extractor": FapelloPostExtractor} + + while True: + page = self.request("https://fapello.com/ajax/{}/page-{}/".format( + self.path, num)).text + if not page: + return + + for item in text.extract_iter( + page, 'uk-transition-toggle">', "</a>"): + yield Message.Queue, text.extr(item, '<a href="', '"'), data + num += 1 diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 81671ec..2290cc2 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -39,10 +39,6 @@ class FoolslideExtractor(BaseExtractor): BASE_PATTERN = FoolslideExtractor.update({ - "kireicake": { - "root": "https://reader.kireicake.com", - "pattern": r"reader\.kireicake\.com", - }, "powermanga": { "root": "https://read.powermanga.org", "pattern": r"read(?:er)?\.powermanga\.org", @@ -64,10 +60,6 @@ class FoolslideChapterExtractor(FoolslideExtractor): archive_fmt = "{id}" pattern = BASE_PATTERN + r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" test = ( - ("https://reader.kireicake.com/read/wonderland/en/1/1/", { - "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e", - "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6", - }), (("https://read.powermanga.org" "/read/one_piece_digital_colour_comics/en/0/75/"), { "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384", @@ -123,10 +115,6 @@ class FoolslideMangaExtractor(FoolslideExtractor): categorytransfer = True pattern = BASE_PATTERN + r"(/series/[^/?#]+)" test = ( - ("https://reader.kireicake.com/series/wonderland/", { - "url": "d067b649af1cc88fa8c8b698fde04a10909fd169", - "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb", - }), (("https://read.powermanga.org" "/series/one_piece_digital_colour_comics/"), { "count": ">= 1", diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index da87b8f..facd3db 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -174,7 +174,8 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" test = ( ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { - "content": "5c6ae9ee13e6d4bc9cb8bdce224c84e67fbfa36c", + "content": ("5c6ae9ee13e6d4bc9cb8bdce224c84e67fbfa36c", + "622e80be3f496672c44aab5c47fbc6941c61bc79"), "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", "count": 2, }), diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 6fcfc55..207562a 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -200,7 +200,7 @@ class ImagetwistImageExtractor(ImagehostImageExtractor): return self.request(self.page_url).cookies def get_info(self, page): - url , pos = text.extract(page, 'center;"><img src="', '"') + url , pos = text.extract(page, '<img src="', '"') filename, pos = text.extract(page, ' alt="', '"', pos) return url, filename diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py index 7e4cce4..9ae22a9 100644 --- a/gallery_dl/extractor/imgth.py +++ b/gallery_dl/extractor/imgth.py @@ -1,60 +1,73 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://imgth.com/""" +"""Extractors for https://imgth.com/""" -from .common import Extractor, Message +from .common import GalleryExtractor from .. import text -class ImgthGalleryExtractor(Extractor): +class ImgthGalleryExtractor(GalleryExtractor): """Extractor for image galleries from imgth.com""" category = "imgth" - subcategory = "gallery" - directory_fmt = ("{category}", "{gallery_id} {title}") - filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" - archive_fmt = "{gallery_id}_{num}" - pattern = r"(?:https?://)?imgth\.com/gallery/(\d+)" - test = ("http://imgth.com/gallery/37/wallpaper-anime", { - "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748", - "keyword": "6f8c00d6849ea89d1a028764675ec1fe9dbd87e2", - }) + root = "https://imgth.com" + pattern = r"(?:https?://)?(?:www\.)?imgth\.com/gallery/(\d+)" + test = ( + ("https://imgth.com/gallery/37/wallpaper-anime", { + "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748", + "pattern": r"https://imgth\.com/images/2009/11/25" + r"/wallpaper-anime_\w+\.jpg", + "keyword": { + "count": 12, + "date": "dt:2009-11-25 18:21:00", + "extension": "jpg", + "filename": r"re:wallpaper-anime_\w+", + "gallery_id": 37, + "num": int, + "title": "Wallpaper anime", + "user": "celebrities", + }, + }), + ("https://www.imgth.com/gallery/37/wallpaper-anime"), + ) def __init__(self, match): - Extractor.__init__(self, match) - self.gid = match.group(1) - self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/" + self.gallery_id = gid = match.group(1) + url = "{}/gallery/{}/g/".format(self.root, gid) + GalleryExtractor.__init__(self, match, url) - def items(self): - page = self.request(self.url_base + "0").text - data = self.metadata(page) - yield Message.Directory, data - for data["num"], url in enumerate(self.images(page), 1): - yield Message.Url, url, text.nameext_from_url(url, data) + def metadata(self, page): + extr = text.extract_from(page) + return { + "gallery_id": text.parse_int(self.gallery_id), + "title": text.unescape(extr("<h1>", "</h1>")), + "count": text.parse_int(extr( + "total of images in this gallery: ", " ")), + "date" : text.parse_datetime( + extr("created on ", " by <") + .replace("th, ", " ", 1).replace("nd, ", " ", 1) + .replace("st, ", " ", 1), "%B %d %Y at %H:%M"), + "user" : text.unescape(extr(">", "<")), + } def images(self, page): - """Yield all image urls for this gallery""" pnum = 0 + while True: thumbs = text.extr(page, '<ul class="thumbnails">', '</ul>') for url in text.extract_iter(thumbs, '<img src="', '"'): - yield "https://imgth.com/images" + url[24:] + path = url.partition("/thumbs/")[2] + yield ("{}/images/{}".format(self.root, path), None) + if '<li class="next">' not in page: return - pnum += 1 - page = self.request(self.url_base + str(pnum)).text - def metadata(self, page): - """Collect metadata for extractor-job""" - return text.extract_all(page, ( - ("title", '<h1>', '</h1>'), - ("count", 'total of images in this gallery: ', ' '), - ("date" , 'created on ', ' by <'), - (None , 'href="/users/', ''), - ("user" , '>', '<'), - ), values={"gallery_id": self.gid})[0] + pnum += 1 + url = "{}/gallery/{}/g/page/{}".format( + self.root, self.gallery_id, pnum) + page = self.request(url).text diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index fd78ce2..42d0a7b 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -12,7 +12,7 @@ from .common import Extractor, Message from .. import text, exception -BASE_PATTERN = r"(?:https?://)?(?:www\.|[im]\.)?imgur\.com" +BASE_PATTERN = r"(?:https?://)?(?:www\.|[im]\.)?imgur\.(?:com|io)" class ImgurExtractor(Extractor): @@ -114,7 +114,9 @@ class ImgurImageExtractor(ImgurExtractor): ("https://www.imgur.com/21yMxCS"), # www ("https://m.imgur.com/21yMxCS"), # mobile ("https://imgur.com/zxaY6"), # 5 character key + ("https://imgur.io/zxaY6"), # .io ("https://i.imgur.com/21yMxCS.png"), # direct link + ("https://i.imgur.io/21yMxCS.png"), # direct link .io ("https://i.imgur.com/21yMxCSh.png"), # direct link thumbnail ("https://i.imgur.com/zxaY6.gif"), # direct link (short) ("https://i.imgur.com/zxaY6s.gif"), # direct link (short; thumb) @@ -205,7 +207,8 @@ class ImgurAlbumExtractor(ImgurExtractor): "count": 0, }), ("https://www.imgur.com/a/TcBmP"), # www - ("https://m.imgur.com/a/TcBmP"), # mobile + ("https://imgur.io/a/TcBmP"), # .io + ("https://m.imgur.com/a/TcBmP"), # mobile ) def items(self): @@ -248,6 +251,7 @@ class ImgurGalleryExtractor(ImgurExtractor): }), ("https://imgur.com/t/unmuted/26sEhNr"), ("https://imgur.com/t/cat/qSB8NbN"), + ("https://imgur.io/t/cat/qSB8NbN"), # .io ) def items(self): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 24ad873..db9f3fb 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -65,6 +65,10 @@ class InstagramExtractor(Extractor): post["count"] = len(files) yield Message.Directory, post + + if "date" in post: + del post["date"] + for file in files: file.update(post) @@ -93,10 +97,6 @@ class InstagramExtractor(Extractor): url = response.url if "/accounts/login/" in url: - if self._username: - self.log.debug("Invalidating cached login session for " - "'%s'", self._username) - _login_impl.invalidate(self._username) page = "login" elif "/challenge/" in url: page = "challenge" @@ -117,11 +117,9 @@ class InstagramExtractor(Extractor): return response def login(self): - self._username = None if not self._check_cookies(self.cookienames): username, password = self._get_auth_info() if username: - self._username = username self._update_cookies(_login_impl(self, username, password)) else: self._logged_in = False diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index d5cca1c..0c3b002 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -65,7 +65,7 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): "count": text.parse_int(extr("Number of Files: <b>", "<")), "size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]), "date" : extr("Date Added: <b>", "<"), - "type" : extr("Album type: <b>", "<"), + "type" : text.remove_html(extr("Album type: <b>", "</b>")), }} def tracks(self, page): diff --git a/gallery_dl/extractor/kissgoddess.py b/gallery_dl/extractor/kissgoddess.py deleted file mode 100644 index 4ec685c..0000000 --- a/gallery_dl/extractor/kissgoddess.py +++ /dev/null @@ -1,82 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2022 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://kissgoddess.com/""" - -from .common import GalleryExtractor, Extractor, Message -from .. import text, exception - - -class KissgoddessGalleryExtractor(GalleryExtractor): - """Extractor for image galleries on kissgoddess.com""" - category = "kissgoddess" - root = "https://kissgoddess.com" - pattern = r"(?:https?://)?(?:www\.)?kissgoddess\.com/album/(\d+)" - test = ("https://kissgoddess.com/album/18285.html", { - "pattern": r"https://pic\.kissgoddess\.com" - r"/gallery/16473/18285/s/\d+\.jpg", - "count": 19, - "keyword": { - "gallery_id": 18285, - "title": "[Young Champion Extra] 2016.02 No.03 菜乃花 安枝瞳 葉月あや", - }, - }) - - def __init__(self, match): - self.gallery_id = match.group(1) - url = "{}/album/{}.html".format(self.root, self.gallery_id) - GalleryExtractor.__init__(self, match, url) - - def metadata(self, page): - return { - "gallery_id": text.parse_int(self.gallery_id), - "title" : text.extr( - page, '<title>', "<")[0].rpartition(" | "), - } - - def images(self, page): - pnum = 1 - - while page: - for url in text.extract_iter(page, "<img src='", "'"): - yield url, None - for url in text.extract_iter(page, "<img data-original='", "'"): - yield url, None - - pnum += 1 - url = "{}/album/{}_{}.html".format( - self.root, self.gallery_id, pnum) - try: - page = self.request(url).text - except exception.HttpError: - return - - -class KissgoddessModelExtractor(Extractor): - """Extractor for all galleries of a model on kissgoddess.com""" - category = "kissgoddess" - subcategory = "model" - root = "https://kissgoddess.com" - pattern = r"(?:https?://)?(?:www\.)?kissgoddess\.com/people/([^./?#]+)" - test = ("https://kissgoddess.com/people/aya-hazuki.html", { - "pattern": KissgoddessGalleryExtractor.pattern, - "count": ">= 7", - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.model = match.group(1) - - def items(self): - url = "{}/people/{}.html".format(self.root, self.model) - page = self.request(url).text - - data = {"_extractor": KissgoddessGalleryExtractor} - for path in text.extract_iter(page, 'thumb"><a href="/album/', '"'): - url = self.root + "/album/" + path - yield Message.Queue, url, data diff --git a/gallery_dl/extractor/kohlchan.py b/gallery_dl/extractor/kohlchan.py deleted file mode 100644 index c96dedc..0000000 --- a/gallery_dl/extractor/kohlchan.py +++ /dev/null @@ -1,78 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://kohlchan.net/""" - -from .common import Extractor, Message -from .. import text -import itertools - - -class KohlchanThreadExtractor(Extractor): - """Extractor for Kohlchan threads""" - category = "kohlchan" - subcategory = "thread" - directory_fmt = ("{category}", "{boardUri}", - "{threadId} {subject|message[:50]}") - filename_fmt = "{postId}{num:?-//} {filename}.{extension}" - archive_fmt = "{boardUri}_{postId}_{num}" - pattern = r"(?:https?://)?kohlchan\.net/([^/?#]+)/res/(\d+)" - test = ("https://kohlchan.net/a/res/4594.html", { - "pattern": r"https://kohlchan\.net/\.media/[0-9a-f]{64}(\.\w+)?$", - "count": ">= 80", - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - - def items(self): - url = "https://kohlchan.net/{}/res/{}.json".format( - self.board, self.thread) - thread = self.request(url).json() - thread["postId"] = thread["threadId"] - posts = thread.pop("posts") - - yield Message.Directory, thread - - for post in itertools.chain((thread,), posts): - files = post.pop("files", ()) - if files: - thread.update(post) - for num, file in enumerate(files): - file.update(thread) - file["num"] = num - url = "https://kohlchan.net" + file["path"] - text.nameext_from_url(file["originalName"], file) - yield Message.Url, url, file - - -class KohlchanBoardExtractor(Extractor): - """Extractor for Kohlchan boards""" - category = "kohlchan" - subcategory = "board" - pattern = (r"(?:https?://)?kohlchan\.net" - r"/([^/?#]+)/(?:(?:catalog|\d+)\.html)?$") - test = ( - ("https://kohlchan.net/a/", { - "pattern": KohlchanThreadExtractor.pattern, - "count": ">= 100", - }), - ("https://kohlchan.net/a/2.html"), - ("https://kohlchan.net/a/catalog.html"), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board = match.group(1) - - def items(self): - url = "https://kohlchan.net/{}/catalog.json".format(self.board) - for thread in self.request(url).json(): - url = "https://kohlchan.net/{}/res/{}.html".format( - self.board, thread["threadId"]) - thread["_extractor"] = KohlchanThreadExtractor - yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index a9eebf4..04373c4 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://komikcast.me/""" +"""Extractors for https://komikcast.site/""" from .common import ChapterExtractor, MangaExtractor from .. import text import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:me|com)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)" class KomikcastBase(): """Base class for komikcast extractors""" category = "komikcast" - root = "https://komikcast.me" + root = "https://komikcast.site" @staticmethod def parse_chapter_string(chapter_string, data=None): @@ -46,23 +46,23 @@ class KomikcastBase(): class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): - """Extractor for manga-chapters from komikcast.me""" + """Extractor for manga-chapters from komikcast.site""" pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)" test = ( - (("https://komikcast.me/chapter" + (("https://komikcast.site/chapter" "/apotheosis-chapter-02-2-bahasa-indonesia/"), { - "url": "74eca5c9b27b896816497f9b2d847f2a1fcfc209", + "url": "f6b43fbc027697749b3ea1c14931c83f878d7936", "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4", }), (("https://komikcast.me/chapter" "/soul-land-ii-chapter-300-1-bahasa-indonesia/"), { - "url": "243a5250e210b40d17217e83b7547cefea5638bd", + "url": "efd00a9bd95461272d51990d7bc54b79ff3ff2e6", "keyword": "cb646cfed3d45105bd645ab38b2e9f7d8c436436", }), ) def metadata(self, page): - info = text.extr(page, "<title>", " – Komikcast<") + info = text.extr(page, "<title>", " - Komikcast<") return self.parse_chapter_string(info) @staticmethod @@ -76,12 +76,12 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): - """Extractor for manga from komikcast.me""" + """Extractor for manga from komikcast.site""" chapterclass = KomikcastChapterExtractor pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$" test = ( - ("https://komikcast.me/komik/090-eko-to-issho/", { - "url": "08204f0a703ec5272121abcf0632ecacba1e588f", + ("https://komikcast.site/komik/090-eko-to-issho/", { + "url": "19d3d50d532e84be6280a3d61ff0fd0ca04dd6b4", "keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1", }), ("https://komikcast.me/tonari-no-kashiwagi-san/"), @@ -101,7 +101,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): @staticmethod def metadata(page): """Return a dict with general metadata""" - manga , pos = text.extract(page, "<title>" , " – Komikcast<") + manga , pos = text.extract(page, "<title>" , " - Komikcast<") genres, pos = text.extract( page, 'class="komik_info-content-genre">', "</span>", pos) author, pos = text.extract(page, ">Author:", "</span>", pos) diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 9caf6d7..5d236c3 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -23,7 +23,7 @@ BASE_PATTERN = LolisafeExtractor.update({ "xbunkr": { "root": "https://xbunkr.com", "pattern": r"xbunkr\.com", - } + }, }) @@ -47,9 +47,9 @@ class LolisafeAlbumExtractor(LolisafeExtractor): self.album_id = match.group(match.lastindex) domain = self.config("domain") - if domain is None or domain == "auto": + if domain == "auto": self.root = text.root_from_url(match.group(0)) - else: + elif domain: self.root = text.ensure_http_scheme(domain) def items(self): diff --git a/gallery_dl/extractor/lynxchan.py b/gallery_dl/extractor/lynxchan.py new file mode 100644 index 0000000..bbcf9c0 --- /dev/null +++ b/gallery_dl/extractor/lynxchan.py @@ -0,0 +1,104 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for LynxChan Imageboards""" + +from .common import BaseExtractor, Message +from .. import text +import itertools + + +class LynxchanExtractor(BaseExtractor): + """Base class for LynxChan extractors""" + basecategory = "lynxchan" + + +BASE_PATTERN = LynxchanExtractor.update({ + "kohlchan": { + "root": "https://kohlchan.net", + "pattern": r"kohlchan\.net" + }, + "endchan": { + "root": None, + "pattern": r"endchan\.(?:org|net|gg)", + }, +}) + + +class LynxchanThreadExtractor(LynxchanExtractor): + """Extractor for LynxChan threads""" + subcategory = "thread" + directory_fmt = ("{category}", "{boardUri}", + "{threadId} {subject|message[:50]}") + filename_fmt = "{postId}{num:?-//} {filename}.{extension}" + archive_fmt = "{boardUri}_{postId}_{num}" + pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + test = ( + ("https://kohlchan.net/a/res/4594.html", { + "pattern": r"https://kohlchan\.net/\.media/[0-9a-f]{64}(\.\w+)?$", + "count": ">= 80", + }), + ("https://endchan.org/yuri/res/193483.html", { + "pattern": r"https://endchan\.org/\.media/[^.]+(\.\w+)?$", + "count" : ">= 19", + }), + ("https://endchan.org/yuri/res/33621.html"), + ) + + def __init__(self, match): + LynxchanExtractor.__init__(self, match) + index = match.lastindex + self.board = match.group(index-1) + self.thread = match.group(index) + + def items(self): + url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) + thread = self.request(url).json() + thread["postId"] = thread["threadId"] + posts = thread.pop("posts", ()) + + yield Message.Directory, thread + for post in itertools.chain((thread,), posts): + files = post.pop("files", ()) + if files: + thread.update(post) + for num, file in enumerate(files): + file.update(thread) + file["num"] = num + url = self.root + file["path"] + text.nameext_from_url(file["originalName"], file) + yield Message.Url, url, file + + +class LynxchanBoardExtractor(LynxchanExtractor): + """Extractor for LynxChan boards""" + subcategory = "board" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" + test = ( + ("https://kohlchan.net/a/", { + "pattern": LynxchanThreadExtractor.pattern, + "count": ">= 100", + }), + ("https://kohlchan.net/a/2.html"), + ("https://kohlchan.net/a/catalog.html"), + ("https://endchan.org/yuri/", { + "pattern": LynxchanThreadExtractor.pattern, + "count" : ">= 9", + }), + ("https://endchan.org/yuri/catalog.html"), + ) + + def __init__(self, match): + LynxchanExtractor.__init__(self, match) + self.board = match.group(match.lastindex) + + def items(self): + url = "{}/{}/catalog.json".format(self.root, self.board) + for thread in self.request(url).json(): + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["threadId"]) + thread["_extractor"] = LynxchanThreadExtractor + yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 0bc3527..dae203e 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -109,7 +109,7 @@ class MangadexChapterExtractor(MangadexExtractor): }), # 'externalUrl', but still downloadable (#2503) ("https://mangadex.org/chapter/364728a4-6909-4164-9eea-6b56354f7c78", { - "count": 39, + "count": 0, # 404 }), ) diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index 4808105..0818fd9 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -10,7 +10,6 @@ from .common import ChapterExtractor, MangaExtractor from .. import text -import re BASE_PATTERN = r"(?:https?://)?(?:www\.|m\.)?(?:fanfox\.net|mangafox\.me)" @@ -44,14 +43,14 @@ class MangafoxChapterExtractor(ChapterExtractor): cid , pos = text.extract(page, "var chapter_id =", ";", pos) return { - "manga": text.unescape(manga), - "volume": text.parse_int(self.volume), - "chapter": text.parse_int(self.chapter), - "chapter_minor": self.minor or "", + "manga" : text.unescape(manga), + "volume" : text.parse_int(self.volume), + "chapter" : text.parse_int(self.chapter), + "chapter_minor" : self.minor or "", "chapter_string": self.cstr, - "count": text.parse_int(count), - "sid": text.parse_int(sid), - "cid": text.parse_int(cid), + "count" : text.parse_int(count), + "sid" : text.parse_int(sid), + "cid" : text.parse_int(cid), } def images(self, page): @@ -76,6 +75,25 @@ class MangafoxMangaExtractor(MangaExtractor): ("https://fanfox.net/manga/kanojo_mo_kanojo", { "pattern": MangafoxChapterExtractor.pattern, "count": ">=60", + "keyword": { + "author": "HIROYUKI", + "chapter": int, + "chapter_minor": r"re:^(\.\d+)?$", + "chapter_string": r"re:(v\d+/)?c\d+", + "date": "type:datetime", + "description": "High school boy Naoya gets a confession from M" + "omi, a cute and friendly girl. However, Naoya " + "already has a girlfriend, Seki... but Momi is " + "too good a catch to let go. Momi and Nagoya's " + "goal becomes clear: convince Seki to accept be" + "ing an item with the two of them. Will she bud" + "ge?", + "lang": "en", + "language": "English", + "manga": "Kanojo mo Kanojo", + "tags": ["Comedy", "Romance", "School Life", "Shounen"], + "volume": int, + }, }), ("https://mangafox.me/manga/shangri_la_frontier", { "pattern": MangafoxChapterExtractor.pattern, @@ -85,34 +103,41 @@ class MangafoxMangaExtractor(MangaExtractor): ) def chapters(self, page): - match_info = re.compile(r"Ch (\d+)(\S*)(?: (.*))?").match - manga, pos = text.extract(page, '<p class="title">', '</p>') - author, pos = text.extract(page, '<p>Author(s):', '</p>', pos) + results = [] + chapter_match = MangafoxChapterExtractor.pattern.match + + extr = text.extract_from(page) + manga = extr('<p class="title">', '</p>') + author = extr('<p>Author(s):', '</p>') + extr('<dd class="chlist">', '') + + genres, _, summary = text.extr( + page, '<div class="manga-genres">', '</section>' + ).partition('<div class="manga-summary">') + data = { - "manga" : text.unescape(manga), - "author" : text.remove_html(author), - "lang" : "en", - "language": "English", + "manga" : text.unescape(manga), + "author" : text.remove_html(author), + "description": text.unescape(text.remove_html(summary)), + "tags" : text.split_html(genres), + "lang" : "en", + "language" : "English", } - results = [] - pos = page.index('<dd class="chlist">') while True: - url, pos = text.extract(page, '<a href="//', '"', pos) - if url == 'mangafox.la?f=mobile': + url = "https://" + extr('<a href="//', '"') + match = chapter_match(url) + if not match: return results - info, pos = text.extract(page, '>', '<span', pos) - date, pos = text.extract(page, 'right">', '</span>', pos) - - match = match_info(text.unescape(info)) - if match: - chapter, minor, title = match.groups() - chapter_minor = minor - else: - chapter, _, minor = url[:-7].rpartition("/c")[2].partition(".") - chapter_minor = "." + minor - - data["chapter"] = text.parse_int(chapter) - data["chapter_minor"] = chapter_minor if minor else "" - data["date"] = date - results.append(("https://" + url, data.copy())) + _, cstr, volume, chapter, minor = match.groups() + + chapter = { + "volume" : text.parse_int(volume), + "chapter" : text.parse_int(chapter), + "chapter_minor" : minor or "", + "chapter_string": cstr, + "date" : text.parse_datetime( + extr('right">', '</span>'), "%b %d, %Y"), + } + chapter.update(data) + results.append((url, chapter)) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 9cd95bb..134361d 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -88,20 +88,32 @@ class PixivExtractor(Extractor): url = ugoira["zip_urls"]["medium"].replace( "_ugoira600x600", "_ugoira1920x1080") work["frames"] = ugoira["frames"] + work["date_url"] = self._date_from_url(url) work["_http_adjust_extension"] = False yield Message.Url, url, text.nameext_from_url(url, work) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] + work["date_url"] = self._date_from_url(url) yield Message.Url, url, text.nameext_from_url(url, work) else: for work["num"], img in enumerate(meta_pages): url = img["image_urls"]["original"] + work["date_url"] = self._date_from_url(url) work["suffix"] = "_p{:02}".format(work["num"]) yield Message.Url, url, text.nameext_from_url(url, work) @staticmethod + def _date_from_url(url, offset=timedelta(hours=9)): + try: + _, _, _, _, _, y, m, d, H, M, S, _ = url.split("/") + return datetime( + int(y), int(m), int(d), int(H), int(M), int(S)) - offset + except Exception: + return None + + @staticmethod def _make_work(kind, url, user): p = url.split("/") return { @@ -309,6 +321,10 @@ class PixivWorkExtractor(PixivExtractor): ("https://www.pixiv.net/artworks/966412", { "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba", "content": "69a8edfb717400d1c2e146ab2b30d2c235440c5a", + "keyword": { + "date" : "dt:2008-06-12 15:29:13", + "date_url": "dt:2008-06-12 15:29:13", + }, }), (("http://www.pixiv.net/member_illust.php" "?mode=medium&illust_id=966411"), { @@ -318,7 +334,11 @@ class PixivWorkExtractor(PixivExtractor): (("https://www.pixiv.net/member_illust.php" "?mode=medium&illust_id=66806629"), { "url": "7267695a985c4db8759bebcf8d21dbdd2d2317ef", - "keywords": {"frames": list}, + "keyword": { + "frames" : list, + "date" : "dt:2018-01-14 15:06:08", + "date_url": "dt:2018-01-15 04:24:48", + }, }), # related works (#1237) ("https://www.pixiv.net/artworks/966412", { diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 0ec8478..204562e 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -428,7 +428,7 @@ class RedditAPI(): def _pagination(self, endpoint, params): id_min = self._parse_id("id-min", 0) - id_max = self._parse_id("id-max", 2147483647) + id_max = self._parse_id("id-max", float("inf")) date_min, date_max = self.extractor._get_date_min_max(0, 253402210800) while True: diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 53e5e79..ad4282c 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -72,7 +72,7 @@ class RedgifsUserExtractor(RedgifsExtractor): pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?#]+)" test = ("https://www.redgifs.com/users/Natalifiction", { "pattern": r"https://\w+\.redgifs\.com/[A-Za-z]+\.mp4", - "count": ">= 120", + "count": ">= 100", }) def metadata(self): @@ -89,7 +89,7 @@ class RedgifsSearchExtractor(RedgifsExtractor): pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/browse/?\?([^#]+)" test = ( ("https://www.redgifs.com/browse?tags=JAV", { - "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.mp4", + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", "range": "1-10", "count": 10, }), diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py index 22c9487..7b8d2a3 100644 --- a/gallery_dl/extractor/seiga.py +++ b/gallery_dl/extractor/seiga.py @@ -44,7 +44,11 @@ class SeigaExtractor(Extractor): url = "{}/image/source/{}".format(self.root, image_id) response = self.request( url, method="HEAD", allow_redirects=False, notfound="image") - return response.headers["Location"].replace("/o/", "/priv/", 1) + location = response.headers["location"] + if "nicovideo.jp/login" in location: + raise exception.StopExtraction( + "HTTP redirect to login page (%s)", location.partition("?")[0]) + return location.replace("/o/", "/priv/", 1) class SeigaUserExtractor(SeigaExtractor): diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index aa6726d..486bf92 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -74,9 +74,7 @@ class SexcomExtractor(Extractor): path = text.extr(info, "src: '", "'") data["filename"] = path.rpartition("/")[2] data["extension"] = "mp4" - if "'HD'" in info: - path += "/hd" - data["url"] = self.root + path + data["url"] = path else: iframe = extr('<iframe', '>') src = (text.extr(iframe, ' src="', '"') or diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py index ae4e2e8..3727c0b 100644 --- a/gallery_dl/extractor/slickpic.py +++ b/gallery_dl/extractor/slickpic.py @@ -43,7 +43,8 @@ class SlickpicAlbumExtractor(SlickpicExtractor): }), ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", { "range": "34", - "content": ("52b5a310587de1048030ab13a912f6a3a9cc7dab", + "content": ("276eb2c902187bb177ae8013e310e1d6641fba9a", + "52b5a310587de1048030ab13a912f6a3a9cc7dab", "cec6630e659dc72db1ee1a9a6f3b525189261988", "6f81e1e74c6cd6db36844e7211eef8e7cd30055d", "22e83645fc242bc3584eca7ec982c8a53a4d8a44"), diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 2264fe4..713d4c4 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -117,7 +117,7 @@ class SmugmugImageExtractor(SmugmugExtractor): # video ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", - "keyword": "4cef98133ace511adc874c9d9abac5817ba0d856", + "keyword": "2b545184592c282b365fcbb7df6ca7952b8a3173", }), ) diff --git a/gallery_dl/extractor/soundgasm.py b/gallery_dl/extractor/soundgasm.py new file mode 100644 index 0000000..1afb92c --- /dev/null +++ b/gallery_dl/extractor/soundgasm.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://soundgasm.net/""" + +from .common import Extractor, Message +from .. import text + + +class SoundgasmAudioExtractor(Extractor): + """Extractor for audio clips from soundgasm.net""" + category = "soundgasm" + subcategory = "audio" + root = "https://soundgasm.net" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{title}.{extension}" + archive_fmt = "{user}_{slug}" + pattern = (r"(?:https?://)?(?:www\.)?soundgasm\.net" + r"/u(?:ser)?/([^/?#]+)/([^/?#]+)") + test = ( + (("https://soundgasm.net/u/ClassWarAndPuppies2" + "/687-Otto-von-Toontown-12822"), { + "pattern": r"https://media\.soundgasm\.net/sounds" + r"/26cb2b23b2f2c6094b40ee3a9167271e274b570a\.m4a", + "keyword": { + "description": "We celebrate today’s important prisoner swap, " + "and finally bring the 2022 mid-terms to a clos" + "e with Raphael Warnock’s defeat of Herschel Wa" + "lker in Georgia. Then, we take a look at the Q" + "anon-addled attempt to overthrow the German go" + "vernment and install Heinrich XIII Prince of R" + "euss as kaiser.", + "extension": "m4a", + "filename": "26cb2b23b2f2c6094b40ee3a9167271e274b570a", + "slug": "687-Otto-von-Toontown-12822", + "title": "687 - Otto von Toontown (12/8/22)", + "user": "ClassWarAndPuppies2", + }, + }), + ("https://www.soundgasm.net/user/ClassWarAndPuppies2" + "/687-Otto-von-Toontown-12822"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user, self.slug = match.groups() + + def items(self): + url = "{}/u/{}/{}".format(self.root, self.user, self.slug) + extr = text.extract_from(self.request(url).text) + + data = { + "user" : self.user, + "slug" : self.slug, + "title": text.unescape(extr('aria-label="title">', "<")), + "description": text.unescape(text.remove_html(extr( + 'class="jp-description">', '</div>'))), + } + + formats = extr('"setMedia", {', '}') + url = text.extr(formats, ': "', '"') + + yield Message.Directory, data + yield Message.Url, url, text.nameext_from_url(url, data) + + +class SoundgasmUserExtractor(Extractor): + """Extractor for all sounds from a soundgasm user""" + category = "soundgasm" + subcategory = "user" + root = "https://soundgasm.net" + pattern = (r"(?:https?://)?(?:www\.)?soundgasm\.net" + r"/u(?:ser)?/([^/?#]+)/?$") + test = ("https://soundgasm.net/u/fierce-aphrodite", { + "pattern": SoundgasmAudioExtractor.pattern, + "count" : ">= 15", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + page = self.request(self.root + "/user/" + self.user).text + data = {"_extractor": SoundgasmAudioExtractor} + for sound in text.extract_iter( + page, 'class="sound-details">', "</a>"): + yield Message.Queue, text.extr(sound, '<a href="', '"'), data diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index f010f92..30bf2f1 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -83,7 +83,7 @@ class TwibooruPostExtractor(TwibooruExtractor): "tag_ids": list, "tags": list, "thumbnails_generated": True, - "updated_at": "2022-09-21T14:31:50.441Z", + "updated_at": "2022-11-27T00:34:50.483Z", "upvotes": int, "view_url": "https://cdn.twibooru.org/img/2020/7/8/1/full.png", "width": 576, diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 22d4a6e..22aa78e 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -633,7 +633,7 @@ class TwitterEventExtractor(TwitterExtractor): pattern = BASE_PATTERN + r"/i/events/(\d+)" test = ("https://twitter.com/i/events/1484669206993903616", { "range": "1-20", - "count": ">5", + "count": ">=1", }) def metadata(self): @@ -759,7 +759,7 @@ class TwitterTweetExtractor(TwitterExtractor): # retweet with missing media entities (#1555) ("https://twitter.com/morino_ya/status/1392763691599237121", { "options": (("retweets", True),), - "count": 4, + "count": 0, # private }), # deleted quote tweet (#2225) ("https://twitter.com/i/web/status/1460044411165888515", { @@ -782,7 +782,7 @@ class TwitterTweetExtractor(TwitterExtractor): # '?format=...&name=...'-style URLs ("https://twitter.com/poco_dandy/status/1150646424461176832", { "options": (("cards", True),), - "pattern": r"https://pbs.twimg.com/card_img/157\d+/\w+" + "pattern": r"https://pbs.twimg.com/card_img/157\d+/[\w-]+" r"\?format=(jpg|png)&name=orig$", "range": "1-2", }), @@ -886,7 +886,7 @@ class TwitterBackgroundExtractor(TwitterExtractor): def tweets(self): self.api._user_id_by_screen_name(self.user) - user = user = self._user_obj + user = self._user_obj try: url = user["legacy"]["profile_banner_url"] @@ -1216,15 +1216,16 @@ class TwitterAPI(): original_retweets = (self.extractor.retweets == "original") while True: - cursor = tweet = None data = self._call(endpoint, params) instr = data["timeline"]["instructions"] if not instr: return - tweet_ids = [] + tweets = data["globalObjects"]["tweets"] users = data["globalObjects"]["users"] + tweet_id = cursor = None + tweet_ids = [] # collect tweet IDs and cursor value for entry in instr[0]["addEntries"]["entries"]: @@ -1243,7 +1244,7 @@ class TwitterAPI(): cursor = entry["content"]["operation"]["cursor"] if not cursor.get("stopOnEmptyResponse", True): # keep going even if there are no tweets - tweet = True + tweet_id = True cursor = cursor["value"] elif entry_startswith("conversationThread-"): @@ -1292,7 +1293,7 @@ class TwitterAPI(): cursor = (instr[-1]["replaceEntry"]["entry"] ["content"]["operation"]["cursor"]["value"]) - if not cursor or not tweet: + if not cursor or (not tweets and not tweet_id): return params["cursor"] = cursor diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index 8bea18c..b298c27 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -78,11 +78,11 @@ class UnsplashImageExtractor(UnsplashExtractor): pattern = BASE_PATTERN + r"/photos/([^/?#]+)" test = ("https://unsplash.com/photos/lsoogGC_5dg", { "pattern": r"https://images\.unsplash\.com/photo-1586348943529-" - r"beaae6c28db9\?ixid=\w+&ixlib=rb-1.2.1", + r"beaae6c28db9\?ixid=\w+&ixlib=rb-4.0.3", "keyword": { "alt_description": "re:silhouette of trees near body of water ", "blur_hash": "LZP4uQS4jboe%#o0WCa}2doJNaaz", - "categories": list, + "? categories": list, "color": "#f3c08c", "created_at": "2020-04-08T12:29:42Z", "date": "dt:2020-04-08 12:29:42", @@ -108,9 +108,8 @@ class UnsplashImageExtractor(UnsplashExtractor): "name": "Beaver Dam, WI 53916, USA", "position": { "latitude": 43.457769, - "longitude": -88.837329 + "longitude": -88.837329, }, - "title": "Beaver Dam, WI 53916, USA" }, "promoted_at": "2020-04-08T15:12:03Z", "sponsorship": None, @@ -149,7 +148,7 @@ class UnsplashUserExtractor(UnsplashExtractor): pattern = BASE_PATTERN + r"/@(\w+)/?$" test = ("https://unsplash.com/@davehoefler", { "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" - r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", + r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", "range": "1-30", "count": 30, }) @@ -166,7 +165,7 @@ class UnsplashFavoriteExtractor(UnsplashExtractor): pattern = BASE_PATTERN + r"/@(\w+)/likes" test = ("https://unsplash.com/@davehoefler/likes", { "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" - r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", + r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", "range": "1-30", "count": 30, }) @@ -184,7 +183,7 @@ class UnsplashCollectionExtractor(UnsplashExtractor): test = ( ("https://unsplash.com/collections/3178572/winter", { "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" - r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", + r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", "keyword": {"collection_id": "3178572", "collection_title": "winter"}, "range": "1-30", @@ -212,8 +211,9 @@ class UnsplashSearchExtractor(UnsplashExtractor): subcategory = "search" pattern = BASE_PATTERN + r"/s/photos/([^/?#]+)(?:\?([^#]+))?" test = ("https://unsplash.com/s/photos/hair-style", { - "pattern": r"https://images\.unsplash\.com/((flagged/)?photo-\d+-\w+" - r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", + "pattern": r"https://(images|plus)\.unsplash\.com" + r"/((flagged/|premium_)?photo-\d+-\w+" + r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-4\.0\.3$", "range": "1-30", "count": 30, }) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 677680f..bdedfcb 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -1,21 +1,22 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://warosu.org/""" +"""Extractors for https://warosu.org/""" from .common import Extractor, Message from .. import text class WarosuThreadExtractor(Extractor): - """Extractor for images from threads on warosu.org""" + """Extractor for threads on warosu.org""" category = "warosu" subcategory = "thread" + root = "https://warosu.org" directory_fmt = ("{category}", "{board}", "{thread} - {title}") filename_fmt = "{tim}-{filename}.{extension}" archive_fmt = "{board}_{thread}_{tim}" @@ -31,7 +32,6 @@ class WarosuThreadExtractor(Extractor): "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", }), ) - root = "https://warosu.org" def __init__(self, match): Extractor.__init__(self, match) @@ -40,12 +40,12 @@ class WarosuThreadExtractor(Extractor): def items(self): url = "{}/{}/thread/{}".format(self.root, self.board, self.thread) page = self.request(url).text - data = self.get_metadata(page) + data = self.metadata(page) posts = self.posts(page) if not data["title"]: - title = text.remove_html(posts[0]["com"]) - data["title"] = text.unescape(title)[:50] + data["title"] = text.unescape(text.remove_html( + posts[0]["com"]))[:50] yield Message.Directory, data for post in posts: @@ -55,25 +55,24 @@ class WarosuThreadExtractor(Extractor): post.update(data) yield Message.Url, post["image"], post - def get_metadata(self, page): - """Collect metadata for extractor-job""" + def metadata(self, page): boardname = text.extr(page, "<title>", "</title>") title = text.extr(page, 'filetitle" itemprop="name">', '<') return { - "board": self.board, + "board" : self.board, "board_name": boardname.rpartition(" - ")[2], - "thread": self.thread, - "title": title, + "thread" : self.thread, + "title" : title, } def posts(self, page): - """Build a list of all post-objects""" + """Build a list of all post objects""" page = text.extr(page, '<div class="content">', '<table>') needle = '<table itemscope itemtype="http://schema.org/Comment">' return [self.parse(post) for post in page.split(needle)] def parse(self, post): - """Build post-object by extracting data from an HTML post""" + """Build post object by extracting data from an HTML post""" data = self._extract_post(post) if "<span>File:" in post: self._extract_image(post, data) @@ -84,24 +83,23 @@ class WarosuThreadExtractor(Extractor): @staticmethod def _extract_post(post): - data = text.extract_all(post, ( - ("no" , 'id="p', '"'), - ("name", '<span itemprop="name">', '</span>'), - ("time", '<span class="posttime" title="', '000">'), - ("now" , '', '<'), - ("com" , '<blockquote><p itemprop="text">', '</p></blockquote>'), - ))[0] - data["com"] = text.unescape(text.remove_html(data["com"].strip())) - return data + extr = text.extract_from(post) + return { + "no" : extr('id="p', '"'), + "name": extr('<span itemprop="name">', "</span>"), + "time": extr('<span class="posttime" title="', '000">'), + "now" : extr("", "<"), + "com" : text.unescape(text.remove_html(extr( + '<blockquote><p itemprop="text">', '</p></blockquote>' + ).strip())), + } @staticmethod def _extract_image(post, data): - text.extract_all(post, ( - ("fsize" , '<span>File: ', ', '), - ("w" , '', 'x'), - ("h" , '', ', '), - ("filename", '', '<'), - ("image" , '<br />\n<a href="', '"'), - ), 0, data) - data["filename"] = text.unquote(data["filename"].rpartition(".")[0]) - data["image"] = "https:" + data["image"] + extr = text.extract_from(post) + data["fsize"] = extr("<span>File: ", ", ") + data["w"] = extr("", "x") + data["h"] = extr("", ", ") + data["filename"] = text.unquote(extr("", "<").rpartition(".")[0]) + extr("<br />", "") + data["image"] = "https:" + extr('<a href="', '"') diff --git a/gallery_dl/extractor/webmshare.py b/gallery_dl/extractor/webmshare.py new file mode 100644 index 0000000..b038425 --- /dev/null +++ b/gallery_dl/extractor/webmshare.py @@ -0,0 +1,84 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://webmshare.com/""" + +from .common import Extractor, Message +from .. import text + + +class WebmshareVideoExtractor(Extractor): + """Extractor for webmshare videos""" + category = "webmshare" + subcategory = "video" + root = "https://webmshare.com" + filename_fmt = "{id}{title:? //}.{extension}" + archive_fmt = "{id}" + pattern = (r"(?:https?://)?(?:s\d+\.)?webmshare\.com" + r"/(?:play/|download-webm/)?(\w{3,})") + test = ( + ("https://webmshare.com/O9mWY", { + "keyword": { + "date": "dt:2022-12-04 00:00:00", + "extension": "webm", + "filename": "O9mWY", + "height": 568, + "id": "O9mWY", + "thumb": "https://s1.webmshare.com/t/O9mWY.jpg", + "title": "Yeah buddy over here", + "url": "https://s1.webmshare.com/O9mWY.webm", + "views": int, + "width": 320, + }, + }), + ("https://s1.webmshare.com/zBGAg.webm", { + "keyword": { + "date": "dt:2018-12-07 00:00:00", + "height": 1080, + "id": "zBGAg", + "thumb": "https://s1.webmshare.com/t/zBGAg.jpg", + "title": "", + "url": "https://s1.webmshare.com/zBGAg.webm", + "views": int, + "width": 1920, + }, + }), + ("https://webmshare.com/play/zBGAg"), + ("https://webmshare.com/download-webm/zBGAg"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.video_id = match.group(1) + + def items(self): + url = "{}/{}".format(self.root, self.video_id) + extr = text.extract_from(self.request(url).text) + + data = { + "title": text.unescape(extr( + 'property="og:title" content="', '"').rpartition(" — ")[0]), + "thumb": "https:" + extr('property="og:image" content="', '"'), + "url" : "https:" + extr('property="og:video" content="', '"'), + "width": text.parse_int(extr( + 'property="og:video:width" content="', '"')), + "height": text.parse_int(extr( + 'property="og:video:height" content="', '"')), + "date" : text.parse_datetime(extr( + "<small>Added ", "<"), "%B %d, %Y"), + "views": text.parse_int(extr('glyphicon-eye-open"></span>', '<')), + "id" : self.video_id, + "filename" : self.video_id, + "extension": "webm", + } + + if data["title"] == "webmshare": + data["title"] = "" + + yield Message.Directory, data + yield Message.Url, data["url"], data diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 8a22fcb..21f7c21 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -57,6 +57,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): }), (("https://www.webtoons.com/en/challenge/punderworld" "/happy-earth-day-/viewer?title_no=312584&episode_no=40"), { + "exception": exception.NotFoundError, "keyword": { "comic": "punderworld", "description": str, diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index c0d43fe..74da615 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -11,8 +11,6 @@ from .booru import BooruExtractor from ..cache import cache from .. import text, exception -from xml.etree import ElementTree - BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -27,12 +25,13 @@ class ZerochanExtractor(BooruExtractor): cookienames = ("z_id", "z_hash") def login(self): + self._logged_in = True if not self._check_cookies(self.cookienames): username, password = self._get_auth_info() if username: self._update_cookies(self._login_impl(username, password)) - # force legacy layout - self.session.cookies.set("v3", "0", domain=self.cookiedomain) + else: + self._logged_in = False @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): @@ -60,36 +59,50 @@ class ZerochanExtractor(BooruExtractor): url = "{}/{}".format(self.root, entry_id) extr = text.extract_from(self.request(url).text) - return { - "id" : entry_id, - "author": extr('"author": "', '"'), + data = { + "id" : text.parse_int(entry_id), + "author" : extr('"author": "', '"'), "file_url": extr('"contentUrl": "', '"'), - "date" : text.parse_datetime(extr( - '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"), - "width" : extr('"width": "', ' '), - "height": extr('"height": "', ' '), - "size" : text.parse_bytes(extr('"contentSize": "', 'B')), - "path" : text.split_html(extr( - 'class="breadcrumbs', '</p>'))[3::2], - "tags" : extr('alt="Tags: Anime, ', '"').split(", ") + "date" : text.parse_datetime(extr('"datePublished": "', '"')), + "width" : text.parse_int(extr('"width": "', ' ')), + "height" : text.parse_int(extr('"height": "', ' ')), + "size" : text.parse_bytes(extr('"contentSize": "', 'B')), + "path" : text.split_html(extr( + 'class="breadcrumbs', '</p>'))[2:], + "uploader": extr('href="/user/', '"'), + "tags" : extr('<ul id="tags"', '</ul>'), + "source" : extr('<h2>Source</h2>', '</p><h2>').rpartition( + ">")[2] or None, } - def _parse_entry_xml(self, entry_id): - url = "{}/{}?xml".format(self.root, entry_id) - item = ElementTree.fromstring(self.request(url).text)[0][-1] - # content = item[4].attrib - - return { - # "id" : entry_id, - # "file_url": content["url"], - # "width" : content["width"], - # "height": content["height"], - # "size" : content["filesize"], - "name" : item[2].text, - "tags" : item[5].text.lstrip().split(", "), - "md5" : item[6].text, + html = data["tags"] + tags = data["tags"] = [] + for tag in html.split("<li class=")[1:]: + category, _, name = text.extr(tag, 'alt="', '<').partition('">') + tags.append(category + ":" + name.strip()) + + return data + + def _parse_entry_json(self, entry_id): + url = "{}/{}?json".format(self.root, entry_id) + item = self.request(url).json() + + data = { + "id" : item["id"], + "file_url": item["full"], + "width" : item["width"], + "height" : item["height"], + "size" : item["size"], + "name" : item["primary"], + "md5" : item["hash"], + "source" : item.get("source"), } + if not self._logged_in: + data["tags"] = item["tags"] + + return data + class ZerochanTagExtractor(ZerochanExtractor): subcategory = "tag" @@ -138,7 +151,7 @@ class ZerochanTagExtractor(ZerochanExtractor): if metadata: entry_id = extr('href="/', '"') post = self._parse_entry_html(entry_id) - post.update(self._parse_entry_xml(entry_id)) + post.update(self._parse_entry_json(entry_id)) yield post else: yield { @@ -163,14 +176,34 @@ class ZerochanImageExtractor(ZerochanExtractor): "pattern": r"https://static\.zerochan\.net/" r"Perth\.%28Kantai\.Collection%29\.full.2920445\.jpg", "keyword": { - "author": "YukinoTokisaki", + "author": "YeFan 葉凡", "date": "dt:2020-04-24 21:33:44", - "file_url": str, + "file_url": "https://static.zerochan.net" + "/Perth.%28Kantai.Collection%29.full.2920445.jpg", "filename": "Perth.(Kantai.Collection).full.2920445", - "height": "1366", - "id": "2920445", - "size": "1929k", - "width": "1920", + "height": 1366, + "id": 2920445, + "path": ["Kantai Collection", "Perth (Kantai Collection)"], + "size": 1975296, + "tags": [ + "Mangaka:YeFan 葉凡", + "Game:Kantai Collection", + "Character:Perth (Kantai Collection)", + "Theme:Blonde Hair", + "Theme:Braids", + "Theme:Coat", + "Theme:Female", + "Theme:Firefighter Outfit", + "Theme:Group", + "Theme:Long Sleeves", + "Theme:Personification", + "Theme:Pins", + "Theme:Ribbon", + "Theme:Shirt", + "Theme:Short Hair", + ], + "uploader": "YukinoTokisaki", + "width": 1920, }, }) @@ -181,5 +214,5 @@ class ZerochanImageExtractor(ZerochanExtractor): def posts(self): post = self._parse_entry_html(self.image_id) if self.config("metadata"): - post.update(self._parse_entry_xml(self.image_id)) + post.update(self._parse_entry_json(self.image_id)) return (post,) diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index ca05fa5..8a45330 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -18,8 +18,10 @@ import operator import functools from . import text, util +NONE = util.NONE -def parse(format_string, default=None, fmt=format): + +def parse(format_string, default=NONE, fmt=format): key = format_string, default, fmt try: @@ -88,7 +90,7 @@ class StringFormatter(): Example: {f:R /_/} -> "f_o_o_b_a_r" (if "f" is "f o o b a r") """ - def __init__(self, format_string, default=None, fmt=format): + def __init__(self, format_string, default=NONE, fmt=format): self.default = default self.format = fmt self.result = [] @@ -193,7 +195,7 @@ class StringFormatter(): class TemplateFormatter(StringFormatter): """Read format_string from file""" - def __init__(self, path, default=None, fmt=format): + def __init__(self, path, default=NONE, fmt=format): with open(util.expand_path(path)) as fp: format_string = fp.read() StringFormatter.__init__(self, format_string, default, fmt) @@ -202,23 +204,23 @@ class TemplateFormatter(StringFormatter): class ExpressionFormatter(): """Generate text by evaluating a Python expression""" - def __init__(self, expression, default=None, fmt=None): + def __init__(self, expression, default=NONE, fmt=None): self.format_map = util.compile_expression(expression) class ModuleFormatter(): """Generate text by calling an external function""" - def __init__(self, function_spec, default=None, fmt=None): + def __init__(self, function_spec, default=NONE, fmt=None): module_name, _, function_name = function_spec.partition(":") module = __import__(module_name) self.format_map = getattr(module, function_name) class FStringFormatter(): - """Generate text by evaluaring an f-string literal""" + """Generate text by evaluating an f-string literal""" - def __init__(self, fstring, default=None, fmt=None): + def __init__(self, fstring, default=NONE, fmt=None): self.format_map = util.compile_expression("f'''" + fstring + "'''") diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 4d9a358..91e9169 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -142,10 +142,12 @@ def build_parser(): ) general.add_argument( "--cookies-from-browser", - dest="cookies_from_browser", metavar="BROWSER[+KEYRING][:PROFILE]", + dest="cookies_from_browser", + metavar="BROWSER[+KEYRING][:PROFILE][::CONTAINER]", help=("Name of the browser to load cookies from, " - "with optional keyring name prefixed with '+' and " - "profile prefixed with ':'"), + "with optional keyring name prefixed with '+', " + "profile prefixed with ':', and " + "container prefixed with '::' ('none' for no container)"), ) output = parser.add_argument_group("Output Options") diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 7d599ee..3b360e9 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -29,6 +29,8 @@ class PathFormat(): def __init__(self, extractor): config = extractor.config kwdefault = config("keywords-default") + if kwdefault is None: + kwdefault = util.NONE filename_fmt = config("filename") try: @@ -212,14 +214,19 @@ class PathFormat(): def fix_extension(self, _=None): """Fix filenames without a given filename extension""" - if not self.extension: - self.kwdict["extension"] = self.prefix + self.extension_map("", "") - self.build_path() - if self.path[-1] == ".": - self.path = self.path[:-1] - self.temppath = self.realpath = self.realpath[:-1] - elif not self.temppath: + try: + if not self.extension: + self.kwdict["extension"] = \ + self.prefix + self.extension_map("", "") + self.build_path() + if self.path[-1] == ".": + self.path = self.path[:-1] + self.temppath = self.realpath = self.realpath[:-1] + elif not self.temppath: + self.build_path() + except Exception: self.path = self.directory + "?" + self.realpath = self.temppath = self.realdirectory + "?" return True def build_filename(self, kwdict): diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 8ce1fb4..23d5bc8 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -528,8 +528,8 @@ def parse_inputfile(file, log): yield line -class UniversalNone(): - """None-style object that supports more operations than None itself""" +class CustomNone(): + """None-style type that supports more operations than regular None""" __slots__ = () def __getattribute__(self, _): @@ -538,18 +538,36 @@ class UniversalNone(): def __getitem__(self, _): return self + def __iter__(self): + return self + + def __call__(self, *args, **kwargs): + return self + + @staticmethod + def __next__(): + raise StopIteration + @staticmethod def __bool__(): return False @staticmethod + def __len__(): + return 0 + + @staticmethod + def __format__(_): + return "None" + + @staticmethod def __str__(): return "None" __repr__ = __str__ -NONE = UniversalNone() +NONE = CustomNone() EPOCH = datetime.datetime(1970, 1, 1) SECOND = datetime.timedelta(0, 1) WINDOWS = (os.name == "nt") diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d289009..d832185 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.24.1" +__version__ = "1.24.2" diff --git a/test/test_util.py b/test/test_util.py index 2921ea2..4b8f9ae 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -618,10 +618,21 @@ class TestOther(unittest.TestCase): obj = util.NONE self.assertFalse(obj) + self.assertEqual(len(obj), 0) self.assertEqual(str(obj), str(None)) self.assertEqual(repr(obj), repr(None)) + self.assertEqual(format(obj), str(None)) + self.assertEqual(format(obj, "%F"), str(None)) self.assertIs(obj.attr, obj) self.assertIs(obj["key"], obj) + self.assertIs(obj(), obj) + self.assertIs(obj(1, "a"), obj) + self.assertIs(obj(foo="bar"), obj) + + i = 0 + for _ in obj: + i += 1 + self.assertEqual(i, 0) class TestExtractor(): |
