diff options
35 files changed, 582 insertions, 255 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index a4ce4ba..34607f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,45 @@ # Changelog +## 1.26.1 - 2023-10-21 +### Extractors +#### Additions +- [bunkr] add extractor for media URLs ([#4684](https://github.com/mikf/gallery-dl/issues/4684)) +- [chevereto] add generic extractors for `chevereto` sites ([#4664](https://github.com/mikf/gallery-dl/issues/4664)) + - `deltaporno.com` ([#1381](https://github.com/mikf/gallery-dl/issues/1381)) + - `img.kiwi` + - `jpgfish` + - `pixl.li` ([#3179](https://github.com/mikf/gallery-dl/issues/3179), [#4357](https://github.com/mikf/gallery-dl/issues/4357)) +- [deviantart] implement `"group": "skip"` ([#4630](https://github.com/mikf/gallery-dl/issues/4630)) +- [fantia] add `content_count` and `content_num` metadata fields ([#4627](https://github.com/mikf/gallery-dl/issues/4627)) +- [imgbb] add `displayname` and `user_id` metadata ([#4626](https://github.com/mikf/gallery-dl/issues/4626)) +- [kemonoparty] support post revisions; add `revisions` option ([#4498](https://github.com/mikf/gallery-dl/issues/4498), [#4597](https://github.com/mikf/gallery-dl/issues/4597)) +- [kemonoparty] support searches ([#3385](https://github.com/mikf/gallery-dl/issues/3385), [#4057](https://github.com/mikf/gallery-dl/issues/4057)) +- [kemonoparty] support discord URLs with channel IDs ([#4662](https://github.com/mikf/gallery-dl/issues/4662)) +- [moebooru] add `metadata` option ([#4646](https://github.com/mikf/gallery-dl/issues/4646)) +- [newgrounds] support multi-image posts ([#4642](https://github.com/mikf/gallery-dl/issues/4642)) +- [sankaku] support `/posts/` URLs ([#4688](https://github.com/mikf/gallery-dl/issues/4688)) +- [twitter] add `sensitive` metadata field ([#4619](https://github.com/mikf/gallery-dl/issues/4619)) +#### Fixes +- [4chanarchives] disable Referer headers by default ([#4686](https://github.com/mikf/gallery-dl/issues/4686)) +- [bunkr] fix `/d/` file URLs ([#4685](https://github.com/mikf/gallery-dl/issues/4685)) +- [deviantart] expand nested comment replies ([#4653](https://github.com/mikf/gallery-dl/issues/4653)) +- [deviantart] disable `jwt` ([#4652](https://github.com/mikf/gallery-dl/issues/4652)) +- [hentaifoundry] fix `.swf` file downloads ([#4641](https://github.com/mikf/gallery-dl/issues/4641)) +- [imgbb] fix `user` metadata extraction ([#4626](https://github.com/mikf/gallery-dl/issues/4626)) +- [imgbb] update pagination end condition ([#4626](https://github.com/mikf/gallery-dl/issues/4626)) +- [kemonoparty] update API endpoints ([#4676](https://github.com/mikf/gallery-dl/issues/4676), [#4677](https://github.com/mikf/gallery-dl/issues/4677)) +- [patreon] update `campaign_id` path ([#4639](https://github.com/mikf/gallery-dl/issues/4639)) +- [reddit] fix wrong previews ([#4649](https://github.com/mikf/gallery-dl/issues/4649)) +- [redgifs] fix `niches` extraction ([#4666](https://github.com/mikf/gallery-dl/issues/4666), [#4667](https://github.com/mikf/gallery-dl/issues/4667)) +- [twitter] fix crash due to missing `source` ([#4620](https://github.com/mikf/gallery-dl/issues/4620)) +- [warosu] fix extraction ([#4634](https://github.com/mikf/gallery-dl/issues/4634)) +### Post Processors +#### Additions +- support `{_filename}`, `{_directory}`, and `{_path}` replacement fields for `--exec` ([#4633](https://github.com/mikf/gallery-dl/issues/4633)) +### Miscellaneous +#### Improvements +- avoid temporary copies with `--cookies-from-browser` by opening cookie databases in read-only mode + ## 1.26.0 - 2023-10-03 - ### Extractors #### Additions @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.26.0 +Version: 1.26.1 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -109,9 +109,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.0/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.0/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.bin>`__ Nightly Builds @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.0/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.0/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.bin>`__ Nightly Builds diff --git a/data/completion/_gallery-dl b/data/completion/_gallery-dl index 94d2f06..c1425bb 100644 --- a/data/completion/_gallery-dl +++ b/data/completion/_gallery-dl @@ -73,8 +73,8 @@ _arguments -C -S \ --write-infojson'[==SUPPRESS==]' \ --write-tags'[Write image tags to separate text files]' \ --mtime-from-date'[Set file modification times according to "date" metadata]' \ ---exec'[Execute CMD for each downloaded file. Example: --exec "convert {} {}.png && rm {}"]':'<cmd>' \ ---exec-after'[Execute CMD after all files were downloaded successfully. Example: --exec-after "cd {} && convert * ../doc.pdf"]':'<cmd>' \ +--exec'[Execute CMD for each downloaded file. Supported replacement fields are {} or {_path}, {_directory}, {_filename}. Example: --exec "convert {} {}.png && rm {}"]':'<cmd>' \ +--exec-after'[Execute CMD after all files were downloaded successfully. Example: --exec-after "cd {_directory} && convert * ../doc.pdf"]':'<cmd>' \ {-P,--postprocessor}'[Activate the specified post processor]':'<name>' \ {-O,--postprocessor-option}'[Additional "<key>=<value>" post processor options]':'<opt>' && rc=0 diff --git a/data/completion/gallery-dl.fish b/data/completion/gallery-dl.fish index 00e7b24..593ab89 100644 --- a/data/completion/gallery-dl.fish +++ b/data/completion/gallery-dl.fish @@ -67,7 +67,7 @@ complete -c gallery-dl -l 'write-info-json' -d 'Write gallery metadata to a info complete -c gallery-dl -l 'write-infojson' -d '==SUPPRESS==' complete -c gallery-dl -l 'write-tags' -d 'Write image tags to separate text files' complete -c gallery-dl -l 'mtime-from-date' -d 'Set file modification times according to "date" metadata' -complete -c gallery-dl -x -l 'exec' -d 'Execute CMD for each downloaded file. Example: --exec "convert {} {}.png && rm {}"' -complete -c gallery-dl -x -l 'exec-after' -d 'Execute CMD after all files were downloaded successfully. Example: --exec-after "cd {} && convert * ../doc.pdf"' +complete -c gallery-dl -x -l 'exec' -d 'Execute CMD for each downloaded file. Supported replacement fields are {} or {_path}, {_directory}, {_filename}. Example: --exec "convert {} {}.png && rm {}"' +complete -c gallery-dl -x -l 'exec-after' -d 'Execute CMD after all files were downloaded successfully. Example: --exec-after "cd {_directory} && convert * ../doc.pdf"' complete -c gallery-dl -x -s 'P' -l 'postprocessor' -d 'Activate the specified post processor' complete -c gallery-dl -x -s 'O' -l 'postprocessor-option' -d 'Additional "<key>=<value>" post processor options' diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index c2eedb7..27f13af 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2023-10-03" "1.26.0" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2023-10-21" "1.26.1" "gallery-dl Manual" .\" disable hyphenation .nh @@ -216,10 +216,10 @@ Write image tags to separate text files Set file modification times according to 'date' metadata .TP .B "\-\-exec" \f[I]CMD\f[] -Execute CMD for each downloaded file. Example: --exec "convert {} {}.png && rm {}" +Execute CMD for each downloaded file. Supported replacement fields are {} or {_path}, {_directory}, {_filename}. Example: --exec "convert {} {}.png && rm {}" .TP .B "\-\-exec\-after" \f[I]CMD\f[] -Execute CMD after all files were downloaded successfully. Example: --exec-after "cd {} && convert * ../doc.pdf" +Execute CMD after all files were downloaded successfully. Example: --exec-after "cd {_directory} && convert * ../doc.pdf" .TP .B "\-P, \-\-postprocessor" \f[I]NAME\f[] Activate the specified post processor diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 19a5812..9083d24 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2023-10-03" "1.26.0" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2023-10-21" "1.26.1" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -1529,7 +1529,10 @@ Use with caution. .SS extractor.deviantart.group .IP "Type:" 6 -\f[I]bool\f[] +.br +* \f[I]bool\f[] +.br +* \f[I]string\f[] .IP "Default:" 9 \f[I]true\f[] @@ -1538,6 +1541,14 @@ Use with caution. Check whether the profile name in a given URL belongs to a group or a regular user. +When disabled, assume every given profile name +belongs to a regular user. + +Special values: + +.br +* \f[I]"skip"\f[]: Skip groups + .SS extractor.deviantart.include .IP "Type:" 6 @@ -1589,13 +1600,15 @@ literature and status updates. \f[I]bool\f[] .IP "Default:" 9 -\f[I]true\f[] +\f[I]false\f[] .IP "Description:" 4 Update \f[I]JSON Web Tokens\f[] (the \f[I]token\f[] URL parameter) of otherwise non-downloadable, low-resolution images to be able to download them in full resolution. +Note: No longer functional as of 2023-10-11 + .SS extractor.deviantart.mature .IP "Type:" 6 @@ -2415,7 +2428,20 @@ Limit the number of posts to download. \f[I]false\f[] .IP "Description:" 4 -Extract \f[I]username\f[] metadata +Extract \f[I]username\f[] metadata. + + +.SS extractor.kemonoparty.revisions +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract post revisions. + +Note: This requires 1 additional HTTP request per post. .SS extractor.khinsider.format @@ -2625,6 +2651,19 @@ Fetch media from renoted notes. Fetch media from replies to other notes. +.SS extractor.[moebooru].pool.metadata +.IP "Type:" 6 +\f[I]bool\f[] + +.IP "Default:" 9 +\f[I]false\f[] + +.IP "Description:" 4 +Extract extended \f[I]pool\f[] metadata. + +Note: Not supported by all \f[I]moebooru\f[] instances. + + .SS extractor.newgrounds.flash .IP "Type:" 6 \f[I]bool\f[] diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 2eac0a1..9f12652 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -85,7 +85,7 @@ "group": true, "include": "gallery", "journals": "html", - "jwt": true, + "jwt": false, "mature": true, "metadata": false, "original": true, diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 022a2d6..95861dc 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.26.0 +Version: 1.26.1 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -109,9 +109,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.0/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.0/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index e319eef..fb6cb4b 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -60,6 +60,7 @@ gallery_dl/extractor/blogger.py gallery_dl/extractor/booru.py gallery_dl/extractor/bunkr.py gallery_dl/extractor/catbox.py +gallery_dl/extractor/chevereto.py gallery_dl/extractor/comicvine.py gallery_dl/extractor/common.py gallery_dl/extractor/cyberdrop.py @@ -111,7 +112,6 @@ gallery_dl/extractor/instagram.py gallery_dl/extractor/issuu.py gallery_dl/extractor/itaku.py gallery_dl/extractor/itchio.py -gallery_dl/extractor/jpgfish.py gallery_dl/extractor/jschan.py gallery_dl/extractor/kabeuchi.py gallery_dl/extractor/keenspot.py diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index c5c5667..416cc9a 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -47,7 +47,7 @@ def load_cookies(cookiejar, browser_specification): def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None): path, container_id = _firefox_cookies_database(profile, container) - with DatabaseCopy(path) as db: + with DatabaseConnection(path) as db: sql = ("SELECT name, value, host, path, isSecure, expiry " "FROM moz_cookies") @@ -100,7 +100,7 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, path = _chrome_cookies_database(profile, config) _log_debug("Extracting cookies from %s", path) - with DatabaseCopy(path) as db: + with DatabaseConnection(path) as db: db.text_factory = bytes decryptor = get_cookie_decryptor( config["directory"], config["keyring"], keyring) @@ -814,7 +814,7 @@ class DataParser: self.skip_to(len(self._data), description) -class DatabaseCopy(): +class DatabaseConnection(): def __init__(self, path): self.path = path @@ -823,12 +823,26 @@ class DatabaseCopy(): def __enter__(self): try: + # https://www.sqlite.org/uri.html#the_uri_path + path = self.path.replace("?", "%3f").replace("#", "%23") + if util.WINDOWS: + path = "/" + os.path.abspath(path) + + uri = "file:{}?mode=ro&immutable=1".format(path) + self.database = sqlite3.connect( + uri, uri=True, isolation_level=None, check_same_thread=False) + return self.database + except Exception as exc: + _log_debug("Falling back to temporary database copy (%s: %s)", + exc.__class__.__name__, exc) + + try: self.directory = tempfile.TemporaryDirectory(prefix="gallery-dl-") path_copy = os.path.join(self.directory.name, "copy.sqlite") shutil.copyfile(self.path, path_copy) - self.database = db = sqlite3.connect( + self.database = sqlite3.connect( path_copy, isolation_level=None, check_same_thread=False) - return db + return self.database except BaseException: if self.directory: self.directory.cleanup() @@ -836,7 +850,8 @@ class DatabaseCopy(): def __exit__(self, exc, value, tb): self.database.close() - self.directory.cleanup() + if self.directory: + self.directory.cleanup() def Popen_communicate(*args): diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py index f018d3e..27ac7c5 100644 --- a/gallery_dl/extractor/4chanarchives.py +++ b/gallery_dl/extractor/4chanarchives.py @@ -20,6 +20,7 @@ class _4chanarchivesThreadExtractor(Extractor): directory_fmt = ("{category}", "{board}", "{thread} - {title}") filename_fmt = "{no}-{filename}.{extension}" archive_fmt = "{board}_{thread}_{no}" + referer = False pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)" example = "https://4chanarchives.com/board/a/thread/12345/" diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3abe74b..1c1473a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -28,6 +28,7 @@ modules = [ "blogger", "bunkr", "catbox", + "chevereto", "comicvine", "cyberdrop", "danbooru", @@ -73,7 +74,6 @@ modules = [ "issuu", "itaku", "itchio", - "jpgfish", "jschan", "kabeuchi", "keenspot", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 5509f5a..26123b8 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -12,6 +12,8 @@ from .lolisafe import LolisafeAlbumExtractor from .. import text from urllib.parse import urlsplit, urlunsplit +BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)" + MEDIA_DOMAIN_OVERRIDES = { "cdn9.bunkr.ru" : "c9.bunkr.ru", "cdn12.bunkr.ru": "media-files12.bunkr.la", @@ -28,7 +30,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkrr.su albums""" category = "bunkr" root = "https://bunkrr.su" - pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)" + pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://bunkrr.su/a/ID" def fetch_album(self, album_id): @@ -53,11 +55,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): for url in urls: if url.startswith("/"): try: - page = self.request(self.root + text.unescape(url)).text - if url[1] == "v": - url = text.extr(page, '<source src="', '"') - else: - url = text.extr(page, '<img src="', '"') + url = self._extract_file(text.unescape(url)) except Exception as exc: self.log.error("%s: %s", exc.__class__.__name__, exc) continue @@ -72,3 +70,37 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): url = urlunsplit((scheme, domain, path, query, fragment)) yield {"file": text.unescape(url)} + + def _extract_file(self, path): + page = self.request(self.root + path).text + if path[1] == "v": + url = text.extr(page, '<source src="', '"') + else: + url = text.extr(page, '<img src="', '"') + if not url: + url = text.rextract( + page, ' href="', '"', page.rindex("Download"))[0] + return url + + +class BunkrMediaExtractor(BunkrAlbumExtractor): + """Extractor for bunkrr.su media links""" + subcategory = "media" + directory_fmt = ("{category}",) + pattern = BASE_PATTERN + r"/[vid]/([^/?#]+)" + example = "https://bunkrr.su/v/FILENAME" + + def fetch_album(self, album_id): + try: + url = self._extract_file(urlsplit(self.url).path) + except Exception as exc: + self.log.error("%s: %s", exc.__class__.__name__, exc) + return (), {} + + return ({"file": text.unescape(url)},), { + "album_id" : "", + "album_name" : "", + "album_size" : -1, + "description": "", + "count" : 1, + } diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py new file mode 100644 index 0000000..21166bd --- /dev/null +++ b/gallery_dl/extractor/chevereto.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Chevereto galleries""" + +from .common import BaseExtractor, Message +from .. import text + + +class CheveretoExtractor(BaseExtractor): + """Base class for chevereto extractors""" + basecategory = "chevereto" + directory_fmt = ("{category}", "{user}", "{album}",) + archive_fmt = "{id}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.path = match.group(match.lastindex) + + def _pagination(self, url): + while url: + page = self.request(url).text + + for item in text.extract_iter( + page, '<div class="list-item-image ', 'image-container'): + yield text.extr(item, '<a href="', '"') + + url = text.extr(page, '<a data-pagination="next" href="', '" ><') + + +BASE_PATTERN = CheveretoExtractor.update({ + "jpgfish": { + "root": "https://jpg2.su", + "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", + }, + "pixl": { + "root": "https://pixl.li", + "pattern": r"pixl\.(?:li|is)", + }, + "imgkiwi": { + "root": "https://img.kiwi", + "pattern": r"img\.kiwi", + }, + "deltaporno": { + "root": "https://gallery.deltaporno.com", + "pattern": r"gallery\.deltaporno\.com", + }, +}) + + +class CheveretoImageExtractor(CheveretoExtractor): + """Extractor for chevereto Images""" + subcategory = "image" + pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)" + example = "https://jpg2.su/img/TITLE.ID" + + def items(self): + url = self.root + self.path + extr = text.extract_from(self.request(url).text) + + image = { + "id" : self.path.rpartition(".")[2], + "url" : extr('<meta property="og:image" content="', '"'), + "album": text.extr(extr("Added to <a", "/a>"), ">", "<"), + "user" : extr('username: "', '"'), + } + + text.nameext_from_url(image["url"], image) + yield Message.Directory, image + yield Message.Url, image["url"], image + + +class CheveretoAlbumExtractor(CheveretoExtractor): + """Extractor for chevereto Albums""" + subcategory = "album" + pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)" + example = "https://jpg2.su/album/TITLE.ID" + + def items(self): + url = self.root + self.path + data = {"_extractor": CheveretoImageExtractor} + + if self.path.endswith("/sub"): + albums = self._pagination(url) + else: + albums = (url,) + + for album in albums: + for image in self._pagination(album): + yield Message.Queue, image, data + + +class CheveretoUserExtractor(CheveretoExtractor): + """Extractor for chevereto Users""" + subcategory = "user" + pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)" + example = "https://jpg2.su/USER" + + def items(self): + url = self.root + self.path + + if self.path.endswith("/albums"): + data = {"_extractor": CheveretoAlbumExtractor} + else: + data = {"_extractor": CheveretoImageExtractor} + + for url in self._pagination(url): + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 9421096..2c37ef1 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -42,7 +42,7 @@ class DeviantartExtractor(Extractor): self.offset = 0 def _init(self): - self.jwt = self.config("jwt", True) + self.jwt = self.config("jwt", False) self.flat = self.config("flat", True) self.extra = self.config("extra", False) self.quality = self.config("quality", "100") @@ -91,14 +91,20 @@ class DeviantartExtractor(Extractor): return True def items(self): - if self.user and self.config("group", True): - profile = self.api.user_profile(self.user) - self.group = not profile - if self.group: - self.subcategory = "group-" + self.subcategory - self.user = self.user.lower() - else: - self.user = profile["user"]["username"] + if self.user: + group = self.config("group", True) + if group: + profile = self.api.user_profile(self.user) + if profile: + self.user = profile["user"]["username"] + self.group = False + elif group == "skip": + self.log.info("Skipping group '%s'", self.user) + raise exception.StopExtraction() + else: + self.subcategory = "group-" + self.subcategory + self.user = self.user.lower() + self.group = True for deviation in self.deviations(): if isinstance(deviation, tuple): @@ -228,7 +234,7 @@ class DeviantartExtractor(Extractor): if self.comments: deviation["comments"] = ( - self.api.comments(deviation["deviationid"], target="deviation") + self._extract_comments(deviation["deviationid"], "deviation") if deviation["stats"]["comments"] else () ) @@ -395,6 +401,28 @@ class DeviantartExtractor(Extractor): binascii.b2a_base64(payload).rstrip(b"=\n").decode()) ) + def _extract_comments(self, target_id, target_type="deviation"): + results = None + comment_ids = [None] + + while comment_ids: + comments = self.api.comments( + target_id, target_type, comment_ids.pop()) + + if results: + results.extend(comments) + else: + results = comments + + # parent comments, i.e. nodes with at least one child + parents = {c["parentid"] for c in comments} + # comments with more than one reply + replies = {c["commentid"] for c in comments if c["replies"]} + # add comment UUIDs with replies that are not parent to any node + comment_ids.extend(replies - parents) + + return results + def _limited_request(self, url, **kwargs): """Limits HTTP requests to one every 2 seconds""" kwargs["fatal"] = None @@ -698,7 +726,7 @@ class DeviantartStatusExtractor(DeviantartExtractor): deviation["stats"] = {"comments": comments_count} if self.comments: deviation["comments"] = ( - self.api.comments(deviation["statusid"], target="status") + self._extract_comments(deviation["statusid"], "status") if comments_count else () ) @@ -1072,11 +1100,17 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params) - def comments(self, id, target, offset=0): + def comments(self, target_id, target_type="deviation", + comment_id=None, offset=0): """Fetch comments posted on a target""" - endpoint = "/comments/{}/{}".format(target, id) - params = {"maxdepth": "5", "offset": offset, "limit": 50, - "mature_content": self.mature} + endpoint = "/comments/{}/{}".format(target_type, target_id) + params = { + "commentid" : comment_id, + "maxdepth" : "5", + "offset" : offset, + "limit" : 50, + "mature_content": self.mature, + } return self._pagination_list(endpoint, params=params, key="thread") def deviation(self, deviation_id, public=None): diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index f1d51e2..4a67695 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -42,7 +42,11 @@ class FantiaExtractor(Extractor): post = self._get_post_data(post_id) post["num"] = 0 - for content in self._get_post_contents(post): + contents = self._get_post_contents(post) + post["content_count"] = len(contents) + post["content_num"] = 0 + + for content in contents: files = self._process_content(post, content) yield Message.Directory, post @@ -59,6 +63,8 @@ class FantiaExtractor(Extractor): post["content_filename"] or file["file_url"], post) yield Message.Url, file["file_url"], post + post["content_num"] += 1 + def posts(self): """Return post IDs""" @@ -131,6 +137,7 @@ class FantiaExtractor(Extractor): post["content_filename"] = content.get("filename") or "" post["content_id"] = content["id"] post["content_comment"] = content.get("comment") or "" + post["content_num"] += 1 post["plan"] = content["plan"] or self._empty_plan files = [] diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 4c02000..8ba23c2 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -72,13 +72,11 @@ class HentaifoundryExtractor(Extractor): extr = text.extract_from(page, page.index('id="picBox"')) data = { + "index" : text.parse_int(path.rsplit("/", 2)[1]), "title" : text.unescape(extr('class="imageTitle">', '<')), "artist" : text.unescape(extr('/profile">', '<')), - "width" : text.parse_int(extr('width="', '"')), - "height" : text.parse_int(extr('height="', '"')), - "index" : text.parse_int(path.rsplit("/", 2)[1]), - "src" : text.urljoin(self.root, text.unescape(extr( - 'src="', '"'))), + "_body" : extr( + '<div class="boxbody"', '<div class="boxfooter"'), "description": text.unescape(text.remove_html(extr( '>Description</div>', '</section>') .replace("\r\n", "\n"), "", "")), @@ -92,6 +90,20 @@ class HentaifoundryExtractor(Extractor): ">Tags </span>", "</div>")), } + body = data["_body"] + if "<object " in body: + data["src"] = text.urljoin(self.root, text.unescape(text.extr( + body, 'name="movie" value="', '"'))) + data["width"] = text.parse_int(text.extr( + body, "name='width' value='", "'")) + data["height"] = text.parse_int(text.extr( + body, "name='height' value='", "'")) + else: + data["src"] = text.urljoin(self.root, text.unescape(text.extr( + body, 'src="', '"'))) + data["width"] = text.parse_int(text.extr(body, 'width="', '"')) + data["height"] = text.parse_int(text.extr(body, 'height="', '"')) + return text.nameext_from_url(data["src"], data) def _parse_story(self, html): diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 1b74180..6c0684e 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -84,6 +84,13 @@ class ImgbbExtractor(Extractor): raise exception.AuthenticationError() return self.cookies + def _extract_resource(self, page): + return util.json_loads(text.extr( + page, "CHV.obj.resource=", "};") + "}") + + def _extract_user(self, page): + return self._extract_resource(page).get("user") or {} + def _pagination(self, page, endpoint, params): data = None seek, pos = text.extract(page, 'data-seek="', '"') @@ -99,7 +106,7 @@ class ImgbbExtractor(Extractor): for img in text.extract_iter(page, "data-object='", "'"): yield util.json_loads(text.unquote(img)) if data: - if params["seek"] == data["seekEnd"]: + if not data["seekEnd"] or params["seek"] == data["seekEnd"]: return params["seek"] = data["seekEnd"] params["page"] += 1 @@ -124,12 +131,14 @@ class ImgbbAlbumExtractor(ImgbbExtractor): self.page_url = "https://ibb.co/album/" + self.album_id def metadata(self, page): - album, pos = text.extract(page, '"og:title" content="', '"') - user , pos = text.extract(page, 'rel="author">', '<', pos) + album = text.extr(page, '"og:title" content="', '"') + user = self._extract_user(page) return { - "album_id" : self.album_id, - "album_name": text.unescape(album), - "user" : user.lower() if user else "", + "album_id" : self.album_id, + "album_name" : text.unescape(album), + "user" : user.get("username") or "", + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", } def images(self, page): @@ -158,7 +167,12 @@ class ImgbbUserExtractor(ImgbbExtractor): self.page_url = "https://{}.imgbb.com/".format(self.user) def metadata(self, page): - return {"user": self.user} + user = self._extract_user(page) + return { + "user" : user.get("username") or self.user, + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", + } def images(self, page): user = text.extr(page, '.obj.resource={"id":"', '"') @@ -181,15 +195,20 @@ class ImgbbImageExtractor(ImgbbExtractor): def items(self): url = "https://ibb.co/" + self.image_id - extr = text.extract_from(self.request(url).text) + page = self.request(url).text + extr = text.extract_from(page) + user = self._extract_user(page) image = { "id" : self.image_id, - "title" : text.unescape(extr('"og:title" content="', '"')), + "title" : text.unescape(extr( + '"og:title" content="', ' hosted at ImgBB"')), "url" : extr('"og:image" content="', '"'), "width" : text.parse_int(extr('"og:image:width" content="', '"')), "height": text.parse_int(extr('"og:image:height" content="', '"')), - "user" : extr('rel="author">', '<').lower(), + "user" : user.get("username") or "", + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", } image["extension"] = text.ext_from_url(image["url"]) diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py deleted file mode 100644 index 8862a7b..0000000 --- a/gallery_dl/extractor/jpgfish.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://jpg1.su/""" - -from .common import Extractor, Message -from .. import text - -BASE_PATTERN = r"(?:https?://)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)" - - -class JpgfishExtractor(Extractor): - """Base class for jpgfish extractors""" - category = "jpgfish" - root = "https://jpg1.su" - directory_fmt = ("{category}", "{user}", "{album}",) - archive_fmt = "{id}" - - def _pagination(self, url): - while url: - page = self.request(url).text - - for item in text.extract_iter( - page, '<div class="list-item-image ', 'image-container'): - yield text.extract(item, '<a href="', '"')[0] - - url = text.extract( - page, '<a data-pagination="next" href="', '" ><')[0] - - -class JpgfishImageExtractor(JpgfishExtractor): - """Extractor for jpgfish Images""" - subcategory = "image" - pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" - example = "https://jpg1.su/img/TITLE.ID" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.path, self.image_id = match.groups() - - def items(self): - url = "{}/img/{}".format(self.root, self.path) - extr = text.extract_from(self.request(url).text) - - image = { - "id" : self.image_id, - "url" : extr('<meta property="og:image" content="', '"'), - "album": text.extract(extr( - "Added to <a", "/a>"), ">", "<")[0] or "", - "user" : extr('username: "', '"'), - } - - text.nameext_from_url(image["url"], image) - yield Message.Directory, image - yield Message.Url, image["url"], image - - -class JpgfishAlbumExtractor(JpgfishExtractor): - """Extractor for jpgfish Albums""" - subcategory = "album" - pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" - example = "https://jpg1.su/album/TITLE.ID" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.album, self.sub_albums = match.groups() - - def items(self): - url = "{}/a/{}".format(self.root, self.album) - data = {"_extractor": JpgfishImageExtractor} - - if self.sub_albums: - albums = self._pagination(url + "/sub") - else: - albums = (url,) - - for album in albums: - for image in self._pagination(album): - yield Message.Queue, image, data - - -class JpgfishUserExtractor(JpgfishExtractor): - """Extractor for jpgfish Users""" - subcategory = "user" - pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" - example = "https://jpg1.su/USER" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.user, self.albums = match.groups() - - def items(self): - url = "{}/{}".format(self.root, self.user) - - if self.albums: - url += "/albums" - data = {"_extractor": JpgfishAlbumExtractor} - else: - data = {"_extractor": JpgfishImageExtractor} - - for url in self._pagination(url): - yield Message.Queue, url, data diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 894c671..1596cfb 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import cache +from ..cache import cache, memcache import itertools import re @@ -70,8 +70,7 @@ class KemonopartyExtractor(Extractor): self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers post["date"] = text.parse_datetime( - post["published"] or post["added"], - "%a, %d %b %Y %H:%M:%S %Z") + post["published"] or post["added"], "%Y-%m-%dT%H:%M:%S") if username: post["username"] = username if comments: @@ -197,14 +196,25 @@ class KemonopartyExtractor(Extractor): dms = [] for dm in text.extract_iter(page, "<article", "</article>"): + footer = text.extr(dm, "<footer", "</footer>") dms.append({ - "body": text.unescape(text.extract( + "body": text.unescape(text.extr( dm, "<pre>", "</pre></", - )[0].strip()), - "date": text.extr(dm, 'datetime="', '"'), + ).strip()), + "date": text.extr(footer, 'Published: ', '\n'), }) return dms + @memcache(keyarg=1) + def _discord_channels(self, server): + url = "{}/api/v1/discord/channel/lookup/{}".format( + self.root, server) + return self.request(url).json() + + @memcache(keyarg=1) + def _post_revisions(self, url): + return self.request(url + "/revisions").json() + def _validate(response): return (response.headers["content-length"] != "9" or @@ -214,48 +224,82 @@ def _validate(response): class KemonopartyUserExtractor(KemonopartyExtractor): """Extractor for all posts from a kemono.party user listing""" subcategory = "user" - pattern = USER_PATTERN + r"/?(?:\?o=(\d+))?(?:$|[?#])" + pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|[?#])" example = "https://kemono.party/SERVICE/user/12345" def __init__(self, match): - _, _, service, user_id, offset = match.groups() + _, _, service, user_id, self.query = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) - self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) + self.api_url = "{}/api/v1/{}/user/{}".format( + self.root, service, user_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) - self.offset = text.parse_int(offset) def posts(self): url = self.api_url - params = {"o": self.offset} + params = text.parse_query(self.query) + params["o"] = text.parse_int(params.get("o")) + revisions = self.config("revisions") while True: posts = self.request(url, params=params).json() - yield from posts - cnt = len(posts) - if cnt < 25: - return - params["o"] += cnt + if revisions: + for post in posts: + post["revision_id"] = 0 + yield post + post_url = "{}/post/{}".format(self.api_url, post["id"]) + try: + revs = self._post_revisions(post_url) + except exception.HttpError: + pass + else: + yield from revs + else: + yield from posts + + if len(posts) < 50: + break + params["o"] += 50 class KemonopartyPostExtractor(KemonopartyExtractor): """Extractor for a single kemono.party post""" subcategory = "post" - pattern = USER_PATTERN + r"/post/([^/?#]+)" + pattern = USER_PATTERN + r"/post/([^/?#]+)(/revisions?(?:/(\d*))?)?" example = "https://kemono.party/SERVICE/user/12345/post/12345" def __init__(self, match): - _, _, service, user_id, post_id = match.groups() + _, _, service, user_id, post_id, self.revision, self.revision_id = \ + match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) - self.api_url = "{}/api/{}/user/{}/post/{}".format( + self.api_url = "{}/api/v1/{}/user/{}/post/{}".format( self.root, service, user_id, post_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): - posts = self.request(self.api_url).json() - return (posts[0],) if len(posts) > 1 else posts + if not self.revision: + post = self.request(self.api_url).json() + if self.config("revisions"): + post["revision_id"] = 0 + try: + revs = self._post_revisions(self.api_url) + except exception.HttpError: + pass + else: + return itertools.chain((post,), revs) + return (post,) + + revs = self._post_revisions(self.api_url) + if not self.revision_id: + return revs + + for rev in revs: + if str(rev["revision_id"]) == self.revision_id: + return (rev,) + + raise exception.NotFoundError("revision") class KemonopartyDiscordExtractor(KemonopartyExtractor): @@ -270,11 +314,29 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - _, _, self.server, self.channel, self.channel_name = match.groups() + _, _, self.server, self.channel_id, self.channel = match.groups() + self.channel_name = "" def items(self): self._prepare_ddosguard_cookies() + if self.channel_id: + self.channel_name = self.channel + else: + if self.channel.isdecimal() and len(self.channel) >= 16: + key = "id" + else: + key = "name" + + for channel in self._discord_channels(self.server): + if channel[key] == self.channel: + break + else: + raise exception.NotFoundError("channel") + + self.channel_id = channel["id"] + self.channel_name = channel["name"] + find_inline = re.compile( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall @@ -299,7 +361,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): post["channel_name"] = self.channel_name post["date"] = text.parse_datetime( - post["published"], "%a, %d %b %Y %H:%M:%S %Z") + post["published"], "%Y-%m-%dT%H:%M:%S.%f") post["count"] = len(files) yield Message.Directory, post @@ -319,27 +381,17 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): yield Message.Url, url, post def posts(self): - if self.channel is None: - url = "{}/api/discord/channels/lookup?q={}".format( - self.root, self.server) - for channel in self.request(url).json(): - if channel["name"] == self.channel_name: - self.channel = channel["id"] - break - else: - raise exception.NotFoundError("channel") - - url = "{}/api/discord/channel/{}".format(self.root, self.channel) - params = {"skip": 0} + url = "{}/api/v1/discord/channel/{}".format( + self.root, self.channel_id) + params = {"o": 0} while True: posts = self.request(url, params=params).json() yield from posts - cnt = len(posts) - if cnt < 25: + if len(posts) < 150: break - params["skip"] += cnt + params["o"] += 150 class KemonopartyDiscordServerExtractor(KemonopartyExtractor): @@ -352,11 +404,7 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): self.server = match.group(3) def items(self): - url = "{}/api/discord/channels/lookup?q={}".format( - self.root, self.server) - channels = self.request(url).json() - - for channel in channels: + for channel in self._discord_channels(self.server): url = "{}/discord/server/{}/channel/{}#{}".format( self.root, self.server, channel["id"], channel["name"]) channel["_extractor"] = KemonopartyDiscordExtractor diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 145dd51..e97d273 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -124,6 +124,11 @@ class MoebooruPoolExtractor(MoebooruExtractor): self.pool_id = match.group(match.lastindex) def metadata(self): + if self.config("metadata"): + url = "{}/pool/show/{}.json".format(self.root, self.pool_id) + pool = self.request(url).json() + pool.pop("posts", None) + return {"pool": pool} return {"pool": text.parse_int(self.pool_id)} def posts(self): diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 1bcc915..a6971e8 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -54,14 +54,31 @@ class NewgroundsExtractor(Extractor): if metadata: post.update(metadata) yield Message.Directory, post + post["num"] = 0 yield Message.Url, url, text.nameext_from_url(url, post) - for num, url in enumerate(text.extract_iter( - post["_comment"], 'data-smartload-src="', '"'), 1): - post["num"] = num - post["_index"] = "{}_{:>02}".format(post["index"], num) + if "_multi" in post: + for data in post["_multi"]: + post["num"] += 1 + post["_index"] = "{}_{:>02}".format( + post["index"], post["num"]) + post.update(data) + url = data["image"] + + text.nameext_from_url(url, post) + yield Message.Url, url, post + + if "_fallback" in post: + del post["_fallback"] + + for url in text.extract_iter( + post["_comment"], 'data-smartload-src="', '"'): + post["num"] += 1 + post["_index"] = "{}_{:>02}".format( + post["index"], post["num"]) url = text.ensure_http_scheme(url) - yield Message.Url, url, text.nameext_from_url(url, post) + text.nameext_from_url(url, post) + yield Message.Url, url, post else: self.log.warning( "Unable to get download URL for '%s'", post_url) @@ -153,8 +170,7 @@ class NewgroundsExtractor(Extractor): data["post_url"] = post_url return data - @staticmethod - def _extract_image_data(extr, url): + def _extract_image_data(self, extr, url): full = text.extract_from(util.json_loads(extr( '"full_image_text":', '});'))) data = { @@ -172,8 +188,34 @@ class NewgroundsExtractor(Extractor): index = data["url"].rpartition("/")[2].partition("_")[0] data["index"] = text.parse_int(index) data["_index"] = index + + image_data = extr("let imageData =", "\n];") + if image_data: + data["_multi"] = self._extract_images_multi(image_data) + else: + art_images = extr('<div class="art-images', '\n</div>') + if art_images: + data["_multi"] = self._extract_images_art(art_images, data) + return data + def _extract_images_multi(self, html): + data = util.json_loads(html + "]") + yield from data[1:] + + def _extract_images_art(self, html, data): + ext = text.ext_from_url(data["url"]) + for url in text.extract_iter(html, 'data-smartload-src="', '"'): + url = text.ensure_http_scheme(url) + url = url.replace("/medium_views/", "/images/", 1) + if text.ext_from_url(url) == "webp": + yield { + "image" : url.replace(".webp", "." + ext), + "_fallback": (url,), + } + else: + yield {"image": url} + @staticmethod def _extract_audio_data(extr, url): index = url.split("/")[5] diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 729ceaf..6ac9a83 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -277,7 +277,7 @@ class PatreonCreatorExtractor(PatreonExtractor): try: data = self._extract_bootstrap(page) - campaign_id = data["creator"]["data"]["id"] + campaign_id = data["campaign"]["data"]["id"] except (KeyError, ValueError): raise exception.NotFoundError("creator") diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 8553312..cd2ba3d 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -125,7 +125,8 @@ class RedditExtractor(Extractor): if match: extra.append(match.group(1)) elif not match_user(url) and not match_subreddit(url): - if previews and "preview" in data: + if previews and "comment" not in data and \ + "preview" in data: data["_fallback"] = self._previews(data) yield Message.Queue, text.unescape(url), data if "_fallback" in data: diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index e246405..6185acb 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -146,11 +146,17 @@ class RedgifsCollectionsExtractor(RedgifsExtractor): class RedgifsNichesExtractor(RedgifsExtractor): """Extractor for redgifs niches""" subcategory = "niches" - pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)" + pattern = (r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)/?" + r"(?:\?([^#]+))?$") example = "https://www.redgifs.com/niches/NAME" + def __init__(self, match): + RedgifsExtractor.__init__(self, match) + self.query = match.group(2) + def gifs(self): - return self.api.niches(self.key) + order = text.parse_query(self.query).get("order") + return self.api.niches(self.key, order or "new") class RedgifsSearchExtractor(RedgifsExtractor): @@ -232,9 +238,10 @@ class RedgifsAPI(): endpoint = "/v2/users/{}/collections".format(user) return self._pagination(endpoint, key="collections") - def niches(self, niche): + def niches(self, niche, order): endpoint = "/v2/niches/{}/gifs".format(niche) - return self._pagination(endpoint) + params = {"count": 30, "order": order} + return self._pagination(endpoint, params) def search(self, params): endpoint = "/v2/gifs/search" diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 745a351..dc35511 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post/show/([0-9a-f]+)" + pattern = BASE_PATTERN + r"/post(?:s|/show)/([0-9a-f]+)" example = "https://sankaku.app/post/show/12345" def __init__(self, match): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 3895c74..61e871e 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -306,6 +306,7 @@ class TwitterExtractor(Extractor): "user" : self._user or author, "lang" : tweet["lang"], "source" : text.extr(source, ">", "<"), + "sensitive" : tget("possibly_sensitive"), "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), "reply_count" : tget("reply_count"), @@ -451,6 +452,7 @@ class TwitterExtractor(Extractor): "id_str": id_str, "lang": None, "user": user, + "source": "><", "entities": {}, "extended_entities": { "media": [ diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 6f152ed..8e6b842 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -47,7 +47,7 @@ class WarosuThreadExtractor(Extractor): def metadata(self, page): boardname = text.extr(page, "<title>", "</title>") - title = text.extr(page, 'filetitle" itemprop="name">', '<') + title = text.unescape(text.extr(page, "class=filetitle>", "<")) return { "board" : self.board, "board_name": boardname.rpartition(" - ")[2], @@ -57,39 +57,37 @@ class WarosuThreadExtractor(Extractor): def posts(self, page): """Build a list of all post objects""" - page = text.extr(page, '<div class="content">', '<table>') - needle = '<table itemscope itemtype="http://schema.org/Comment">' + page = text.extr(page, "<div class=content", "</form>") + needle = "<table>" return [self.parse(post) for post in page.split(needle)] def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if "<span>File:" in post: + if "<span> File:" in post: self._extract_image(post, data) part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] return data - @staticmethod - def _extract_post(post): + def _extract_post(self, post): extr = text.extract_from(post) return { - "no" : extr('id="p', '"'), - "name": extr('<span itemprop="name">', "</span>"), - "time": extr('<span class="posttime" title="', '000">'), - "now" : extr("", "<"), + "no" : extr("id=p", ">"), + "name": extr("class=postername>", "<").strip(), + "time": extr("class=posttime title=", "000>"), + "now" : extr("", "<").strip(), "com" : text.unescape(text.remove_html(extr( - '<blockquote><p itemprop="text">', '</p></blockquote>' - ).strip())), + "<blockquote>", "</blockquote>").strip())), } - @staticmethod - def _extract_image(post, data): + def _extract_image(self, post, data): extr = text.extract_from(post) - data["fsize"] = extr("<span>File: ", ", ") + data["fsize"] = extr("<span> File: ", ", ") data["w"] = extr("", "x") data["h"] = extr("", ", ") - data["filename"] = text.unquote(extr("", "<").rpartition(".")[0]) - extr("<br />", "") - data["image"] = "https:" + extr('<a href="', '"') + data["filename"] = text.unquote(extr( + "", "<").rstrip().rpartition(".")[0]) + extr("<br>", "") + data["image"] = self.root + extr("<a href=", ">") diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 08e6e70..1982b71 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -510,6 +510,8 @@ def build_parser(): dest="postprocessors", metavar="CMD", action=AppendCommandAction, const={"name": "exec"}, help=("Execute CMD for each downloaded file. " + "Supported replacement fields are " + "{} or {_path}, {_directory}, {_filename}. " "Example: --exec \"convert {} {}.png && rm {}\""), ) postprocessor.add_argument( @@ -518,7 +520,8 @@ def build_parser(): action=AppendCommandAction, const={ "name": "exec", "event": "finalize"}, help=("Execute CMD after all files were downloaded successfully. " - "Example: --exec-after \"cd {} && convert * ../doc.pdf\""), + "Example: --exec-after \"cd {_directory} " + "&& convert * ../doc.pdf\""), ) postprocessor.add_argument( "-P", "--postprocessor", diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index afa828c..e7ed2f6 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -12,6 +12,7 @@ from .common import PostProcessor from .. import util, formatter import subprocess import os +import re if util.WINDOWS: @@ -32,6 +33,7 @@ class ExecPP(PostProcessor): args = options["command"] if isinstance(args, str): self.args = args + self._sub = re.compile(r"\{(_directory|_filename|_path|)\}").sub execute = self.exec_string else: self.args = [formatter.parse(arg) for arg in args] @@ -69,11 +71,8 @@ class ExecPP(PostProcessor): if archive and archive.check(pathfmt.kwdict): return - if pathfmt.realpath: - args = self.args.replace("{}", quote(pathfmt.realpath)) - else: - args = self.args.replace("{}", quote(pathfmt.realdirectory)) - + self.pathfmt = pathfmt + args = self._sub(self._replace, self.args) self._exec(args, True) if archive: @@ -90,5 +89,13 @@ class ExecPP(PostProcessor): self.log.debug("Running '%s'", args) subprocess.Popen(args, shell=shell) + def _replace(self, match): + name = match.group(1) + if name == "_directory": + return quote(self.pathfmt.realdirectory) + if name == "_filename": + return quote(self.pathfmt.filename) + return quote(self.pathfmt.realpath) + __postprocessor__ = ExecPP diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d06d9d6..593cffa 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.0" +__version__ = "1.26.1" diff --git a/test/test_extractor.py b/test/test_extractor.py index 9387f5b..29ccf97 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -238,7 +238,7 @@ class TestExtractorWait(unittest.TestCase): until = datetime.fromtimestamp(until) o = self._isotime_to_seconds(output) u = self._isotime_to_seconds(until.time().isoformat()[:8]) - self.assertLess(o-u, 1.0) + self.assertLessEqual(o-u, 1.0) @staticmethod def _isotime_to_seconds(isotime): diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index c00144e..b64df88 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -168,7 +168,7 @@ class ExecTest(BasePostprocessorTest): def test_command_string(self): self._create({ - "command": "echo {} && rm {};", + "command": "echo {} {_path} {_directory} {_filename} && rm {};", }) with patch("subprocess.Popen") as p: @@ -178,7 +178,11 @@ class ExecTest(BasePostprocessorTest): self._trigger(("after",)) p.assert_called_once_with( - "echo {0} && rm {0};".format(self.pathfmt.realpath), shell=True) + "echo {0} {0} {1} {2} && rm {0};".format( + self.pathfmt.realpath, + self.pathfmt.realdirectory, + self.pathfmt.filename), + shell=True) i.wait.assert_called_once_with() def test_command_list(self): diff --git a/test/test_results.py b/test/test_results.py index 4fb22c7..f275bbf 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -201,6 +201,9 @@ class TestExtractorResults(unittest.TestCase): self.assertEqual(str(value), test[3:], msg=key) elif test.startswith("type:"): self.assertEqual(type(value).__name__, test[5:], msg=key) + elif test.startswith("len:"): + self.assertIsInstance(value, (list, tuple), msg=key) + self.assertEqual(len(value), int(test[4:]), msg=key) else: self.assertEqual(value, test, msg=key) else: |
