diff options
31 files changed, 409 insertions, 110 deletions
diff --git a/CHANGELOG.md b/CHANGELOG.md index 34607f2..ad34930 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,44 @@ # Changelog +## 1.26.2 - 2023-11-04 +### Extractors +#### Additions +- [4archive] add `thread` and `board` extractors ([#1262](https://github.com/mikf/gallery-dl/issues/1262), [#2418](https://github.com/mikf/gallery-dl/issues/2418), [#4400](https://github.com/mikf/gallery-dl/issues/4400), [#4710](https://github.com/mikf/gallery-dl/issues/4710), [#4714](https://github.com/mikf/gallery-dl/issues/4714)) +- [hitomi] recognize `imageset` gallery URLs ([#4756](https://github.com/mikf/gallery-dl/issues/4756)) +- [kemonoparty] add `revision_index` metadata field ([#4727](https://github.com/mikf/gallery-dl/issues/4727)) +- [misskey] support `misskey.design` ([#4713](https://github.com/mikf/gallery-dl/issues/4713)) +- [reddit] support Reddit Mobile share links ([#4693](https://github.com/mikf/gallery-dl/issues/4693)) +- [sankaku] support `/posts/` tag search URLs ([#4740](https://github.com/mikf/gallery-dl/issues/4740)) +- [twitter] recognize `fixupx.com` URLs ([#4755](https://github.com/mikf/gallery-dl/issues/4755)) +#### Fixes +- [exhentai] update to site layout changes ([#4730](https://github.com/mikf/gallery-dl/issues/4730), [#4754](https://github.com/mikf/gallery-dl/issues/4754)) +- [exhentai] provide fallback URLs ([#1021](https://github.com/mikf/gallery-dl/issues/1021), [#4745](https://github.com/mikf/gallery-dl/issues/4745)) +- [exhentai] disable `DH` ciphers to avoid `DH_KEY_TOO_SMALL` errors ([#1021](https://github.com/mikf/gallery-dl/issues/1021), [#4593](https://github.com/mikf/gallery-dl/issues/4593)) +- [idolcomplex] disable sending Referer headers ([#4726](https://github.com/mikf/gallery-dl/issues/4726)) +- [instagram] update API headers +- [kemonoparty] fix parsing of non-standard `date` values ([#4676](https://github.com/mikf/gallery-dl/issues/4676)) +- [patreon] fix `campaign_id` extraction ([#4699](https://github.com/mikf/gallery-dl/issues/4699), [#4715](https://github.com/mikf/gallery-dl/issues/4715), [#4736](https://github.com/mikf/gallery-dl/issues/4736), [#4738](https://github.com/mikf/gallery-dl/issues/4738)) +- [pixiv] load cookies for non-OAuth URLs ([#4760](https://github.com/mikf/gallery-dl/issues/4760)) +- [twitter] fix avatars without `date` information ([#4696](https://github.com/mikf/gallery-dl/issues/4696)) +- [twitter] restore truncated retweet texts ([#3430](https://github.com/mikf/gallery-dl/issues/3430), [#4690](https://github.com/mikf/gallery-dl/issues/4690)) +- [weibo] fix Sina Visitor requests +#### Improvements +- [behance] unescape embed URLs ([#4742](https://github.com/mikf/gallery-dl/issues/4742)) +- [fantia] simplify `tags` to a list of strings ([#4752](https://github.com/mikf/gallery-dl/issues/4752)) +- [kemonoparty] limit `title` length ([#4741](https://github.com/mikf/gallery-dl/issues/4741)) +- [nijie] set 1-2s delay between requests to avoid 429 errors +- [patreon] provide ways to manually specify a user's campaign_id + - `https://www.patreon.com/id:12345` + - `https://www.patreon.com/USER?c=12345` + - `https://www.patreon.com/USER?campaign_id=12345` +- [twitter] cache `user_by_…` results ([#4719](https://github.com/mikf/gallery-dl/issues/4719)) +### Post Processors +#### Fixes +- [metadata] ignore non-string tag values ([#4764](https://github.com/mikf/gallery-dl/issues/4764)) +### Miscellaneous +#### Fixes +- prevent crash when `stdout.line_buffering` is not defined ([#642](https://github.com/mikf/gallery-dl/issues/642)) + ## 1.26.1 - 2023-10-21 ### Extractors #### Additions @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery_dl -Version: 1.26.1 +Version: 1.26.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -109,9 +109,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.2/gallery-dl.bin>`__ Nightly Builds @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.2/gallery-dl.bin>`__ Nightly Builds diff --git a/data/man/gallery-dl.1 b/data/man/gallery-dl.1 index 27f13af..5a9ec79 100644 --- a/data/man/gallery-dl.1 +++ b/data/man/gallery-dl.1 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL" "1" "2023-10-21" "1.26.1" "gallery-dl Manual" +.TH "GALLERY-DL" "1" "2023-11-04" "1.26.2" "gallery-dl Manual" .\" disable hyphenation .nh diff --git a/data/man/gallery-dl.conf.5 b/data/man/gallery-dl.conf.5 index 9083d24..46a8ea6 100644 --- a/data/man/gallery-dl.conf.5 +++ b/data/man/gallery-dl.conf.5 @@ -1,4 +1,4 @@ -.TH "GALLERY-DL.CONF" "5" "2023-10-21" "1.26.1" "gallery-dl Manual" +.TH "GALLERY-DL.CONF" "5" "2023-11-04" "1.26.2" "gallery-dl Manual" .\" disable hyphenation .nh .\" disable justification (adjust text to left margin only) @@ -6036,9 +6036,19 @@ section of your account's preferences .br * click the "are you a developer? create an app..." button .br -* fill out the form, choose "installed app", preferably set -"http://localhost:6414/" as "redirect uri" and finally click -"create app" +* fill out the form: + +.br +* choose a name +.br +* select "installed app" +.br +* set \f[I]http://localhost:6414/\f[] as "redirect uri" +.br +* solve the "I'm not a rebot" reCATCHA if needed +.br +* click "create app" + .br * copy the client id (third line, under your application's name and "installed app") and put it in your configuration file diff --git a/gallery_dl.egg-info/PKG-INFO b/gallery_dl.egg-info/PKG-INFO index 95861dc..37876de 100644 --- a/gallery_dl.egg-info/PKG-INFO +++ b/gallery_dl.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: gallery-dl -Version: 1.26.1 +Version: 1.26.2 Summary: Command-line program to download image galleries and collections from several image hosting sites Home-page: https://github.com/mikf/gallery-dl Download-URL: https://github.com/mikf/gallery-dl/releases/latest @@ -109,9 +109,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.exe>`__ +- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.2/gallery-dl.exe>`__ (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) -- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.1/gallery-dl.bin>`__ +- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.2/gallery-dl.bin>`__ Nightly Builds diff --git a/gallery_dl.egg-info/SOURCES.txt b/gallery_dl.egg-info/SOURCES.txt index fb6cb4b..699dce5 100644 --- a/gallery_dl.egg-info/SOURCES.txt +++ b/gallery_dl.egg-info/SOURCES.txt @@ -44,6 +44,7 @@ gallery_dl/extractor/2chan.py gallery_dl/extractor/2chen.py gallery_dl/extractor/35photo.py gallery_dl/extractor/3dbooru.py +gallery_dl/extractor/4archive.py gallery_dl/extractor/4chan.py gallery_dl/extractor/4chanarchives.py gallery_dl/extractor/500px.py diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py new file mode 100644 index 0000000..d198369 --- /dev/null +++ b/gallery_dl/extractor/4archive.py @@ -0,0 +1,111 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://4archive.org/""" + +from .common import Extractor, Message +from .. import text, util + + +class _4archiveThreadExtractor(Extractor): + """Extractor for 4archive threads""" + category = "4archive" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{no} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{no}" + root = "https://4archive.org" + referer = False + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)/thread/(\d+)" + example = "https://4archive.org/board/a/thread/12345/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) + posts = self.posts(page) + + if not data["title"]: + data["title"] = posts[0]["com"][:50] + + for post in posts: + post.update(data) + post["time"] = int(util.datetime_to_timestamp(post["date"])) + yield Message.Directory, post + if "url" in post: + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + return { + "board" : self.board, + "thread": text.parse_int(self.thread), + "title" : text.unescape(text.extr( + page, 'class="subject">', "</span>")) + } + + def posts(self, page): + return [ + self.parse(post) + for post in page.split('class="postContainer')[1:] + ] + + @staticmethod + def parse(post): + extr = text.extract_from(post) + data = { + "name": extr('class="name">', "</span>"), + "date": text.parse_datetime( + extr('class="dateTime postNum" >', "<").strip(), + "%Y-%m-%d %H:%M:%S"), + "no" : text.parse_int(extr('href="#p', '"')), + } + if 'class="file"' in post: + extr('class="fileText"', ">File: <a") + data.update({ + "url" : extr('href="', '"'), + "filename": extr( + 'rel="noreferrer noopener"', "</a>").strip()[1:], + "size" : text.parse_bytes(extr(" (", ", ")[:-1]), + "width" : text.parse_int(extr("", "x")), + "height" : text.parse_int(extr("", "px")), + }) + extr("<blockquote ", "") + data["com"] = text.unescape(text.remove_html( + extr(">", "</blockquote>"))) + return data + + +class _4archiveBoardExtractor(Extractor): + """Extractor for 4archive boards""" + category = "4archive" + subcategory = "board" + root = "https://4archive.org" + pattern = r"(?:https?://)?4archive\.org/board/([^/?#]+)(?:/(\d+))?/?$" + example = "https://4archive.org/board/a/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + self.num = text.parse_int(match.group(2), 1) + + def items(self): + data = {"_extractor": _4archiveThreadExtractor} + while True: + url = "{}/board/{}/{}".format(self.root, self.board, self.num) + page = self.request(url).text + if 'class="thread"' not in page: + return + for thread in text.extract_iter(page, 'class="thread" id="t', '"'): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, thread) + yield Message.Queue, url, data + self.num += 1 diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 1c1473a..22e4fe3 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -15,6 +15,7 @@ modules = [ "35photo", "3dbooru", "4chan", + "4archive", "4chanarchives", "500px", "8chan", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index fc5f9ef..a92918e 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -170,7 +170,8 @@ class BehanceGalleryExtractor(BehanceExtractor): elif mtype == "EmbedModule": embed = module.get("originalEmbed") or module.get("fluidEmbed") if embed: - append(("ytdl:" + text.extr(embed, 'src="', '"'), module)) + embed = text.unescape(text.extr(embed, 'src="', '"')) + append(("ytdl:" + embed, module)) return result diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 0d67df7..3bec424 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -35,6 +35,7 @@ class Extractor(): root = "" cookies_domain = "" referer = True + ciphers = None tls12 = True browser = None request_interval = 0.0 @@ -305,6 +306,7 @@ class Extractor(): headers["User-Agent"] = useragent headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" + ssl_ciphers = self.ciphers if BROTLI: headers["Accept-Encoding"] = "gzip, deflate, br" diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 44bfe7d..182910c 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -27,6 +27,7 @@ class ExhentaiExtractor(Extractor): cookies_names = ("ipb_member_id", "ipb_pass_hash") root = "https://exhentai.org" request_interval = 5.0 + ciphers = "DEFAULT:!DH" LIMIT = False @@ -112,12 +113,15 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def __init__(self, match): ExhentaiExtractor.__init__(self, match) - self.key = {} - self.count = 0 self.gallery_id = text.parse_int(match.group(2) or match.group(5)) self.gallery_token = match.group(3) self.image_token = match.group(4) self.image_num = text.parse_int(match.group(6), 1) + self.key_start = None + self.key_show = None + self.key_next = None + self.api_url = "" + self.count = 0 def _init(self): source = self.config("source") @@ -145,17 +149,17 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): gpage = self._gallery_page() self.image_token = text.extr(gpage, 'hentai.org/s/', '"') if not self.image_token: - self.log.error("Failed to extract initial image token") self.log.debug("Page content:\n%s", gpage) - return + raise exception.StopExtraction( + "Failed to extract initial image token") ipage = self._image_page() else: ipage = self._image_page() part = text.extr(ipage, 'hentai.org/g/', '"') if not part: - self.log.error("Failed to extract gallery token") self.log.debug("Page content:\n%s", ipage) - return + raise exception.StopExtraction( + "Failed to extract gallery token") self.gallery_token = part.split("/")[1] gpage = self._gallery_page() @@ -176,7 +180,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data.update(image) if self.limits: self._check_limits(data) - if "/fullimg.php" in url: + if "/fullimg" in url: data["_http_validate"] = _validate_response else: data["_http_validate"] = None @@ -208,6 +212,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def metadata_from_page(self, page): extr = text.extract_from(page) + self.api_url = extr('var api_url = "', '"') or (self.root + "/api.php") + data = { "gid" : self.gallery_id, "token" : self.gallery_token, @@ -225,7 +231,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): '>Visible:</td><td class="gdt2">', '<'), "language" : extr('>Language:</td><td class="gdt2">', ' '), "filesize" : text.parse_bytes(extr( - '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")), + '>File Size:</td><td class="gdt2">', '<').rstrip("Bbi")), "filecount" : extr('>Length:</td><td class="gdt2">', ' '), "favorites" : extr('id="favcount">', ' '), "rating" : extr(">Average: ", "<"), @@ -251,14 +257,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): return data def metadata_from_api(self): - url = self.root + "/api.php" data = { - "method": "gdata", - "gidlist": ((self.gallery_id, self.gallery_token),), + "method" : "gdata", + "gidlist" : ((self.gallery_id, self.gallery_token),), "namespace": 1, } - data = self.request(url, method="POST", json=data).json() + data = self.request(self.api_url, method="POST", json=data).json() if "error" in data: raise exception.StopExtraction(data["error"]) @@ -269,54 +274,70 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): pos = page.index('<div id="i3"><a onclick="return load_image(') + 26 extr = text.extract_from(page, pos) - self.key["next"] = extr("'", "'") + self.key_next = extr("'", "'") iurl = extr('<img id="img" src="', '"') - orig = extr('hentai.org/fullimg.php', '"') + nl = extr(" nl(", ")").strip("\"'") + orig = extr('hentai.org/fullimg', '"') try: if self.original and orig: - url = self.root + "/fullimg.php" + text.unescape(orig) + url = self.root + "/fullimg" + text.unescape(orig) data = self._parse_original_info(extr('ownload original', '<')) + data["_fallback"] = ("{}?nl={}".format(url, nl),) else: url = iurl data = self._parse_image_info(url) + data["_fallback"] = self._fallback( + None, self.image_num, nl) except IndexError: self.log.debug("Page content:\n%s", page) raise exception.StopExtraction( "Unable to parse image info for '%s'", url) data["num"] = self.image_num - data["image_token"] = self.key["start"] = extr('var startkey="', '";') - self.key["show"] = extr('var showkey="', '";') + data["image_token"] = self.key_start = extr('var startkey="', '";') + self.key_show = extr('var showkey="', '";') self._check_509(iurl, data) - return url, text.nameext_from_url(iurl, data) + return url, text.nameext_from_url(url, data) def images_from_api(self): """Get image url and data from api calls""" - api_url = self.root + "/api.php" - nextkey = self.key["next"] + api_url = self.api_url + nextkey = self.key_next request = { "method" : "showpage", "gid" : self.gallery_id, + "page" : 0, "imgkey" : nextkey, - "showkey": self.key["show"], + "showkey": self.key_show, } + for request["page"] in range(self.image_num + 1, self.count + 1): page = self.request(api_url, method="POST", json=request).json() + + i3 = page["i3"] + i6 = page["i6"] + imgkey = nextkey - nextkey, pos = text.extract(page["i3"], "'", "'") - imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos) - origurl, pos = text.extract(page["i7"], '<a href="', '"') + nextkey, pos = text.extract(i3, "'", "'") + imgurl , pos = text.extract(i3, 'id="img" src="', '"', pos) + nl , pos = text.extract(i3, " nl(", ")", pos) + nl = (nl or "").strip("\"'") try: - if self.original and origurl: + pos = i6.find("hentai.org/fullimg") + if self.original and pos >= 0: + origurl, pos = text.rextract(i6, '"', '"', pos) url = text.unescape(origurl) data = self._parse_original_info(text.extract( - page["i7"], "ownload original", "<", pos)[0]) + i6, "ownload original", "<", pos)[0]) + data["_fallback"] = ("{}?nl={}".format(url, nl),) else: url = imgurl data = self._parse_image_info(url) + data["_fallback"] = self._fallback( + imgkey, request["page"], nl) except IndexError: self.log.debug("Page content:\n%s", page) raise exception.StopExtraction( @@ -326,7 +347,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["image_token"] = imgkey self._check_509(imgurl, data) - yield url, text.nameext_from_url(imgurl, data) + yield url, text.nameext_from_url(url, data) request["imgkey"] = nextkey @@ -390,6 +411,14 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.NotFoundError("image page") return page + def _fallback(self, imgkey, num, nl): + url = "{}/s/{}/{}-{}?nl={}".format( + self.root, imgkey or self.key_start, self.gallery_id, num, nl) + page = self.request(url, fatal=False).text + if page.startswith(("Invalid page", "Keep trying")): + return + yield self.image_from_page(page)[0] + @staticmethod def _parse_image_info(url): for part in url.split("/")[4:]: diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 4a67695..6218f19 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -108,7 +108,7 @@ class FantiaExtractor(Extractor): "fanclub_user_name": resp["fanclub"]["user"]["name"], "fanclub_name": resp["fanclub"]["name"], "fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]), - "tags": resp["tags"], + "tags": [t["name"] for t in resp["tags"]], "_data": resp, } diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index bc49ca3..88f5708 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor): category = "hitomi" root = "https://hitomi.la" pattern = (r"(?:https?://)?hitomi\.la" - r"/(?:manga|doujinshi|cg|gamecg|galleries|reader)" + r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)" r"/(?:[^/?#]+-)?(\d+)") example = "https://hitomi.la/manga/TITLE-867789.html" diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 16e4097..b7b6ef1 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -22,6 +22,7 @@ class IdolcomplexExtractor(SankakuExtractor): cookies_domain = "idol.sankakucomplex.com" cookies_names = ("login", "pass_hash") root = "https://" + cookies_domain + referer = False request_interval = 5.0 def __init__(self, match): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index c704183..b0789be 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -778,13 +778,15 @@ class InstagramRestAPI(): kwargs["headers"] = { "Accept" : "*/*", "X-CSRFToken" : extr.csrf_token, - "X-Instagram-AJAX": "1006242110", "X-IG-App-ID" : "936619743392459", - "X-ASBD-ID" : "198387", + "X-ASBD-ID" : "129477", "X-IG-WWW-Claim" : extr.www_claim, "X-Requested-With": "XMLHttpRequest", - "Alt-Used" : "www.instagram.com", + "Connection" : "keep-alive", "Referer" : extr.root + "/", + "Sec-Fetch-Dest" : "empty", + "Sec-Fetch-Mode" : "cors", + "Sec-Fetch-Site" : "same-origin", } return extr.request(url, **kwargs).json() diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 1596cfb..cba6211 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -24,7 +24,7 @@ class KemonopartyExtractor(Extractor): category = "kemonoparty" root = "https://kemono.party" directory_fmt = ("{category}", "{service}", "{user}") - filename_fmt = "{id}_{title}_{num:>02}_{filename[:180]}.{extension}" + filename_fmt = "{id}_{title[:180]}_{num:>02}_{filename[:180]}.{extension}" archive_fmt = "{service}_{user}_{id}_{num}" cookies_domain = ".kemono.party" @@ -69,8 +69,9 @@ class KemonopartyExtractor(Extractor): headers["Referer"] = "{}/{}/user/{}/post/{}".format( self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers - post["date"] = text.parse_datetime( - post["published"] or post["added"], "%Y-%m-%dT%H:%M:%S") + post["date"] = self._parse_datetime( + post["published"] or post["added"]) + if username: post["username"] = username if comments: @@ -205,6 +206,11 @@ class KemonopartyExtractor(Extractor): }) return dms + def _parse_datetime(self, date_string): + if len(date_string) > 19: + date_string = date_string[:19] + return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S") + @memcache(keyarg=1) def _discord_channels(self, server): url = "{}/api/v1/discord/channel/lookup/{}".format( @@ -213,7 +219,14 @@ class KemonopartyExtractor(Extractor): @memcache(keyarg=1) def _post_revisions(self, url): - return self.request(url + "/revisions").json() + revs = self.request(url + "/revisions").json() + + idx = len(revs) + for rev in revs: + rev["revision_index"] = idx + idx -= 1 + + return revs def _validate(response): @@ -247,13 +260,15 @@ class KemonopartyUserExtractor(KemonopartyExtractor): if revisions: for post in posts: post["revision_id"] = 0 - yield post post_url = "{}/post/{}".format(self.api_url, post["id"]) try: revs = self._post_revisions(post_url) except exception.HttpError: - pass + post["revision_index"] = 1 + yield post else: + post["revision_index"] = len(revs) + 1 + yield post yield from revs else: yield from posts @@ -286,8 +301,9 @@ class KemonopartyPostExtractor(KemonopartyExtractor): try: revs = self._post_revisions(self.api_url) except exception.HttpError: - pass + post["revision_index"] = 1 else: + post["revision_index"] = len(revs) + 1 return itertools.chain((post,), revs) return (post,) @@ -360,8 +376,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): "name": path, "type": "inline", "hash": ""}) post["channel_name"] = self.channel_name - post["date"] = text.parse_datetime( - post["published"], "%Y-%m-%dT%H:%M:%S.%f") + post["date"] = self._parse_datetime(post["published"]) post["count"] = len(files) yield Message.Directory, post diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 95b83b6..5385f8a 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -70,6 +70,10 @@ BASE_PATTERN = MisskeyExtractor.update({ "root": "https://misskey.io", "pattern": r"misskey\.io", }, + "misskey.design": { + "root": "https://misskey.design", + "pattern": r"misskey\.design", + }, "lesbian.energy": { "root": "https://lesbian.energy", "pattern": r"lesbian\.energy", diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index b902404..76c5404 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -19,6 +19,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): directory_fmt = ("{category}", "{user_id}") filename_fmt = "{image_id}_p{num}.{extension}" archive_fmt = "{image_id}_{num}" + request_interval = (1.0, 2.0) def __init__(self, match): BaseExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 6ac9a83..6aef9cb 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -249,8 +249,23 @@ class PatreonExtractor(Extractor): return [genmap[ft] for ft in filetypes] def _extract_bootstrap(self, page): - return util.json_loads(text.extr( - page, "window.patreon.bootstrap,", "});") + "}") + bootstrap = text.extr( + page, 'window.patreon = {"bootstrap":', '},"apiServer"') + if bootstrap: + return util.json_loads(bootstrap + "}") + + bootstrap = text.extr(page, "window.patreon.bootstrap,", "});") + if bootstrap: + return util.json_loads(bootstrap + "}") + + data = text.extr(page, "window.patreon = {", "};\n") + if data: + try: + return util.json_loads("{" + data + "}")["bootstrap"] + except Exception: + pass + + raise exception.StopExtraction("Unable to extract bootstrap data") class PatreonCreatorExtractor(PatreonExtractor): @@ -267,34 +282,52 @@ class PatreonCreatorExtractor(PatreonExtractor): def posts(self): query = text.parse_query(self.query) + campaign_id = self._get_campaign_id(query) + filters = self._get_filters(query) + + self.log.debug("campaign_id: %s", campaign_id) + + url = self._build_url("posts", ( + "&filter[campaign_id]=" + campaign_id + + "&filter[contains_exclusive_posts]=true" + "&filter[is_draft]=false" + filters + + "&sort=" + query.get("sort", "-published_at") + )) + return self._pagination(url) - creator_id = query.get("u") - if creator_id: - url = "{}/user/posts?u={}".format(self.root, creator_id) + def _get_campaign_id(self, query): + if self.creator.startswith("id:"): + return self.creator[3:] + + campaign_id = query.get("c") or query.get("campaign_id") + if campaign_id: + return campaign_id + + user_id = query.get("u") + if user_id: + url = "{}/user/posts?u={}".format(self.root, user_id) else: url = "{}/{}/posts".format(self.root, self.creator) page = self.request(url, notfound="creator").text try: + data = None data = self._extract_bootstrap(page) - campaign_id = data["campaign"]["data"]["id"] - except (KeyError, ValueError): - raise exception.NotFoundError("creator") - - filters = "".join( + return data["campaign"]["data"]["id"] + except (KeyError, ValueError) as exc: + if data: + self.log.debug(data) + raise exception.StopExtraction( + "Unable to extract campaign ID (%s: %s)", + exc.__class__.__name__, exc) + + def _get_filters(self, query): + return "".join( "&filter[{}={}".format(key[8:], text.escape(value)) for key, value in query.items() if key.startswith("filters[") ) - url = self._build_url("posts", ( - "&filter[campaign_id]=" + campaign_id + - "&filter[contains_exclusive_posts]=true" - "&filter[is_draft]=false" + filters + - "&sort=" + query.get("sort", "-published_at") - )) - return self._pagination(url) - class PatreonUserExtractor(PatreonExtractor): """Extractor for media from creators supported by you""" diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 18a3ceb..411d191 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -517,6 +517,7 @@ class PixivPixivisionExtractor(PixivExtractor): directory_fmt = ("{category}", "pixivision", "{pixivision_id} {pixivision_title}") archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}" + cookies_domain = ".pixiv.net" pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)" example = "https://www.pixivision.net/en/a/12345" @@ -549,6 +550,9 @@ class PixivSeriesExtractor(PixivExtractor): directory_fmt = ("{category}", "{user[id]} {user[account]}", "{series[id]} {series[title]}") filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" + cookies_domain = ".pixiv.net" + browser = "firefox" + tls12 = False pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)" example = "https://www.pixiv.net/user/12345/series/12345" diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index cd2ba3d..c0bf5b3 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -292,6 +292,29 @@ class RedditImageExtractor(Extractor): yield Message.Url, url, data +class RedditRedirectExtractor(Extractor): + """Extractor for personalized share URLs produced by the mobile app""" + category = "reddit" + subcategory = "redirect" + pattern = (r"(?:https?://)?(?:" + r"(?:\w+\.)?reddit\.com/(?:(?:r)/([^/?#]+)))" + r"/s/([a-zA-Z0-9]{10})") + example = "https://www.reddit.com/r/SUBREDDIT/s/abc456GHIJ" + + def __init__(self, match): + Extractor.__init__(self, match) + self.subreddit = match.group(1) + self.share_url = match.group(2) + + def items(self): + url = "https://www.reddit.com/r/" + self.subreddit + "/s/" + \ + self.share_url + data = {"_extractor": RedditSubmissionExtractor} + response = self.request(url, method="HEAD", allow_redirects=False, + notfound="submission") + yield Message.Queue, response.headers["Location"], data + + class RedditAPI(): """Interface for the Reddit API diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index dc35511..bebea2a 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -87,7 +87,7 @@ class SankakuTagExtractor(SankakuExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/?\?([^#]*)" + pattern = BASE_PATTERN + r"(?:/posts)?/?\?([^#]*)" example = "https://sankaku.app/?tags=TAG" def __init__(self, match): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 61e871e..4766ae5 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -10,12 +10,13 @@ from .common import Extractor, Message from .. import text, util, exception -from ..cache import cache +from ..cache import cache, memcache import itertools import json import re -BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:(?:[fv]x)?twitter|x)\.com" +BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?" + r"(?:(?:[fv]x)?twitter|(?:fixup)?x)\.com") class TwitterExtractor(Extractor): @@ -272,25 +273,23 @@ class TwitterExtractor(Extractor): author = tweet["user"] author = self._transform_user(author) - if "note_tweet" in tweet: - note = tweet["note_tweet"]["note_tweet_results"]["result"] - else: - note = None - - source = tweet["source"] - if "legacy" in tweet: - tweet = tweet["legacy"] + legacy = tweet["legacy"] + else: + legacy = tweet + tget = legacy.get - tweet_id = int(tweet["id_str"]) + tweet_id = int(legacy["id_str"]) if tweet_id >= 300000000000000: date = text.parse_timestamp( ((tweet_id >> 22) + 1288834974657) // 1000) else: - date = text.parse_datetime( - tweet["created_at"], "%a %b %d %H:%M:%S %z %Y") + try: + date = text.parse_datetime( + legacy["created_at"], "%a %b %d %H:%M:%S %z %Y") + except Exception: + date = util.NONE - tget = tweet.get tdata = { "tweet_id" : tweet_id, "retweet_id" : text.parse_int( @@ -304,8 +303,8 @@ class TwitterExtractor(Extractor): "date" : date, "author" : author, "user" : self._user or author, - "lang" : tweet["lang"], - "source" : text.extr(source, ">", "<"), + "lang" : legacy["lang"], + "source" : text.extr(tweet["source"], ">", "<"), "sensitive" : tget("possibly_sensitive"), "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), @@ -313,7 +312,13 @@ class TwitterExtractor(Extractor): "retweet_count" : tget("retweet_count"), } - entities = note["entity_set"] if note else tweet["entities"] + if "note_tweet" in tweet: + note = tweet["note_tweet"]["note_tweet_results"]["result"] + content = note["text"] + entities = note["entity_set"] + else: + content = tget("full_text") or tget("text") or "" + entities = legacy["entities"] hashtags = entities.get("hashtags") if hashtags: @@ -327,8 +332,7 @@ class TwitterExtractor(Extractor): "nick": u["name"], } for u in mentions] - content = text.unescape( - note["text"] if note else tget("full_text") or tget("text") or "") + content = text.unescape(content) urls = entities.get("urls") if urls: for url in urls: @@ -336,11 +340,13 @@ class TwitterExtractor(Extractor): txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content - if "in_reply_to_screen_name" in tweet: - tdata["reply_to"] = tweet["in_reply_to_screen_name"] - if "quoted_by" in tweet: - tdata["quote_by"] = tweet["quoted_by"] + if "in_reply_to_screen_name" in legacy: + tdata["reply_to"] = legacy["in_reply_to_screen_name"] + if "quoted_by" in legacy: + tdata["quote_by"] = legacy["quoted_by"] if tdata["retweet_id"]: + tdata["content"] = "RT @{}: {}".format( + author["name"], tdata["content"]) tdata["date_original"] = text.parse_timestamp( ((tdata["retweet_id"] >> 22) + 1288834974657) // 1000) @@ -1194,6 +1200,7 @@ class TwitterAPI(): } return self._pagination_users(endpoint, variables) + @memcache(keyarg=1) def user_by_rest_id(self, rest_id): endpoint = "/graphql/1YAM811Q8Ry4XyPpJclURQ/UserByRestId" features = self.features.copy() @@ -1207,6 +1214,7 @@ class TwitterAPI(): } return self._call(endpoint, params)["data"]["user"]["result"] + @memcache(keyarg=1) def user_by_screen_name(self, screen_name): endpoint = "/graphql/XA6F1nJELYg65hxOC2Ekmg/UserByScreenName" params = { @@ -1527,15 +1535,21 @@ class TwitterAPI(): retweet["core"]["user_results"]["result"] rtlegacy = retweet["legacy"] + + if "note_tweet" in retweet: + tweet["note_tweet"] = retweet["note_tweet"] + if "extended_entities" in rtlegacy and \ "extended_entities" not in legacy: legacy["extended_entities"] = \ rtlegacy["extended_entities"] + if "withheld_scope" in rtlegacy and \ "withheld_scope" not in legacy: legacy["withheld_scope"] = \ rtlegacy["withheld_scope"] - legacy["full_text"] = rtlegacy["full_text"] + + legacy["full_text"] = rtlegacy["full_text"] except KeyError: pass diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 168d5a0..ed05e1f 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -191,7 +191,7 @@ class WeiboExtractor(Extractor): headers = {"Referer": response.url} data = { "cb": "gen_callback", - "fp": '{"os":"1","browser":"Gecko91,0,0,0","fonts":"undefined",' + "fp": '{"os":"1","browser":"Gecko109,0,0,0","fonts":"undefined",' '"screenInfo":"1920*1080*24","plugins":""}', } @@ -203,8 +203,8 @@ class WeiboExtractor(Extractor): params = { "a" : "incarnate", "t" : data["tid"], - "w" : "2", - "c" : "{:>03}".format(data["confidence"]), + "w" : "3" if data.get("new_tid") else "2", + "c" : "{:>03}".format(data.get("confidence") or 100), "gc" : "", "cb" : "cross_domain", "from" : "weibo", diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 4f2ee26..9508ff3 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -254,14 +254,14 @@ def stderr_write_flush(s): sys.stderr.flush() -if sys.stdout.line_buffering: +if getattr(sys.stdout, "line_buffering", None): def stdout_write(s): sys.stdout.write(s) else: stdout_write = stdout_write_flush -if sys.stderr.line_buffering: +if getattr(sys.stderr, "line_buffering", None): def stderr_write(s): sys.stderr.write(s) else: diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 5004bed..18d00e1 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -189,7 +189,7 @@ class MetadataPP(PostProcessor): tags = [] extend = tags.extend for tagdict in taglists: - extend([x for x in tagdict.values() if x is not None]) + extend([x for x in tagdict.values() if isinstance(x, str)]) tags.sort() fp.write("\n".join(tags) + "\n") @@ -206,7 +206,8 @@ class MetadataPP(PostProcessor): sort_keys=options.get("sort", False), separators=options.get("separators"), indent=options.get("indent", indent), - check_circular=False, default=str, + check_circular=False, + default=util.json_default, ) diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 62e7b4a..6255d49 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -223,8 +223,14 @@ def datetime_to_timestamp_string(dt): return "" +def json_default(obj): + if isinstance(obj, CustomNone): + return None + return str(obj) + + json_loads = json._default_decoder.decode -json_dumps = json.JSONEncoder(default=str).encode +json_dumps = json.JSONEncoder(default=json_default).encode def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4): @@ -233,7 +239,7 @@ def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4): obj, fp, ensure_ascii=ensure_ascii, indent=indent, - default=str, + default=json_default, sort_keys=True, ) fp.write("\n") diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 593cffa..5050174 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.26.1" +__version__ = "1.26.2" diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index b64df88..fb1d739 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -365,8 +365,8 @@ class MetadataTest(BasePostprocessorTest): self._create( {"mode": "tags"}, {"tags": [ - {"g": "foobar1", "m": "foobar2"}, - {"g": None, "m": "foobarbaz"} + {"g": "foobar1", "m": "foobar2", "u": True}, + {"g": None, "m": "foobarbaz", "u": [3, 4]}, ]}, ) with patch("builtins.open", mock_open()) as m: diff --git a/test/test_util.py b/test/test_util.py index 0813a0b..780f475 100644 --- a/test/test_util.py +++ b/test/test_util.py @@ -750,6 +750,7 @@ def hash(value): self.assertIs(obj(), obj) self.assertIs(obj(1, "a"), obj) self.assertIs(obj(foo="bar"), obj) + self.assertEqual(util.json_dumps(obj), "null") i = 0 for _ in obj: |
