diff options
Diffstat (limited to 'gallery_dl')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/blogger.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/bunkr.py | 13 | ||||
| -rw-r--r-- | gallery_dl/extractor/fanbox.py | 12 | ||||
| -rw-r--r-- | gallery_dl/extractor/fantia.py | 142 | ||||
| -rw-r--r-- | gallery_dl/extractor/furaffinity.py | 15 | ||||
| -rw-r--r-- | gallery_dl/extractor/imagehosts.py | 31 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/jpgfish.py | 23 | ||||
| -rw-r--r-- | gallery_dl/extractor/jschan.py | 94 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 46 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixiv.py | 99 | ||||
| -rw-r--r-- | gallery_dl/extractor/pornhub.py | 23 | ||||
| -rw-r--r-- | gallery_dl/extractor/reddit.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/redgifs.py | 73 | ||||
| -rw-r--r-- | gallery_dl/extractor/senmanga.py | 96 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 119 | ||||
| -rw-r--r-- | gallery_dl/extractor/vipergirls.py | 94 | ||||
| -rw-r--r-- | gallery_dl/extractor/wallhaven.py | 24 | ||||
| -rw-r--r-- | gallery_dl/extractor/weibo.py | 4 | ||||
| -rw-r--r-- | gallery_dl/formatter.py | 1 | ||||
| -rw-r--r-- | gallery_dl/version.py | 2 |
22 files changed, 648 insertions, 272 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3e47c3e..a344fe4 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -76,6 +76,7 @@ modules = [ "itaku", "itchio", "jpgfish", + "jschan", "kabeuchi", "keenspot", "kemonoparty", diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index eafc8af..3ceada8 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -44,6 +44,7 @@ class BloggerExtractor(Extractor): findall_image = re.compile( r'src="(https?://(?:' r'blogger\.googleusercontent\.com/img|' + r'lh\d+\.googleusercontent\.com/|' r'\d+\.bp\.blogspot\.com)/[^"]+)').findall findall_video = re.compile( r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 7c66fb0..5c8c530 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -6,19 +6,19 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://bunkr.la/""" +"""Extractors for https://bunkrr.su/""" from .lolisafe import LolisafeAlbumExtractor from .. import text class BunkrAlbumExtractor(LolisafeAlbumExtractor): - """Extractor for bunkr.la albums""" + """Extractor for bunkrr.su albums""" category = "bunkr" - root = "https://bunkr.la" - pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:la|[sr]u|is|to)/a/([^/?#]+)" + root = "https://bunkrr.su" + pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)" test = ( - ("https://bunkr.la/a/Lktg9Keq", { + ("https://bunkrr.su/a/Lktg9Keq", { "pattern": r"https://cdn\.bunkr\.ru/test-テスト-\"&>-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -52,6 +52,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "num": int, }, }), + ("https://bunkrr.su/a/Lktg9Keq"), ("https://bunkr.la/a/Lktg9Keq"), ("https://bunkr.su/a/Lktg9Keq"), ("https://bunkr.ru/a/Lktg9Keq"), @@ -70,7 +71,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): cdn = None files = [] append = files.append - headers = {"Referer": self.root.replace("://", "://stream.", 1) + "/"} + headers = {"Referer": self.root + "/"} pos = page.index('class="grid-images') for url in text.extract_iter(page, '<a href="', '"', pos): diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 4ca0852..373529f 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -214,9 +214,15 @@ class FanboxExtractor(Extractor): # to a proper Fanbox URL url = "https://www.pixiv.net/fanbox/"+content_id # resolve redirect - response = self.request(url, method="HEAD", allow_redirects=False) - url = response.headers["Location"] - final_post["_extractor"] = FanboxPostExtractor + try: + url = self.request(url, method="HEAD", + allow_redirects=False).headers["location"] + except Exception as exc: + url = None + self.log.warning("Unable to extract fanbox embed %s (%s: %s)", + content_id, exc.__class__.__name__, exc) + else: + final_post["_extractor"] = FanboxPostExtractor elif provider == "twitter": url = "https://twitter.com/_/status/"+content_id elif provider == "google_forms": diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 13dfead..35c4cc4 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -24,6 +24,14 @@ class FantiaExtractor(Extractor): "Accept" : "application/json, text/plain, */*", "Referer": self.root, } + _empty_plan = { + "id" : 0, + "price": 0, + "limit": 0, + "name" : "", + "description": "", + "thumb": self.root + "/images/fallback/plan/thumb_default.png", + } if self._warning: if not self._check_cookies(("_session_id",)): @@ -31,15 +39,29 @@ class FantiaExtractor(Extractor): FantiaExtractor._warning = False for post_id in self.posts(): - full_response, post = self._get_post_data(post_id) - yield Message.Directory, post + post = self._get_post_data(post_id) post["num"] = 0 - for url, url_data in self._get_urls_from_post(full_response, post): - post["num"] += 1 - fname = url_data["content_filename"] or url - text.nameext_from_url(fname, url_data) - url_data["file_url"] = url - yield Message.Url, url, url_data + + for content in self._get_post_contents(post): + post["content_category"] = content["category"] + post["content_title"] = content["title"] + post["content_filename"] = content.get("filename", "") + post["content_id"] = content["id"] + post["plan"] = content["plan"] or _empty_plan + yield Message.Directory, post + + if content["visible_status"] != "visible": + self.log.warning( + "Unable to download '%s' files from " + "%s#post-content-id-%s", content["visible_status"], + post["post_url"], content["id"]) + + for url in self._get_content_urls(post, content): + text.nameext_from_url( + post["content_filename"] or url, post) + post["file_url"] = url + post["num"] += 1 + yield Message.Url, url, post def posts(self): """Return post IDs""" @@ -71,7 +93,7 @@ class FantiaExtractor(Extractor): """Fetch and process post data""" url = self.root+"/api/v1/posts/"+post_id resp = self.request(url, headers=self.headers).json()["post"] - post = { + return { "post_id": resp["id"], "post_url": self.root + "/posts/" + str(resp["id"]), "post_title": resp["title"], @@ -85,55 +107,65 @@ class FantiaExtractor(Extractor): "fanclub_user_name": resp["fanclub"]["user"]["name"], "fanclub_name": resp["fanclub"]["name"], "fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]), - "tags": resp["tags"] + "tags": resp["tags"], + "_data": resp, } - return resp, post - def _get_urls_from_post(self, resp, post): + def _get_post_contents(self, post): + contents = post["_data"]["post_contents"] + + try: + url = post["_data"]["thumb"]["original"] + except Exception: + pass + else: + contents.insert(0, { + "id": "thumb", + "title": "thumb", + "category": "thumb", + "download_uri": url, + "visible_status": "visible", + "plan": None, + }) + + return contents + + def _get_content_urls(self, post, content): """Extract individual URL data from the response""" - if "thumb" in resp and resp["thumb"] and "original" in resp["thumb"]: - post["content_filename"] = "" - post["content_category"] = "thumb" - post["file_id"] = "thumb" - yield resp["thumb"]["original"], post - - for content in resp["post_contents"]: - post["content_category"] = content["category"] - post["content_title"] = content["title"] - post["content_filename"] = content.get("filename", "") - post["content_id"] = content["id"] - - if "comment" in content: - post["content_comment"] = content["comment"] - - if "post_content_photos" in content: - for photo in content["post_content_photos"]: - post["file_id"] = photo["id"] - yield photo["url"]["original"], post - - if "download_uri" in content: - post["file_id"] = content["id"] - yield self.root+"/"+content["download_uri"], post - - if content["category"] == "blog" and "comment" in content: - comment_json = util.json_loads(content["comment"]) - ops = comment_json.get("ops", ()) - - # collect blogpost text first - blog_text = "" - for op in ops: - insert = op.get("insert") - if isinstance(insert, str): - blog_text += insert - post["blogpost_text"] = blog_text - - # collect images - for op in ops: - insert = op.get("insert") - if isinstance(insert, dict) and "fantiaImage" in insert: - img = insert["fantiaImage"] - post["file_id"] = img["id"] - yield "https://fantia.jp" + img["original_url"], post + if "comment" in content: + post["content_comment"] = content["comment"] + + if "post_content_photos" in content: + for photo in content["post_content_photos"]: + post["file_id"] = photo["id"] + yield photo["url"]["original"] + + if "download_uri" in content: + post["file_id"] = content["id"] + url = content["download_uri"] + if url[0] == "/": + url = self.root + url + yield url + + if content["category"] == "blog" and "comment" in content: + comment_json = util.json_loads(content["comment"]) + ops = comment_json.get("ops") or () + + # collect blogpost text first + blog_text = "" + for op in ops: + insert = op.get("insert") + if isinstance(insert, str): + blog_text += insert + post["blogpost_text"] = blog_text + + # collect images + for op in ops: + insert = op.get("insert") + if isinstance(insert, dict) and "fantiaImage" in insert: + img = insert["fantiaImage"] + post["file_id"] = img["id"] + yield self.root + img["original_url"] class FantiaCreatorExtractor(FantiaExtractor): diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index cc43cec..9f5cbba 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -159,7 +159,13 @@ class FuraffinityExtractor(Extractor): while path: page = self.request(self.root + path).text - yield from text.extract_iter(page, 'id="sid-', '"') + extr = text.extract_from(page) + while True: + post_id = extr('id="sid-', '"') + if not post_id: + break + self._favorite_id = text.parse_int(extr('data-fav-id="', '"')) + yield post_id path = text.extr(page, 'right" href="', '"') def _pagination_search(self, query): @@ -241,6 +247,7 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): test = ("https://www.furaffinity.net/favorites/mirlinthloth/", { "pattern": r"https://d\d?\.f(uraffinity|acdn)\.net" r"/art/[^/]+/\d+/\d+.\w+\.\w+", + "keyword": {"favorite_id": int}, "range": "45-50", "count": 6, }) @@ -248,6 +255,12 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): def posts(self): return self._pagination_favorites() + def _parse_post(self, post_id): + post = FuraffinityExtractor._parse_post(self, post_id) + if post: + post["favorite_id"] = self._favorite_id + return post + class FuraffinitySearchExtractor(FuraffinityExtractor): """Extractor for furaffinity search results""" diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index df4ff26..a6e848c 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -164,17 +164,17 @@ class AcidimgImageExtractor(ImagehostImageExtractor): pattern = r"(?:https?://)?((?:www\.)?acidimg\.cc/img-([a-z0-9]+)\.html)" test = ("https://acidimg.cc/img-5acb6b9de4640.html", { "url": "f132a630006e8d84f52d59555191ed82b3b64c04", - "keyword": "a8bb9ab8b2f6844071945d31f8c6e04724051f37", + "keyword": "135347ab4345002fc013863c0d9419ba32d98f78", "content": "0c8768055e4e20e7c7259608b67799171b691140", }) params = "simple" encoding = "utf-8" def get_info(self, page): - url, pos = text.extract(page, "<img class='centred' src='", "'") + url, pos = text.extract(page, '<img class="centred" src="', '"') if not url: raise exception.NotFoundError("image") - filename, pos = text.extract(page, " alt='", "'", pos) + filename, pos = text.extract(page, ' alt="', '"', pos) return url, (filename + splitext(url)[1]) if filename else url @@ -295,19 +295,38 @@ class PostimgImageExtractor(ImagehostImageExtractor): """Extractor for single images from postimages.org""" category = "postimg" pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)" - r"/(?:image/)?([^/?#]+)/?)") + r"/(?!gallery/)(?:image/)?([^/?#]+)/?)") test = ("https://postimg.cc/Wtn2b3hC", { - "url": "0794cfda9b8951a8ac3aa692472484200254ab86", + "url": "72f3c8b1d6c6601a20ad58f35635494b4891a99e", "keyword": "2d05808d04e4e83e33200db83521af06e3147a84", "content": "cfaa8def53ed1a575e0c665c9d6d8cf2aac7a0ee", }) def get_info(self, page): - url , pos = text.extract(page, 'id="main-image" src="', '"') + pos = page.index(' id="download"') + url , pos = text.rextract(page, ' href="', '"', pos) filename, pos = text.extract(page, 'class="imagename">', '<', pos) return url, text.unescape(filename) +class PostimgGalleryExtractor(ImagehostImageExtractor): + """Extractor for images galleries from postimages.org""" + category = "postimg" + subcategory = "gallery" + pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)" + r"/(?:gallery/)([^/?#]+)/?)") + test = ("https://postimg.cc/gallery/wxpDLgX", { + "pattern": PostimgImageExtractor.pattern, + "count": 22, + }) + + def items(self): + page = self.request(self.page_url).text + data = {"_extractor": PostimgImageExtractor} + for url in text.extract_iter(page, ' class="thumb"><a href="', '"'): + yield Message.Queue, url, data + + class TurboimagehostImageExtractor(ImagehostImageExtractor): """Extractor for single images from www.turboimagehost.com""" category = "turboimagehost" diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 677cbdd..faeffa6 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -864,7 +864,7 @@ class InstagramRestAPI(): def user_tagged(self, user_id): endpoint = "/v1/usertags/{}/feed/".format(user_id) - params = {"count": 50} + params = {"count": 20} return self._pagination(endpoint, params) def _call(self, endpoint, **kwargs): diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py index cdcf35c..b8d425a 100644 --- a/gallery_dl/extractor/jpgfish.py +++ b/gallery_dl/extractor/jpgfish.py @@ -4,18 +4,18 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://jpg.fishing/""" +"""Extractors for https://jpg.pet/""" from .common import Extractor, Message from .. import text -BASE_PATTERN = r"(?:https?://)?jpg\.(?:fishing|church)" +BASE_PATTERN = r"(?:https?://)?jpg\.(?:pet|fish(?:ing)?|church)" class JpgfishExtractor(Extractor): """Base class for jpgfish extractors""" category = "jpgfish" - root = "https://jpg.fishing" + root = "https://jpg.pet" directory_fmt = ("{category}", "{user}", "{album}",) archive_fmt = "{id}" @@ -36,7 +36,7 @@ class JpgfishImageExtractor(JpgfishExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" test = ( - ("https://jpg.fishing/img/funnymeme.LecXGS", { + ("https://jpg.pet/img/funnymeme.LecXGS", { "pattern": r"https://simp3\.jpg\.church/images/funnymeme\.jpg", "content": "098e5e9b17ad634358426e0ffd1c93871474d13c", "keyword": { @@ -52,7 +52,9 @@ class JpgfishImageExtractor(JpgfishExtractor): "pattern": r"https://simp2\.jpg\.church/hannahowo_00457\.jpg", "keyword": {"album": "401-500"}, }), - ("https://jpg.church/img/hannahowo-00424.au64iA"), + ("https://jpg.fishing/img/funnymeme.LecXGS"), + ("https://jpg.fish/img/funnymeme.LecXGS"), + ("https://jpg.church/img/funnymeme.LecXGS"), ) def __init__(self, match): @@ -81,13 +83,13 @@ class JpgfishAlbumExtractor(JpgfishExtractor): subcategory = "album" pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" test = ( - ("https://jpg.fishing/album/CDilP/?sort=date_desc&page=1", { + ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1", { "count": 2, }), - ("https://jpg.church/a/gunggingnsk.N9OOI", { + ("https://jpg.fishing/a/gunggingnsk.N9OOI", { "count": 114, }), - ("https://jpg.church/a/101-200.aNJ6A/", { + ("https://jpg.fish/a/101-200.aNJ6A/", { "count": 100, }), ("https://jpg.church/a/hannahowo.aNTdH/sub", { @@ -118,12 +120,15 @@ class JpgfishUserExtractor(JpgfishExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" test = ( - ("https://jpg.fishing/exearco", { + ("https://jpg.pet/exearco", { "count": 3, }), ("https://jpg.church/exearco/albums", { "count": 1, }), + ("https://jpg.fishing/exearco"), + ("https://jpg.fish/exearco"), + ("https://jpg.church/exearco"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/jschan.py b/gallery_dl/extractor/jschan.py new file mode 100644 index 0000000..fe758fa --- /dev/null +++ b/gallery_dl/extractor/jschan.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for jschan Imageboards""" + +from .common import BaseExtractor, Message +from .. import text +import itertools + + +class JschanExtractor(BaseExtractor): + basecategory = "jschan" + + +BASE_PATTERN = JschanExtractor.update({ + "94chan": { + "root": "https://94chan.org", + "pattern": r"94chan\.org" + } +}) + + +class JschanThreadExtractor(JschanExtractor): + """Extractor for jschan threads""" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", + "{threadId} {subject|nomarkup[:50]}") + filename_fmt = "{postId}{num:?-//} {filename}.{extension}" + archive_fmt = "{board}_{postId}_{num}" + pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)\.html" + test = ( + ("https://94chan.org/art/thread/25.html", { + "pattern": r"https://94chan.org/file/[0-9a-f]{64}(\.\w+)?", + "count": ">= 15" + }) + ) + + def __init__(self, match): + JschanExtractor.__init__(self, match) + index = match.lastindex + self.board = match.group(index-1) + self.thread = match.group(index) + + def items(self): + url = "{}/{}/thread/{}.json".format( + self.root, self.board, self.thread) + thread = self.request(url).json() + thread["threadId"] = thread["postId"] + posts = thread.pop("replies", ()) + + yield Message.Directory, thread + for post in itertools.chain((thread,), posts): + files = post.pop("files", ()) + if files: + thread.update(post) + thread["count"] = len(files) + for num, file in enumerate(files): + url = self.root + "/file/" + file["filename"] + file.update(thread) + file["num"] = num + file["siteFilename"] = file["filename"] + text.nameext_from_url(file["originalFilename"], file) + yield Message.Url, url, file + + +class JschanBoardExtractor(JschanExtractor): + """Extractor for jschan boards""" + subcategory = "board" + pattern = (BASE_PATTERN + r"/([^/?#]+)" + r"(?:/index\.html|/catalog\.html|/\d+\.html|/?$)") + test = ( + ("https://94chan.org/art/", { + "pattern": JschanThreadExtractor.pattern, + "count": ">= 30" + }), + ("https://94chan.org/art/2.html"), + ("https://94chan.org/art/catalog.html"), + ("https://94chan.org/art/index.html"), + ) + + def __init__(self, match): + JschanExtractor.__init__(self, match) + self.board = match.group(match.lastindex) + + def items(self): + url = "{}/{}/catalog.json".format(self.root, self.board) + for thread in self.request(url).json(): + url = "{}/{}/thread/{}.html".format( + self.root, self.board, thread["postId"]) + thread["_extractor"] = JschanThreadExtractor + yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 915fbe6..5aeefeb 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -14,7 +14,7 @@ from ..cache import cache import itertools import re -BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.party" +BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})" @@ -29,10 +29,11 @@ class KemonopartyExtractor(Extractor): cookiedomain = ".kemono.party" def __init__(self, match): - if match.group(1) == "coomer": - self.category = "coomerparty" - self.cookiedomain = ".coomer.party" + domain = match.group(1) + tld = match.group(2) + self.category = domain + "party" self.root = text.root_from_url(match.group(0)) + self.cookiedomain = ".{}.{}".format(domain, tld) Extractor.__init__(self, match) self.session.headers["Referer"] = self.root + "/" @@ -40,7 +41,7 @@ class KemonopartyExtractor(Extractor): self._prepare_ddosguard_cookies() self._find_inline = re.compile( - r'src="(?:https?://(?:kemono|coomer)\.party)?(/inline/[^"]+' + r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) @@ -224,11 +225,12 @@ class KemonopartyUserExtractor(KemonopartyExtractor): "options": (("max-posts", 25),), "count": "< 100", }), + ("https://kemono.su/subscribestar/user/alcorart"), ("https://kemono.party/subscribestar/user/alcorart"), ) def __init__(self, match): - _, service, user_id, offset = match.groups() + _, _, service, user_id, offset = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) @@ -329,13 +331,14 @@ class KemonopartyPostExtractor(KemonopartyExtractor): r"f51c10adc9dabd86e92bd52339f298b9\.txt", "content": "da39a3ee5e6b4b0d3255bfef95601890afd80709", # empty }), + ("https://kemono.su/subscribestar/user/alcorart/post/184330"), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"), ("https://beta.kemono.party/subscribestar/user/alcorart/post/184330"), ) def __init__(self, match): - _, service, user_id, post_id = match.groups() + _, _, service, user_id, post_id = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) self.api_url = "{}/api/{}/user/{}/post/{}".format( @@ -361,9 +364,9 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): "count": 4, "keyword": {"channel_name": "finish-work"}, }), - (("https://kemono.party/discord" + (("https://kemono.su/discord" "/server/256559665620451329/channel/462437519519383555#"), { - "pattern": r"https://kemono\.party/data/(" + "pattern": r"https://kemono\.su/data/(" r"e3/77/e377e3525164559484ace2e64425b0cec1db08.*\.png|" r"51/45/51453640a5e0a4d23fbf57fb85390f9c5ec154.*\.gif)", "keyword": {"hash": "re:e377e3525164559484ace2e64425b0cec1db08" @@ -382,7 +385,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - _, self.server, self.channel, self.channel_name = match.groups() + _, _, self.server, self.channel, self.channel_name = match.groups() def items(self): self._prepare_ddosguard_cookies() @@ -457,14 +460,20 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): class KemonopartyDiscordServerExtractor(KemonopartyExtractor): subcategory = "discord-server" pattern = BASE_PATTERN + r"/discord/server/(\d+)$" - test = ("https://kemono.party/discord/server/488668827274444803", { - "pattern": KemonopartyDiscordExtractor.pattern, - "count": 13, - }) + test = ( + ("https://kemono.party/discord/server/488668827274444803", { + "pattern": KemonopartyDiscordExtractor.pattern, + "count": 13, + }), + ("https://kemono.su/discord/server/488668827274444803", { + "pattern": KemonopartyDiscordExtractor.pattern, + "count": 13, + }), + ) def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.server = match.group(2) + self.server = match.group(3) def items(self): url = "{}/api/discord/channels/lookup?q={}".format( @@ -493,11 +502,16 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): "url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", "count": 3, }), + ("https://kemono.su/favorites?type=post", { + "pattern": KemonopartyPostExtractor.pattern, + "url": "4be8e84cb384a907a8e7997baaf6287b451783b5", + "count": 3, + }), ) def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.favorites = (text.parse_query(match.group(2)).get("type") or + self.favorites = (text.parse_query(match.group(3)).get("type") or self.config("favorites") or "artist") diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index cdaf595..861959e 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -15,6 +15,9 @@ from datetime import datetime, timedelta import itertools import hashlib +BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" +USER_PATTERN = BASE_PATTERN + r"/(?:en/)?users/(\d+)" + class PixivExtractor(Extractor): """Base class for pixiv extractors""" @@ -150,7 +153,7 @@ class PixivExtractor(Extractor): class PixivUserExtractor(PixivExtractor): """Extractor for a pixiv user profile""" subcategory = "user" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" + pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id=" r")(\d+)(?:$|[?#])") test = ( @@ -168,18 +171,19 @@ class PixivUserExtractor(PixivExtractor): def items(self): base = "{}/users/{}/".format(self.root, self.user_id) return self._dispatch_extractors(( - (PixivAvatarExtractor , base + "avatar"), - (PixivBackgroundExtractor, base + "background"), - (PixivArtworksExtractor , base + "artworks"), - (PixivFavoriteExtractor , base + "bookmarks/artworks"), - (PixivNovelUserExtractor , base + "novels"), + (PixivAvatarExtractor , base + "avatar"), + (PixivBackgroundExtractor , base + "background"), + (PixivArtworksExtractor , base + "artworks"), + (PixivFavoriteExtractor , base + "bookmarks/artworks"), + (PixivNovelBookmarkExtractor, base + "bookmarks/novels"), + (PixivNovelUserExtractor , base + "novels"), ), ("artworks",)) class PixivArtworksExtractor(PixivExtractor): """Extractor for artworks of a pixiv user""" subcategory = "artworks" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" + pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" r"(?:/([^/?#]+))?/?(?:$|[?#])" r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") @@ -240,8 +244,7 @@ class PixivAvatarExtractor(PixivExtractor): subcategory = "avatar" filename_fmt = "avatar{date:?_//%Y-%m-%d}.{extension}" archive_fmt = "avatar_{user[id]}_{date}" - pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/avatar") + pattern = USER_PATTERN + r"/avatar" test = ("https://www.pixiv.net/en/users/173530/avatar", { "content": "4e57544480cc2036ea9608103e8f024fa737fe66", }) @@ -261,8 +264,7 @@ class PixivBackgroundExtractor(PixivExtractor): subcategory = "background" filename_fmt = "background{date:?_//%Y-%m-%d}.{extension}" archive_fmt = "background_{user[id]}_{date}" - pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/background") + pattern = USER_PATTERN + "/background" test = ("https://www.pixiv.net/en/users/194921/background", { "pattern": r"https://i\.pximg\.net/background/img/2021/01/30/16/12/02" r"/194921_af1f71e557a42f499213d4b9eaccc0f8\.jpg", @@ -376,12 +378,12 @@ class PixivWorkExtractor(PixivExtractor): class PixivFavoriteExtractor(PixivExtractor): - """Extractor for all favorites/bookmarks of a pixiv-user""" + """Extractor for all favorites/bookmarks of a pixiv user""" subcategory = "favorite" directory_fmt = ("{category}", "bookmarks", "{user_bookmark[id]} {user_bookmark[account]}") archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?" + pattern = (BASE_PATTERN + r"/(?:(?:en/)?" r"users/(\d+)/(bookmarks/artworks|following)(?:/([^/?#]+))?" r"|bookmark\.php)(?:\?([^#]*))?") test = ( @@ -484,8 +486,7 @@ class PixivRankingExtractor(PixivExtractor): archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}" directory_fmt = ("{category}", "rankings", "{ranking[mode]}", "{ranking[date]}") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/ranking\.php(?:\?([^#]*))?") + pattern = BASE_PATTERN + r"/ranking\.php(?:\?([^#]*))?" test = ( ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"), ("https://www.pixiv.net/ranking.php"), @@ -550,8 +551,7 @@ class PixivSearchExtractor(PixivExtractor): subcategory = "search" archive_fmt = "s_{search[word]}_{id}{num}.{extension}" directory_fmt = ("{category}", "search", "{search[word]}") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" + pattern = (BASE_PATTERN + r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" r"|search\.php)(?:\?([^#]+))?") test = ( ("https://www.pixiv.net/en/tags/Original", { @@ -634,8 +634,7 @@ class PixivFollowExtractor(PixivExtractor): subcategory = "follow" archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}" directory_fmt = ("{category}", "following") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/bookmark_new_illust\.php") + pattern = BASE_PATTERN + r"/bookmark_new_illust\.php" test = ( ("https://www.pixiv.net/bookmark_new_illust.php"), ("https://touch.pixiv.net/bookmark_new_illust.php"), @@ -697,8 +696,7 @@ class PixivSeriesExtractor(PixivExtractor): directory_fmt = ("{category}", "{user[id]} {user[account]}", "{series[id]} {series[title]}") filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}" - pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" - r"/user/(\d+)/series/(\d+)") + pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)" test = ("https://www.pixiv.net/user/10509347/series/21859", { "range": "1-10", "count": 10, @@ -755,8 +753,7 @@ class PixivNovelExtractor(PixivExtractor): """Extractor for pixiv novels""" subcategory = "novel" request_interval = 1.0 - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/n(?:ovel/show\.php\?id=|/)(\d+)") + pattern = BASE_PATTERN + r"/n(?:ovel/show\.php\?id=|/)(\d+)" test = ( ("https://www.pixiv.net/novel/show.php?id=19612040", { "count": 1, @@ -799,6 +796,12 @@ class PixivNovelExtractor(PixivExtractor): "options": (("embeds", True),), "count": 3, }), + # full series + ("https://www.pixiv.net/novel/show.php?id=19612040", { + "options": (("full-series", True),), + "count": 4, + }), + # short URL ("https://www.pixiv.net/n/19612040"), ) @@ -862,7 +865,7 @@ class PixivNovelExtractor(PixivExtractor): illusts = {} for marker in text.extract_iter(content, "[", "]"): - if marker.startswith("[jumpuri:"): + if marker.startswith("[jumpuri:If you would like to "): desktop = True elif marker.startswith("pixivimage:"): illusts[marker[11:].partition("-")[0]] = None @@ -895,14 +898,17 @@ class PixivNovelExtractor(PixivExtractor): yield Message.Queue, url, novel def novels(self): - return (self.api.novel_detail(self.novel_id),) + novel = self.api.novel_detail(self.novel_id) + if self.config("full-series") and novel["series"]: + self.subcategory = PixivNovelSeriesExtractor.subcategory + return self.api.novel_series(novel["series"]["id"]) + return (novel,) class PixivNovelUserExtractor(PixivNovelExtractor): """Extractor for pixiv users' novels""" subcategory = "novel-user" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/novels") + pattern = USER_PATTERN + r"/novels" test = ("https://www.pixiv.net/en/users/77055466/novels", { "pattern": "^text:", "range": "1-5", @@ -916,8 +922,7 @@ class PixivNovelUserExtractor(PixivNovelExtractor): class PixivNovelSeriesExtractor(PixivNovelExtractor): """Extractor for pixiv novel series""" subcategory = "novel-series" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/novel/series/(\d+)") + pattern = BASE_PATTERN + r"/novel/series/(\d+)" test = ("https://www.pixiv.net/novel/series/10278364", { "count": 4, "content": "b06abed001b3f6ccfb1579699e9a238b46d38ea2", @@ -927,6 +932,37 @@ class PixivNovelSeriesExtractor(PixivNovelExtractor): return self.api.novel_series(self.novel_id) +class PixivNovelBookmarkExtractor(PixivNovelExtractor): + """Extractor for bookmarked pixiv novels""" + subcategory = "novel-bookmark" + pattern = (USER_PATTERN + r"/bookmarks/novels" + r"(?:/([^/?#]+))?(?:/?\?([^#]+))?") + test = ( + ("https://www.pixiv.net/en/users/77055466/bookmarks/novels", { + "count": 1, + "content": "7194e8faa876b2b536f185ee271a2b6e46c69089", + }), + ("https://www.pixiv.net/en/users/11/bookmarks/novels/TAG?rest=hide"), + ) + + def __init__(self, match): + PixivNovelExtractor.__init__(self, match) + self.user_id, self.tag, self.query = match.groups() + + def novels(self): + if self.tag: + tag = text.unquote(self.tag) + else: + tag = None + + if text.parse_query(self.query).get("rest") == "hide": + restrict = "private" + else: + restrict = "public" + + return self.api.user_bookmarks_novel(self.user_id, tag, restrict) + + class PixivSketchExtractor(Extractor): """Extractor for user pages on sketch.pixiv.net""" category = "pixiv" @@ -1113,6 +1149,11 @@ class PixivAppAPI(): params = {"user_id": user_id, "tag": tag, "restrict": restrict} return self._pagination("/v1/user/bookmarks/illust", params) + def user_bookmarks_novel(self, user_id, tag=None, restrict="public"): + """Return novels bookmarked by a user""" + params = {"user_id": user_id, "tag": tag, "restrict": restrict} + return self._pagination("/v1/user/bookmarks/novel", params, "novels") + def user_bookmark_tags_illust(self, user_id, restrict="public"): """Return bookmark tags defined by a user""" params = {"user_id": user_id, "restrict": restrict} diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index f8497c0..f19e33c 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, exception - BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com" @@ -146,10 +145,20 @@ class PornhubUserExtractor(PornhubExtractor): data = {"_extractor": PornhubGalleryExtractor} while True: - page = self.request( - url, method="POST", headers=headers, params=params).text - if not page: - return - for gid in text.extract_iter(page, 'id="albumphoto', '"'): + response = self.request( + url, method="POST", headers=headers, params=params, + allow_redirects=False) + + if 300 <= response.status_code < 400: + url = "{}{}/photos/{}/ajax".format( + self.root, response.headers["location"], + self.cat or "public") + continue + + gid = None + for gid in text.extract_iter(response.text, 'id="albumphoto', '"'): yield Message.Queue, self.root + "/album/" + gid, data + if gid is None: + return + params["page"] += 1 diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 3f09e13..9a57dcf 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -56,7 +56,10 @@ class RedditExtractor(Extractor): submission["num"] = 0 if "crosspost_parent_list" in submission: - media = submission["crosspost_parent_list"][-1] + try: + media = submission["crosspost_parent_list"][-1] + except Exception: + media = submission else: media = submission diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index eaaef7d..bfd18b5 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,7 +16,8 @@ from ..cache import memcache class RedgifsExtractor(Extractor): """Base class for redgifs extractors""" category = "redgifs" - filename_fmt = "{category}_{id}.{extension}" + filename_fmt = \ + "{category}_{gallery:?//[:11]}{num:?_/_/>02}{id}.{extension}" archive_fmt = "{id}" root = "https://www.redgifs.com" @@ -34,16 +35,32 @@ class RedgifsExtractor(Extractor): def items(self): metadata = self.metadata() + for gif in self.gifs(): - url = self._process(gif) - if not url: - self.log.warning("Skipping '%s' (format not available)", - gif["id"]) - continue + + gallery = gif.get("gallery") + if gallery: + gifs = self.api.gallery(gallery)["gifs"] + enum = 1 + cnt = len(gifs) + else: + gifs = (gif,) + enum = 0 + cnt = 1 gif.update(metadata) + gif["count"] = cnt yield Message.Directory, gif - yield Message.Url, url, gif + + for num, gif in enumerate(gifs, enum): + url = self._process(gif) + if not url: + self.log.warning( + "Skipping '%s' (format not available)", gif["id"]) + continue + gif["num"] = num + gif["count"] = cnt + yield Message.Url, url, gif def _process(self, gif): gif["_fallback"] = formats = self._formats(gif) @@ -145,21 +162,36 @@ class RedgifsSearchExtractor(RedgifsExtractor): """Extractor for redgifs search results""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") - pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/browse/?\?([^#]+)" + pattern = (r"(?:https?://)?(?:\w+\.)?redgifs\.com" + r"/(?:gifs/([^/?#]+)|browse)(?:/?\?([^#]+))?") test = ( + ("https://www.redgifs.com/gifs/jav", { + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", + "range": "1-10", + "count": 10, + }), ("https://www.redgifs.com/browse?tags=JAV", { "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", "range": "1-10", "count": 10, }), - ("https://v3.redgifs.com/browse?tags=JAV"), + ("https://www.redgifs.com/gifs/jav?order=best&verified=1"), ("https://www.redgifs.com/browse?type=i&verified=y&order=top7"), + ("https://v3.redgifs.com/browse?tags=JAV"), ) + def __init__(self, match): + RedgifsExtractor.__init__(self, match) + self.search, self.query = match.groups() + def metadata(self): - self.params = params = text.parse_query(self.key) - search = params.get("tags") or params.get("order") or "trending" - return {"search": search} + self.params = text.parse_query(self.query) + if self.search: + self.params["tags"] = text.unquote(self.search) + + return {"search": (self.params.get("tags") or + self.params.get("order") or + "trending")} def gifs(self): return self.api.search(self.params) @@ -178,6 +210,16 @@ class RedgifsImageExtractor(RedgifsExtractor): r"/FoolishForkedAbyssiniancat\.mp4", "content": "f6e03f1df9a2ff2a74092f53ee7580d2fb943533", }), + # gallery (#4021) + ("https://www.redgifs.com/watch/desertedbaregraywolf", { + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.jpg", + "count": 4, + "keyword": { + "num": int, + "count": 4, + "gallery": "187ad979693-1922-fc66-0000-a96fb07b8a5d", + }, + }), ("https://redgifs.com/ifr/FoolishForkedAbyssiniancat"), ("https://i.redgifs.com/i/FoolishForkedAbyssiniancat"), ("https://www.gifdeliverynetwork.com/foolishforkedabyssiniancat"), @@ -207,6 +249,10 @@ class RedgifsAPI(): endpoint = "/v2/gifs/" + gif_id.lower() return self._call(endpoint)["gif"] + def gallery(self, gallery_id): + endpoint = "/v2/gallery/" + gallery_id + return self._call(endpoint) + def user(self, user, order="best"): endpoint = "/v2/users/{}/search".format(user.lower()) params = {"order": order} @@ -228,7 +274,6 @@ class RedgifsAPI(): def search(self, params): endpoint = "/v2/gifs/search" params["search_text"] = params.pop("tags", None) - params.pop("needSendGtm", None) return self._pagination(endpoint, params) def _call(self, endpoint, params=None): diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py index 34177b4..6d025f4 100644 --- a/gallery_dl/extractor/senmanga.py +++ b/gallery_dl/extractor/senmanga.py @@ -1,64 +1,88 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters from from https://raw.senmanga.com/""" +"""Extractors for https://raw.senmanga.com/""" -from .common import Extractor, Message +from .common import ChapterExtractor from .. import text -class SenmangaChapterExtractor(Extractor): - """Extractor for manga-chapters from raw.senmanga.com""" +class SenmangaChapterExtractor(ChapterExtractor): + """Extractor for manga chapters from raw.senmanga.com""" category = "senmanga" - subcategory = "chapter" - directory_fmt = ("{category}", "{manga}", "{chapter_string}") - filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}" - archive_fmt = "{manga}_{chapter_string}_{page}" - pattern = r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)" + root = "https://raw.senmanga.com" + pattern = r"(?:https?://)?raw\.senmanga\.com(/[^/?#]+/[^/?#]+)" test = ( - ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", { + ("https://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", { + "pattern": r"https://raw\.senmanga\.com/viewer" + r"/Bokura-wa-Minna-Kawaisou/37A/[12]", "url": "5f95140ff511d8497e2ec08fa7267c6bb231faec", - "keyword": "705d941a150765edb33cd2707074bd703a93788c", "content": "556a16d5ca3441d7a5807b6b5ac06ec458a3e4ba", + "keyword": { + "chapter": "37A", + "count": 2, + "extension": "", + "filename": "re:[12]", + "lang": "ja", + "language": "Japanese", + "manga": "Bokura wa Minna Kawaisou", + "page": int, + }, }), ("http://raw.senmanga.com/Love-Lab/2016-03/1", { + "pattern": r"https://raw\.senmanga\.com/viewer" + r"/Love-Lab/2016-03/\d", "url": "8347b9f00c14b864dd3c19a1f5ae52adb2ef00de", - "keyword": "8a8ab2529ba2edfc83a6b3a8bede1d6c580db7b4", + "keyword": { + "chapter": "2016-03", + "count": 9, + "extension": "", + "filename": r"re:\d", + "manga": "Renai Lab 恋愛ラボ", + }, + }), + ("https://raw.senmanga.com/akabane-honeko-no-bodyguard/1", { + "pattern": r"https://i\d\.wp\.com/kumacdn.club/image-new-2/a" + r"/akabane-honeko-no-bodyguard/chapter-1" + r"/\d+-[0-9a-f]{13}\.jpg", + "keyword": { + "chapter": "1", + "count": 65, + "extension": "jpg", + "filename": r"re:\d+-\w+", + "manga": "Akabane Honeko no Bodyguard", + }, }), ) - root = "https://raw.senmanga.com" def __init__(self, match): - Extractor.__init__(self, match) - part = match.group(1) - self.chapter_url = "{}/{}/".format(self.root, part) - self.img_url = "{}/viewer/{}/".format(self.root, part) - self.session.headers["Referer"] = self.chapter_url + ChapterExtractor.__init__(self, match) + self.session.headers["Referer"] = self.gallery_url - def items(self): - data = self.metadata() - yield Message.Directory, data - for data["page"] in range(1, data["count"]+1): - data["extension"] = None - yield Message.Url, self.img_url + str(data["page"]), data + # select "All pages" viewer + self.session.cookies.set( + "viewer", "1", domain="raw.senmanga.com") - def metadata(self): - """Collect metadata for extractor-job""" - page = self.request(self.chapter_url).text - self.session.cookies.clear() - title, pos = text.extract(page, '<title>', '</title>') - count, pos = text.extract(page, '</select> of ', '\n', pos) + def metadata(self, page): + title = text.extr(page, "<title>", "</title>") manga, _, chapter = title.partition(" - Chapter ") return { - "manga": text.unescape(manga).replace("-", " "), - "chapter_string": chapter.partition(" - Page ")[0], - "count": text.parse_int(count), - "lang": "jp", - "language": "Japanese", + "manga" : text.unescape(manga).replace("-", " "), + "chapter" : chapter.partition(" - Page ")[0], + "chapter_minor": "", + "lang" : "ja", + "language" : "Japanese", } + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter( + page, '<img class="picture" src="', '"') + ] diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c47021e..710bde3 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -295,6 +295,8 @@ class TwitterExtractor(Extractor): tget("quoted_by_id_str")), "reply_id" : text.parse_int( tget("in_reply_to_status_id_str")), + "conversation_id": text.parse_int( + tget("conversation_id_str")), "date" : date, "author" : author, "user" : self._user or author, @@ -664,8 +666,8 @@ class TwitterSearchExtractor(TwitterExtractor): subcategory = "search" pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" test = ("https://twitter.com/search?q=nature", { - "range": "1-40", - "count": 40, + "range": "1-20", + "count": 20, "archive": False, }) @@ -1058,7 +1060,7 @@ class TwitterAPI(): def __init__(self, extractor): self.extractor = extractor - self.root = "https://api.twitter.com" + self.root = "https://twitter.com/i/api" self._nsfw_warning = True self._syndication = self.extractor.syndication self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode @@ -1077,6 +1079,10 @@ class TwitterAPI(): auth_token = cookies.get("auth_token", domain=cookiedomain) + search = extractor.config("search-endpoint") + if search == "graphql" or not auth_token and search in ("auto", None): + self.search_adaptive = self.search_timeline + self.headers = { "Accept": "*/*", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR" @@ -1087,7 +1093,6 @@ class TwitterAPI(): "x-twitter-client-language": "en", "x-twitter-active-user": "yes", "x-csrf-token": csrf_token, - "Origin": "https://twitter.com", "Referer": "https://twitter.com/", } self.params = { @@ -1131,47 +1136,44 @@ class TwitterAPI(): "enrichments,superFollowMetadata,unmentionInfo,editControl," "collab_control,vibe", } - self.variables = { - "withDownvotePerspective": False, - "withReactionsMetadata": False, - "withReactionsPerspective": False, - } self.features = { - "blue_business_profile_image_shape_enabled": False, - "responsive_web_twitter_blue_verified_badge_is_enabled": True, + "hidden_profile_likes_enabled": False, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, - "responsive_web_graphql_skip_user_profile_" - "image_extensions_enabled": False, + "subscriptions_verification_info_verified_since_enabled": True, + "highlights_tweets_tab_ui_enabled": True, + "creator_subscriptions_tweet_preview_api_enabled": True, + "responsive_web_graphql_" + "skip_user_profile_image_extensions_enabled": False, "responsive_web_graphql_timeline_navigation_enabled": True, } self.features_pagination = { - "blue_business_profile_image_shape_enabled": False, - "responsive_web_twitter_blue_verified_badge_is_enabled": True, + "rweb_lists_timeline_redesign_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, + "creator_subscriptions_tweet_preview_api_enabled": True, "responsive_web_graphql_timeline_navigation_enabled": True, "responsive_web_graphql_skip_user_profile_" "image_extensions_enabled": False, "tweetypie_unmention_optimization_enabled": True, - "vibe_api_enabled": True, "responsive_web_edit_tweet_api_enabled": True, "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, "view_counts_everywhere_api_enabled": True, "longform_notetweets_consumption_enabled": True, "tweet_awards_web_tipping_enabled": False, - "freedom_of_speech_not_reach_fetch_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, "standardized_nudges_misinfo": True, "tweet_with_visibility_results_prefer_gql_" "limited_actions_policy_enabled": False, "interactive_text_enabled": True, "responsive_web_text_conversations_enabled": False, - "longform_notetweets_richtext_consumption_enabled": False, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": False, "responsive_web_enhance_cards_enabled": False, } def tweet_detail(self, tweet_id): - endpoint = "/graphql/AV_lPTkN6Fc6LgerQpK8Zg/TweetDetail" + endpoint = "/graphql/JlLZj42Ltr2qwjasw-l5lQ/TweetDetail" variables = { "focalTweetId": tweet_id, "referrer": "profile", @@ -1179,9 +1181,7 @@ class TwitterAPI(): "includePromotedContent": True, "withCommunity": True, "withQuickPromoteEligibilityTweetFields": True, - "withBirdwatchNotes": False, - "withSuperFollowsUserFields": True, - "withSuperFollowsTweetFields": True, + "withBirdwatchNotes": True, "withVoice": True, "withV2Timeline": True, } @@ -1189,7 +1189,7 @@ class TwitterAPI(): endpoint, variables, ("threaded_conversation_with_injections_v2",)) def user_tweets(self, screen_name): - endpoint = "/graphql/BeHK76TOCY3P8nO-FWocjA/UserTweets" + endpoint = "/graphql/-AY51QoFpVf-w7TxjQ6lpw/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1201,7 +1201,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/eZVlZu_1gwb6hMUDXBnZoQ/UserTweetsAndReplies" + endpoint = "/graphql/urrCZMyyIh1FkSFi2cdPUA/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1213,7 +1213,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_media(self, screen_name): - endpoint = "/graphql/d_ONZLUHGCsErBCriRsLXg/UserMedia" + endpoint = "/graphql/lo965xQZdN2-eSM1Jc-W_A/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1246,7 +1246,7 @@ class TwitterAPI(): features=False) def user_likes(self, screen_name): - endpoint = "/graphql/fN4-E0MjFJ9Cn7IYConL7g/Likes" + endpoint = "/graphql/6JET1d0iHsIzW0Zjs3OOwQ/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1259,7 +1259,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_bookmarks(self): - endpoint = "/graphql/RV1g3b8n_SGOHwkqKYSCFw/Bookmarks" + endpoint = "/graphql/YNtYqNuki6_oiVwx0uP8mQ/Bookmarks" variables = { "count": 100, } @@ -1270,7 +1270,7 @@ class TwitterAPI(): features=features) def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/5DAiJG3bD77SiWEs4xViBw/ListLatestTweetsTimeline" + endpoint = "/graphql/ZBbXrl37E6za5ml-DIpmgg/ListLatestTweetsTimeline" variables = { "listId": list_id, "count": 100, @@ -1288,6 +1288,24 @@ class TwitterAPI(): params["spelling_corrections"] = "1" return self._pagination_legacy(endpoint, params) + def search_timeline(self, query): + endpoint = "/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline" + variables = { + "rawQuery": query, + "count": 20, + "product": "Latest", + "withDownvotePerspective": False, + "withReactionsMetadata": False, + "withReactionsPerspective": False, + } + features = self.features_pagination.copy() + features["blue_business_profile_image_shape_enabled"] = False + features["vibe_api_enabled"] = True + return self._pagination_tweets( + endpoint, variables, + ("search_by_raw_query", "search_timeline", "timeline"), + features=features) + def live_event_timeline(self, event_id): endpoint = "/2/live_event/timeline/{}.json".format(event_id) params = self.params.copy() @@ -1305,11 +1323,10 @@ class TwitterAPI(): ["twitter_objects"]["live_events"][event_id]) def list_by_rest_id(self, list_id): - endpoint = "/graphql/D0EoyrDcct2MEqC-LnPzFg/ListByRestId" + endpoint = "/graphql/AmCdeFUvlrKAO96yHr-GCg/ListByRestId" params = { "variables": self._json_dumps({ "listId": list_id, - "withSuperFollowsUserFields": True, }), "features": self._json_dumps(self.features), } @@ -1319,7 +1336,7 @@ class TwitterAPI(): raise exception.NotFoundError("list") def list_members(self, list_id): - endpoint = "/graphql/tzsIIbGUH9RyFCVmtO2W2w/ListMembers" + endpoint = "/graphql/a_ZQomd3MMk1crWkeiQBPg/ListMembers" variables = { "listId": list_id, "count": 100, @@ -1329,7 +1346,7 @@ class TwitterAPI(): endpoint, variables, ("list", "members_timeline", "timeline")) def user_following(self, screen_name): - endpoint = "/graphql/FaBzCqZXuQCb4PhB0RHqHw/Following" + endpoint = "/graphql/JPZiqKjET7_M1r5Tlr8pyA/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1338,18 +1355,20 @@ class TwitterAPI(): return self._pagination_users(endpoint, variables) def user_by_rest_id(self, rest_id): - endpoint = "/graphql/S2BkcAyFMG--jef2N6Dgzw/UserByRestId" + endpoint = "/graphql/1YAM811Q8Ry4XyPpJclURQ/UserByRestId" + features = self.features.copy() + features["blue_business_profile_image_shape_enabled"] = True params = { "variables": self._json_dumps({ "userId": rest_id, "withSafetyModeUserFields": True, }), - "features": self._json_dumps(self.features), + "features": self._json_dumps(features), } return self._call(endpoint, params)["data"]["user"]["result"] def user_by_screen_name(self, screen_name): - endpoint = "/graphql/k26ASEiniqy4eXMdknTSoQ/UserByScreenName" + endpoint = "/graphql/XA6F1nJELYg65hxOC2Ekmg/UserByScreenName" params = { "variables": self._json_dumps({ "screen_name": screen_name, @@ -1380,7 +1399,9 @@ class TwitterAPI(): def _guest_token(self): endpoint = "/1.1/guest/activate.json" self.extractor.log.info("Requesting guest token") - return str(self._call(endpoint, None, "POST", False)["guest_token"]) + return str(self._call( + endpoint, None, "POST", False, "https://api.twitter.com", + )["guest_token"]) def _authenticate_guest(self): guest_token = self._guest_token() @@ -1389,8 +1410,8 @@ class TwitterAPI(): self.extractor.session.cookies.set( "gt", guest_token, domain=self.extractor.cookiedomain) - def _call(self, endpoint, params, method="GET", auth=True): - url = self.root + endpoint + def _call(self, endpoint, params, method="GET", auth=True, root=None): + url = (root or self.root) + endpoint while True: if not self.headers["x-twitter-auth-type"] and auth: @@ -1416,6 +1437,12 @@ class TwitterAPI(): self.extractor.wait(until=until, seconds=seconds) continue + if response.status_code == 403 and \ + not self.headers["x-twitter-auth-type"] and \ + endpoint == "/2/search/adaptive.json": + raise exception.AuthorizationError( + "Login required to access search results") + # error try: data = response.json() @@ -1524,7 +1551,6 @@ class TwitterAPI(): def _pagination_tweets(self, endpoint, variables, path=None, stop_tweets=True, features=None): extr = self.extractor - variables.update(self.variables) original_retweets = (extr.retweets == "original") pinned_tweet = extr.pinned @@ -1548,11 +1574,17 @@ class TwitterAPI(): instructions = instructions[key] instructions = instructions["instructions"] + cursor = None + entries = None for instr in instructions: - if instr.get("type") == "TimelineAddEntries": + instr_type = instr.get("type") + if instr_type == "TimelineAddEntries": entries = instr["entries"] - break - else: + elif instr_type == "TimelineReplaceEntry": + entry = instr["entry"] + if entry["entryId"].startswith("cursor-bottom-"): + cursor = entry["content"]["value"] + if entries is None: raise KeyError() except LookupError: @@ -1581,7 +1613,7 @@ class TwitterAPI(): "Unable to retrieve Tweets from this timeline") tweets = [] - tweet = cursor = None + tweet = None if pinned_tweet: pinned_tweet = False @@ -1687,7 +1719,6 @@ class TwitterAPI(): variables["cursor"] = cursor def _pagination_users(self, endpoint, variables, path=None): - variables.update(self.variables) params = {"variables": None, "features" : self._json_dumps(self.features_pagination)} diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 1cebdf7..6dff01c 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -9,7 +9,10 @@ """Extractors for https://vipergirls.to/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception +from ..cache import cache + +from xml.etree import ElementTree BASE_PATTERN = r"(?:https?://)?(?:www\.)?vipergirls\.to" @@ -18,26 +21,50 @@ class VipergirlsExtractor(Extractor): """Base class for vipergirls extractors""" category = "vipergirls" root = "https://vipergirls.to" + request_interval = 0.5 + request_interval_min = 0.2 + cookiedomain = ".vipergirls.to" + cookienames = ("vg_userid", "vg_password") def __init__(self, match): Extractor.__init__(self, match) self.session.headers["Referer"] = self.root def items(self): - for html in self.posts(): - - pos = html.find('<a href="') - if pos < 0: - continue + self.login() - title = text.extr(html, '<h2 class="title', '<') - data = { - "title": text.unescape(title.partition(">")[2].strip()), - } + for post in self.posts(): + data = post.attrib + data["thread_id"] = self.thread_id yield Message.Directory, data - for href in text.extract_iter(html, '<a href="', '"', pos): - yield Message.Queue, href, data + for image in post: + yield Message.Queue, image.attrib["main_url"], data + + def login(self): + if not self._check_cookies(self.cookienames): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=90*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = "{}/login.php?do=login".format(self.root) + data = { + "vb_login_username": username, + "vb_login_password": password, + "do" : "login", + "cookieuser" : "1", + } + + response = self.request(url, method="POST", data=data) + if not response.cookies.get("vg_password"): + raise exception.AuthenticationError() + + return {cookie.name: cookie.value + for cookie in response.cookies} class VipergirlsThreadExtractor(VipergirlsExtractor): @@ -47,11 +74,11 @@ class VipergirlsThreadExtractor(VipergirlsExtractor): test = ( (("https://vipergirls.to/threads/4328304" "-2011-05-28-Danica-Simply-Beautiful-x112-4500x3000"), { - "url": "b22feaa35a358bb36086c2b9353aee28989e1d7a", - "count": 227, + "url": "0d75cb42777f5bebc0d284d1d38cb90c750c61d9", + "count": 225, }), ("https://vipergirls.to/threads/6858916-Karina/page4", { - "count": 1294, + "count": 1279, }), ("https://vipergirls.to/threads/4328304"), ) @@ -61,25 +88,20 @@ class VipergirlsThreadExtractor(VipergirlsExtractor): self.thread_id, self.page = match.groups() def posts(self): - url = "{}/threads/{}{}".format( - self.root, self.thread_id, self.page or "") - - while True: - page = self.request(url).text - yield from text.extract_iter( - page, '<div class="postbody">', '</blockquote>') + url = "{}/vr.php?t={}".format(self.root, self.thread_id) + root = ElementTree.fromstring(self.request(url).text) + posts = root.iter("post") - url = text.extr(page, '<a rel="next" href="', '"') - if not url: - return - url = "{}/{}".format(self.root, url) + if self.page: + util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) + return posts class VipergirlsPostExtractor(VipergirlsExtractor): """Extractor for vipergirls posts""" subcategory = "post" pattern = (BASE_PATTERN + - r"/threads/(\d+)(?:-[^/?#]+)?\?(p=\d+[^#]*)#post(\d+)") + r"/threads/(\d+)(?:-[^/?#]+)?\?p=\d+[^#]*#post(\d+)") test = ( (("https://vipergirls.to/threads/4328304-2011-05-28-Danica-Simply-" "Beautiful-x112-4500x3000?p=116038081&viewfull=1#post116038081"), { @@ -87,6 +109,10 @@ class VipergirlsPostExtractor(VipergirlsExtractor): "range": "2-113", "count": 112, "keyword": { + "id": "116038081", + "imagecount": "113", + "number": "116038081", + "thread_id": "4328304", "title": "FemJoy Danica - Simply Beautiful (x112) 3000x4500", }, }), @@ -94,15 +120,9 @@ class VipergirlsPostExtractor(VipergirlsExtractor): def __init__(self, match): VipergirlsExtractor.__init__(self, match) - self.thread_id, self.query, self.post_id = match.groups() + self.thread_id, self.post_id = match.groups() def posts(self): - url = "{}/threads/{}?{}".format(self.root, self.thread_id, self.query) - page = self.request(url).text - - try: - pos = page.index('id="post_' + self.post_id + '"') - return (text.extract( - page, '<div class="postbody">', '</blockquote>', pos)[0],) - except Exception: - raise exception.NotFoundError("post") + url = "{}/vr.php?p={}".format(self.root, self.post_id) + root = ElementTree.fromstring(self.request(url).text) + return root.iter("post") diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 06f1aab..a0fba3c 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,15 +9,16 @@ """Extractors for https://wallhaven.cc/""" from .common import Extractor, Message -from .. import text +from .. import text, exception class WallhavenExtractor(Extractor): """Base class for wallhaven extractors""" category = "wallhaven" + root = "https://wallhaven.cc" filename_fmt = "{category}_{id}_{resolution}.{extension}" archive_fmt = "{id}" - root = "https://wallhaven.cc" + request_interval = 1.4 def __init__(self, match): Extractor.__init__(self, match) @@ -246,8 +247,21 @@ class WallhavenAPI(): def _call(self, endpoint, params=None): url = "https://wallhaven.cc/api" + endpoint - return self.extractor.request( - url, headers=self.headers, params=params).json() + + while True: + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) + + if response.status_code < 400: + return response.json() + if response.status_code == 429: + self.extractor.wait(seconds=60) + continue + + self.extractor.log.debug("Server response: %s", response.text) + raise exception.StopExtraction( + "API request failed (%s: %s)", + response.status_code, response.reason) def _pagination(self, endpoint, params=None, metadata=None): if params is None: diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 2cbfad6..805aa53 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -72,6 +72,8 @@ class WeiboExtractor(Extractor): file["url"] = "https:" + file["url"][5:] if "filename" not in file: text.nameext_from_url(file["url"], file) + if file["extension"] == "json": + file["extension"] = "mp4" file["status"] = status file["num"] = num yield Message.Url, file["url"], file @@ -123,7 +125,7 @@ class WeiboExtractor(Extractor): key=lambda m: m["meta"]["quality_index"]) except Exception: return {"url": (info.get("stream_url_hd") or - info["stream_url"])} + info.get("stream_url") or "")} else: return media["play_info"].copy() diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index 2ff48c3..500eaa1 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -437,6 +437,7 @@ _CONVERSIONS = { "T": util.datetime_to_timestamp_string, "d": text.parse_timestamp, "U": text.unescape, + "H": lambda s: text.unescape(text.remove_html(s)), "g": text.slugify, "S": util.to_string, "s": str, diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 3e0290c..09b8612 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.5" +__version__ = "1.25.6" |
