diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/behance.py | 15 | ||||
| -rw-r--r-- | gallery_dl/extractor/bunkr.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/cien.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/deviantart.py | 17 | ||||
| -rw-r--r-- | gallery_dl/extractor/fanbox.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/furaffinity.py | 40 | ||||
| -rw-r--r-- | gallery_dl/extractor/hentaicosplays.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/hotleak.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 8 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/zerochan.py | 8 |
11 files changed, 93 insertions, 40 deletions
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index f24059f..72f9195 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -49,7 +49,7 @@ class BehanceExtractor(Extractor): def _update(self, data): # compress data to simple lists - if data["fields"] and isinstance(data["fields"][0], dict): + if data.get("fields") and isinstance(data["fields"][0], dict): data["fields"] = [ field.get("name") or field.get("label") for field in data["fields"] @@ -165,6 +165,19 @@ class BehanceGalleryExtractor(BehanceExtractor): elif mtype == "video": try: + url = text.extr(module["embed"], 'src="', '"') + page = self.request(text.unescape(url)).text + + url = text.extr(page, '<source src="', '"') + if text.ext_from_url(url) == "m3u8": + url = "ytdl:" + url + module["extension"] = "mp4" + append((url, module)) + continue + except Exception as exc: + self.log.debug("%s: %s", exc.__class__.__name__, exc) + + try: renditions = module["videoData"]["renditions"] except Exception: self.log.warning("No download URLs for video %s", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 77f0de6..240bbd3 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -13,7 +13,7 @@ from .. import text BASE_PATTERN = ( r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|fi|ru|la|is|to|ac|black|cat|media|red|site|ws))" + r"\.(?:s[kiu]|[cf]i|ru|la|is|to|ac|black|cat|media|red|site|ws|org))" ) LEGACY_DOMAINS = { @@ -55,6 +55,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "album_name" : text.unescape(info[0]), "album_size" : size[1:-1], "count" : len(urls), + "_http_validate": self._validate, } def _extract_files(self, urls): @@ -74,6 +75,12 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): text.rextract(page, ' href="', '"', page.rindex("Download"))[0] ) + def _validate(self, response): + if response.history and response.url.endswith("/maintenance-vid.mp4"): + self.log.warning("File server in maintenance mode") + return False + return True + class BunkrMediaExtractor(BunkrAlbumExtractor): """Extractor for bunkr.sk media links""" @@ -95,4 +102,5 @@ class BunkrMediaExtractor(BunkrAlbumExtractor): "album_size" : -1, "description": "", "count" : 1, + "_http_validate": self._validate, } diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py index bae86d0..378365e 100644 --- a/gallery_dl/extractor/cien.py +++ b/gallery_dl/extractor/cien.py @@ -59,7 +59,7 @@ class CienArticleExtractor(CienExtractor): post = util.json_loads(text.extr( page, '<script type="application/ld+json">', '</script>'))[0] - files = self._extract_files(post.get("articleBody") or page) + files = self._extract_files(page) post["post_url"] = url post["post_id"] = text.parse_int(self.groups[1]) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index a70710c..f3ea4e7 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -12,7 +12,6 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache import collections -import itertools import mimetypes import binascii import time @@ -246,7 +245,6 @@ class DeviantartExtractor(Extractor): deviation["username"] = deviation["author"]["username"] deviation["_username"] = deviation["username"].lower() - deviation["da_category"] = deviation["category"] deviation["published_time"] = text.parse_int( deviation["published_time"]) deviation["date"] = text.parse_timestamp( @@ -301,15 +299,6 @@ class DeviantartExtractor(Extractor): ) else: needle = '<div usr class="gr">' - catlist = deviation["category_path"].split("/") - categories = " / ".join( - ('<span class="crumb"><a href="{}/{}/"><span>{}</span></a>' - '</span>').format(self.root, cpath, cat.capitalize()) - for cat, cpath in zip( - catlist, - itertools.accumulate(catlist, lambda t, c: t + "/" + c) - ) - ) username = deviation["author"]["username"] urlname = deviation.get("username") or username.lower() header = HEADER_TEMPLATE.format( @@ -318,7 +307,6 @@ class DeviantartExtractor(Extractor): userurl="{}/{}/".format(self.root, urlname), username=username, date=deviation["date"], - categories=categories, ) if needle in html: @@ -624,7 +612,7 @@ class DeviantartAvatarExtractor(DeviantartExtractor): def _make_deviation(self, url, user, index, fmt): return { "author" : user, - "category" : "avatar", + "da_category" : "avatar", "index" : text.parse_int(index), "is_deleted" : False, "is_downloadable": False, @@ -1773,9 +1761,6 @@ HEADER_TEMPLATE = """<div usr class="gr"> <span class="user-symbol regular"></span></span></span>, <span>{date}</span> </li> - <li class="category"> - {categories} - </li> </ul> </div> """ diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index d81fd0b..d8337b6 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -309,8 +309,20 @@ class FanboxCreatorExtractor(FanboxExtractor): self.creator_id = match.group(1) or match.group(2) def posts(self): - url = "https://api.fanbox.cc/post.listCreator?creatorId={}&limit=10" - return self._pagination(url.format(self.creator_id)) + url = "https://api.fanbox.cc/post.paginateCreator?creatorId=" + return self._pagination_creator(url + self.creator_id) + + def _pagination_creator(self, url): + urls = self.request(url, headers=self.headers).json()["body"] + for url in urls: + url = text.ensure_http_scheme(url) + body = self.request(url, headers=self.headers).json()["body"] + for item in body: + try: + yield self._get_post_data(item["id"]) + except Exception as exc: + self.log.warning("Skipping post %s (%s: %s)", + item["id"], exc.__class__.__name__, exc) class FanboxPostExtractor(FanboxExtractor): diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index f48a984..3055426 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -78,14 +78,12 @@ class FuraffinityExtractor(Extractor): path = extr('href="//d', '"') if not path: - self.log.warning( - "Unable to download post %s (\"%s\")", - post_id, text.remove_html( - extr('System Message', '</section>') or - extr('System Message', '</table>') - ) - ) - return None + msg = text.remove_html( + extr('System Message', '</section>') or + extr('System Message', '</table>') + ).partition(" . Continue ")[0] + return self.log.warning( + "Unable to download post %s (\"%s\")", post_id, msg) pi = text.parse_int rh = text.remove_html @@ -335,3 +333,29 @@ class FuraffinityFollowingExtractor(FuraffinityExtractor): if url.endswith(path): return url = self.root + path + + +class FuraffinitySubmissionsExtractor(FuraffinityExtractor): + """Extractor for new furaffinity submissions""" + subcategory = "submissions" + pattern = BASE_PATTERN + r"(/msg/submissions(?:/[^/?#]+)?)" + example = "https://www.furaffinity.net/msg/submissions" + + def posts(self): + self.user = None + url = self.root + self.groups[0] + return self._pagination_submissions(url) + + def _pagination_submissions(self, url): + while True: + page = self.request(url).text + + for post_id in text.extract_iter(page, 'id="sid-', '"'): + yield post_id + + path = (text.extr(page, '<a class="button standard more" href="', '"') or # noqa 501 + text.extr(page, '<a class="more-half" href="', '"') or + text.extr(page, '<a class="more" href="', '"')) + if not path: + return + url = self.root + text.unescape(path) diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index d5ff8c8..fbbae16 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -4,7 +4,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://hentai-cosplays.com/ +"""Extractors for https://hentai-cosplay-xxx.com/ (also works for hentai-img.com and porn-images-xxx.com)""" from .common import GalleryExtractor @@ -13,19 +13,21 @@ from .. import text class HentaicosplaysGalleryExtractor(GalleryExtractor): """Extractor for image galleries from - hentai-cosplays.com, hentai-img.com, and porn-images-xxx.com""" + hentai-cosplay-xxx.com, hentai-img.com, and porn-images-xxx.com""" category = "hentaicosplays" directory_fmt = ("{site}", "{title}") filename_fmt = "{filename}.{extension}" archive_fmt = "{title}_{filename}" pattern = r"((?:https?://)?(?:\w{2}\.)?" \ - r"(hentai-cosplays|hentai-img|porn-images-xxx)\.com)/" \ + r"(hentai-cosplay(?:s|-xxx)|hentai-img|porn-images-xxx)\.com)/" \ r"(?:image|story)/([\w-]+)" - example = "https://hentai-cosplays.com/image/TITLE/" + example = "https://hentai-cosplay-xxx.com/image/TITLE/" def __init__(self, match): root, self.site, self.slug = match.groups() self.root = text.ensure_http_scheme(root) + if self.root == "https://hentai-cosplays.com": + self.root = "https://hentai-cosplay-xxx.com" url = "{}/story/{}/".format(self.root, self.slug) GalleryExtractor.__init__(self, match, url) diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 34fbabd..ddfc54b 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -23,7 +23,7 @@ class HotleakExtractor(Extractor): def items(self): for post in self.posts(): - if self.type == "photo": + if not post["url"].startswith("ytdl:"): post["url"] = ( post["url"] .replace("/storage/storage/", "/storage/") diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index dbe2df3..c05fe72 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -936,23 +936,23 @@ class InstagramGraphqlAPI(): def tags_media(self, tag): query_hash = "9b498c08113f1e09617a1703c22b2f32" - variables = {"tag_name": text.unescape(tag), "first": 50} + variables = {"tag_name": text.unescape(tag), "first": 24} return self._pagination(query_hash, variables, "hashtag", "edge_hashtag_to_media") def user_clips(self, user_id): query_hash = "bc78b344a68ed16dd5d7f264681c4c76" - variables = {"id": user_id, "first": 50} + variables = {"id": user_id, "first": 24} return self._pagination(query_hash, variables) def user_feed(self, user_id): query_hash = "69cba40317214236af40e7efa697781d" - variables = {"id": user_id, "first": 50} + variables = {"id": user_id, "first": 24} return self._pagination(query_hash, variables) def user_tagged(self, user_id): query_hash = "be13233562af2d229b008d2976b998b5" - variables = {"id": user_id, "first": 50} + variables = {"id": user_id, "first": 24} return self._pagination(query_hash, variables) def _call(self, query_hash, variables): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 9fa5b3f..ea57d76 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1849,6 +1849,11 @@ def _login_impl(extr, username, password): url, params=params, headers=headers, json=data, method="POST", fatal=None) + # update 'x-csrf-token' header (#5945) + csrf_token = response.cookies.get("ct0") + if csrf_token: + headers["x-csrf-token"] = csrf_token + try: data = response.json() except ValueError: diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 126ef49..f9b1a7f 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -198,11 +198,15 @@ class ZerochanTagExtractor(ZerochanExtractor): while True: response = self.request(url, params=params, allow_redirects=False) + if response.status_code >= 300: url = text.urljoin(self.root, response.headers["location"]) - response = self.request(url, params=params) - data = response.json() + self.log.warning("HTTP redirect to %s", url) + if self.config("redirects"): + continue + raise exception.StopExtraction() + data = response.json() try: posts = data["items"] except Exception: |
