aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2024-08-12 02:42:36 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2024-08-12 02:42:36 -0400
commitb5e56c51e491b41f9eb6a895459c185788a377e5 (patch)
treef933c7df043d8949e0dc39b560ab534a5d0dc60f /gallery_dl/extractor
parent032e5bed275a253e122ed9ac86dac7b8c4204172 (diff)
New upstream version 1.27.3.upstream/1.27.3
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/behance.py15
-rw-r--r--gallery_dl/extractor/bunkr.py10
-rw-r--r--gallery_dl/extractor/cien.py2
-rw-r--r--gallery_dl/extractor/deviantart.py17
-rw-r--r--gallery_dl/extractor/fanbox.py16
-rw-r--r--gallery_dl/extractor/furaffinity.py40
-rw-r--r--gallery_dl/extractor/hentaicosplays.py10
-rw-r--r--gallery_dl/extractor/hotleak.py2
-rw-r--r--gallery_dl/extractor/instagram.py8
-rw-r--r--gallery_dl/extractor/twitter.py5
-rw-r--r--gallery_dl/extractor/zerochan.py8
11 files changed, 93 insertions, 40 deletions
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index f24059f..72f9195 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -49,7 +49,7 @@ class BehanceExtractor(Extractor):
def _update(self, data):
# compress data to simple lists
- if data["fields"] and isinstance(data["fields"][0], dict):
+ if data.get("fields") and isinstance(data["fields"][0], dict):
data["fields"] = [
field.get("name") or field.get("label")
for field in data["fields"]
@@ -165,6 +165,19 @@ class BehanceGalleryExtractor(BehanceExtractor):
elif mtype == "video":
try:
+ url = text.extr(module["embed"], 'src="', '"')
+ page = self.request(text.unescape(url)).text
+
+ url = text.extr(page, '<source src="', '"')
+ if text.ext_from_url(url) == "m3u8":
+ url = "ytdl:" + url
+ module["extension"] = "mp4"
+ append((url, module))
+ continue
+ except Exception as exc:
+ self.log.debug("%s: %s", exc.__class__.__name__, exc)
+
+ try:
renditions = module["videoData"]["renditions"]
except Exception:
self.log.warning("No download URLs for video %s",
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 77f0de6..240bbd3 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -13,7 +13,7 @@ from .. import text
BASE_PATTERN = (
r"(?:https?://)?(?:app\.)?(bunkr+"
- r"\.(?:s[kiu]|fi|ru|la|is|to|ac|black|cat|media|red|site|ws))"
+ r"\.(?:s[kiu]|[cf]i|ru|la|is|to|ac|black|cat|media|red|site|ws|org))"
)
LEGACY_DOMAINS = {
@@ -55,6 +55,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"album_name" : text.unescape(info[0]),
"album_size" : size[1:-1],
"count" : len(urls),
+ "_http_validate": self._validate,
}
def _extract_files(self, urls):
@@ -74,6 +75,12 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
text.rextract(page, ' href="', '"', page.rindex("Download"))[0]
)
+ def _validate(self, response):
+ if response.history and response.url.endswith("/maintenance-vid.mp4"):
+ self.log.warning("File server in maintenance mode")
+ return False
+ return True
+
class BunkrMediaExtractor(BunkrAlbumExtractor):
"""Extractor for bunkr.sk media links"""
@@ -95,4 +102,5 @@ class BunkrMediaExtractor(BunkrAlbumExtractor):
"album_size" : -1,
"description": "",
"count" : 1,
+ "_http_validate": self._validate,
}
diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py
index bae86d0..378365e 100644
--- a/gallery_dl/extractor/cien.py
+++ b/gallery_dl/extractor/cien.py
@@ -59,7 +59,7 @@ class CienArticleExtractor(CienExtractor):
post = util.json_loads(text.extr(
page, '<script type="application/ld+json">', '</script>'))[0]
- files = self._extract_files(post.get("articleBody") or page)
+ files = self._extract_files(page)
post["post_url"] = url
post["post_id"] = text.parse_int(self.groups[1])
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index a70710c..f3ea4e7 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -12,7 +12,6 @@ from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache, memcache
import collections
-import itertools
import mimetypes
import binascii
import time
@@ -246,7 +245,6 @@ class DeviantartExtractor(Extractor):
deviation["username"] = deviation["author"]["username"]
deviation["_username"] = deviation["username"].lower()
- deviation["da_category"] = deviation["category"]
deviation["published_time"] = text.parse_int(
deviation["published_time"])
deviation["date"] = text.parse_timestamp(
@@ -301,15 +299,6 @@ class DeviantartExtractor(Extractor):
)
else:
needle = '<div usr class="gr">'
- catlist = deviation["category_path"].split("/")
- categories = " / ".join(
- ('<span class="crumb"><a href="{}/{}/"><span>{}</span></a>'
- '</span>').format(self.root, cpath, cat.capitalize())
- for cat, cpath in zip(
- catlist,
- itertools.accumulate(catlist, lambda t, c: t + "/" + c)
- )
- )
username = deviation["author"]["username"]
urlname = deviation.get("username") or username.lower()
header = HEADER_TEMPLATE.format(
@@ -318,7 +307,6 @@ class DeviantartExtractor(Extractor):
userurl="{}/{}/".format(self.root, urlname),
username=username,
date=deviation["date"],
- categories=categories,
)
if needle in html:
@@ -624,7 +612,7 @@ class DeviantartAvatarExtractor(DeviantartExtractor):
def _make_deviation(self, url, user, index, fmt):
return {
"author" : user,
- "category" : "avatar",
+ "da_category" : "avatar",
"index" : text.parse_int(index),
"is_deleted" : False,
"is_downloadable": False,
@@ -1773,9 +1761,6 @@ HEADER_TEMPLATE = """<div usr class="gr">
<span class="user-symbol regular"></span></span></span>,
<span>{date}</span>
</li>
- <li class="category">
- {categories}
- </li>
</ul>
</div>
"""
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index d81fd0b..d8337b6 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -309,8 +309,20 @@ class FanboxCreatorExtractor(FanboxExtractor):
self.creator_id = match.group(1) or match.group(2)
def posts(self):
- url = "https://api.fanbox.cc/post.listCreator?creatorId={}&limit=10"
- return self._pagination(url.format(self.creator_id))
+ url = "https://api.fanbox.cc/post.paginateCreator?creatorId="
+ return self._pagination_creator(url + self.creator_id)
+
+ def _pagination_creator(self, url):
+ urls = self.request(url, headers=self.headers).json()["body"]
+ for url in urls:
+ url = text.ensure_http_scheme(url)
+ body = self.request(url, headers=self.headers).json()["body"]
+ for item in body:
+ try:
+ yield self._get_post_data(item["id"])
+ except Exception as exc:
+ self.log.warning("Skipping post %s (%s: %s)",
+ item["id"], exc.__class__.__name__, exc)
class FanboxPostExtractor(FanboxExtractor):
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index f48a984..3055426 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -78,14 +78,12 @@ class FuraffinityExtractor(Extractor):
path = extr('href="//d', '"')
if not path:
- self.log.warning(
- "Unable to download post %s (\"%s\")",
- post_id, text.remove_html(
- extr('System Message', '</section>') or
- extr('System Message', '</table>')
- )
- )
- return None
+ msg = text.remove_html(
+ extr('System Message', '</section>') or
+ extr('System Message', '</table>')
+ ).partition(" . Continue ")[0]
+ return self.log.warning(
+ "Unable to download post %s (\"%s\")", post_id, msg)
pi = text.parse_int
rh = text.remove_html
@@ -335,3 +333,29 @@ class FuraffinityFollowingExtractor(FuraffinityExtractor):
if url.endswith(path):
return
url = self.root + path
+
+
+class FuraffinitySubmissionsExtractor(FuraffinityExtractor):
+ """Extractor for new furaffinity submissions"""
+ subcategory = "submissions"
+ pattern = BASE_PATTERN + r"(/msg/submissions(?:/[^/?#]+)?)"
+ example = "https://www.furaffinity.net/msg/submissions"
+
+ def posts(self):
+ self.user = None
+ url = self.root + self.groups[0]
+ return self._pagination_submissions(url)
+
+ def _pagination_submissions(self, url):
+ while True:
+ page = self.request(url).text
+
+ for post_id in text.extract_iter(page, 'id="sid-', '"'):
+ yield post_id
+
+ path = (text.extr(page, '<a class="button standard more" href="', '"') or # noqa 501
+ text.extr(page, '<a class="more-half" href="', '"') or
+ text.extr(page, '<a class="more" href="', '"'))
+ if not path:
+ return
+ url = self.root + text.unescape(path)
diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py
index d5ff8c8..fbbae16 100644
--- a/gallery_dl/extractor/hentaicosplays.py
+++ b/gallery_dl/extractor/hentaicosplays.py
@@ -4,7 +4,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://hentai-cosplays.com/
+"""Extractors for https://hentai-cosplay-xxx.com/
(also works for hentai-img.com and porn-images-xxx.com)"""
from .common import GalleryExtractor
@@ -13,19 +13,21 @@ from .. import text
class HentaicosplaysGalleryExtractor(GalleryExtractor):
"""Extractor for image galleries from
- hentai-cosplays.com, hentai-img.com, and porn-images-xxx.com"""
+ hentai-cosplay-xxx.com, hentai-img.com, and porn-images-xxx.com"""
category = "hentaicosplays"
directory_fmt = ("{site}", "{title}")
filename_fmt = "{filename}.{extension}"
archive_fmt = "{title}_{filename}"
pattern = r"((?:https?://)?(?:\w{2}\.)?" \
- r"(hentai-cosplays|hentai-img|porn-images-xxx)\.com)/" \
+ r"(hentai-cosplay(?:s|-xxx)|hentai-img|porn-images-xxx)\.com)/" \
r"(?:image|story)/([\w-]+)"
- example = "https://hentai-cosplays.com/image/TITLE/"
+ example = "https://hentai-cosplay-xxx.com/image/TITLE/"
def __init__(self, match):
root, self.site, self.slug = match.groups()
self.root = text.ensure_http_scheme(root)
+ if self.root == "https://hentai-cosplays.com":
+ self.root = "https://hentai-cosplay-xxx.com"
url = "{}/story/{}/".format(self.root, self.slug)
GalleryExtractor.__init__(self, match, url)
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
index 34fbabd..ddfc54b 100644
--- a/gallery_dl/extractor/hotleak.py
+++ b/gallery_dl/extractor/hotleak.py
@@ -23,7 +23,7 @@ class HotleakExtractor(Extractor):
def items(self):
for post in self.posts():
- if self.type == "photo":
+ if not post["url"].startswith("ytdl:"):
post["url"] = (
post["url"]
.replace("/storage/storage/", "/storage/")
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index dbe2df3..c05fe72 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -936,23 +936,23 @@ class InstagramGraphqlAPI():
def tags_media(self, tag):
query_hash = "9b498c08113f1e09617a1703c22b2f32"
- variables = {"tag_name": text.unescape(tag), "first": 50}
+ variables = {"tag_name": text.unescape(tag), "first": 24}
return self._pagination(query_hash, variables,
"hashtag", "edge_hashtag_to_media")
def user_clips(self, user_id):
query_hash = "bc78b344a68ed16dd5d7f264681c4c76"
- variables = {"id": user_id, "first": 50}
+ variables = {"id": user_id, "first": 24}
return self._pagination(query_hash, variables)
def user_feed(self, user_id):
query_hash = "69cba40317214236af40e7efa697781d"
- variables = {"id": user_id, "first": 50}
+ variables = {"id": user_id, "first": 24}
return self._pagination(query_hash, variables)
def user_tagged(self, user_id):
query_hash = "be13233562af2d229b008d2976b998b5"
- variables = {"id": user_id, "first": 50}
+ variables = {"id": user_id, "first": 24}
return self._pagination(query_hash, variables)
def _call(self, query_hash, variables):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 9fa5b3f..ea57d76 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1849,6 +1849,11 @@ def _login_impl(extr, username, password):
url, params=params, headers=headers, json=data,
method="POST", fatal=None)
+ # update 'x-csrf-token' header (#5945)
+ csrf_token = response.cookies.get("ct0")
+ if csrf_token:
+ headers["x-csrf-token"] = csrf_token
+
try:
data = response.json()
except ValueError:
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index 126ef49..f9b1a7f 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -198,11 +198,15 @@ class ZerochanTagExtractor(ZerochanExtractor):
while True:
response = self.request(url, params=params, allow_redirects=False)
+
if response.status_code >= 300:
url = text.urljoin(self.root, response.headers["location"])
- response = self.request(url, params=params)
- data = response.json()
+ self.log.warning("HTTP redirect to %s", url)
+ if self.config("redirects"):
+ continue
+ raise exception.StopExtraction()
+ data = response.json()
try:
posts = data["items"]
except Exception: