aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2025-01-28 19:12:09 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2025-01-28 19:12:09 -0500
commita26df18796ff4e506b16bf32fcec9336233b9e2e (patch)
tree876512f59831cd670a90a0bc92bc85def6ea3d82 /gallery_dl/extractor
parent0532a387ef5b7fcb4507a9b094dca37a5f635fe1 (diff)
New upstream version 1.28.5.upstream/1.28.5
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/4archive.py2
-rw-r--r--gallery_dl/extractor/__init__.py4
-rw-r--r--gallery_dl/extractor/adultempire.py3
-rw-r--r--gallery_dl/extractor/architizer.py10
-rw-r--r--gallery_dl/extractor/artstation.py12
-rw-r--r--gallery_dl/extractor/batoto.py1
-rw-r--r--gallery_dl/extractor/bunkr.py10
-rw-r--r--gallery_dl/extractor/cohost.py250
-rw-r--r--gallery_dl/extractor/danbooru.py2
-rw-r--r--gallery_dl/extractor/deviantart.py2
-rw-r--r--gallery_dl/extractor/e621.py19
-rw-r--r--gallery_dl/extractor/facebook.py35
-rw-r--r--gallery_dl/extractor/fanleaks.py87
-rw-r--r--gallery_dl/extractor/fapachi.py3
-rw-r--r--gallery_dl/extractor/hiperdex.py12
-rw-r--r--gallery_dl/extractor/imagehosts.py28
-rw-r--r--gallery_dl/extractor/issuu.py32
-rw-r--r--gallery_dl/extractor/kemonoparty.py51
-rw-r--r--gallery_dl/extractor/khinsider.py26
-rw-r--r--gallery_dl/extractor/komikcast.py11
-rw-r--r--gallery_dl/extractor/lofter.py8
-rw-r--r--gallery_dl/extractor/lolisafe.py9
-rw-r--r--gallery_dl/extractor/mangafox.py6
-rw-r--r--gallery_dl/extractor/mangahere.py6
-rw-r--r--gallery_dl/extractor/mangaread.py6
-rw-r--r--gallery_dl/extractor/nekohouse.py122
-rw-r--r--gallery_dl/extractor/pixiv.py42
-rw-r--r--gallery_dl/extractor/pornpics.py22
-rw-r--r--gallery_dl/extractor/rule34xyz.py8
-rw-r--r--gallery_dl/extractor/saint.py1
-rw-r--r--gallery_dl/extractor/shimmie2.py4
-rw-r--r--gallery_dl/extractor/szurubooru.py8
-rw-r--r--gallery_dl/extractor/toyhouse.py10
-rw-r--r--gallery_dl/extractor/twitter.py44
-rw-r--r--gallery_dl/extractor/urlgalleries.py13
-rw-r--r--gallery_dl/extractor/vsco.py3
-rw-r--r--gallery_dl/extractor/webtoons.py4
-rw-r--r--gallery_dl/extractor/weebcentral.py6
-rw-r--r--gallery_dl/extractor/xfolio.py146
-rw-r--r--gallery_dl/extractor/xhamster.py74
40 files changed, 606 insertions, 536 deletions
diff --git a/gallery_dl/extractor/4archive.py b/gallery_dl/extractor/4archive.py
index 948a605..d198369 100644
--- a/gallery_dl/extractor/4archive.py
+++ b/gallery_dl/extractor/4archive.py
@@ -64,7 +64,7 @@ class _4archiveThreadExtractor(Extractor):
data = {
"name": extr('class="name">', "</span>"),
"date": text.parse_datetime(
- extr('class="dateTime postNum">', "<").strip(),
+ extr('class="dateTime postNum" >', "<").strip(),
"%Y-%m-%d %H:%M:%S"),
"no" : text.parse_int(extr('href="#p', '"')),
}
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index b582c99..fc8d7b2 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -39,7 +39,6 @@ modules = [
"chevereto",
"cien",
"civitai",
- "cohost",
"comicvine",
"cyberdrop",
"danbooru",
@@ -52,7 +51,6 @@ modules = [
"exhentai",
"facebook",
"fanbox",
- "fanleaks",
"fantia",
"fapello",
"fapachi",
@@ -116,6 +114,7 @@ modules = [
"myportfolio",
"naver",
"naverwebtoon",
+ "nekohouse",
"newgrounds",
"nhentai",
"nijie",
@@ -196,6 +195,7 @@ modules = [
"wikiart",
"wikifeet",
"wikimedia",
+ "xfolio",
"xhamster",
"xvideos",
"yiffverse",
diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py
index 1617414..c891b17 100644
--- a/gallery_dl/extractor/adultempire.py
+++ b/gallery_dl/extractor/adultempire.py
@@ -24,6 +24,9 @@ class AdultempireGalleryExtractor(GalleryExtractor):
GalleryExtractor.__init__(self, match)
self.gallery_id = match.group(2)
+ def _init(self):
+ self.cookies.set("ageConfirmed", "true", domain="www.adultempire.com")
+
def metadata(self, page):
extr = text.extract_from(page, page.index('<div id="content">'))
return {
diff --git a/gallery_dl/extractor/architizer.py b/gallery_dl/extractor/architizer.py
index 8064e78..0268224 100644
--- a/gallery_dl/extractor/architizer.py
+++ b/gallery_dl/extractor/architizer.py
@@ -32,10 +32,10 @@ class ArchitizerProjectExtractor(GalleryExtractor):
extr('id="Pages"', "")
return {
- "title" : extr('data-name="', '"'),
- "slug" : extr('data-slug="', '"'),
- "gid" : extr('data-gid="', '"').rpartition(".")[2],
- "firm" : extr('data-firm-leaders-str="', '"'),
+ "title" : extr("data-name='", "'"),
+ "slug" : extr("data-slug='", "'"),
+ "gid" : extr("data-gid='", "'").rpartition(".")[2],
+ "firm" : extr("data-firm-leaders-str='", "'"),
"location" : extr("<h2>", "<").strip(),
"type" : text.unescape(text.remove_html(extr(
'<div class="title">Type</div>', '<br'))),
@@ -54,7 +54,7 @@ class ArchitizerProjectExtractor(GalleryExtractor):
return [
(url, None)
for url in text.extract_iter(
- page, 'property="og:image:secure_url" content="', "?")
+ page, "property='og:image:secure_url' content='", "?")
]
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index ce1a78d..f448710 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -11,8 +11,6 @@
from .common import Extractor, Message
from .. import text, util, exception
import itertools
-import random
-import string
class ArtstationExtractor(Extractor):
@@ -29,6 +27,9 @@ class ArtstationExtractor(Extractor):
Extractor.__init__(self, match)
self.user = match.group(1) or match.group(2)
+ def _init(self):
+ self.session.headers["Cache-Control"] = "max-age=0"
+
def items(self):
videos = self.config("videos", True)
previews = self.config("previews", False)
@@ -172,7 +173,7 @@ class ArtstationExtractor(Extractor):
).json()["public_csrf_token"]
@staticmethod
- def _no_cache(url, alphabet=(string.digits + string.ascii_letters)):
+ def _no_cache(url):
"""Cause a cache miss to prevent Cloudflare 'optimizations'
Cloudflare's 'Polish' optimization strips image metadata and may even
@@ -184,10 +185,9 @@ class ArtstationExtractor(Extractor):
https://github.com/r888888888/danbooru/issues/3528
https://danbooru.donmai.us/forum_topics/14952
"""
- param = "gallerydl_no_cache=" + util.bencode(
- random.getrandbits(64), alphabet)
sep = "&" if "?" in url else "?"
- return url + sep + param
+ token = util.generate_token(8)
+ return url + sep + token[:4] + "=" + token[4:]
class ArtstationUserExtractor(ArtstationExtractor):
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py
index 77c40ef..4d192a4 100644
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@@ -67,6 +67,7 @@ class BatotoBase():
class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
"""Extractor for batoto manga chapters"""
+ archive_fmt = "{chapter_id}_{page}"
pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)"
example = "https://xbato.org/title/12345-MANGA/54321"
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index e1ee50d..25e9fd5 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -70,6 +70,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
self.root = "https://" + domain
def request(self, url, **kwargs):
+ kwargs["encoding"] = "utf-8"
kwargs["allow_redirects"] = False
while True:
@@ -114,8 +115,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
def fetch_album(self, album_id):
# album metadata
- page = self.request(
- self.root + "/a/" + album_id, encoding="utf-8").text
+ page = self.request(self.root + "/a/" + album_id).text
title = text.unescape(text.unescape(text.extr(
page, 'property="og:title" content="', '"')))
@@ -140,7 +140,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
file = self._extract_file(url)
info = text.split_html(item)
- file["name"] = info[-3]
+ if not file["name"]:
+ file["name"] = info[-3]
file["size"] = info[-2]
file["date"] = text.parse_datetime(
info[-1], "%H:%M:%S %d/%m/%Y")
@@ -157,6 +158,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
page = response.text
file_url = (text.extr(page, '<source src="', '"') or
text.extr(page, '<img src="', '"'))
+ file_name = (text.extr(page, 'property="og:title" content="', '"') or
+ text.extr(page, "<title>", " | Bunkr<"))
if not file_url:
webpage_url = text.unescape(text.rextract(
@@ -166,6 +169,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
return {
"file" : text.unescape(file_url),
+ "name" : text.unescape(file_name),
"_http_headers" : {"Referer": response.url},
"_http_validate": self._validate,
}
diff --git a/gallery_dl/extractor/cohost.py b/gallery_dl/extractor/cohost.py
deleted file mode 100644
index 6a43224..0000000
--- a/gallery_dl/extractor/cohost.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2024 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://cohost.org/"""
-
-from .common import Extractor, Message
-from .. import text, util
-
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?cohost\.org"
-
-
-class CohostExtractor(Extractor):
- """Base class for cohost extractors"""
- category = "cohost"
- root = "https://cohost.org"
- directory_fmt = ("{category}", "{postingProject[handle]}")
- filename_fmt = ("{postId}{headline:?_//[b:200]}{num:?_//}.{extension}")
- archive_fmt = "{postId}_{num}"
-
- def _init(self):
- self.replies = self.config("replies", True)
- self.pinned = self.config("pinned", False)
- self.shares = self.config("shares", False)
- self.asks = self.config("asks", True)
-
- self.avatar = self.config("avatar", False)
- if self.avatar:
- self._urls_avatar = {None, ""}
-
- self.background = self.config("background", False)
- if self.background:
- self._urls_background = {None, ""}
-
- def items(self):
- for post in self.posts():
- reason = post.get("limitedVisibilityReason")
- if reason and reason != "none":
- if reason == "log-in-first":
- reason = ("This page's posts are visible only to users "
- "who are logged in.")
- self.log.warning('%s: "%s"', post["postId"], reason)
-
- files = self._extract_files(post)
- post["count"] = len(files)
- post["date"] = text.parse_datetime(
- post["publishedAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
-
- yield Message.Directory, post
-
- project = post["postingProject"]
- if self.avatar:
- url = project.get("avatarURL")
- if url not in self._urls_avatar:
- self._urls_avatar.add(url)
- p = post.copy()
- p["postId"] = p["kind"] = "avatar"
- p["headline"] = p["num"] = ""
- yield Message.Url, url, text.nameext_from_url(url, p)
-
- if self.background:
- url = project.get("headerURL")
- if url not in self._urls_background:
- self._urls_background.add(url)
- p = post.copy()
- p["postId"] = p["kind"] = "background"
- p["headline"] = p["num"] = ""
- yield Message.Url, url, text.nameext_from_url(url, p)
-
- for post["num"], file in enumerate(files, 1):
- url = file["fileURL"]
- post.update(file)
- text.nameext_from_url(url, post)
- yield Message.Url, url, post
-
- def posts(self):
- return ()
-
- def _request_api(self, endpoint, input):
- url = "{}/api/v1/trpc/{}".format(self.root, endpoint)
- params = {"batch": "1", "input": util.json_dumps({"0": input})}
- headers = {"content-type": "application/json"}
-
- data = self.request(url, params=params, headers=headers).json()
- return data[0]["result"]["data"]
-
- def _extract_files(self, post):
- files = []
-
- self._extract_blocks(post, files)
- if self.shares and post.get("shareTree"):
- for share in post["shareTree"]:
- self._extract_blocks(share, files, share)
- del post["shareTree"]
-
- return files
-
- def _extract_blocks(self, post, files, shared=None):
- post["content"] = content = []
-
- for block in post.pop("blocks") or ():
- try:
- type = block["type"]
- if type == "attachment":
- file = block["attachment"].copy()
- file["shared"] = shared
- files.append(file)
- elif type == "attachment-row":
- for att in block["attachments"]:
- file = att["attachment"].copy()
- file["shared"] = shared
- files.append(file)
- elif type == "markdown":
- content.append(block["markdown"]["content"])
- elif type == "ask":
- post["ask"] = block["ask"]
- else:
- self.log.debug("%s: Unsupported block type '%s'",
- post["postId"], type)
- except Exception as exc:
- self.log.debug("%s: %s", exc.__class__.__name__, exc)
-
-
-class CohostUserExtractor(CohostExtractor):
- """Extractor for media from a cohost user"""
- subcategory = "user"
- pattern = BASE_PATTERN + r"/([^/?#]+)/?(?:$|\?|#)"
- example = "https://cohost.org/USER"
-
- def posts(self):
- empty = 0
- params = {
- "projectHandle": self.groups[0],
- "page": 0,
- "options": {
- "pinnedPostsAtTop" : True if self.pinned else False,
- "hideReplies" : not self.replies,
- "hideShares" : not self.shares,
- "hideAsks" : not self.asks,
- "viewingOnProjectPage": True,
- },
- }
-
- while True:
- data = self._request_api("posts.profilePosts", params)
-
- posts = data["posts"]
- if posts:
- empty = 0
- yield from posts
- else:
- empty += 1
-
- pagination = data["pagination"]
- if not pagination.get("morePagesForward"):
- return
- if empty >= 3:
- return self.log.debug("Empty API results")
- params["page"] = pagination["nextPage"]
-
-
-class CohostPostExtractor(CohostExtractor):
- """Extractor for media from a single cohost post"""
- subcategory = "post"
- pattern = BASE_PATTERN + r"/([^/?#]+)/post/(\d+)"
- example = "https://cohost.org/USER/post/12345"
-
- def posts(self):
- endpoint = "posts.singlePost"
- params = {
- "handle": self.groups[0],
- "postId": int(self.groups[1]),
- }
-
- data = self._request_api(endpoint, params)
- post = data["post"]
-
- try:
- post["comments"] = data["comments"][self.groups[1]]
- except LookupError:
- post["comments"] = ()
-
- return (post,)
-
-
-class CohostTagExtractor(CohostExtractor):
- """Extractor for tagged posts"""
- subcategory = "tag"
- pattern = BASE_PATTERN + r"/([^/?#]+)/tagged/([^/?#]+)(?:\?([^#]+))?"
- example = "https://cohost.org/USER/tagged/TAG"
-
- def posts(self):
- user, tag, query = self.groups
- url = "{}/{}/tagged/{}".format(self.root, user, tag)
- params = text.parse_query(query)
- post_feed_key = ("tagged-post-feed" if user == "rc" else
- "project-tagged-post-feed")
-
- while True:
- page = self.request(url, params=params).text
- data = util.json_loads(text.extr(
- page, 'id="__COHOST_LOADER_STATE__">', '</script>'))
-
- try:
- feed = data[post_feed_key]
- except KeyError:
- feed = data.popitem()[1]
-
- yield from feed["posts"]
-
- pagination = feed["paginationMode"]
- if not pagination.get("morePagesForward"):
- return
- params["refTimestamp"] = pagination["refTimestamp"]
- params["skipPosts"] = \
- pagination["currentSkip"] + pagination["idealPageStride"]
-
-
-class CohostLikesExtractor(CohostExtractor):
- """Extractor for liked posts"""
- subcategory = "likes"
- pattern = BASE_PATTERN + r"/rc/liked-posts"
- example = "https://cohost.org/rc/liked-posts"
-
- def posts(self):
- url = "{}/rc/liked-posts".format(self.root)
- params = {}
-
- while True:
- page = self.request(url, params=params).text
- data = util.json_loads(text.extr(
- page, 'id="__COHOST_LOADER_STATE__">', '</script>'))
-
- try:
- feed = data["liked-posts-feed"]
- except KeyError:
- feed = data.popitem()[1]
-
- yield from feed["posts"]
-
- pagination = feed["paginationMode"]
- if not pagination.get("morePagesForward"):
- return
- params["refTimestamp"] = pagination["refTimestamp"]
- params["skipPosts"] = \
- pagination["currentSkip"] + pagination["idealPageStride"]
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 37b6747..d0a9397 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -32,7 +32,7 @@ class DanbooruExtractor(BaseExtractor):
if isinstance(threshold, int):
self.threshold = 1 if threshold < 1 else threshold
else:
- self.threshold = self.per_page
+ self.threshold = self.per_page - 20
username, api_key = self._get_auth_info()
if username:
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 8172f62..59b2d6d 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -822,7 +822,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
username, folder["gallery_id"], public=False):
cache[dev["deviationid"]] = dev if has_access else None
- return cache[deviation["deviationid"]]
+ return cache.get(deviation["deviationid"])
def _unwatch_premium(self):
for username in self.unwatch:
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index 33e6ba8..eddcb12 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -8,7 +8,7 @@
"""Extractors for https://e621.net/ and other e621 instances"""
-from .common import Message
+from .common import Extractor, Message
from . import danbooru
from ..cache import memcache
from .. import text, util
@@ -156,3 +156,20 @@ class E621FavoriteExtractor(E621Extractor):
def posts(self):
return self._pagination("/favorites.json", self.query)
+
+
+class E621FrontendExtractor(Extractor):
+ """Extractor for alternative e621 frontends"""
+ basecategory = "E621"
+ category = "e621"
+ subcategory = "frontend"
+ pattern = r"(?:https?://)?e621\.(?:cc/\?tags|anthro\.fr/\?q)=([^&#]*)"
+ example = "https://e621.cc/?tags=TAG"
+
+ def initialize(self):
+ pass
+
+ def items(self):
+ url = "https://e621.net/posts?tags=" + self.groups[0]
+ data = {"_extractor": E621TagExtractor}
+ yield Message.Queue, url, data
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py
index 2f3fdbf..1ec6adc 100644
--- a/gallery_dl/extractor/facebook.py
+++ b/gallery_dl/extractor/facebook.py
@@ -238,8 +238,9 @@ class FacebookExtractor(Extractor):
return res
- def extract_set(self, first_photo_id, set_id):
- all_photo_ids = [first_photo_id]
+ def extract_set(self, set_data):
+ set_id = set_data["set_id"]
+ all_photo_ids = [set_data["first_photo_id"]]
retries = 0
i = 0
@@ -252,7 +253,6 @@ class FacebookExtractor(Extractor):
photo_page = self.photo_page_request_wrapper(photo_url).text
photo = self.parse_photo_page(photo_page)
- photo["set_id"] = set_id
photo["num"] = i + 1
if self.author_followups:
@@ -281,9 +281,11 @@ class FacebookExtractor(Extractor):
retries = 0
else:
retries = 0
+ photo.update(set_data)
+ yield Message.Directory, photo
yield Message.Url, photo["url"], photo
- if photo["next_photo_id"] == "":
+ if not photo["next_photo_id"]:
self.log.debug(
"Can't find next image in the set. "
"Extraction is over."
@@ -322,15 +324,11 @@ class FacebookSetExtractor(FacebookExtractor):
set_url = self.set_url_fmt.format(set_id=set_id)
set_page = self.request(set_url).text
+ set_data = self.parse_set_page(set_page)
+ if self.groups[2]:
+ set_data["first_photo_id"] = self.groups[2]
- directory = self.parse_set_page(set_page)
-
- yield Message.Directory, directory
-
- yield from self.extract_set(
- self.groups[2] or directory["first_photo_id"],
- directory["set_id"]
- )
+ return self.extract_set(set_data)
class FacebookPhotoExtractor(FacebookExtractor):
@@ -436,13 +434,8 @@ class FacebookProfileExtractor(FacebookExtractor):
if set_id:
set_url = self.set_url_fmt.format(set_id=set_id)
set_page = self.request(set_url).text
+ set_data = self.parse_set_page(set_page)
+ return self.extract_set(set_data)
- directory = self.parse_set_page(set_page)
-
- yield Message.Directory, directory
-
- yield from self.extract_set(
- directory["first_photo_id"], directory["set_id"]
- )
- else:
- self.log.debug("Profile photos set ID not found.")
+ self.log.debug("Profile photos set ID not found.")
+ return iter(())
diff --git a/gallery_dl/extractor/fanleaks.py b/gallery_dl/extractor/fanleaks.py
deleted file mode 100644
index 886e893..0000000
--- a/gallery_dl/extractor/fanleaks.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://fanleaks.club/"""
-
-from .common import Extractor, Message
-from .. import text
-
-
-class FanleaksExtractor(Extractor):
- """Base class for Fanleaks extractors"""
- category = "fanleaks"
- directory_fmt = ("{category}", "{model}")
- filename_fmt = "{model_id}_{id}.{extension}"
- archive_fmt = "{model_id}_{id}"
- root = "https://fanleaks.club"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.model_id = match.group(1)
-
- def extract_post(self, url):
- extr = text.extract_from(self.request(url, notfound="post").text)
- data = {
- "model_id": self.model_id,
- "model" : text.unescape(extr('text-lg">', "</a>")),
- "id" : text.parse_int(self.id),
- "type" : extr('type="', '"')[:5] or "photo",
- }
- url = extr('src="', '"')
- yield Message.Directory, data
- yield Message.Url, url, text.nameext_from_url(url, data)
-
-
-class FanleaksPostExtractor(FanleaksExtractor):
- """Extractor for individual posts on fanleaks.club"""
- subcategory = "post"
- pattern = r"(?:https?://)?(?:www\.)?fanleaks\.club/([^/?#]+)/(\d+)"
- example = "https://fanleaks.club/MODEL/12345"
-
- def __init__(self, match):
- FanleaksExtractor.__init__(self, match)
- self.id = match.group(2)
-
- def items(self):
- url = "{}/{}/{}".format(self.root, self.model_id, self.id)
- return self.extract_post(url)
-
-
-class FanleaksModelExtractor(FanleaksExtractor):
- """Extractor for all posts from a fanleaks model"""
- subcategory = "model"
- pattern = (r"(?:https?://)?(?:www\.)?fanleaks\.club"
- r"/(?!latest/?$)([^/?#]+)/?$")
- example = "https://fanleaks.club/MODEL"
-
- def items(self):
- page_num = 1
- page = self.request(
- self.root + "/" + self.model_id, notfound="model").text
- data = {
- "model_id": self.model_id,
- "model" : text.unescape(text.extr(page, 'mt-4">', "</h1>")),
- "type" : "photo",
- }
- page_url = text.extr(page, "url: '", "'")
- while True:
- page = self.request("{}{}".format(page_url, page_num)).text
- if not page:
- return
-
- for item in text.extract_iter(page, '<a href="/', "</a>"):
- self.id = id = text.extr(item, "/", '"')
- if "/icon-play.svg" in item:
- url = "{}/{}/{}".format(self.root, self.model_id, id)
- yield from self.extract_post(url)
- continue
-
- data["id"] = text.parse_int(id)
- url = text.extr(item, 'src="', '"').replace(
- "/thumbs/", "/", 1)
- yield Message.Directory, data
- yield Message.Url, url, text.nameext_from_url(url, data)
- page_num += 1
diff --git a/gallery_dl/extractor/fapachi.py b/gallery_dl/extractor/fapachi.py
index 80478ca..43627e2 100644
--- a/gallery_dl/extractor/fapachi.py
+++ b/gallery_dl/extractor/fapachi.py
@@ -33,7 +33,8 @@ class FapachiPostExtractor(Extractor):
}
page = self.request("{}/{}/media/{}".format(
self.root, self.user, self.id)).text
- url = self.root + text.extr(page, 'd-block" src="', '"')
+ url = self.root + text.extract(
+ page, 'data-src="', '"', page.index('class="media-img'))[0]
yield Message.Directory, data
yield Message.Url, url, text.nameext_from_url(url, data)
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
index c939a3c..f15aab7 100644
--- a/gallery_dl/extractor/hiperdex.py
+++ b/gallery_dl/extractor/hiperdex.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://hipertoon.com/"""
+"""Extractors for https://hiperdex.com/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
@@ -20,7 +20,7 @@ BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
class HiperdexBase():
"""Base class for hiperdex extractors"""
category = "hiperdex"
- root = "https://hipertoon.com"
+ root = "https://hiperdex.com"
@memcache(keyarg=1)
def manga_data(self, manga, page=None):
@@ -49,7 +49,7 @@ class HiperdexBase():
"status" : extr(
'class="summary-content">', '<').strip(),
"description": text.remove_html(text.unescape(extr(
- "Summary </h5>", "</div>"))),
+ '<div class="description-summary">', "</div>"))),
"language": "English",
"lang" : "en",
}
@@ -69,7 +69,7 @@ class HiperdexBase():
class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
"""Extractor for hiperdex manga chapters"""
pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))"
- example = "https://hipertoon.com/manga/MANGA/CHAPTER/"
+ example = "https://hiperdex.com/manga/MANGA/CHAPTER/"
def __init__(self, match):
root, path, self.manga, self.chapter = match.groups()
@@ -91,7 +91,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
"""Extractor for hiperdex manga"""
chapterclass = HiperdexChapterExtractor
pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$"
- example = "https://hipertoon.com/manga/MANGA/"
+ example = "https://hiperdex.com/manga/MANGA/"
def __init__(self, match):
root, path, self.manga = match.groups()
@@ -127,7 +127,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
chapterclass = HiperdexMangaExtractor
reverse = False
pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))"
- example = "https://hipertoon.com/manga-artist/NAME/"
+ example = "https://hiperdex.com/manga-artist/NAME/"
def __init__(self, match):
self.root = text.ensure_http_scheme(match.group(1))
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index 5f1e0f4..d6b36cb 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -286,6 +286,34 @@ class TurboimagehostImageExtractor(ImagehostImageExtractor):
return url, url
+class TurboimagehostGalleryExtractor(ImagehostImageExtractor):
+ """Extractor for image galleries from turboimagehost.com"""
+ category = "turboimagehost"
+ subcategory = "gallery"
+ pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com"
+ r"/album/(\d+)/([^/?#]*))")
+ example = "https://www.turboimagehost.com/album/12345/GALLERY_NAME"
+
+ def items(self):
+ data = {"_extractor": TurboimagehostImageExtractor}
+ params = {"p": 1}
+
+ while True:
+ page = self.request(self.page_url, params=params).text
+
+ if params["p"] == 1 and \
+ "Requested gallery don`t exist on our website." in page:
+ raise exception.NotFoundError("gallery")
+
+ thumb_url = None
+ for thumb_url in text.extract_iter(page, '"><a href="', '"'):
+ yield Message.Queue, thumb_url, data
+ if thumb_url is None:
+ return
+
+ params["p"] += 1
+
+
class ViprImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from vipr.im"""
category = "vipr"
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index 54c6539..b900113 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -54,26 +54,30 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
class IssuuUserExtractor(IssuuBase, Extractor):
"""Extractor for all publications of a user/publisher"""
subcategory = "user"
- pattern = r"(?:https?://)?issuu\.com/([^/?#]+)/?$"
+ pattern = r"(?:https?://)?issuu\.com/([^/?#]+)(?:/(\d*))?$"
example = "https://issuu.com/USER"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.user = match.group(1)
-
def items(self):
- url = "{}/call/profile/v1/documents/{}".format(self.root, self.user)
- params = {"offset": 0, "limit": "25"}
+ user, pnum = self.groups
+ base = self.root + "/" + user
+ pnum = text.parse_int(pnum, 1)
while True:
- data = self.request(url, params=params).json()
+ url = base + "/" + str(pnum) if pnum > 1 else base
+ try:
+ html = self.request(url).text
+ data = util.json_loads(text.unescape(text.extr(
+ html, '</main></div><script data-json="', '" id="')))
+ docs = data["docs"]
+ except Exception as exc:
+ self.log.debug("", exc_info=exc)
+ return
- for publication in data["items"]:
- publication["url"] = "{}/{}/docs/{}".format(
- self.root, self.user, publication["uri"])
+ for publication in docs:
+ url = self.root + "/" + publication["uri"]
publication["_extractor"] = IssuuPublicationExtractor
- yield Message.Queue, publication["url"], publication
+ yield Message.Queue, url, publication
- if not data["hasMore"]:
+ if len(docs) < 48:
return
- params["offset"] += data["limit"]
+ pnum += 1
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 66bbab5..788b5d9 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -54,26 +54,19 @@ class KemonopartyExtractor(Extractor):
sort_keys=True, separators=(",", ":")).encode
def items(self):
- service = self.groups[2]
- creator_id = self.groups[3]
-
find_hash = re.compile(HASH_PATTERN).match
generators = self._build_file_generators(self.config("files"))
announcements = True if self.config("announcements") else None
comments = True if self.config("comments") else False
duplicates = True if self.config("duplicates") else False
dms = True if self.config("dms") else None
- profile = username = None
+ max_posts = self.config("max-posts")
+ creator_info = {} if self.config("metadata") else None
# prevent files from being sent with gzip compression
headers = {"Accept-Encoding": "identity"}
- if self.config("metadata"):
- profile = self.api.creator_profile(service, creator_id)
- username = profile["name"]
-
posts = self.posts()
- max_posts = self.config("max-posts")
if max_posts:
posts = itertools.islice(posts, max_posts)
if self.revisions:
@@ -85,10 +78,20 @@ class KemonopartyExtractor(Extractor):
post["_http_headers"] = headers
post["date"] = self._parse_datetime(
post.get("published") or post.get("added") or "")
+ service = post["service"]
+ creator_id = post["user"]
+
+ if creator_info is not None:
+ key = "{}_{}".format(service, creator_id)
+ if key not in creator_info:
+ creator = creator_info[key] = self.api.creator_profile(
+ service, creator_id)
+ else:
+ creator = creator_info[key]
+
+ post["user_profile"] = creator
+ post["username"] = creator["name"]
- if profile is not None:
- post["username"] = username
- post["user_profile"] = profile
if comments:
try:
post["comments"] = self.api.creator_post_comments(
@@ -171,7 +174,7 @@ class KemonopartyExtractor(Extractor):
try:
msg = '"' + response.json()["error"] + '"'
except Exception:
- msg = '"0/1 Username or password is incorrect"'
+ msg = '"Username or password is incorrect"'
raise exception.AuthenticationError(msg)
return {c.name: c.value for c in response.cookies}
@@ -296,8 +299,12 @@ class KemonopartyUserExtractor(KemonopartyExtractor):
def posts(self):
_, _, service, creator_id, query = self.groups
params = text.parse_query(query)
- return self.api.creator_posts(
- service, creator_id, params.get("o"), params.get("q"))
+ if params.get("tag"):
+ return self.api.creator_tagged_posts(
+ service, creator_id, params.get("tag"), params.get("o"))
+ else:
+ return self.api.creator_posts(
+ service, creator_id, params.get("o"), params.get("q"))
class KemonopartyPostsExtractor(KemonopartyExtractor):
@@ -493,7 +500,7 @@ class KemonoAPI():
def posts(self, offset=0, query=None, tags=None):
endpoint = "/posts"
- params = {"q": query, "o": offset, "tags": tags}
+ params = {"q": query, "o": offset, "tag": tags}
return self._pagination(endpoint, params, 50, "posts")
def creator_posts(self, service, creator_id, offset=0, query=None):
@@ -501,6 +508,11 @@ class KemonoAPI():
params = {"q": query, "o": offset}
return self._pagination(endpoint, params, 50)
+ def creator_tagged_posts(self, service, creator_id, tags, offset=0):
+ endpoint = "/{}/user/{}/posts-legacy".format(service, creator_id)
+ params = {"o": offset, "tag": tags}
+ return self._pagination(endpoint, params, 50, "results")
+
def creator_announcements(self, service, creator_id):
endpoint = "/{}/user/{}/announcements".format(service, creator_id)
return self._call(endpoint)
@@ -565,9 +577,10 @@ class KemonoAPI():
data = self._call(endpoint, params)
if key:
- yield from data[key]
- else:
- yield from data
+ data = data.get(key)
+ if not data:
+ return
+ yield from data
if len(data) < batch:
return
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index d0c9c30..e779e97 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -36,22 +36,36 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
data = self.metadata(page)
yield Message.Directory, data
- for track in self.tracks(page):
+
+ if self.config("covers", False):
+ for num, url in enumerate(self._extract_covers(page), 1):
+ cover = text.nameext_from_url(
+ url, {"url": url, "num": num, "type": "cover"})
+ cover.update(data)
+ yield Message.Url, url, cover
+
+ for track in self._extract_tracks(page):
track.update(data)
+ track["type"] = "track"
yield Message.Url, track["url"], track
def metadata(self, page):
extr = text.extract_from(page)
return {"album": {
"name" : text.unescape(extr("<h2>", "<")),
- "platform": extr("Platforms: <a", "<").rpartition(">")[2],
+ "platform": text.split_html(extr("Platforms: ", "<br>"))[::2],
+ "year": extr("Year: <b>", "<"),
+ "catalog": extr("Catalog Number: <b>", "<"),
+ "developer": text.remove_html(extr(" Developed by: ", "</")),
+ "publisher": text.remove_html(extr(" Published by: ", "</")),
"count": text.parse_int(extr("Number of Files: <b>", "<")),
"size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]),
"date" : extr("Date Added: <b>", "<"),
"type" : text.remove_html(extr("Album type: <b>", "</b>")),
+ "uploader": text.remove_html(extr("Uploaded by: ", "</")),
}}
- def tracks(self, page):
+ def _extract_tracks(self, page):
fmt = self.config("format", ("mp3",))
if fmt and isinstance(fmt, str):
if fmt == "all":
@@ -75,3 +89,9 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
yield track
if first:
yield first
+
+ def _extract_covers(self, page):
+ return [
+ text.unescape(text.extr(cover, ' href="', '"'))
+ for cover in text.extract_iter(page, ' class="albumImage', '</')
+ ]
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index e39e272..89a1b5e 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -6,19 +6,20 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://komikcast.cz/"""
+"""Extractors for https://komikcast.la/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
import re
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:cz|lol|site|mo?e|com)"
+BASE_PATTERN = (r"(?:https?://)?(?:www\.)?"
+ r"komikcast\.(?:la|cz|lol|site|mo?e|com)")
class KomikcastBase():
"""Base class for komikcast extractors"""
category = "komikcast"
- root = "https://komikcast.cz"
+ root = "https://komikcast.la"
@staticmethod
def parse_chapter_string(chapter_string, data=None):
@@ -48,7 +49,7 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
"""Extractor for komikcast manga chapters"""
pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
- example = "https://komikcast.cz/chapter/TITLE/"
+ example = "https://komikcast.la/chapter/TITLE/"
def metadata(self, page):
info = text.extr(page, "<title>", " - Komikcast<")
@@ -68,7 +69,7 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
"""Extractor for komikcast manga"""
chapterclass = KomikcastChapterExtractor
pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$"
- example = "https://komikcast.cz/komik/TITLE"
+ example = "https://komikcast.la/komik/TITLE"
def chapters(self, page):
results = []
diff --git a/gallery_dl/extractor/lofter.py b/gallery_dl/extractor/lofter.py
index 412b6b9..b92a6ff 100644
--- a/gallery_dl/extractor/lofter.py
+++ b/gallery_dl/extractor/lofter.py
@@ -23,6 +23,8 @@ class LofterExtractor(Extractor):
def items(self):
for post in self.posts():
+ if post is None:
+ continue
if "post" in post:
post = post["post"]
@@ -129,6 +131,9 @@ class LofterAPI():
url, method="POST", params=params, data=data)
info = response.json()
+ if info["meta"]["status"] == 4200:
+ raise exception.NotFoundError("blog")
+
if info["meta"]["status"] != 200:
self.extractor.log.debug("Server response: %s", info)
raise exception.StopExtraction("API request failed")
@@ -142,6 +147,9 @@ class LofterAPI():
yield from posts
+ if data["offset"] < 0:
+ break
+
if params["offset"] + len(posts) < data["offset"]:
break
params["offset"] = data["offset"]
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 295b9c4..6a9f633 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -53,7 +53,14 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
if "name" in file:
name = file["name"]
file["name"] = name.rpartition(".")[0] or name
- file["id"] = file["filename"].rpartition("-")[2]
+ _, sep, fid = file["filename"].rpartition("-")
+ if not sep or len(fid) == 12:
+ if "id" not in file:
+ file["id"] = ""
+ file["filename"] = file["name"]
+ else:
+ file["id"] = fid
+ file["filename"] = file["name"] + "-" + fid
elif "id" in file:
file["name"] = file["filename"]
file["filename"] = "{}-{}".format(file["name"], file["id"])
diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py
index d590753..827756a 100644
--- a/gallery_dl/extractor/mangafox.py
+++ b/gallery_dl/extractor/mangafox.py
@@ -30,7 +30,7 @@ class MangafoxChapterExtractor(ChapterExtractor):
def metadata(self, page):
manga, pos = text.extract(page, "<title>", "</title>")
count, pos = text.extract(
- page, ">", "<", page.find("</select>", pos) - 20)
+ page, ">", "<", page.find("</select>", pos) - 40)
sid , pos = text.extract(page, "var series_id =", ";", pos)
cid , pos = text.extract(page, "var chapter_id =", ";", pos)
@@ -49,9 +49,9 @@ class MangafoxChapterExtractor(ChapterExtractor):
pnum = 1
while True:
url, pos = text.extract(page, '<img src="', '"')
- yield text.ensure_http_scheme(url), None
+ yield text.ensure_http_scheme(text.unescape(url)), None
url, pos = text.extract(page, ' src="', '"', pos)
- yield text.ensure_http_scheme(url), None
+ yield text.ensure_http_scheme(text.unescape(url)), None
pnum += 2
page = self.request("{}/{}.html".format(self.urlbase, pnum)).text
diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py
index e8ee861..8c94f04 100644
--- a/gallery_dl/extractor/mangahere.py
+++ b/gallery_dl/extractor/mangahere.py
@@ -37,7 +37,7 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
def metadata(self, page):
pos = page.index("</select>")
- count , pos = text.extract(page, ">", "<", pos - 20)
+ count , pos = text.extract(page, ">", "<", pos - 40)
manga_id , pos = text.extract(page, "series_id = ", ";", pos)
chapter_id, pos = text.extract(page, "chapter_id = ", ";", pos)
manga , pos = text.extract(page, '"name":"', '"', pos)
@@ -61,9 +61,9 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
while True:
url, pos = text.extract(page, '<img src="', '"')
- yield text.ensure_http_scheme(url), None
+ yield text.ensure_http_scheme(text.unescape(url)), None
url, pos = text.extract(page, ' src="', '"', pos)
- yield text.ensure_http_scheme(url), None
+ yield text.ensure_http_scheme(text.unescape(url)), None
pnum += 2
page = self.request(self.url_fmt.format(self.part, pnum)).text
diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py
index 4b017dc..6970b4f 100644
--- a/gallery_dl/extractor/mangaread.py
+++ b/gallery_dl/extractor/mangaread.py
@@ -92,9 +92,9 @@ class MangareadMangaExtractor(MangareadBase, MangaExtractor):
"genres" : list(text.extract_iter(
extr('class="genres-content">', "</div>"), '"tag">', "</a>")),
"type" : text.remove_html(
- extr("Type </h5>\n</div>", "</div>")),
+ extr(" Type ", "\n</div>")),
"release" : text.parse_int(text.remove_html(
- extr("Release </h5>\n</div>", "</div>"))),
+ extr(" Release ", "\n</div>"))),
"status" : text.remove_html(
- extr("Status </h5>\n</div>", "</div>")),
+ extr(" Status ", "\n</div>")),
}
diff --git a/gallery_dl/extractor/nekohouse.py b/gallery_dl/extractor/nekohouse.py
new file mode 100644
index 0000000..fe9d512
--- /dev/null
+++ b/gallery_dl/extractor/nekohouse.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://nekohouse.su/"""
+
+from .common import Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?nekohouse\.su"
+USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
+
+
+class NekohouseExtractor(Extractor):
+ """Base class for nekohouse extractors"""
+ category = "nekohouse"
+ root = "https://nekohouse.su"
+
+
+class NekohousePostExtractor(NekohouseExtractor):
+ subcategory = "post"
+ directory_fmt = ("{category}", "{service}", "{username} ({user_id})",
+ "{post_id} {date} {title[b:230]}")
+ filename_fmt = "{num:>02} {id|filename}.{extension}"
+ archive_fmt = "{service}_{user_id}_{post_id}_{hash}"
+ pattern = USER_PATTERN + r"/post/([^/?#]+)"
+ example = "https://nekohouse.su/SERVICE/user/12345/post/12345"
+
+ def items(self):
+ service, user_id, post_id = self.groups
+ url = "{}/{}/user/{}/post/{}".format(
+ self.root, service, user_id, post_id)
+ html = self.request(url).text
+
+ files = self._extract_files(html)
+ post = self._extract_post(html)
+ post["service"] = service
+ post["user_id"] = user_id
+ post["post_id"] = post_id
+ post["count"] = len(files)
+
+ yield Message.Directory, post
+ for post["num"], file in enumerate(files, 1):
+ url = file["url"]
+ text.nameext_from_url(url, file)
+ file["hash"] = file["filename"]
+ file.update(post)
+ if "name" in file:
+ text.nameext_from_url(file.pop("name"), file)
+ yield Message.Url, url, file
+
+ def _extract_post(self, html):
+ extr = text.extract_from(html)
+ return {
+ "username": text.unescape(extr(
+ 'class="scrape__user-name', '</').rpartition(">")[2].strip()),
+ "title" : text.unescape(extr(
+ 'class="scrape__title', '</').rpartition(">")[2]),
+ "date" : text.parse_datetime(extr(
+ 'datetime="', '"')[:19], "%Y-%m-%d %H:%M:%S"),
+ "content": text.unescape(extr(
+ 'class="scrape__content">', "</div>").strip()),
+ }
+
+ def _extract_files(self, html):
+ files = []
+
+ extr = text.extract_from(text.extr(
+ html, 'class="scrape__files"', "<footer"))
+ while True:
+ file_id = extr('<a href="/post/', '"')
+ if not file_id:
+ break
+ files.append({
+ "id" : file_id,
+ "url" : self.root + extr('href="', '"'),
+ "type": "file",
+ })
+
+ extr = text.extract_from(text.extr(
+ html, 'class="scrape__attachments"', "</ul>"))
+ while True:
+ url = extr('href="', '"')
+ if not url:
+ break
+ files.append({
+ "id" : "",
+ "url" : self.root + url,
+ "name": text.unescape(extr('download="', '"')),
+ "type": "attachment",
+ })
+
+ return files
+
+
+class NekohouseUserExtractor(NekohouseExtractor):
+ subcategory = "user"
+ pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)"
+ example = "https://nekohouse.su/SERVICE/user/12345"
+
+ def items(self):
+ service, user_id, _ = self.groups
+ creator_url = "{}/{}/user/{}".format(self.root, service, user_id)
+ params = {"o": 0}
+
+ data = {"_extractor": NekohousePostExtractor}
+ while True:
+ html = self.request(creator_url, params=params).text
+
+ cnt = 0
+ for post in text.extract_iter(html, "<article", "</article>"):
+ cnt += 1
+ post_url = self.root + text.extr(post, '<a href="', '"')
+ yield Message.Queue, post_url, data
+
+ if cnt < 50:
+ return
+ params["o"] += 50
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index d3e40ee..7fe8869 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -27,8 +27,10 @@ class PixivExtractor(Extractor):
filename_fmt = "{id}_p{num}.{extension}"
archive_fmt = "{id}{suffix}.{extension}"
cookies_domain = ".pixiv.net"
- sanity_url = "https://s.pximg.net/common/images/limit_sanity_level_360.png"
- mypixiv_url = "https://s.pximg.net/common/images/limit_mypixiv_360.png"
+ limit_url = "https://s.pximg.net/common/images/limit_"
+ # https://s.pximg.net/common/images/limit_sanity_level_360.png
+ # https://s.pximg.net/common/images/limit_unviewable_360.png
+ # https://s.pximg.net/common/images/limit_mypixiv_360.png
def _init(self):
self.api = PixivAppAPI(self)
@@ -117,16 +119,30 @@ class PixivExtractor(Extractor):
]
url = meta_single_page["original_image_url"]
- if url == self.sanity_url:
- work["_ajax"] = True
- self.log.warning("%s: 'limit_sanity_level' warning", work["id"])
- if self.sanity_workaround:
- body = self._request_ajax("/illust/" + str(work["id"]))
- return self._extract_ajax(work, body)
+ if url.startswith(self.limit_url):
+ work_id = work["id"]
+ self.log.debug("%s: %s", work_id, url)
+
+ limit_type = url.rpartition("/")[2]
+ if limit_type in (
+ "limit_", # for '_extend_sanity()' inserts
+ "limit_unviewable_360.png",
+ "limit_sanity_level_360.png",
+ ):
+ work["_ajax"] = True
+ self.log.warning("%s: 'limit_sanity_level' warning", work_id)
+ if self.sanity_workaround:
+ body = self._request_ajax("/illust/" + str(work_id))
+ return self._extract_ajax(work, body)
+
+ elif limit_type == "limit_mypixiv_360.png":
+ work["_mypixiv"] = True
+ self.log.warning("%s: 'My pixiv' locked", work_id)
- elif url == self.mypixiv_url:
- work["_mypixiv"] = True
- self.log.warning("%s: 'My pixiv' locked", work["id"])
+ else:
+ work["_mypixiv"] = True # stop further processing
+ self.log.error("%s: Unknown 'limit' URL type: %s",
+ work_id, limit_type)
elif work["type"] != "ugoira":
return ({"url": url, "_fallback": self._fallback_image(url)},)
@@ -430,7 +446,7 @@ class PixivArtworksExtractor(PixivExtractor):
elif ajax_id > work_id:
index -= 1
self.log.debug("Inserting work %s", ajax_id)
- yield self._make_work(ajax_id, self.sanity_url, user)
+ yield self._make_work(ajax_id, self.limit_url, user)
else: # ajax_id < work_id
break
@@ -440,7 +456,7 @@ class PixivArtworksExtractor(PixivExtractor):
while index >= 0:
ajax_id = ajax_ids[index]
self.log.debug("Inserting work %s", ajax_id)
- yield self._make_work(ajax_id, self.sanity_url, user)
+ yield self._make_work(ajax_id, self.limit_url, user)
index -= 1
diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py
index 83f3064..863ef3b 100644
--- a/gallery_dl/extractor/pornpics.py
+++ b/gallery_dl/extractor/pornpics.py
@@ -20,10 +20,6 @@ class PornpicsExtractor(Extractor):
root = "https://www.pornpics.com"
request_interval = (0.5, 1.5)
- def __init__(self, match):
- super().__init__(match)
- self.item = match.group(1)
-
def items(self):
for gallery in self.galleries():
gallery["_extractor"] = PornpicsGalleryExtractor
@@ -34,9 +30,11 @@ class PornpicsExtractor(Extractor):
# fetch first 20 galleries from HTML
# since '"offset": 0' does not return a JSON response
page = self.request(url).text
- for path in text.extract_iter(
+ for href in text.extract_iter(
page, 'class="rel-link" href="', '"'):
- yield {"g_url": self.root + path}
+ if href[0] == "/":
+ href = self.root + href
+ yield {"g_url": href}
del page
params = {"offset": 20}
@@ -60,12 +58,12 @@ class PornpicsExtractor(Extractor):
class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor):
"""Extractor for pornpics galleries"""
- pattern = BASE_PATTERN + r"(/galleries/(?:[^/?#]+-)?(\d+))"
+ pattern = BASE_PATTERN + r"/galleries/((?:[^/?#]+-)?(\d+))"
example = "https://www.pornpics.com/galleries/TITLE-12345/"
def __init__(self, match):
- PornpicsExtractor.__init__(self, match)
- self.gallery_id = match.group(2)
+ url = "{}/galleries/{}/".format(self.root, match.group(1))
+ GalleryExtractor.__init__(self, match, url)
items = GalleryExtractor.items
@@ -73,7 +71,7 @@ class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor):
extr = text.extract_from(page)
return {
- "gallery_id": text.parse_int(self.gallery_id),
+ "gallery_id": text.parse_int(self.groups[1]),
"slug" : extr("/galleries/", "/").rpartition("-")[0],
"title" : text.unescape(extr("<h1>", "<")),
"channel" : text.split_html(extr(">Channel:&nbsp;", '</div>')),
@@ -100,7 +98,7 @@ class PornpicsTagExtractor(PornpicsExtractor):
example = "https://www.pornpics.com/tags/TAGS/"
def galleries(self):
- url = "{}/tags/{}/".format(self.root, self.item)
+ url = "{}/tags/{}/".format(self.root, self.groups[0])
return self._pagination(url)
@@ -113,7 +111,7 @@ class PornpicsSearchExtractor(PornpicsExtractor):
def galleries(self):
url = self.root + "/search/srch.php"
params = {
- "q" : self.item.replace("-", " "),
+ "q" : self.groups[0].replace("-", " "),
"lang" : "en",
"offset": 0,
}
diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py
index f1e7518..3b8d344 100644
--- a/gallery_dl/extractor/rule34xyz.py
+++ b/gallery_dl/extractor/rule34xyz.py
@@ -60,18 +60,22 @@ class Rule34xyzExtractor(BooruExtractor):
post.pop("filesPreview", None)
post.pop("tagsWithType", None)
post["date"] = text.parse_datetime(
- post["created"], "%Y-%m-%dT%H:%M:%S.%f")
+ post["created"][:19], "%Y-%m-%dT%H:%M:%S")
def _tags(self, post, _):
if post.get("tagsWithType") is None:
post.update(self._fetch_post(post["id"]))
tags = collections.defaultdict(list)
+ tagslist = []
for tag in post["tagsWithType"]:
- tags[tag["type"]].append(tag["value"])
+ value = tag["value"]
+ tagslist.append(value)
+ tags[tag["type"]].append(value)
types = self.TAG_TYPES
for type, values in tags.items():
post["tags_" + types[type]] = values
+ post["tags"] = tagslist
def _fetch_post(self, post_id):
url = "{}/api/post/{}".format(self.root, post_id)
diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py
index 1c62d75..5ec2443 100644
--- a/gallery_dl/extractor/saint.py
+++ b/gallery_dl/extractor/saint.py
@@ -81,6 +81,7 @@ class SaintMediaExtractor(SaintAlbumExtractor):
else: # /d/
file = {
"file" : text.unescape(extr('<a href="', '"')),
+ "id" : album_id,
"id_dl" : album_id,
"name" : album_id,
"filename" : album_id,
diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py
index 97bad09..d15762d 100644
--- a/gallery_dl/extractor/shimmie2.py
+++ b/gallery_dl/extractor/shimmie2.py
@@ -74,10 +74,6 @@ BASE_PATTERN = Shimmie2Extractor.update({
"pattern": r"(?:sizechange|giantess)booru\.com",
"cookies": {"agreed": "true"},
},
- "tentaclerape": {
- "root": "https://tentaclerape.net",
- "pattern": r"tentaclerape\.net",
- },
"cavemanon": {
"root": "https://booru.cavemanon.xyz",
"pattern": r"booru\.cavemanon\.xyz",
diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py
index b122f26..1713509 100644
--- a/gallery_dl/extractor/szurubooru.py
+++ b/gallery_dl/extractor/szurubooru.py
@@ -79,10 +79,6 @@ class SzurubooruExtractor(booru.BooruExtractor):
BASE_PATTERN = SzurubooruExtractor.update({
- "foalcon": {
- "root": "https://booru.foalcon.com",
- "pattern": r"booru\.foalcon\.com",
- },
"bcbnsfw": {
"root": "https://booru.bcbnsfw.space",
"pattern": r"booru\.bcbnsfw\.space",
@@ -104,7 +100,7 @@ class SzurubooruTagExtractor(SzurubooruExtractor):
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}_{version}"
pattern = BASE_PATTERN + r"/posts(?:/query=([^/?#]*))?"
- example = "https://booru.foalcon.com/posts/query=TAG"
+ example = "https://booru.bcbnsfw.space/posts/query=TAG"
def __init__(self, match):
SzurubooruExtractor.__init__(self, match)
@@ -127,7 +123,7 @@ class SzurubooruPostExtractor(SzurubooruExtractor):
subcategory = "post"
archive_fmt = "{id}_{version}"
pattern = BASE_PATTERN + r"/post/(\d+)"
- example = "https://booru.foalcon.com/post/12345"
+ example = "https://booru.bcbnsfw.space/post/12345"
def posts(self):
return (self._api_request("/post/" + self.groups[-1]),)
diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py
index 44d87ee..cee0d9d 100644
--- a/gallery_dl/extractor/toyhouse.py
+++ b/gallery_dl/extractor/toyhouse.py
@@ -52,16 +52,18 @@ class ToyhouseExtractor(Extractor):
return {
"url": extr(needle, '"'),
"date": text.parse_datetime(extr(
- 'Credits\n</h2>\n<div class="mb-1">', '<'),
+ '</h2>\n <div class="mb-1">', '<'),
"%d %b %Y, %I:%M:%S %p"),
"artists": [
text.remove_html(artist)
for artist in extr(
- '<div class="artist-credit">', '</div>\n</div>').split(
- '<div class="artist-credit">')
+ '<div class="artist-credit">',
+ '</div>\n </div>').split(
+ '<div class="ar tist-credit">')
],
"characters": text.split_html(extr(
- '<div class="image-characters', '</div>\n</div>'))[2:],
+ '<div class="image-characters',
+ '<div class="image-comments">'))[2:],
}
def _pagination(self, path):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 090b11a..840e846 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -121,14 +121,7 @@ class TwitterExtractor(Extractor):
txt = data.get("full_text") or data.get("text") or ""
self.log.warning("'%s' (%s)", txt, data["id_str"])
- files = []
- if "extended_entities" in data:
- self._extract_media(
- data, data["extended_entities"]["media"], files)
- if "card" in tweet and self.cards:
- self._extract_card(tweet, files)
- if self.twitpic:
- self._extract_twitpic(data, files)
+ files = self._extract_files(data, tweet)
if not files and not self.textonly:
continue
@@ -143,6 +136,39 @@ class TwitterExtractor(Extractor):
text.nameext_from_url(url, file)
yield Message.Url, url, file
+ def _extract_files(self, data, tweet):
+ files = []
+
+ if "extended_entities" in data:
+ try:
+ self._extract_media(
+ data, data["extended_entities"]["media"], files)
+ except Exception as exc:
+ self.log.debug("", exc_info=exc)
+ self.log.warning(
+ "%s: Error while extracting media files (%s: %s)",
+ data["id_str"], exc.__class__.__name__, exc)
+
+ if self.cards and "card" in tweet:
+ try:
+ self._extract_card(tweet, files)
+ except Exception as exc:
+ self.log.debug("", exc_info=exc)
+ self.log.warning(
+ "%s: Error while extracting Card files (%s: %s)",
+ data["id_str"], exc.__class__.__name__, exc)
+
+ if self.twitpic:
+ try:
+ self._extract_twitpic(data, files)
+ except Exception as exc:
+ self.log.debug("", exc_info=exc)
+ self.log.warning(
+ "%s: Error while extracting TwitPic files (%s: %s)",
+ data["id_str"], exc.__class__.__name__, exc)
+
+ return files
+
def _extract_media(self, tweet, entities, files):
for media in entities:
@@ -1039,7 +1065,7 @@ class TwitterAPI():
else:
csrf_token = None
if not csrf_token:
- csrf_token = util.generate_token()
+ csrf_token = util.generate_token(80)
cookies.set("ct0", csrf_token, domain=cookies_domain)
auth_token = cookies.get("auth_token", domain=cookies_domain)
diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py
index bb80055..ebfeb9d 100644
--- a/gallery_dl/extractor/urlgalleries.py
+++ b/gallery_dl/extractor/urlgalleries.py
@@ -15,12 +15,15 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor):
category = "urlgalleries"
root = "https://urlgalleries.net"
request_interval = (0.5, 1.5)
- pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)"
- example = "https://BLOG.urlgalleries.net/gallery-12345/TITLE"
+ pattern = (r"(?:https?://)()(?:(\w+)\.)?urlgalleries\.net"
+ r"/(?:b/([^/?#]+)/)?(?:[\w-]+-)?(\d+)")
+ example = "https://urlgalleries.net/b/BLOG/gallery-12345/TITLE"
def items(self):
- blog, self.gallery_id = self.groups
- url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format(
+ _, blog_alt, blog, self.gallery_id = self.groups
+ if not blog:
+ blog = blog_alt
+ url = "https://urlgalleries.net/b/{}/porn-gallery-{}/?a=10000".format(
blog, self.gallery_id)
with self.request(url, allow_redirects=False, fatal=...) as response:
@@ -35,7 +38,7 @@ class UrlgalleriesGalleryExtractor(GalleryExtractor):
data = self.metadata(page)
data["count"] = len(imgs)
- root = "https://{}.urlgalleries.net".format(blog)
+ root = "https://urlgalleries.net/b/" + blog
yield Message.Directory, data
for data["num"], img in enumerate(imgs, 1):
page = self.request(root + img).text
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index 922a591..1c0c172 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -70,7 +70,8 @@ class VscoExtractor(Extractor):
def _extract_preload_state(self, url):
page = self.request(url, notfound=self.subcategory).text
- return util.json_loads(text.extr(page, "__PRELOADED_STATE__ = ", "<"))
+ return util.json_loads(text.extr(page, "__PRELOADED_STATE__ = ", "<")
+ .replace('"prevPageToken":undefined,', ''))
def _pagination(self, url, params, token, key, extra=None):
headers = {
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 70ab259..008ae6e 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -102,8 +102,8 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
else:
episode = ""
- if extr('<div class="author_area"', '\n'):
- username = extr('/creator/', '"')
+ if extr('<span class="author"', '\n'):
+ username = extr('/u/', '"')
author_name = extr('<span>', '</span>')
else:
username = author_name = ""
diff --git a/gallery_dl/extractor/weebcentral.py b/gallery_dl/extractor/weebcentral.py
index 39f998a..fc1badb 100644
--- a/gallery_dl/extractor/weebcentral.py
+++ b/gallery_dl/extractor/weebcentral.py
@@ -80,12 +80,12 @@ class WeebcentralChapterExtractor(WeebcentralBase, ChapterExtractor):
results = []
while True:
- src = extr(' src="', '"')
+ src = extr('src="', '"')
if not src:
break
results.append((src, {
- "width" : text.parse_int(extr(' width="' , '"')),
- "height": text.parse_int(extr(' height="', '"')),
+ "width" : text.parse_int(extr('width="' , '"')),
+ "height": text.parse_int(extr('height="', '"')),
}))
return results
diff --git a/gallery_dl/extractor/xfolio.py b/gallery_dl/extractor/xfolio.py
new file mode 100644
index 0000000..a1a5be3
--- /dev/null
+++ b/gallery_dl/extractor/xfolio.py
@@ -0,0 +1,146 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://xfolio.jp/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?xfolio\.jp(?:/[^/?#]+)?"
+
+
+class XfolioExtractor(Extractor):
+ """Base class for xfolio extractors"""
+ category = "xfolio"
+ root = "https://xfolio.jp"
+ cookies_domain = ".xfolio.jp"
+ directory_fmt = ("{category}", "{creator_slug}", "{work_id}")
+ filename_fmt = "{work_id}_{image_id}.{extension}"
+ archive_fmt = "{work_id}_{image_id}"
+ request_interval = (0.5, 1.5)
+
+ def _init(self):
+ XfolioExtractor._init = Extractor._init
+ if not self.cookies_check(("xfolio_session",)):
+ self.log.error("'xfolio_session' cookie required")
+
+ def items(self):
+ data = {"_extractor": XfolioWorkExtractor}
+ for work in self.works():
+ yield Message.Queue, work, data
+
+ def request(self, url, **kwargs):
+ response = Extractor.request(self, url, **kwargs)
+
+ if "/system/recaptcha" in response.url:
+ raise exception.StopExtraction("Bot check / CAPTCHA page")
+
+ return response
+
+
+class XfolioWorkExtractor(XfolioExtractor):
+ subcategory = "work"
+ pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/works/(\d+)"
+ example = "https://xfolio.jp/portfolio/USER/works/12345"
+ ref_fmt = ("{}/fullscale_image?image_id={}&work_id={}")
+ url_fmt = ("{}/user_asset.php?id={}&work_id={}"
+ "&work_image_id={}&type=work_image")
+
+ def items(self):
+ creator, work_id = self.groups
+ url = "{}/portfolio/{}/works/{}".format(self.root, creator, work_id)
+ html = self.request(url).text
+
+ work = self._extract_data(html)
+ files = self._extract_files(html, work)
+ work["count"] = len(files)
+
+ yield Message.Directory, work
+ for work["num"], file in enumerate(files, 1):
+ file.update(work)
+ yield Message.Url, file["url"], file
+
+ def _extract_data(self, html):
+ creator, work_id = self.groups
+ extr = text.extract_from(html)
+ return {
+ "title" : text.unescape(extr(
+ 'property="og:title" content="', '"').rpartition(" - ")[0]),
+ "description" : text.unescape(extr(
+ 'property="og:description" content="', '"')),
+ "creator_id" : extr(' data-creator-id="', '"'),
+ "creator_userid" : extr(' data-creator-user-id="', '"'),
+ "creator_name" : extr(' data-creator-name="', '"'),
+ "creator_profile": text.unescape(extr(
+ ' data-creator-profile="', '"')),
+ "series_id" : extr("/series/", '"'),
+ "creator_slug" : creator,
+ "work_id" : work_id,
+ }
+
+ def _extract_files(self, html, work):
+ files = []
+
+ work_id = work["work_id"]
+ for img in text.extract_iter(
+ html, 'class="article__wrap_img', "</div>"):
+ image_id = text.extr(img, "/fullscale_image?image_id=", "&")
+ if not image_id:
+ self.log.warning(
+ "%s: 'fullscale_image' not available", work_id)
+ continue
+
+ files.append({
+ "image_id" : image_id,
+ "extension": "jpg",
+ "url": self.url_fmt.format(
+ self.root, image_id, work_id, image_id),
+ "_http_headers": {"Referer": self.ref_fmt.format(
+ self.root, image_id, work_id)},
+ })
+
+ return files
+
+
+class XfolioUserExtractor(XfolioExtractor):
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)(?:/works)?/?(?:$|\?|#)"
+ example = "https://xfolio.jp/portfolio/USER"
+
+ def works(self):
+ url = "{}/portfolio/{}/works".format(self.root, self.groups[0])
+
+ while True:
+ html = self.request(url).text
+
+ for item in text.extract_iter(
+ html, '<div class="postItem', "</div>"):
+ yield text.extr(item, ' href="', '"')
+
+ pager = text.extr(html, ' class="pager__list_next', "</li>")
+ url = text.extr(pager, ' href="', '"')
+ if not url:
+ return
+ url = text.unescape(url)
+
+
+class XfolioSeriesExtractor(XfolioExtractor):
+ subcategory = "series"
+ pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/series/(\d+)"
+ example = "https://xfolio.jp/portfolio/USER/series/12345"
+
+ def works(self):
+ creator, series_id = self.groups
+ url = "{}/portfolio/{}/series/{}".format(self.root, creator, series_id)
+ html = self.request(url).text
+
+ return [
+ text.extr(item, ' href="', '"')
+ for item in text.extract_iter(
+ html, 'class="listWrap--title">', "</a>")
+ ]
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 6dc9362..4d69d3d 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -20,8 +20,8 @@ class XhamsterExtractor(Extractor):
category = "xhamster"
def __init__(self, match):
- Extractor.__init__(self, match)
self.root = "https://" + match.group(1)
+ Extractor.__init__(self, match)
class XhamsterGalleryExtractor(XhamsterExtractor):
@@ -34,48 +34,48 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
pattern = BASE_PATTERN + r"(/photos/gallery/[^/?#]+)"
example = "https://xhamster.com/photos/gallery/12345"
- def __init__(self, match):
- XhamsterExtractor.__init__(self, match)
- self.path = match.group(2)
- self.data = None
-
def items(self):
data = self.metadata()
yield Message.Directory, data
for num, image in enumerate(self.images(), 1):
url = image["imageURL"]
image.update(data)
+ text.nameext_from_url(url, image)
image["num"] = num
- yield Message.Url, url, text.nameext_from_url(url, image)
+ image["extension"] = "webp"
+ del image["modelName"]
+ yield Message.Url, url, image
def metadata(self):
- self.data = self._data(self.root + self.path)
- user = self.data["authorModel"]
- imgs = self.data["photosGalleryModel"]
+ data = self.data = self._extract_data(self.root + self.groups[1])
+
+ gallery = data["galleryPage"]
+ info = gallery["infoProps"]
+ model = gallery["galleryModel"]
+ author = info["authorInfoProps"]
return {
"user":
{
- "id" : text.parse_int(user["id"]),
- "url" : user["pageURL"],
- "name" : user["name"],
- "retired" : user["retired"],
- "verified" : user["verified"],
- "subscribers": user["subscribers"],
+ "id" : text.parse_int(model["userId"]),
+ "url" : author["authorLink"],
+ "name" : author["authorName"],
+ "verified" : True if author.get("verified") else False,
+ "subscribers": info["subscribeButtonProps"]["subscribers"],
},
"gallery":
{
- "id" : text.parse_int(imgs["id"]),
- "tags" : [c["name"] for c in imgs["categories"]],
- "date" : text.parse_timestamp(imgs["created"]),
- "views" : text.parse_int(imgs["views"]),
- "likes" : text.parse_int(imgs["rating"]["likes"]),
- "dislikes" : text.parse_int(imgs["rating"]["dislikes"]),
- "title" : text.unescape(imgs["title"]),
- "description": text.unescape(imgs["description"]),
- "thumbnail" : imgs["thumbURL"],
+ "id" : text.parse_int(gallery["id"]),
+ "tags" : [t["label"] for t in info["categoriesTags"]],
+ "date" : text.parse_timestamp(model["created"]),
+ "views" : text.parse_int(model["views"]),
+ "likes" : text.parse_int(model["rating"]["likes"]),
+ "dislikes" : text.parse_int(model["rating"]["dislikes"]),
+ "title" : model["title"],
+ "description": model["description"],
+ "thumbnail" : model["thumbURL"],
},
- "count": text.parse_int(imgs["quantity"]),
+ "count": text.parse_int(gallery["photosCount"]),
}
def images(self):
@@ -83,17 +83,17 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
self.data = None
while True:
- for image in data["photosGalleryModel"]["photos"]:
- del image["modelName"]
- yield image
+ yield from data["photosGalleryModel"]["photos"]
- pgntn = data["pagination"]
- if pgntn["active"] == pgntn["maxPage"]:
+ pagination = data["galleryPage"]["paginationProps"]
+ if pagination["currentPageNumber"] >= pagination["lastPageNumber"]:
return
- url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"])
- data = self._data(url)
+ url = (pagination["pageLinkTemplate"][:-3] +
+ str(pagination["currentPageNumber"] + 1))
+
+ data = self._extract_data(url)
- def _data(self, url):
+ def _extract_data(self, url):
page = self.request(url).text
return util.json_loads(text.extr(
page, "window.initials=", "</script>").rstrip("\n\r;"))
@@ -105,12 +105,8 @@ class XhamsterUserExtractor(XhamsterExtractor):
pattern = BASE_PATTERN + r"/users/([^/?#]+)(?:/photos)?/?(?:$|[?#])"
example = "https://xhamster.com/users/USER/photos"
- def __init__(self, match):
- XhamsterExtractor.__init__(self, match)
- self.user = match.group(2)
-
def items(self):
- url = "{}/users/{}/photos".format(self.root, self.user)
+ url = "{}/users/{}/photos".format(self.root, self.groups[1])
data = {"_extractor": XhamsterGalleryExtractor}
while url: