summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2024-08-03 20:27:44 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2024-08-03 20:27:44 -0400
commit032e5bed275a253e122ed9ac86dac7b8c4204172 (patch)
treeb4eda52ebfe00c4d22e9d633b1ab2d158a9f0573 /gallery_dl/extractor
parent80e39a8fc7de105510cbbdca8507f2a4b8c9e01d (diff)
New upstream version 1.27.2.upstream/1.27.2
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/8chan.py3
-rw-r--r--gallery_dl/extractor/__init__.py4
-rw-r--r--gallery_dl/extractor/agnph.py113
-rw-r--r--gallery_dl/extractor/aryion.py23
-rw-r--r--gallery_dl/extractor/behance.py12
-rw-r--r--gallery_dl/extractor/booru.py18
-rw-r--r--gallery_dl/extractor/bunkr.py2
-rw-r--r--gallery_dl/extractor/cien.py199
-rw-r--r--gallery_dl/extractor/common.py20
-rw-r--r--gallery_dl/extractor/deviantart.py142
-rw-r--r--gallery_dl/extractor/directlink.py3
-rw-r--r--gallery_dl/extractor/dynastyscans.py2
-rw-r--r--gallery_dl/extractor/erome.py6
-rw-r--r--gallery_dl/extractor/exhentai.py3
-rw-r--r--gallery_dl/extractor/fallenangels.py84
-rw-r--r--gallery_dl/extractor/furaffinity.py9
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py25
-rw-r--r--gallery_dl/extractor/hentainexus.py11
-rw-r--r--gallery_dl/extractor/hotleak.py6
-rw-r--r--gallery_dl/extractor/imagefap.py2
-rw-r--r--gallery_dl/extractor/inkbunny.py4
-rw-r--r--gallery_dl/extractor/instagram.py20
-rw-r--r--gallery_dl/extractor/koharu.py221
-rw-r--r--gallery_dl/extractor/nijie.py3
-rw-r--r--gallery_dl/extractor/paheal.py8
-rw-r--r--gallery_dl/extractor/readcomiconline.py26
-rw-r--r--gallery_dl/extractor/redgifs.py2
-rw-r--r--gallery_dl/extractor/sankaku.py26
-rw-r--r--gallery_dl/extractor/sankakucomplex.py14
-rw-r--r--gallery_dl/extractor/subscribestar.py2
-rw-r--r--gallery_dl/extractor/toyhouse.py28
-rw-r--r--gallery_dl/extractor/tumblr.py47
-rw-r--r--gallery_dl/extractor/twitter.py157
-rw-r--r--gallery_dl/extractor/vipergirls.py3
-rw-r--r--gallery_dl/extractor/vsco.py23
-rw-r--r--gallery_dl/extractor/wallpapercave.py11
-rw-r--r--gallery_dl/extractor/warosu.py4
-rw-r--r--gallery_dl/extractor/zerochan.py41
38 files changed, 1019 insertions, 308 deletions
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
index a4b0997..a5e8b27 100644
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -27,7 +27,8 @@ class _8chanExtractor(Extractor):
Extractor.__init__(self, match)
def _init(self):
- self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2])
+ self.cookies.set(
+ "TOS20240718", "1", domain=self.root.rpartition("/")[2])
@memcache()
def cookies_prepare(self):
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 6aff1f3..e103cb1 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -22,6 +22,7 @@ modules = [
"8chan",
"8muses",
"adultempire",
+ "agnph",
"architizer",
"artstation",
"aryion",
@@ -33,6 +34,7 @@ modules = [
"bunkr",
"catbox",
"chevereto",
+ "cien",
"comicvine",
"cyberdrop",
"danbooru",
@@ -42,7 +44,6 @@ modules = [
"e621",
"erome",
"exhentai",
- "fallenangels",
"fanbox",
"fanleaks",
"fantia",
@@ -84,6 +85,7 @@ modules = [
"keenspot",
"kemonoparty",
"khinsider",
+ "koharu",
"komikcast",
"lensdump",
"lexica",
diff --git a/gallery_dl/extractor/agnph.py b/gallery_dl/extractor/agnph.py
new file mode 100644
index 0000000..653b73f
--- /dev/null
+++ b/gallery_dl/extractor/agnph.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://agn.ph/"""
+
+from . import booru
+from .. import text
+
+from xml.etree import ElementTree
+import collections
+import re
+
+BASE_PATTERN = r"(?:https?://)?agn\.ph"
+
+
+class AgnphExtractor(booru.BooruExtractor):
+ category = "agnph"
+ root = "https://agn.ph"
+ page_start = 1
+ per_page = 45
+
+ TAG_TYPES = {
+ "a": "artist",
+ "b": "copyright",
+ "c": "character",
+ "d": "species",
+ "m": "general",
+ }
+
+ def _init(self):
+ self.cookies.set("confirmed_age", "true", domain="agn.ph")
+
+ def _prepare(self, post):
+ post["date"] = text.parse_timestamp(post["created_at"])
+ post["status"] = post["status"].strip()
+ post["has_children"] = ("true" in post["has_children"])
+
+ def _xml_to_dict(self, xml):
+ return {element.tag: element.text for element in xml}
+
+ def _pagination(self, url, params):
+ params["api"] = "xml"
+ if "page" in params:
+ params["page"] = \
+ self.page_start + text.parse_int(params["page"]) - 1
+ else:
+ params["page"] = self.page_start
+
+ while True:
+ data = self.request(url, params=params).text
+ root = ElementTree.fromstring(data)
+
+ yield from map(self._xml_to_dict, root)
+
+ attrib = root.attrib
+ if int(attrib["offset"]) + len(root) >= int(attrib["count"]):
+ return
+
+ params["page"] += 1
+
+ def _html(self, post):
+ url = "{}/gallery/post/show/{}/".format(self.root, post["id"])
+ return self.request(url).text
+
+ def _tags(self, post, page):
+ tag_container = text.extr(
+ page, '<ul class="taglist">', '<h3>Statistics</h3>')
+ if not tag_container:
+ return
+
+ tags = collections.defaultdict(list)
+ pattern = re.compile(r'class="(.)typetag">([^<]+)')
+ for tag_type, tag_name in pattern.findall(tag_container):
+ tags[tag_type].append(text.unquote(tag_name).replace(" ", "_"))
+ for key, value in tags.items():
+ post["tags_" + self.TAG_TYPES[key]] = " ".join(value)
+
+
+class AgnphTagExtractor(AgnphExtractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/gallery/post/(?:\?([^#]+))?$"
+ example = "https://agn.ph/gallery/post/?search=TAG"
+
+ def __init__(self, match):
+ AgnphExtractor.__init__(self, match)
+ self.params = text.parse_query(self.groups[0])
+
+ def metadata(self):
+ return {"search_tags": self.params.get("search") or ""}
+
+ def posts(self):
+ url = self.root + "/gallery/post/"
+ return self._pagination(url, self.params.copy())
+
+
+class AgnphPostExtractor(AgnphExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/gallery/post/show/(\d+)"
+ example = "https://agn.ph/gallery/post/show/12345/"
+
+ def posts(self):
+ url = "{}/gallery/post/show/{}/?api=xml".format(
+ self.root, self.groups[0])
+ post = ElementTree.fromstring(self.request(url).text)
+ return (self._xml_to_dict(post),)
diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py
index ec86263..17b780e 100644
--- a/gallery_dl/extractor/aryion.py
+++ b/gallery_dl/extractor/aryion.py
@@ -79,18 +79,20 @@ class AryionExtractor(Extractor):
def metadata(self):
"""Return general metadata"""
- def _pagination_params(self, url, params=None):
+ def _pagination_params(self, url, params=None, needle=None):
if params is None:
params = {"p": 1}
else:
params["p"] = text.parse_int(params.get("p"), 1)
+ if needle is None:
+ needle = "class='gallery-item' id='"
+
while True:
page = self.request(url, params=params).text
cnt = 0
- for post_id in text.extract_iter(
- page, "class='gallery-item' id='", "'"):
+ for post_id in text.extract_iter(page, needle, "'"):
cnt += 1
yield post_id
@@ -200,6 +202,21 @@ class AryionGalleryExtractor(AryionExtractor):
return util.advance(self._pagination_next(url), self.offset)
+class AryionFavoriteExtractor(AryionExtractor):
+ """Extractor for a user's favorites gallery"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "{user!l}", "favorites")
+ archive_fmt = "f_{user}_{id}"
+ categorytransfer = True
+ pattern = BASE_PATTERN + r"/favorites/([^/?#]+)"
+ example = "https://aryion.com/g4/favorites/USER"
+
+ def posts(self):
+ url = "{}/g4/favorites/{}".format(self.root, self.user)
+ return self._pagination_params(
+ url, None, "class='gallery-item favorite' id='")
+
+
class AryionTagExtractor(AryionExtractor):
"""Extractor for tag searches on eka's portal"""
subcategory = "tag"
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index ad0caf9..f24059f 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -152,8 +152,16 @@ class BehanceGalleryExtractor(BehanceExtractor):
continue
if mtype == "image":
- url = module["imageSizes"]["size_original"]["url"]
- append((url, module))
+ sizes = {
+ size["url"].rsplit("/", 2)[1]: size
+ for size in module["imageSizes"]["allAvailable"]
+ }
+ size = (sizes.get("source") or
+ sizes.get("max_3840") or
+ sizes.get("fs") or
+ sizes.get("hd") or
+ sizes.get("disp"))
+ append((size["url"], module))
elif mtype == "video":
try:
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index cbd0e07..7e26f38 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -29,16 +29,21 @@ class BooruExtractor(BaseExtractor):
url_key = self.config("url")
if url_key:
- self._file_url = operator.itemgetter(url_key)
+ if isinstance(url_key, (list, tuple)):
+ self._file_url = self._file_url_list
+ self._file_url_keys = url_key
+ else:
+ self._file_url = operator.itemgetter(url_key)
for post in self.posts():
try:
url = self._file_url(post)
if url[0] == "/":
url = self.root + url
- except (KeyError, TypeError):
- self.log.debug("Unable to fetch download URL for post %s "
- "(md5: %s)", post.get("id"), post.get("md5"))
+ except Exception as exc:
+ self.log.debug("%s: %s", exc.__class__.__name__, exc)
+ self.log.warning("Unable to fetch download URL for post %s "
+ "(md5: %s)", post.get("id"), post.get("md5"))
continue
if fetch_html:
@@ -73,6 +78,11 @@ class BooruExtractor(BaseExtractor):
_file_url = operator.itemgetter("file_url")
+ def _file_url_list(self, post):
+ urls = (post[key] for key in self._file_url_keys if post.get(key))
+ post["_fallback"] = it = iter(urls)
+ return next(it)
+
def _prepare(self, post):
"""Prepare a 'post's metadata"""
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index a093347..77f0de6 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -13,7 +13,7 @@ from .. import text
BASE_PATTERN = (
r"(?:https?://)?(?:app\.)?(bunkr+"
- r"\.(?:s[kiu]|ru|la|is|to|ac|black|cat|media|red|site|ws))"
+ r"\.(?:s[kiu]|fi|ru|la|is|to|ac|black|cat|media|red|site|ws))"
)
LEGACY_DOMAINS = {
diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py
new file mode 100644
index 0000000..bae86d0
--- /dev/null
+++ b/gallery_dl/extractor/cien.py
@@ -0,0 +1,199 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://ci-en.net/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)"
+
+
+class CienExtractor(Extractor):
+ category = "cien"
+ root = "https://ci-en.net"
+ request_interval = (1.0, 2.0)
+
+ def __init__(self, match):
+ self.root = text.root_from_url(match.group(0))
+ Extractor.__init__(self, match)
+
+ def _init(self):
+ self.cookies.set("accepted_rating", "r18g", domain="ci-en.dlsite.com")
+
+ def _pagination_articles(self, url, params):
+ data = {"_extractor": CienArticleExtractor}
+ params["page"] = text.parse_int(params.get("page"), 1)
+
+ while True:
+ page = self.request(url, params=params).text
+
+ for card in text.extract_iter(
+ page, ' class="c-cardCase-item', '</div>'):
+ article_url = text.extr(card, ' href="', '"')
+ yield Message.Queue, article_url, data
+
+ if ' rel="next"' not in page:
+ return
+ params["page"] += 1
+
+
+class CienArticleExtractor(CienExtractor):
+ subcategory = "article"
+ filename_fmt = "{num:>02} {filename}.{extension}"
+ directory_fmt = ("{category}", "{author[name]}", "{post_id} {name}")
+ archive_fmt = "{post_id}_{num}"
+ pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)"
+ example = "https://ci-en.net/creator/123/article/12345"
+
+ def items(self):
+ url = "{}/creator/{}/article/{}".format(
+ self.root, self.groups[0], self.groups[1])
+ page = self.request(url, notfound="article").text
+
+ post = util.json_loads(text.extr(
+ page, '<script type="application/ld+json">', '</script>'))[0]
+
+ files = self._extract_files(post.get("articleBody") or page)
+
+ post["post_url"] = url
+ post["post_id"] = text.parse_int(self.groups[1])
+ post["count"] = len(files)
+ post["date"] = text.parse_datetime(post["datePublished"])
+
+ try:
+ del post["publisher"]
+ del post["sameAs"]
+ except Exception:
+ pass
+
+ yield Message.Directory, post
+ for post["num"], file in enumerate(files, 1):
+ post.update(file)
+ if "extension" not in file:
+ text.nameext_from_url(file["url"], post)
+ yield Message.Url, file["url"], post
+
+ def _extract_files(self, page):
+ files = []
+
+ filetypes = self.config("files")
+ if filetypes is None:
+ self._extract_files_image(page, files)
+ self._extract_files_video(page, files)
+ self._extract_files_download(page, files)
+ self._extract_files_gallery(page, files)
+ else:
+ generators = {
+ "image" : self._extract_files_image,
+ "video" : self._extract_files_video,
+ "download": self._extract_files_download,
+ "gallery" : self._extract_files_gallery,
+ "gallerie": self._extract_files_gallery,
+ }
+ if isinstance(filetypes, str):
+ filetypes = filetypes.split(",")
+ for ft in filetypes:
+ generators[ft.rstrip("s")](page, files)
+
+ return files
+
+ def _extract_files_image(self, page, files):
+ for image in text.extract_iter(
+ page, 'class="file-player-image"', "</figure>"):
+ size = text.extr(image, ' data-size="', '"')
+ w, _, h = size.partition("x")
+
+ files.append({
+ "url" : text.extr(image, ' data-raw="', '"'),
+ "width" : text.parse_int(w),
+ "height": text.parse_int(h),
+ "type" : "image",
+ })
+
+ def _extract_files_video(self, page, files):
+ for video in text.extract_iter(
+ page, "<vue-file-player", "</vue-file-player>"):
+ path = text.extr(video, ' base-path="', '"')
+ name = text.extr(video, ' file-name="', '"')
+ auth = text.extr(video, ' auth-key="', '"')
+
+ file = text.nameext_from_url(name)
+ file["url"] = "{}video-web.mp4?{}".format(path, auth)
+ file["type"] = "video"
+ files.append(file)
+
+ def _extract_files_download(self, page, files):
+ for download in text.extract_iter(
+ page, 'class="downloadBlock', "</div>"):
+ name = text.extr(download, "<p>", "<")
+
+ file = text.nameext_from_url(name.rpartition(" ")[0])
+ file["url"] = text.extr(download, ' href="', '"')
+ file["type"] = "download"
+ files.append(file)
+
+ def _extract_files_gallery(self, page, files):
+ for gallery in text.extract_iter(
+ page, "<vue-image-gallery", "</vue-image-gallery>"):
+
+ url = self.root + "/api/creator/gallery/images"
+ params = {
+ "hash" : text.extr(gallery, ' hash="', '"'),
+ "gallery_id": text.extr(gallery, ' gallery-id="', '"'),
+ "time" : text.extr(gallery, ' time="', '"'),
+ }
+ data = self.request(url, params=params).json()
+ url = self.root + "/api/creator/gallery/imagePath"
+
+ for params["page"], params["file_id"] in enumerate(
+ data["imgList"]):
+ path = self.request(url, params=params).json()["path"]
+
+ file = params.copy()
+ file["url"] = path
+ files.append(file)
+
+
+class CienCreatorExtractor(CienExtractor):
+ subcategory = "creator"
+ pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$"
+ example = "https://ci-en.net/creator/123"
+
+ def items(self):
+ url = "{}/creator/{}/article".format(self.root, self.groups[0])
+ params = text.parse_query(self.groups[1])
+ params["mode"] = "list"
+ return self._pagination_articles(url, params)
+
+
+class CienRecentExtractor(CienExtractor):
+ subcategory = "recent"
+ pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?"
+ example = "https://ci-en.net/mypage/recent"
+
+ def items(self):
+ url = self.root + "/mypage/recent"
+ params = text.parse_query(self.groups[0])
+ return self._pagination_articles(url, params)
+
+
+class CienFollowingExtractor(CienExtractor):
+ subcategory = "following"
+ pattern = BASE_PATTERN + r"/mypage/subscription(/following)?"
+ example = "https://ci-en.net/mypage/subscription"
+
+ def items(self):
+ url = self.root + "/mypage/subscription" + (self.groups[0] or "")
+ page = self.request(url).text
+ data = {"_extractor": CienCreatorExtractor}
+
+ for subscription in text.extract_iter(
+ page, 'class="c-grid-subscriptionInfo', '</figure>'):
+ url = text.extr(subscription, ' href="', '"')
+ yield Message.Queue, url, data
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index d7a41bc..df70571 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -378,7 +378,7 @@ class Extractor():
useragent = self.config("user-agent")
if useragent is None:
useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
- "rv:109.0) Gecko/20100101 Firefox/115.0")
+ "rv:128.0) Gecko/20100101 Firefox/128.0")
elif useragent == "browser":
useragent = _browser_useragent()
headers["User-Agent"] = useragent
@@ -390,6 +390,8 @@ class Extractor():
headers["Accept-Encoding"] = "gzip, deflate, br"
else:
headers["Accept-Encoding"] = "gzip, deflate"
+ if ZSTD:
+ headers["Accept-Encoding"] += ", zstd"
referer = self.config("referer", self.referer)
if referer:
@@ -789,10 +791,11 @@ class BaseExtractor(Extractor):
instances = ()
def __init__(self, match):
- Extractor.__init__(self, match)
if not self.category:
+ self.groups = match.groups()
+ self.match = match
self._init_category()
- self._cfgpath = ("extractor", self.category, self.subcategory)
+ Extractor.__init__(self, match)
def _init_category(self):
for index, group in enumerate(self.groups):
@@ -911,13 +914,12 @@ _browser_cookies = {}
HTTP_HEADERS = {
"firefox": (
("User-Agent", "Mozilla/5.0 ({}; "
- "rv:109.0) Gecko/20100101 Firefox/115.0"),
+ "rv:128.0) Gecko/20100101 Firefox/128.0"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
- "image/avif,image/webp,*/*;q=0.8"),
+ "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"),
("Accept-Language", "en-US,en;q=0.5"),
("Accept-Encoding", None),
("Referer", None),
- ("DNT", "1"),
("Connection", "keep-alive"),
("Upgrade-Insecure-Requests", "1"),
("Cookie", None),
@@ -991,6 +993,12 @@ try:
except AttributeError:
BROTLI = False
+# detect zstandard support
+try:
+ ZSTD = urllib3.response.HAS_ZSTD
+except AttributeError:
+ ZSTD = False
+
# set (urllib3) warnings filter
action = config.get((), "warnings", "default")
if action:
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 2199cc8..a70710c 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -846,55 +846,6 @@ class DeviantartStatusExtractor(DeviantartExtractor):
)
-class DeviantartPopularExtractor(DeviantartExtractor):
- """Extractor for popular deviations"""
- subcategory = "popular"
- directory_fmt = ("{category}", "Popular",
- "{popular[range]}", "{popular[search]}")
- archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}"
- pattern = (r"(?:https?://)?www\.deviantart\.com/(?:"
- r"(?:deviations/?)?\?order=(popular-[^/?#]+)"
- r"|((?:[\w-]+/)*)(popular-[^/?#]+)"
- r")/?(?:\?([^#]*))?")
- example = "https://www.deviantart.com/popular-24-hours/"
-
- def __init__(self, match):
- DeviantartExtractor.__init__(self, match)
- self.user = ""
-
- trange1, path, trange2, query = match.groups()
- query = text.parse_query(query)
- self.search_term = query.get("q")
-
- trange = trange1 or trange2 or query.get("order", "")
- if trange.startswith("popular-"):
- trange = trange[8:]
- self.time_range = {
- "newest" : "now",
- "most-recent" : "now",
- "this-week" : "1week",
- "this-month" : "1month",
- "this-century": "alltime",
- "all-time" : "alltime",
- }.get(trange, "alltime")
-
- self.popular = {
- "search": self.search_term or "",
- "range" : trange or "all-time",
- "path" : path.strip("/") if path else "",
- }
-
- def deviations(self):
- if self.time_range == "now":
- return self.api.browse_newest(self.search_term, self.offset)
- return self.api.browse_popular(
- self.search_term, self.time_range, self.offset)
-
- def prepare(self, deviation):
- DeviantartExtractor.prepare(self, deviation)
- deviation["popular"] = self.popular
-
-
class DeviantartTagExtractor(DeviantartExtractor):
"""Extractor for deviations from tag searches"""
subcategory = "tag"
@@ -1077,14 +1028,14 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor):
class DeviantartFollowingExtractor(DeviantartExtractor):
"""Extractor for user's watched users"""
subcategory = "following"
- pattern = BASE_PATTERN + "/about#watching$"
+ pattern = BASE_PATTERN + "/(?:about#)?watching"
example = "https://www.deviantart.com/USER/about#watching"
def items(self):
- eclipse_api = DeviantartEclipseAPI(self)
+ api = DeviantartOAuthAPI(self)
- for user in eclipse_api.user_watching(self.user, self.offset):
- url = "{}/{}".format(self.root, user["username"])
+ for user in api.user_friends(self.user):
+ url = "{}/{}".format(self.root, user["user"]["username"])
user["_extractor"] = DeviantartUserExtractor
yield Message.Queue, url, user
@@ -1095,7 +1046,7 @@ class DeviantartFollowingExtractor(DeviantartExtractor):
class DeviantartOAuthAPI():
"""Interface for the DeviantArt OAuth API
- Ref: https://www.deviantart.com/developers/http/v1/20160316
+ https://www.deviantart.com/developers/http/v1/20160316
"""
CLIENT_ID = "5388"
CLIENT_SECRET = "76b08c69cfb27f26d6161f9ab6d061a1"
@@ -1188,29 +1139,6 @@ class DeviantartOAuthAPI():
"mature_content": self.mature}
return self._pagination(endpoint, params, public=False, unpack=True)
- def browse_newest(self, query=None, offset=0):
- """Browse newest deviations"""
- endpoint = "/browse/newest"
- params = {
- "q" : query,
- "limit" : 120,
- "offset" : offset,
- "mature_content": self.mature,
- }
- return self._pagination(endpoint, params)
-
- def browse_popular(self, query=None, timerange=None, offset=0):
- """Yield popular deviations"""
- endpoint = "/browse/popular"
- params = {
- "q" : query,
- "limit" : 120,
- "timerange" : timerange,
- "offset" : offset,
- "mature_content": self.mature,
- }
- return self._pagination(endpoint, params)
-
def browse_tags(self, tag, offset=0):
""" Browse a tag """
endpoint = "/browse/tags"
@@ -1223,11 +1151,12 @@ class DeviantartOAuthAPI():
return self._pagination(endpoint, params)
def browse_user_journals(self, username, offset=0):
- """Yield all journal entries of a specific user"""
- endpoint = "/browse/user/journals"
- params = {"username": username, "offset": offset, "limit": 50,
- "mature_content": self.mature, "featured": "false"}
- return self._pagination(endpoint, params)
+ journals = filter(
+ lambda post: "/journal/" in post["url"],
+ self.user_profile_posts(username))
+ if offset:
+ journals = util.advance(journals, offset)
+ return journals
def collections(self, username, folder_id, offset=0):
"""Yield all Deviation-objects contained in a collection folder"""
@@ -1339,16 +1268,10 @@ class DeviantartOAuthAPI():
"mature_content": self.mature}
return self._pagination_list(endpoint, params)
- @memcache(keyarg=1)
- def user_profile(self, username):
- """Get user profile information"""
- endpoint = "/user/profile/" + username
- return self._call(endpoint, fatal=False)
-
- def user_statuses(self, username, offset=0):
- """Yield status updates of a specific user"""
- endpoint = "/user/statuses/"
- params = {"username": username, "offset": offset, "limit": 50}
+ def user_friends(self, username, offset=0):
+ """Get the users list of friends"""
+ endpoint = "/user/friends/" + username
+ params = {"limit": 50, "offset": offset, "mature_content": self.mature}
return self._pagination(endpoint, params)
def user_friends_watch(self, username):
@@ -1376,6 +1299,27 @@ class DeviantartOAuthAPI():
endpoint, method="POST", public=False, fatal=False,
).get("success")
+ @memcache(keyarg=1)
+ def user_profile(self, username):
+ """Get user profile information"""
+ endpoint = "/user/profile/" + username
+ return self._call(endpoint, fatal=False)
+
+ def user_profile_posts(self, username):
+ endpoint = "/user/profile/posts"
+ params = {"username": username, "limit": 50,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params)
+
+ def user_statuses(self, username, offset=0):
+ """Yield status updates of a specific user"""
+ statuses = filter(
+ lambda post: "/status-update/" in post["url"],
+ self.user_profile_posts(username))
+ if offset:
+ statuses = util.advance(statuses, offset)
+ return statuses
+
def authenticate(self, refresh_token_key):
"""Authenticate the application by requesting an access token"""
self.headers["Authorization"] = \
@@ -1464,7 +1408,7 @@ class DeviantartOAuthAPI():
self.log.error(msg)
return data
- def _switch_tokens(self, results, params):
+ def _should_switch_tokens(self, results, params):
if len(results) < params["limit"]:
return True
@@ -1496,7 +1440,7 @@ class DeviantartOAuthAPI():
results = [item["journal"] for item in results
if "journal" in item]
if extend:
- if public and self._switch_tokens(results, params):
+ if public and self._should_switch_tokens(results, params):
if self.refresh_token_key:
self.log.debug("Switching to private access token")
public = False
@@ -1540,6 +1484,11 @@ class DeviantartOAuthAPI():
return
params["offset"] = int(params["offset"]) + len(results)
+ def _pagination_list(self, endpoint, params, key="results"):
+ result = []
+ result.extend(self._pagination(endpoint, params, False, key=key))
+ return result
+
@staticmethod
def _shared_content(results):
"""Return an iterable of shared deviations in 'results'"""
@@ -1548,11 +1497,6 @@ class DeviantartOAuthAPI():
if "deviation" in item:
yield item["deviation"]
- def _pagination_list(self, endpoint, params, key="results"):
- result = []
- result.extend(self._pagination(endpoint, params, False, key=key))
- return result
-
def _metadata(self, deviations):
"""Add extended metadata to each deviation object"""
if len(deviations) <= self.limit:
diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py
index 26f2184..2f0230a 100644
--- a/gallery_dl/extractor/directlink.py
+++ b/gallery_dl/extractor/directlink.py
@@ -18,7 +18,8 @@ class DirectlinkExtractor(Extractor):
filename_fmt = "{domain}/{path}/{filename}.{extension}"
archive_fmt = filename_fmt
pattern = (r"(?i)https?://(?P<domain>[^/?#]+)/(?P<path>[^?#]+\."
- r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))"
+ r"(?:jpe?g|jpe|png|gif|bmp|svg|web[mp]|avif|heic|psd"
+ r"|mp4|m4v|mov|mkv|og[gmv]|wav|mp3|opus|zip|rar|7z|pdf|swf))"
r"(?:\?(?P<query>[^#]*))?(?:#(?P<fragment>.*))?$")
example = "https://en.wikipedia.org/static/images/project-logos/enwiki.png"
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index 733d0d8..583869f 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -66,6 +66,8 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
text.extr(group, ' alt="', '"')),
"date" : text.parse_datetime(extr(
'"icon-calendar"></i> ', '<'), "%b %d, %Y"),
+ "tags" : text.split_html(extr(
+ "class='tags'>", "<div id='chapter-actions'")),
"lang" : "en",
"language": "English",
}
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 8c9da2f..e6d136f 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -46,18 +46,24 @@ class EromeExtractor(Extractor):
page, 'href="https://www.erome.com/', '"', pos)
urls = []
+ date = None
groups = page.split('<div class="media-group"')
for group in util.advance(groups, 1):
url = (text.extr(group, '<source src="', '"') or
text.extr(group, 'data-src="', '"'))
if url:
urls.append(url)
+ if not date:
+ ts = text.extr(group, '?v=', '"')
+ if len(ts) > 1:
+ date = text.parse_timestamp(ts)
data = {
"album_id" : album_id,
"title" : text.unescape(title),
"user" : text.unquote(user),
"count" : len(urls),
+ "date" : date,
"_http_headers": {"Referer": url},
}
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 1805403..1b4f995 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -394,6 +394,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self.original = False
return self.data["_url_1280"]
+ if " temporarily banned " in page:
+ raise exception.AuthorizationError("Temporarily Banned")
+
self._report_limits()
return True
diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py
deleted file mode 100644
index 650a707..0000000
--- a/gallery_dl/extractor/fallenangels.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017-2023 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://www.fascans.com/"""
-
-from .common import ChapterExtractor, MangaExtractor
-from .. import text, util
-
-
-class FallenangelsChapterExtractor(ChapterExtractor):
- """Extractor for manga chapters from fascans.com"""
- category = "fallenangels"
- pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com"
- r"/manga/([^/?#]+)/([^/?#]+)")
- example = "https://manga.fascans.com/manga/NAME/CHAPTER/"
-
- def __init__(self, match):
- self.version, self.manga, self.chapter = match.groups()
- url = "https://{}.fascans.com/manga/{}/{}/1".format(
- self.version, self.manga, self.chapter)
- ChapterExtractor.__init__(self, match, url)
-
- def metadata(self, page):
- extr = text.extract_from(page)
- lang = "vi" if self.version == "truyen" else "en"
- chapter, sep, minor = self.chapter.partition(".")
- return {
- "manga" : extr('name="description" content="', ' Chapter '),
- "title" : extr(': ', ' - Page 1'),
- "chapter" : chapter,
- "chapter_minor": sep + minor,
- "lang" : lang,
- "language": util.code_to_language(lang),
- }
-
- @staticmethod
- def images(page):
- return [
- (img["page_image"], None)
- for img in util.json_loads(
- text.extr(page, "var pages = ", ";")
- )
- ]
-
-
-class FallenangelsMangaExtractor(MangaExtractor):
- """Extractor for manga from fascans.com"""
- chapterclass = FallenangelsChapterExtractor
- category = "fallenangels"
- pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$"
- example = "https://manga.fascans.com/manga/NAME"
-
- def __init__(self, match):
- url = "https://" + match.group(1)
- self.lang = "vi" if match.group(2) == "truyen" else "en"
- MangaExtractor.__init__(self, match, url)
-
- def chapters(self, page):
- extr = text.extract_from(page)
- results = []
- language = util.code_to_language(self.lang)
- while extr('<li style="', '"'):
- vol = extr('class="volume-', '"')
- url = extr('href="', '"')
- cha = extr('>', '<')
- title = extr('<em>', '</em>')
-
- manga, _, chapter = cha.rpartition(" ")
- chapter, dot, minor = chapter.partition(".")
- results.append((url, {
- "manga" : manga,
- "title" : text.unescape(title),
- "volume" : text.parse_int(vol),
- "chapter" : text.parse_int(chapter),
- "chapter_minor": dot + minor,
- "lang" : self.lang,
- "language": language,
- }))
- return results
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index 6040187..f48a984 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -113,6 +113,12 @@ class FuraffinityExtractor(Extractor):
data["gender"] = rh(extr('>Gender</strong>', '</div>'))
data["width"] = pi(extr("<span>", "x"))
data["height"] = pi(extr("", "p"))
+ data["folders"] = folders = []
+ for folder in extr(
+ "<h3>Listed in Folders</h3>", "</section>").split("</a>"):
+ folder = rh(folder)
+ if folder:
+ folders.append(folder)
else:
# old site layout
data["title"] = text.unescape(extr("<h2>", "</h2>"))
@@ -132,11 +138,14 @@ class FuraffinityExtractor(Extractor):
data["_description"] = extr(
'<td valign="top" align="left" width="70%" class="alt1" '
'style="padding:8px">', ' </td>')
+ data["folders"] = () # folders not present in old layout
data["artist_url"] = data["artist"].replace("_", "").lower()
data["user"] = self.user or data["artist_url"]
data["date"] = text.parse_timestamp(data["filename"].partition(".")[0])
data["description"] = self._process_description(data["_description"])
+ data["thumbnail"] = "https://t.furaffinity.net/{}@600-{}.jpg".format(
+ post_id, path.rsplit("/", 2)[1])
return data
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 8d8b8ad..fbbd26c 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -36,7 +36,9 @@ class GelbooruV02Extractor(booru.BooruExtractor):
params["pid"] = self.page_start
params["limit"] = self.per_page
- post = None
+ post = total = None
+ count = 0
+
while True:
try:
root = self._api_request(params)
@@ -50,12 +52,29 @@ class GelbooruV02Extractor(booru.BooruExtractor):
params["pid"] = 0
continue
+ if total is None:
+ try:
+ total = int(root.attrib["count"])
+ self.log.debug("%s posts in total", total)
+ except Exception as exc:
+ total = 0
+ self.log.debug(
+ "Failed to get total number of posts (%s: %s)",
+ exc.__class__.__name__, exc)
+
post = None
for post in root:
yield post.attrib
- if len(root) < self.per_page:
- return
+ num = len(root)
+ count += num
+ if num < self.per_page:
+ if not total or count >= total:
+ return
+ if not num:
+ self.log.debug("Empty response - Retrying")
+ continue
+
params["pid"] += 1
def _pagination_html(self, params):
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
index 97b7844..286ee38 100644
--- a/gallery_dl/extractor/hentainexus.py
+++ b/gallery_dl/extractor/hentainexus.py
@@ -70,10 +70,13 @@ class HentainexusGalleryExtractor(GalleryExtractor):
for img in imgs:
img["_http_headers"] = headers
- return [
- (img["image"], img)
- for img in imgs
- ]
+ results = []
+ for img in imgs:
+ try:
+ results.append((img["image"], img))
+ except KeyError:
+ pass
+ return results
@staticmethod
def _decode(data):
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
index a2b51be..34fbabd 100644
--- a/gallery_dl/extractor/hotleak.py
+++ b/gallery_dl/extractor/hotleak.py
@@ -23,6 +23,12 @@ class HotleakExtractor(Extractor):
def items(self):
for post in self.posts():
+ if self.type == "photo":
+ post["url"] = (
+ post["url"]
+ .replace("/storage/storage/", "/storage/")
+ .replace("_thumb.", ".")
+ )
post["_http_expected_status"] = (404,)
yield Message.Directory, post
yield Message.Url, post["url"], post
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
index 85446c0..345f51d 100644
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -19,7 +19,7 @@ class ImagefapExtractor(Extractor):
category = "imagefap"
root = "https://www.imagefap.com"
directory_fmt = ("{category}", "{gallery_id} {title}")
- filename_fmt = "{category}_{gallery_id}_{filename}.{extension}"
+ filename_fmt = "{category}_{gallery_id}_{num:04}_{filename}.{extension}"
archive_fmt = "{gallery_id}_{image_id}"
request_interval = (2.0, 4.0)
diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py
index 2ae8cbe..f3098f1 100644
--- a/gallery_dl/extractor/inkbunny.py
+++ b/gallery_dl/extractor/inkbunny.py
@@ -246,14 +246,12 @@ class InkbunnyFollowingExtractor(InkbunnyExtractor):
data = {"_extractor": InkbunnyUserExtractor}
while True:
- cnt = 0
for user in text.extract_iter(
page, '<a class="widget_userNameSmall" href="', '"',
page.index('id="changethumboriginal_form"')):
- cnt += 1
yield Message.Queue, self.root + user, data
- if cnt < 20:
+ if "<a title='next page' " not in page:
return
params["page"] += 1
page = self.request(url, params=params).text
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index f7a5cc7..dbe2df3 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -596,6 +596,22 @@ class InstagramTagExtractor(InstagramExtractor):
return self.api.tags_media(self.item)
+class InstagramInfoExtractor(InstagramExtractor):
+ """Extractor for an Instagram user's profile data"""
+ subcategory = "info"
+ pattern = USER_PATTERN + r"/info"
+ example = "https://www.instagram.com/USER/info/"
+
+ def items(self):
+ screen_name = self.item
+ if screen_name.startswith("id:"):
+ user = self.api.user_by_id(screen_name[3:])
+ else:
+ user = self.api.user_by_name(screen_name)
+
+ return iter(((Message.Directory, user),))
+
+
class InstagramAvatarExtractor(InstagramExtractor):
"""Extractor for an Instagram user's avatar"""
subcategory = "avatar"
@@ -975,9 +991,9 @@ class InstagramGraphqlAPI():
if not info["has_next_page"]:
return extr._update_cursor(None)
elif not data["edges"]:
- s = "" if self.item.endswith("s") else "s"
+ s = "" if self.extractor.item.endswith("s") else "s"
raise exception.StopExtraction(
- "%s'%s posts are private", self.item, s)
+ "%s'%s posts are private", self.extractor.item, s)
variables["after"] = extr._update_cursor(info["end_cursor"])
diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py
new file mode 100644
index 0000000..979b1a2
--- /dev/null
+++ b/gallery_dl/extractor/koharu.py
@@ -0,0 +1,221 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://koharu.to/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+BASE_PATTERN = r"(?i)(?:https?://)?(?:koharu|anchira)\.to"
+
+
+class KoharuExtractor(Extractor):
+ """Base class for koharu extractors"""
+ category = "koharu"
+ root = "https://koharu.to"
+ root_api = "https://api.koharu.to"
+ request_interval = (0.5, 1.5)
+
+ def _init(self):
+ self.headers = {
+ "Accept" : "*/*",
+ "Referer": self.root + "/",
+ "Origin" : self.root,
+ }
+
+ def _pagination(self, endpoint, params):
+ url_api = self.root_api + endpoint
+
+ while True:
+ data = self.request(
+ url_api, params=params, headers=self.headers).json()
+
+ try:
+ entries = data["entries"]
+ except KeyError:
+ return
+
+ for entry in entries:
+ url = "{}/g/{}/{}".format(
+ self.root, entry["id"], entry["public_key"])
+ entry["_extractor"] = KoharuGalleryExtractor
+ yield Message.Queue, url, entry
+
+ try:
+ if data["limit"] * data["page"] >= data["total"]:
+ return
+ except Exception:
+ pass
+ params["page"] += 1
+
+
+class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
+ """Extractor for koharu galleries"""
+ filename_fmt = "{num:>03}.{extension}"
+ directory_fmt = ("{category}", "{id} {title}")
+ archive_fmt = "{id}_{num}"
+ request_interval = 0.0
+ pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)"
+ example = "https://koharu.to/g/12345/67890abcde/"
+
+ TAG_TYPES = {
+ 0 : "general",
+ 1 : "artist",
+ 2 : "circle",
+ 3 : "parody",
+ 4 : "magazine",
+ 5 : "character",
+ 6 : "",
+ 7 : "uploader",
+ 8 : "male",
+ 9 : "female",
+ 10: "mixed",
+ 11: "language",
+ 12: "other",
+ }
+
+ def __init__(self, match):
+ GalleryExtractor.__init__(self, match)
+ self.gallery_url = None
+
+ def _init(self):
+ self.headers = {
+ "Accept" : "*/*",
+ "Referer": self.root + "/",
+ "Origin" : self.root,
+ }
+
+ self.fmt = self.config("format")
+ self.cbz = self.config("cbz", True)
+
+ if self.cbz:
+ self.filename_fmt = "{id} {title}.{extension}"
+ self.directory_fmt = ("{category}",)
+
+ def metadata(self, _):
+ url = "{}/books/detail/{}/{}".format(
+ self.root_api, self.groups[0], self.groups[1])
+ self.data = data = self.request(url, headers=self.headers).json()
+
+ tags = []
+ for tag in data["tags"]:
+ name = tag["name"]
+ namespace = tag.get("namespace", 0)
+ tags.append(self.TAG_TYPES[namespace] + ":" + name)
+ data["tags"] = tags
+ data["date"] = text.parse_timestamp(data["created_at"] // 1000)
+
+ try:
+ if self.cbz:
+ data["count"] = len(data["thumbnails"]["entries"])
+ del data["thumbnails"]
+ del data["rels"]
+ except Exception:
+ pass
+
+ return data
+
+ def images(self, _):
+ data = self.data
+ fmt = self._select_format(data["data"])
+
+ url = "{}/books/data/{}/{}/{}/{}".format(
+ self.root_api,
+ data["id"], data["public_key"],
+ fmt["id"], fmt["public_key"],
+ )
+ params = {
+ "v": data["updated_at"],
+ "w": fmt["w"],
+ }
+
+ if self.cbz:
+ params["action"] = "dl"
+ base = self.request(
+ url, method="POST", params=params, headers=self.headers,
+ ).json()["base"]
+ url = "{}?v={}&w={}".format(base, data["updated_at"], fmt["w"])
+ info = text.nameext_from_url(base)
+ if not info["extension"]:
+ info["extension"] = "cbz"
+ return ((url, info),)
+
+ data = self.request(url, params=params, headers=self.headers).json()
+ base = data["base"]
+
+ results = []
+ for entry in data["entries"]:
+ dimensions = entry["dimensions"]
+ info = {
+ "w": dimensions[0],
+ "h": dimensions[1],
+ "_http_headers": self.headers,
+ }
+ results.append((base + entry["path"], info))
+ return results
+
+ def _select_format(self, formats):
+ if not self.fmt or self.fmt == "original":
+ fmtid = "0"
+ else:
+ fmtid = str(self.fmt)
+
+ try:
+ fmt = formats[fmtid]
+ except KeyError:
+ raise exception.NotFoundError("format")
+
+ fmt["w"] = fmtid
+ return fmt
+
+
+class KoharuSearchExtractor(KoharuExtractor):
+ """Extractor for koharu search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/\?([^#]*)"
+ example = "https://koharu.to/?s=QUERY"
+
+ def items(self):
+ params = text.parse_query(self.groups[0])
+ params["page"] = text.parse_int(params.get("page"), 1)
+ return self._pagination("/books", params)
+
+
+class KoharuFavoriteExtractor(KoharuExtractor):
+ """Extractor for koharu favorites"""
+ subcategory = "favorite"
+ pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?"
+ example = "https://koharu.to/favorites"
+
+ def items(self):
+ self.login()
+
+ params = text.parse_query(self.groups[0])
+ params["page"] = text.parse_int(params.get("page"), 1)
+ return self._pagination("/favorites", params)
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self.headers["Authorization"] = \
+ "Bearer " + self._login_impl(username, password)
+ return
+
+ raise exception.AuthenticationError("Username and password required")
+
+ @cache(maxage=86400, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ url = "https://auth.koharu.to/login"
+ data = {"uname": username, "passwd": password}
+ response = self.request(
+ url, method="POST", headers=self.headers, data=data)
+
+ return response.json()["session"]
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index 60cca22..b01c591 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -120,7 +120,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
]
else:
pos = page.find('id="view-center"') + 1
- return (text.extr(page, 'itemprop="image" src="', '"', pos),)
+ # do NOT use text.extr() here, as it doesn't support a pos argument
+ return (text.extract(page, 'itemprop="image" src="', '"', pos)[0],)
@staticmethod
def _extract_user_name(page):
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index b21e1eb..2330b08 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -77,6 +77,7 @@ class PahealTagExtractor(PahealExtractor):
pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
r"/post/list/([^/?#]+)")
example = "https://rule34.paheal.net/post/list/TAG/1"
+ page_start = 1
per_page = 70
def __init__(self, match):
@@ -87,11 +88,16 @@ class PahealTagExtractor(PahealExtractor):
if self.config("metadata"):
self._extract_data = self._extract_data_ex
+ def skip(self, num):
+ pages = num // self.per_page
+ self.page_start += pages
+ return pages * self.per_page
+
def get_metadata(self):
return {"search_tags": self.tags}
def get_posts(self):
- pnum = 1
+ pnum = self.page_start
while True:
url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
page = self.request(url).text
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index 115de9a..271fa50 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -78,12 +78,16 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
}
def images(self, page):
- return [
- (beau(url), None)
- for url in text.extract_iter(
- page, "lstImages.push('", "'",
- )
- ]
+ results = []
+
+ for block in page.split(" pth = '")[1:]:
+ pth = text.extr(block, "", "'")
+ for needle, repl in re.findall(
+ r"pth = pth\.replace\(/([^/]+)/g, [\"']([^\"']*)", block):
+ pth = pth.replace(needle, repl)
+ results.append((beau(pth), None))
+
+ return results
class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
@@ -116,9 +120,9 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
def beau(url):
- """https://readcomiconline.li/Scripts/rguard.min.js"""
- url = url.replace("_x236", "d")
- url = url.replace("_x945", "g")
+ """https://readcomiconline.li/Scripts/rguard.min.js?v=1.5.1"""
+ url = url.replace("pw_.g28x", "b")
+ url = url.replace("d2pr.x_27", "h")
if url.startswith("https"):
return url
@@ -126,8 +130,8 @@ def beau(url):
url, sep, rest = url.partition("?")
containsS0 = "=s0" in url
url = url[:-3 if containsS0 else -6]
- url = url[4:22] + url[25:]
- url = url[0:-6] + url[-2:]
+ url = url[15:33] + url[50:]
+ url = url[0:-11] + url[-2:]
url = binascii.a2b_base64(url).decode()
url = url[0:13] + url[17:]
url = url[0:-2] + ("=s0" if containsS0 else "=s1600")
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index 327bcd1..506f6ac 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -190,7 +190,7 @@ class RedgifsImageExtractor(RedgifsExtractor):
r"(?:\w+\.)?redgifs\.com/(?:watch|ifr)|"
r"(?:\w+\.)?gfycat\.com(?:/gifs/detail|/\w+)?|"
r"(?:www\.)?gifdeliverynetwork\.com|"
- r"i\.redgifs\.com/i)/([A-Za-z]+)")
+ r"i\.redgifs\.com/i)/([A-Za-z0-9]+)")
example = "https://redgifs.com/watch/ID"
def gifs(self):
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index caf3e16..ad3efa7 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -16,7 +16,7 @@ import collections
import re
BASE_PATTERN = r"(?:https?://)?" \
- r"(?:(?:chan|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \
+ r"(?:(?:chan|www|beta|black|white)\.sankakucomplex\.com|sankaku\.app)" \
r"(?:/[a-z]{2})?"
@@ -45,6 +45,9 @@ class SankakuExtractor(BooruExtractor):
def skip(self, num):
return 0
+ def _init(self):
+ self.api = SankakuAPI(self)
+
def _file_url(self, post):
url = post["file_url"]
if not url:
@@ -81,6 +84,15 @@ class SankakuExtractor(BooruExtractor):
post["tags_" + key] = value
post["tag_string_" + key] = " ".join(value)
+ def _notes(self, post, page):
+ if post.get("has_notes"):
+ post["notes"] = self.api.notes(post["id"])
+ for note in post["notes"]:
+ note["created_at"] = note["created_at"]["s"]
+ note["updated_at"] = note["updated_at"]["s"]
+ else:
+ post["notes"] = ()
+
class SankakuTagExtractor(SankakuExtractor):
"""Extractor for images from sankaku.app by search-tags"""
@@ -109,7 +121,7 @@ class SankakuTagExtractor(SankakuExtractor):
def posts(self):
params = {"tags": self.tags}
- return SankakuAPI(self).posts_keyset(params)
+ return self.api.posts_keyset(params)
class SankakuPoolExtractor(SankakuExtractor):
@@ -125,7 +137,7 @@ class SankakuPoolExtractor(SankakuExtractor):
self.pool_id = match.group(1)
def metadata(self):
- pool = SankakuAPI(self).pools(self.pool_id)
+ pool = self.api.pools(self.pool_id)
pool["tags"] = [tag["name"] for tag in pool["tags"]]
pool["artist_tags"] = [tag["name"] for tag in pool["artist_tags"]]
@@ -151,7 +163,7 @@ class SankakuPostExtractor(SankakuExtractor):
self.post_id = match.group(1)
def posts(self):
- return SankakuAPI(self).posts(self.post_id)
+ return self.api.posts(self.post_id)
class SankakuBooksExtractor(SankakuExtractor):
@@ -167,7 +179,7 @@ class SankakuBooksExtractor(SankakuExtractor):
def items(self):
params = {"tags": self.tags, "pool_type": "0"}
- for pool in SankakuAPI(self).pools_keyset(params):
+ for pool in self.api.pools_keyset(params):
pool["_extractor"] = SankakuPoolExtractor
url = "https://sankaku.app/books/{}".format(pool["id"])
yield Message.Queue, url, pool
@@ -192,6 +204,10 @@ class SankakuAPI():
if not self.username:
self.authenticate = util.noop
+ def notes(self, post_id):
+ params = {"lang": "en"}
+ return self._call("/posts/{}/notes".format(post_id), params)
+
def pools(self, pool_id):
params = {"lang": "en"}
return self._call("/pools/" + pool_id, params)
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index e1d4153..50c21e3 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://www.sankakucomplex.com/"""
+"""Extractors for https://news.sankakucomplex.com/"""
from .common import Extractor, Message
from .. import text, util
@@ -16,7 +16,7 @@ import re
class SankakucomplexExtractor(Extractor):
"""Base class for sankakucomplex extractors"""
category = "sankakucomplex"
- root = "https://www.sankakucomplex.com"
+ root = "https://news.sankakucomplex.com"
def __init__(self, match):
Extractor.__init__(self, match)
@@ -24,14 +24,14 @@ class SankakucomplexExtractor(Extractor):
class SankakucomplexArticleExtractor(SankakucomplexExtractor):
- """Extractor for articles on www.sankakucomplex.com"""
+ """Extractor for articles on news.sankakucomplex.com"""
subcategory = "article"
directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}")
filename_fmt = "{filename}.{extension}"
archive_fmt = "{date:%Y%m%d}_{filename}"
- pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
+ pattern = (r"(?:https?://)?(?:news|www)\.sankakucomplex\.com"
r"/(\d\d\d\d/\d\d/\d\d/[^/?#]+)")
- example = "https://www.sankakucomplex.com/1970/01/01/TITLE"
+ example = "https://news.sankakucomplex.com/1970/01/01/TITLE"
def items(self):
url = "{}/{}/?pg=X".format(self.root, self.path)
@@ -87,9 +87,9 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
class SankakucomplexTagExtractor(SankakucomplexExtractor):
"""Extractor for sankakucomplex blog articles by tag or author"""
subcategory = "tag"
- pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
+ pattern = (r"(?:https?://)?(?:news|www)\.sankakucomplex\.com"
r"/((?:tag|category|author)/[^/?#]+)")
- example = "https://www.sankakucomplex.com/tag/TAG/"
+ example = "https://news.sankakucomplex.com/tag/TAG/"
def items(self):
pnum = 1
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index 0abb3ab..7c760ac 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -127,6 +127,8 @@ class SubscribestarExtractor(Extractor):
}
def _parse_datetime(self, dt):
+ if dt.startswith("Updated on "):
+ dt = dt[11:]
date = text.parse_datetime(dt, "%b %d, %Y %I:%M %p")
if date is dt:
date = text.parse_datetime(dt, "%B %d, %Y %I:%M %p")
diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py
index 78ff265..64fa951 100644
--- a/gallery_dl/extractor/toyhouse.py
+++ b/gallery_dl/extractor/toyhouse.py
@@ -77,23 +77,27 @@ class ToyhouseExtractor(Extractor):
cnt += 1
yield self._parse_post(post)
- if cnt == 0 and params["page"] == 1:
- token, pos = text.extract(
- page, '<input name="_token" type="hidden" value="', '"')
- if not token:
- return
- data = {
- "_token": token,
- "user" : text.extract(page, 'value="', '"', pos)[0],
- }
- self.request(self.root + "/~account/warnings/accept",
- method="POST", data=data, allow_redirects=False)
- continue
+ if not cnt and params["page"] == 1:
+ if self._accept_content_warning(page):
+ continue
+ return
if cnt < 18:
return
params["page"] += 1
+ def _accept_content_warning(self, page):
+ pos = page.find(' name="_token"') + 1
+ token, pos = text.extract(page, ' value="', '"', pos)
+ user , pos = text.extract(page, ' value="', '"', pos)
+ if not token or not user:
+ return False
+
+ data = {"_token": token, "user": user}
+ self.request(self.root + "/~account/warnings/accept",
+ method="POST", data=data, allow_redirects=False)
+ return True
+
class ToyhouseArtExtractor(ToyhouseExtractor):
"""Extractor for artworks of a toyhouse user"""
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index c34910f..ff29c04 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -386,7 +386,7 @@ class TumblrAPI(oauth.OAuth1API):
def posts(self, blog, params):
"""Retrieve published posts"""
params["offset"] = self.extractor.config("offset")
- params["limit"] = "50"
+ params["limit"] = 50
params["reblog_info"] = "true"
params["type"] = self.posts_type
params["before"] = self.before
@@ -398,8 +398,14 @@ class TumblrAPI(oauth.OAuth1API):
def likes(self, blog):
"""Retrieve liked posts"""
+ endpoint = "/v2/blog/{}/likes".format(blog)
params = {"limit": "50", "before": self.before}
- return self._pagination(blog, "/likes", params, key="liked_posts")
+ while True:
+ posts = self._call(endpoint, params)["liked_posts"]
+ if not posts:
+ return
+ yield from posts
+ params["before"] = posts[-1]["liked_timestamp"]
def _call(self, endpoint, params, **kwargs):
url = self.ROOT + endpoint
@@ -474,6 +480,7 @@ class TumblrAPI(oauth.OAuth1API):
if self.api_key:
params["api_key"] = self.api_key
+ strategy = self.extractor.config("pagination")
while True:
data = self._call(endpoint, params)
@@ -481,13 +488,31 @@ class TumblrAPI(oauth.OAuth1API):
self.BLOG_CACHE[blog] = data["blog"]
cache = False
- yield from data[key]
-
- try:
- endpoint = data["_links"]["next"]["href"]
- except KeyError:
- return
+ posts = data[key]
+ yield from posts
- params = None
- if self.api_key:
- endpoint += "&api_key=" + self.api_key
+ if strategy == "api":
+ try:
+ endpoint = data["_links"]["next"]["href"]
+ except KeyError:
+ return
+
+ params = None
+ if self.api_key:
+ endpoint += "&api_key=" + self.api_key
+
+ elif strategy == "before":
+ if not posts:
+ return
+ timestamp = posts[-1]["timestamp"] + 1
+ if params["before"] and timestamp >= params["before"]:
+ return
+ params["before"] = timestamp
+ params["offset"] = None
+
+ else: # offset
+ params["offset"] = \
+ text.parse_int(params["offset"]) + params["limit"]
+ params["before"] = None
+ if params["offset"] >= data["total_posts"]:
+ return
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index ec098aa..9fa5b3f 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -51,6 +51,8 @@ class TwitterExtractor(Extractor):
if not self.config("transform", True):
self._transform_user = util.identity
self._transform_tweet = util.identity
+
+ self._cursor = None
self._user = None
self._user_obj = None
self._user_cache = {}
@@ -321,8 +323,17 @@ class TwitterExtractor(Extractor):
"quote_count" : tget("quote_count"),
"reply_count" : tget("reply_count"),
"retweet_count" : tget("retweet_count"),
+ "bookmark_count": tget("bookmark_count"),
}
+ if "views" in tweet:
+ try:
+ tdata["view_count"] = int(tweet["views"]["count"])
+ except Exception:
+ tdata["view_count"] = 0
+ else:
+ tdata["view_count"] = 0
+
if "note_tweet" in tweet:
note = tweet["note_tweet"]["note_tweet_results"]["result"]
content = note["text"]
@@ -492,6 +503,14 @@ class TwitterExtractor(Extractor):
},
}
+ def _init_cursor(self):
+ return self.config("cursor") or None
+
+ def _update_cursor(self, cursor):
+ self.log.debug("Cursor: %s", cursor)
+ self._cursor = cursor
+ return cursor
+
def metadata(self):
"""Return general metadata"""
return {}
@@ -499,6 +518,11 @@ class TwitterExtractor(Extractor):
def tweets(self):
"""Yield all relevant tweet objects"""
+ def finalize(self):
+ if self._cursor:
+ self.log.info("Use '-o cursor=%s' to continue downloading "
+ "from the current position", self._cursor)
+
def login(self):
if self.cookies_check(self.cookies_names):
return
@@ -530,6 +554,9 @@ class TwitterUserExtractor(TwitterExtractor):
def initialize(self):
pass
+ def finalize(self):
+ pass
+
def items(self):
base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors((
@@ -549,30 +576,73 @@ class TwitterTimelineExtractor(TwitterExtractor):
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)"
example = "https://x.com/USER/timeline"
+ def _init_cursor(self):
+ if self._cursor:
+ return self._cursor.partition("/")[2] or None
+ return None
+
+ def _update_cursor(self, cursor):
+ if cursor:
+ self._cursor = self._cursor_prefix + cursor
+ self.log.debug("Cursor: %s", self._cursor)
+ else:
+ self._cursor = None
+ return cursor
+
def tweets(self):
- # yield initial batch of (media) tweets
- tweet = None
- for tweet in self._select_tweet_source()(self.user):
- yield tweet
- if tweet is None:
- return
+ self._cursor = cursor = self.config("cursor") or None
+ reset = False
- # build search query
- query = "from:{} max_id:{}".format(
- self._user["name"], tweet["rest_id"])
- if self.retweets:
- query += " include:retweets include:nativeretweets"
+ if cursor:
+ state = cursor.partition("/")[0]
+ state, _, tweet_id = state.partition("_")
+ state = text.parse_int(state, 1)
+ else:
+ state = 1
+
+ if state <= 1:
+ self._cursor_prefix = "1/"
- if not self.textonly:
- # try to search for media-only tweets
+ # yield initial batch of (media) tweets
tweet = None
- for tweet in self.api.search_timeline(query + " filter:links"):
+ for tweet in self._select_tweet_source()(self.user):
yield tweet
- if tweet is not None:
+ if tweet is None and not cursor:
return
+ tweet_id = tweet["rest_id"]
+
+ state = reset = 2
+ else:
+ self.api._user_id_by_screen_name(self.user)
+
+ # build search query
+ query = "from:{} max_id:{}".format(self._user["name"], tweet_id)
+ if self.retweets:
+ query += " include:retweets include:nativeretweets"
- # yield unfiltered search results
- yield from self.api.search_timeline(query)
+ if state <= 2:
+ self._cursor_prefix = "2_{}/".format(tweet_id)
+ if reset:
+ self._cursor = self._cursor_prefix
+
+ if not self.textonly:
+ # try to search for media-only tweets
+ tweet = None
+ for tweet in self.api.search_timeline(query + " filter:links"):
+ yield tweet
+ if tweet is not None:
+ return self._update_cursor(None)
+
+ state = reset = 3
+
+ if state <= 3:
+ # yield unfiltered search results
+ self._cursor_prefix = "3_{}/".format(tweet_id)
+ if reset:
+ self._cursor = self._cursor_prefix
+
+ yield from self.api.search_timeline(query)
+ return self._update_cursor(None)
def _select_tweet_source(self):
strategy = self.config("strategy")
@@ -854,6 +924,24 @@ class TwitterQuotesExtractor(TwitterExtractor):
yield Message.Queue, url, data
+class TwitterInfoExtractor(TwitterExtractor):
+ """Extractor for a user's profile data"""
+ subcategory = "info"
+ pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/info"
+ example = "https://x.com/USER/info"
+
+ def items(self):
+ api = TwitterAPI(self)
+
+ screen_name = self.user
+ if screen_name.startswith("id:"):
+ user = api.user_by_rest_id(screen_name[3:])
+ else:
+ user = api.user_by_screen_name(screen_name)
+
+ return iter(((Message.Directory, self._transform_user(user)),))
+
+
class TwitterAvatarExtractor(TwitterExtractor):
subcategory = "avatar"
filename_fmt = "avatar {date}.{extension}"
@@ -1388,7 +1476,11 @@ class TwitterAPI():
"%s %s (%s)", response.status_code, response.reason, errors)
def _pagination_legacy(self, endpoint, params):
- original_retweets = (self.extractor.retweets == "original")
+ extr = self.extractor
+ cursor = extr._init_cursor()
+ if cursor:
+ params["cursor"] = cursor
+ original_retweets = (extr.retweets == "original")
bottom = ("cursor-bottom-", "sq-cursor-bottom")
while True:
@@ -1396,7 +1488,7 @@ class TwitterAPI():
instructions = data["timeline"]["instructions"]
if not instructions:
- return
+ return extr._update_cursor(None)
tweets = data["globalObjects"]["tweets"]
users = data["globalObjects"]["users"]
@@ -1477,8 +1569,8 @@ class TwitterAPI():
# stop on empty response
if not cursor or (not tweets and not tweet_id):
- return
- params["cursor"] = cursor
+ return extr._update_cursor(None)
+ params["cursor"] = extr._update_cursor(cursor)
def _pagination_tweets(self, endpoint, variables,
path=None, stop_tweets=True, features=None):
@@ -1487,6 +1579,9 @@ class TwitterAPI():
pinned_tweet = extr.pinned
params = {"variables": None}
+ cursor = extr._init_cursor()
+ if cursor:
+ variables["cursor"] = cursor
if features is None:
features = self.features_pagination
if features:
@@ -1523,7 +1618,7 @@ class TwitterAPI():
cursor = entry["content"]["value"]
if entries is None:
if not cursor:
- return
+ return extr._update_cursor(None)
entries = ()
except LookupError:
@@ -1672,12 +1767,16 @@ class TwitterAPI():
continue
if stop_tweets and not tweet:
- return
+ return extr._update_cursor(None)
if not cursor or cursor == variables.get("cursor"):
- return
- variables["cursor"] = cursor
+ return extr._update_cursor(None)
+ variables["cursor"] = extr._update_cursor(cursor)
def _pagination_users(self, endpoint, variables, path=None):
+ extr = self.extractor
+ cursor = extr._init_cursor()
+ if cursor:
+ variables["cursor"] = cursor
params = {
"variables": None,
"features" : self._json_dumps(self.features_pagination),
@@ -1697,7 +1796,7 @@ class TwitterAPI():
data = data[key]
instructions = data["instructions"]
except KeyError:
- return
+ return extr._update_cursor(None)
for instr in instructions:
if instr["type"] == "TimelineAddEntries":
@@ -1715,8 +1814,8 @@ class TwitterAPI():
cursor = entry["content"]["value"]
if not cursor or cursor.startswith(("-1|", "0|")) or not entry:
- return
- variables["cursor"] = cursor
+ return extr._update_cursor(None)
+ variables["cursor"] = extr._update_cursor(cursor)
def _handle_ratelimit(self, response):
rl = self.extractor.config("ratelimit")
@@ -1864,7 +1963,7 @@ def _login_impl(extr, username, password):
},
}
elif subtask == "LoginEnterAlternateIdentifierSubtask":
- alt = extr.config("username_alt") or extr.input(
+ alt = extr.config("username-alt") or extr.input(
"Alternate Identifier (username, email, phone number): ")
data = {
"enter_text": {
diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py
index 6dfb23c..5cde0d6 100644
--- a/gallery_dl/extractor/vipergirls.py
+++ b/gallery_dl/extractor/vipergirls.py
@@ -101,7 +101,8 @@ class VipergirlsExtractor(Extractor):
class VipergirlsThreadExtractor(VipergirlsExtractor):
"""Extractor for vipergirls threads"""
subcategory = "thread"
- pattern = BASE_PATTERN + r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?$"
+ pattern = (BASE_PATTERN +
+ r"/threads/(\d+)(?:-[^/?#]+)?(/page\d+)?(?:$|#|\?(?!p=))")
example = "https://vipergirls.to/threads/12345-TITLE"
def __init__(self, match):
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index c112f4a..922a591 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -115,9 +115,28 @@ class VscoExtractor(Extractor):
class VscoUserExtractor(VscoExtractor):
- """Extractor for images from a user on vsco.co"""
+ """Extractor for a vsco user profile"""
subcategory = "user"
- pattern = USER_PATTERN + r"(?:/gallery|/images(?:/\d+)?)?/?(?:$|[?#])"
+ pattern = USER_PATTERN + r"/?$"
+ example = "https://vsco.co/USER"
+
+ def initialize(self):
+ pass
+
+ def items(self):
+ base = "{}/{}/".format(self.root, self.user)
+ return self._dispatch_extractors((
+ (VscoAvatarExtractor , base + "avatar"),
+ (VscoGalleryExtractor , base + "gallery"),
+ (VscoSpacesExtractor , base + "spaces"),
+ (VscoCollectionExtractor, base + "collection"),
+ ), ("gallery",))
+
+
+class VscoGalleryExtractor(VscoExtractor):
+ """Extractor for a vsco user's gallery"""
+ subcategory = "gallery"
+ pattern = USER_PATTERN + r"/(?:gallery|images)"
example = "https://vsco.co/USER/gallery"
def images(self):
diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py
index faf3b0d..796f3f8 100644
--- a/gallery_dl/extractor/wallpapercave.py
+++ b/gallery_dl/extractor/wallpapercave.py
@@ -18,7 +18,7 @@ class WallpapercaveImageExtractor(Extractor):
category = "wallpapercave"
subcategory = "image"
root = "https://wallpapercave.com"
- pattern = r"(?:https?://)?(?:www\.)?wallpapercave\.com"
+ pattern = r"(?:https?://)?(?:www\.)?wallpapercave\.com/"
example = "https://wallpapercave.com/w/wp12345"
def items(self):
@@ -40,3 +40,12 @@ class WallpapercaveImageExtractor(Extractor):
image = text.nameext_from_url(path)
yield Message.Directory, image
yield Message.Url, self.root + path, image
+
+ if path is None:
+ for wp in text.extract_iter(
+ page, 'class="wallpaper" id="wp', '</picture>'):
+ path = text.rextract(wp, ' src="', '"')[0]
+ if path:
+ image = text.nameext_from_url(path)
+ yield Message.Directory, image
+ yield Message.Url, self.root + path, image
diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py
index e91f45f..61a36d5 100644
--- a/gallery_dl/extractor/warosu.py
+++ b/gallery_dl/extractor/warosu.py
@@ -64,7 +64,7 @@ class WarosuThreadExtractor(Extractor):
def parse(self, post):
"""Build post object by extracting data from an HTML post"""
data = self._extract_post(post)
- if "<span> File:" in post and self._extract_image(post, data):
+ if "<span class=fileinfo>" in post and self._extract_image(post, data):
part = data["image"].rpartition("/")[2]
data["tim"], _, data["extension"] = part.partition(".")
data["ext"] = "." + data["extension"]
@@ -83,7 +83,7 @@ class WarosuThreadExtractor(Extractor):
def _extract_image(self, post, data):
extr = text.extract_from(post)
- data["fsize"] = extr("<span> File: ", ", ")
+ data["fsize"] = extr("<span class=fileinfo> File: ", ", ")
data["w"] = extr("", "x")
data["h"] = extr("", ", ")
data["filename"] = text.unquote(extr(
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index fc61dff..126ef49 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -11,6 +11,8 @@
from .booru import BooruExtractor
from ..cache import cache
from .. import text, util, exception
+import collections
+import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
@@ -76,22 +78,29 @@ class ZerochanExtractor(BooruExtractor):
'class="breadcrumbs', '</nav>'))[2:],
"uploader": extr('href="/user/', '"'),
"tags" : extr('<ul id="tags"', '</ul>'),
- "source" : extr('<h2>Source</h2>', '</p><h2>').rpartition(
- ">")[2] or None,
+ "source" : text.unescape(text.extr(
+ extr('id="source-url"', '</a>'), 'href="', '"')),
}
html = data["tags"]
tags = data["tags"] = []
for tag in html.split("<li class=")[1:]:
- category = text.extr(tag, 'data-type="', '"')
+ category = text.extr(tag, '"', '"')
name = text.extr(tag, 'data-tag="', '"')
- tags.append(category.capitalize() + ":" + name)
+ tags.append(category.partition(" ")[0].capitalize() + ":" + name)
return data
def _parse_entry_api(self, entry_id):
url = "{}/{}?json".format(self.root, entry_id)
- item = self.request(url).json()
+ text = self.request(url).text
+ try:
+ item = util.json_loads(text)
+ except ValueError as exc:
+ if " control character " not in str(exc):
+ raise
+ text = re.sub(r"[\x00-\x1f\x7f]", "", text)
+ item = util.json_loads(text)
data = {
"id" : item["id"],
@@ -109,6 +118,14 @@ class ZerochanExtractor(BooruExtractor):
return data
+ def _tags(self, post, page):
+ tags = collections.defaultdict(list)
+ for tag in post["tags"]:
+ category, _, name = tag.partition(":")
+ tags[category].append(name)
+ for key, value in tags.items():
+ post["tags_" + key.lower()] = value
+
class ZerochanTagExtractor(ZerochanExtractor):
subcategory = "tag"
@@ -180,10 +197,16 @@ class ZerochanTagExtractor(ZerochanExtractor):
static = "https://static.zerochan.net/.full."
while True:
- data = self.request(url, params=params).json()
+ response = self.request(url, params=params, allow_redirects=False)
+ if response.status_code >= 300:
+ url = text.urljoin(self.root, response.headers["location"])
+ response = self.request(url, params=params)
+ data = response.json()
+
try:
posts = data["items"]
- except ValueError:
+ except Exception:
+ self.log.debug("Server response: %s", data)
return
if metadata:
@@ -191,13 +214,13 @@ class ZerochanTagExtractor(ZerochanExtractor):
post_id = post["id"]
post.update(self._parse_entry_html(post_id))
post.update(self._parse_entry_api(post_id))
+ yield post
else:
for post in posts:
base = static + str(post["id"])
post["file_url"] = base + ".jpg"
post["_fallback"] = (base + ".png",)
-
- yield from posts
+ yield post
if not data.get("next"):
return