summaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/exception.py15
-rw-r--r--gallery_dl/extractor/__init__.py3
-rw-r--r--gallery_dl/extractor/ao3.py5
-rw-r--r--gallery_dl/extractor/bellazon.py165
-rw-r--r--gallery_dl/extractor/boosty.py2
-rw-r--r--gallery_dl/extractor/comick.py32
-rw-r--r--gallery_dl/extractor/common.py19
-rw-r--r--gallery_dl/extractor/cyberfile.py125
-rw-r--r--gallery_dl/extractor/danbooru.py5
-rw-r--r--gallery_dl/extractor/facebook.py56
-rw-r--r--gallery_dl/extractor/fansly.py188
-rw-r--r--gallery_dl/extractor/imgbb.py253
-rw-r--r--gallery_dl/extractor/simpcity.py145
-rw-r--r--gallery_dl/extractor/tiktok.py3
-rw-r--r--gallery_dl/extractor/tungsten.py11
-rw-r--r--gallery_dl/extractor/twitter.py65
-rw-r--r--gallery_dl/formatter.py4
-rw-r--r--gallery_dl/util.py15
-rw-r--r--gallery_dl/version.py2
19 files changed, 864 insertions, 249 deletions
diff --git a/gallery_dl/exception.py b/gallery_dl/exception.py
index 6adda0d..559fdd1 100644
--- a/gallery_dl/exception.py
+++ b/gallery_dl/exception.py
@@ -104,13 +104,16 @@ class AuthRequired(AuthorizationError):
if auth:
if not isinstance(auth, str):
auth = " or ".join(auth)
- if " " not in resource:
- resource = "this " + resource
- if message is None:
- message = (f"{auth} needed to access {resource}")
+
+ if resource:
+ if " " not in resource:
+ resource = f"this {resource}"
+ resource = f" to access {resource}"
else:
- message = (f"{auth} needed to access {resource} "
- f"('{message}')")
+ resource = ""
+
+ message = f" ('{message}')" if message else ""
+ message = f"{auth} needed{resource}{message}"
AuthorizationError.__init__(self, message)
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 574d1e2..b32fcd1 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -31,6 +31,7 @@ modules = [
"batoto",
"bbc",
"behance",
+ "bellazon",
"bilibili",
"blogger",
"bluesky",
@@ -44,6 +45,7 @@ modules = [
"comick",
"comicvine",
"cyberdrop",
+ "cyberfile",
"danbooru",
"dankefuerslesen",
"desktopography",
@@ -170,6 +172,7 @@ modules = [
"senmanga",
"sexcom",
"shimmie2",
+ "simpcity",
"simplyhentai",
"sizebooru",
"skeb",
diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py
index 2652acb..60380c4 100644
--- a/gallery_dl/extractor/ao3.py
+++ b/gallery_dl/extractor/ao3.py
@@ -102,8 +102,11 @@ class Ao3Extractor(Extractor):
def _pagination(self, path, needle='<li id="work_'):
while True:
page = self.request(self.root + path).text
+
yield from text.extract_iter(page, needle, '"')
- path = text.extr(page, '<a rel="next" href="', '"')
+
+ path = (text.extr(page, '<a rel="next" href="', '"') or
+ text.extr(page, '<li class="next"><a href="', '"'))
if not path:
return
path = text.unescape(path)
diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py
new file mode 100644
index 0000000..5c9b9cd
--- /dev/null
+++ b/gallery_dl/extractor/bellazon.py
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.bellazon.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?bellazon\.com/main"
+
+
+class BellazonExtractor(Extractor):
+ """Base class for bellazon extractors"""
+ category = "bellazon"
+ root = "https://www.bellazon.com/main"
+ directory_fmt = ("{category}", "{thread[section]}",
+ "{thread[title]} ({thread[id]})")
+ filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}"
+ archive_fmt = "{post[id]}/{filename}"
+
+ def items(self):
+ extract_urls = text.re(r'<a ([^>]*?href="([^"]+)".*?)</a>').findall
+ native = f"{self.root}/"
+
+ for post in self.posts():
+ urls = extract_urls(post["content"])
+ data = {"post": post}
+ post["count"] = data["count"] = len(urls)
+
+ yield Message.Directory, data
+ for data["num"], (info, url) in enumerate(urls, 1):
+ url = text.unescape(url)
+ if url.startswith(native):
+ if not (alt := text.extr(info, ' alt="', '"')) or (
+ alt.startswith("post-") and "_thumb." in alt):
+ name = url
+ else:
+ name = text.unescape(alt)
+ dc = text.nameext_from_url(name, data.copy())
+ dc["id"] = text.extr(info, 'data-fileid="', '"')
+ if ext := text.extr(info, 'data-fileext="', '"'):
+ dc["extension"] = ext
+ yield Message.Url, url, dc
+ else:
+ yield Message.Queue, url, data
+
+ def _pagination(self, base, pnum=None):
+ base = f"{self.root}{base}"
+
+ if pnum is None:
+ url = f"{base}/"
+ pnum = 1
+ else:
+ url = f"{base}/page/{pnum}/"
+ pnum = None
+
+ while True:
+ page = self.request(url).text
+
+ yield page
+
+ if pnum is None or ' rel="next" ' not in page or text.extr(
+ page, " rel=\"next\" data-page='", "'") == str(pnum):
+ return
+ pnum += 1
+ url = f"{base}/page/{pnum}/"
+
+ def _parse_thread(self, page):
+ schema = self._extract_jsonld(page)
+ author = schema["author"]
+ stats = schema["interactionStatistic"]
+ url_t = schema["url"]
+ url_a = author["url"]
+
+ path = text.split_html(text.extr(
+ page, '<nav class="ipsBreadcrumb', "</nav>"))[2:-1]
+
+ thread = {
+ "url" : url_t,
+ "path" : path,
+ "title": schema["headline"],
+ "views": stats[0]["userInteractionCount"],
+ "posts": stats[1]["userInteractionCount"],
+ "date" : text.parse_datetime(schema["datePublished"]),
+ "date_updated": text.parse_datetime(schema["dateModified"]),
+ "description" : text.unescape(schema["text"]),
+ "section" : path[-2],
+ "author" : author["name"],
+ "author_url" : url_a,
+ }
+
+ thread["id"], _, thread["slug"] = \
+ url_t.rsplit("/", 2)[1].partition("-")
+ thread["author_id"], _, thread["author_slug"] = \
+ url_a.rsplit("/", 2)[1].partition("-")
+
+ return thread
+
+ def _parse_post(self, html):
+ extr = text.extract_from(html)
+
+ post = {
+ "id": extr('id="elComment_', '"'),
+ "author_url": extr(" href='", "'"),
+ "date": text.parse_datetime(extr("datetime='", "'")),
+ "content": extr("<!-- Post content -->", "\n\t\t</div>"),
+ }
+
+ if (pos := post["content"].find(">")) >= 0:
+ post["content"] = post["content"][pos+1:].strip()
+
+ post["author_id"], _, post["author_slug"] = \
+ post["author_url"].rsplit("/", 2)[1].partition("-")
+
+ return post
+
+
+class BellazonPostExtractor(BellazonExtractor):
+ subcategory = "post"
+ pattern = (rf"{BASE_PATTERN}(/topic/\d+-[\w-]+(?:/page/\d+)?)"
+ rf"/?#findComment-(\d+)")
+ example = "https://www.bellazon.com/main/topic/123-SLUG/#findComment-12345"
+
+ def posts(self):
+ path, post_id = self.groups
+ page = self.request(f"{self.root}{path}").text
+
+ pos = page.find(f'id="elComment_{post_id}')
+ if pos < 0:
+ raise exception.NotFoundError("post")
+ html = text.extract(page, "<article ", "</article>", pos-100)[0]
+
+ self.kwdict["thread"] = self._parse_thread(page)
+ return (self._parse_post(html),)
+
+
+class BellazonThreadExtractor(BellazonExtractor):
+ subcategory = "thread"
+ pattern = rf"{BASE_PATTERN}(/topic/\d+-[\w-]+)(?:/page/(\d+))?"
+ example = "https://www.bellazon.com/main/topic/123-SLUG/"
+
+ def posts(self):
+ for page in self._pagination(*self.groups):
+ if "thread" not in self.kwdict:
+ self.kwdict["thread"] = self._parse_thread(page)
+ for html in text.extract_iter(page, "<article ", "</article>"):
+ yield self._parse_post(html)
+
+
+class BellazonForumExtractor(BellazonExtractor):
+ subcategory = "forum"
+ pattern = rf"{BASE_PATTERN}(/forum/\d+-[\w-]+)(?:/page/(\d+))?"
+ example = "https://www.bellazon.com/main/forum/123-SLUG/"
+
+ def items(self):
+ data = {"_extractor": BellazonThreadExtractor}
+ for page in self._pagination(*self.groups):
+ for row in text.extract_iter(
+ page, '<li data-ips-hook="topicRow"', "</"):
+ yield Message.Queue, text.extr(row, 'href="', '"'), data
diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py
index e0383bf..22f3259 100644
--- a/gallery_dl/extractor/boosty.py
+++ b/gallery_dl/extractor/boosty.py
@@ -281,7 +281,7 @@ class BoostyAPI():
if not access_token:
if auth := self.extractor.cookies.get("auth", domain=".boosty.to"):
access_token = text.extr(
- auth, "%22accessToken%22%3A%22", "%22")
+ text.unquote(auth), '"accessToken":"', '"')
if access_token:
self.headers["Authorization"] = "Bearer " + access_token
diff --git a/gallery_dl/extractor/comick.py b/gallery_dl/extractor/comick.py
index a6aec38..c76694c 100644
--- a/gallery_dl/extractor/comick.py
+++ b/gallery_dl/extractor/comick.py
@@ -9,7 +9,7 @@
"""Extractors for https://comick.io/"""
from .common import GalleryExtractor, ChapterExtractor, MangaExtractor, Message
-from .. import text
+from .. import text, exception
from ..cache import memcache
BASE_PATTERN = r"(?:https?://)?(?:www\.)?comick\.io"
@@ -67,9 +67,35 @@ class ComickChapterExtractor(ComickBase, ChapterExtractor):
def metadata(self, page):
slug, chstr = self.groups
manga = _manga_info(self, slug)
- props = _chapter_info(self, manga, chstr)
- ch = props["chapter"]
+ while True:
+ try:
+ props = _chapter_info(self, manga, chstr)
+ except exception.HttpError as exc:
+ if exc.response.status_code != 404:
+ raise
+ if exc.response.headers.get(
+ "Content-Type", "").startswith("text/html"):
+ if locals().get("_retry_buildid"):
+ raise
+ self.log.debug("Updating Next.js build ID")
+ _retry_buildid = True
+ _manga_info.cache.clear()
+ manga = _manga_info(self, slug)
+ continue
+ if b'"notFound":true' in exc.response.content:
+ raise exception.NotFoundError("chapter")
+ raise
+
+ if "__N_REDIRECT" in props:
+ path = props["__N_REDIRECT"]
+ self.log.debug("Following redirect to %s", path)
+ _, slug, chstr = path.rsplit("/", 2)
+ continue
+
+ ch = props["chapter"]
+ break
+
self._images = ch["md_images"]
if chapter := ch["chap"]:
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 568f435..01965f3 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -354,12 +354,11 @@ class Extractor():
raise exception.AbortExtraction(
f"User input required ({prompt.strip(' :')})")
- def _get_auth_info(self):
+ def _get_auth_info(self, password=None):
"""Return authentication information as (username, password) tuple"""
username = self.config("username")
- password = None
- if username:
+ if username or password:
password = self.config("password")
if not password:
self._check_input_allowed("password")
@@ -667,12 +666,18 @@ class Extractor():
return False
def _extract_jsonld(self, page):
- return util.json_loads(text.extr(
- page, '<script type="application/ld+json">', "</script>"))
+ return util.json_loads(
+ text.extr(page, '<script type="application/ld+json">',
+ "</script>") or
+ text.extr(page, "<script type='application/ld+json'>",
+ "</script>"))
def _extract_nextdata(self, page):
- return util.json_loads(text.extr(
- page, ' id="__NEXT_DATA__" type="application/json">', "</script>"))
+ return util.json_loads(
+ text.extr(page, ' id="__NEXT_DATA__" type="application/json">',
+ "</script>") or
+ text.extr(page, " id='__NEXT_DATA__' type='application/json'>",
+ "</script>"))
def _cache(self, func, maxage, keyarg=None):
# return cache.DatabaseCacheDecorator(func, maxage, keyarg)
diff --git a/gallery_dl/extractor/cyberfile.py b/gallery_dl/extractor/cyberfile.py
new file mode 100644
index 0000000..2ea81d6
--- /dev/null
+++ b/gallery_dl/extractor/cyberfile.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://cyberfile.me/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberfile\.me"
+
+
+class CyberfileExtractor(Extractor):
+ """Base class for cyberfile extractors"""
+ category = "cyberfile"
+ root = "https://cyberfile.me"
+
+ def request_api(self, endpoint, data):
+ url = f"{self.root}{endpoint}"
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ "Origin": self.root,
+ }
+ resp = self.request_json(
+ url, method="POST", headers=headers, data=data)
+
+ if "albumPasswordModel" in resp.get("javascript", ""):
+ url_pw = f"{self.root}/ajax/folder_password_process"
+ data_pw = {
+ "folderPassword": self._get_auth_info(password=True)[1],
+ "folderId": text.extr(
+ resp["html"], '<input type="hidden" value="', '"'),
+ "submitme": "1",
+ }
+ resp = self.request_json(
+ url_pw, method="POST", headers=headers, data=data_pw)
+ if not resp.get("success"):
+ raise exception.AuthorizationError(f"'{resp.get('msg')}'")
+ resp = self.request_json(
+ url, method="POST", headers=headers, data=data)
+
+ return resp
+
+
+class CyberfileFolderExtractor(CyberfileExtractor):
+ subcategory = "folder"
+ pattern = rf"{BASE_PATTERN}/folder/([0-9a-f]+)"
+ example = "https://cyberfile.me/folder/0123456789abcdef/NAME"
+
+ def items(self):
+ folder_hash = self.groups[0]
+ url = f"{self.root}/folder/{folder_hash}"
+ folder_num = text.extr(self.request(url).text, "ages('folder', '", "'")
+
+ extract_urls = text.re(r'dtfullurl="([^"]+)').findall
+ perpage = 600
+
+ data = {
+ "pageType" : "folder",
+ "nodeId" : folder_num,
+ "pageStart": 1,
+ "perPage" : perpage,
+ "filterOrderBy": "",
+ }
+ resp = self.request_api("/account/ajax/load_files", data)
+
+ folder = {
+ "_extractor" : CyberfileFileExtractor,
+ "folder_hash": folder_hash,
+ "folder_num" : text.parse_int(folder_num),
+ "folder" : resp["page_title"],
+ }
+
+ while True:
+ urls = extract_urls(resp["html"])
+ for url in urls:
+ yield Message.Queue, url, folder
+
+ if len(urls) < perpage:
+ return
+ data["pageStart"] += 1
+ resp = self.request_api("/account/ajax/load_files", data)
+
+
+class CyberfileFileExtractor(CyberfileExtractor):
+ subcategory = "file"
+ directory_fmt = ("{category}", "{uploader}", "{folder}")
+ pattern = rf"{BASE_PATTERN}/([a-zA-Z0-9]+)"
+ example = "https://cyberfile.me/AbCdE"
+
+ def items(self):
+ file_id = self.groups[0]
+ url = f"{self.root}/{file_id}"
+ file_num = text.extr(self.request(url).text, "owFileInformation(", ")")
+
+ data = {"u": file_num}
+ resp = self.request_api("/account/ajax/file_details", data)
+ extr = text.extract_from(resp["html"])
+ info = text.split_html(extr('class="text-section">', "</span>"))
+ folder = info[0] if len(info) > 1 else ""
+
+ file = {
+ "file_id" : file_id,
+ "file_num": text.parse_int(file_num),
+ "name" : resp["page_title"],
+ "folder" : folder,
+ "uploader": info[-1][2:].strip(),
+ "size" : text.parse_bytes(text.remove_html(extr(
+ "Filesize:", "</tr>"))[:-1]),
+ "tags" : text.split_html(extr(
+ "Keywords:", "</tr>")),
+ "date" : text.parse_datetime(text.remove_html(extr(
+ "Uploaded:", "</tr>")), "%d/%m/%Y %H:%M:%S"),
+ "permissions": text.remove_html(extr(
+ "Permissions:", "</tr>")).split(" &amp; "),
+ }
+
+ file["file_url"] = url = extr("openUrl('", "'")
+ text.nameext_from_url(file["name"] or url, file)
+ yield Message.Directory, file
+ yield Message.Url, url, file
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 019410c..f8ad07a 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -102,7 +102,10 @@ class DanbooruExtractor(BaseExtractor):
post["extension"] = "webm"
if url[0] == "/":
- url = self.root + url
+ if url[1] == "/":
+ url = "https:" + url
+ else:
+ url = self.root + url
post.update(data)
yield Message.Directory, post
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py
index f9ed1ab..bf24941 100644
--- a/gallery_dl/extractor/facebook.py
+++ b/gallery_dl/extractor/facebook.py
@@ -376,34 +376,6 @@ class FacebookExtractor(Extractor):
return user
-class FacebookSetExtractor(FacebookExtractor):
- """Base class for Facebook Set extractors"""
- subcategory = "set"
- pattern = (
- BASE_PATTERN +
- r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)"
- r"[^/?#]*(?<!&setextract)$"
- r"|([^/?#]+/posts/[^/?#]+)"
- r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)"
- )
- example = "https://www.facebook.com/media/set/?set=SET_ID"
-
- def items(self):
- set_id = self.groups[0] or self.groups[3]
- if path := self.groups[1]:
- post_url = self.root + "/" + path
- post_page = self.request(post_url).text
- set_id = self.parse_post_page(post_page)["set_id"]
-
- set_url = f"{self.root}/media/set/?set={set_id}"
- set_page = self.request(set_url).text
- set_data = self.parse_set_page(set_page)
- if self.groups[2]:
- set_data["first_photo_id"] = self.groups[2]
-
- return self.extract_set(set_data)
-
-
class FacebookPhotoExtractor(FacebookExtractor):
"""Base class for Facebook Photo extractors"""
subcategory = "photo"
@@ -441,6 +413,34 @@ class FacebookPhotoExtractor(FacebookExtractor):
yield Message.Url, comment_photo["url"], comment_photo
+class FacebookSetExtractor(FacebookExtractor):
+ """Base class for Facebook Set extractors"""
+ subcategory = "set"
+ pattern = (
+ BASE_PATTERN +
+ r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)"
+ r"[^/?#]*(?<!&setextract)$"
+ r"|([^/?#]+/posts/[^/?#]+)"
+ r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)"
+ )
+ example = "https://www.facebook.com/media/set/?set=SET_ID"
+
+ def items(self):
+ set_id = self.groups[0] or self.groups[3]
+ if path := self.groups[1]:
+ post_url = self.root + "/" + path
+ post_page = self.request(post_url).text
+ set_id = self.parse_post_page(post_page)["set_id"]
+
+ set_url = f"{self.root}/media/set/?set={set_id}"
+ set_page = self.request(set_url).text
+ set_data = self.parse_set_page(set_page)
+ if self.groups[2]:
+ set_data["first_photo_id"] = self.groups[2]
+
+ return self.extract_set(set_data)
+
+
class FacebookVideoExtractor(FacebookExtractor):
"""Base class for Facebook Video extractors"""
subcategory = "video"
diff --git a/gallery_dl/extractor/fansly.py b/gallery_dl/extractor/fansly.py
index 31d242f..8a6dbef 100644
--- a/gallery_dl/extractor/fansly.py
+++ b/gallery_dl/extractor/fansly.py
@@ -25,7 +25,11 @@ class FanslyExtractor(Extractor):
def _init(self):
self.api = FanslyAPI(self)
- self.formats = self.config("format") or (303, 302, 1, 2, 4)
+
+ if fmts := self.config("formats"):
+ self.formats = set(fmts)
+ else:
+ self.formats = {1, 2, 3, 4, 302, 303}
def items(self):
for post in self.posts():
@@ -41,6 +45,19 @@ class FanslyExtractor(Extractor):
def _extract_files(self, post):
files = []
+
+ if "_extra" in post:
+ extra = post.pop("_extra", ())
+ media = {
+ media["id"]: media
+ for media in self.api.account_media(extra)
+ }
+ post["attachments"].extend(
+ media[mid]
+ for mid in extra
+ if mid in media
+ )
+
for attachment in post.pop("attachments"):
try:
self._extract_attachment(files, post, attachment)
@@ -54,19 +71,23 @@ class FanslyExtractor(Extractor):
def _extract_attachment(self, files, post, attachment):
media = attachment["media"]
- variants = {
- variant["type"]: variant
- for variant in media.pop("variants", ())
- }
- variants[media["type"]] = media
- for fmt in self.formats:
- if fmt in variants and (variant := variants[fmt]).get("locations"):
- break
- else:
- return self.log.warning(
- "%s/%s: Requested format not available",
- post["id"], attachment["id"])
+ variants = media.pop("variants") or []
+ if media.get("locations"):
+ variants.append(media)
+
+ formats = [
+ (type > 256, variant["width"], type, variant)
+ for variant in variants
+ if variant.get("locations") and
+ (type := variant["type"]) in self.formats
+ ]
+
+ try:
+ variant = max(formats)[-1]
+ except Exception:
+ return self.log.warning("%s/%s: No format available",
+ post["id"], attachment["id"])
mime = variant["mimetype"]
location = variant.pop("locations")[0]
@@ -78,7 +99,7 @@ class FanslyExtractor(Extractor):
file = {
**variant,
- "format": fmt,
+ "format": variant["type"],
"date": text.parse_timestamp(media["createdAt"]),
"date_updated": text.parse_timestamp(media["updatedAt"]),
}
@@ -86,12 +107,17 @@ class FanslyExtractor(Extractor):
if "metadata" in location:
# manifest
meta = location["metadata"]
-
file["type"] = "video"
+
+ try:
+ fallback = (media["locations"][0]["location"],)
+ except Exception:
+ fallback = ()
+
files.append({
"file": file,
"url": f"ytdl:{location['location']}",
- # "_fallback": (media["locations"][0]["location"],),
+ "_fallback": fallback,
"_ytdl_manifest":
"dash" if mime == "application/dash+xml" else "hls",
"_ytdl_manifest_cookies": (
@@ -161,17 +187,26 @@ class FanslyListsExtractor(FanslyExtractor):
class FanslyCreatorPostsExtractor(FanslyExtractor):
subcategory = "creator-posts"
- pattern = rf"{BASE_PATTERN}/([^/?#]+)/posts"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/posts(?:/wall/(\d+))?"
example = "https://fansly.com/CREATOR/posts"
def posts(self):
- creator = self.groups[0]
- if creator.startswith("id:"):
- account = self.api.account_by_id(creator[3:])
- else:
- account = self.api.account(creator)
- wall_id = account["walls"][0]["id"]
- return self.api.timeline_new(account["id"], wall_id)
+ creator, wall_id = self.groups
+ account = self.api.account(creator)
+ return self.api.timeline_new(
+ account["id"], wall_id or account["walls"][0]["id"])
+
+
+class FanslyCreatorMediaExtractor(FanslyExtractor):
+ subcategory = "creator-media"
+ pattern = rf"{BASE_PATTERN}/([^/?#]+)/media(?:/wall/(\d+))?"
+ example = "https://fansly.com/CREATOR/media"
+
+ def posts(self):
+ creator, wall_id = self.groups
+ account = self.api.account(creator)
+ return self.api.mediaoffers_location(
+ account["id"], wall_id or account["walls"][0]["id"])
class FanslyAPI():
@@ -179,18 +214,24 @@ class FanslyAPI():
def __init__(self, extractor):
self.extractor = extractor
-
- token = extractor.config("token")
- if not token:
- self.extractor.log.warning("No 'token' provided")
-
self.headers = {
"fansly-client-ts": None,
"Origin" : extractor.root,
- "authorization" : token,
}
- def account(self, username):
+ if token := extractor.config("token"):
+ self.headers["authorization"] = token
+ self.extractor.log.debug(
+ "Using authorization 'token' %.5s...", token)
+ else:
+ self.extractor.log.warning("No 'token' provided")
+
+ def account(self, creator):
+ if creator.startswith("id:"):
+ return self.account_by_id(creator[3:])
+ return self.account_by_username(creator)
+
+ def account_by_username(self, username):
endpoint = "/v1/account"
params = {"usernames": username}
return self._call(endpoint, params)[0]
@@ -205,6 +246,11 @@ class FanslyAPI():
params = {"ids": ",".join(map(str, account_ids))}
return self._call(endpoint, params)
+ def account_media(self, media_ids):
+ endpoint = "/v1/account/media"
+ params = {"ids": ",".join(map(str, media_ids))}
+ return self._call(endpoint, params)
+
def lists_account(self):
endpoint = "/v1/lists/account"
params = {"itemId": ""}
@@ -218,7 +264,21 @@ class FanslyAPI():
"after" : None,
"sortMode": sort,
}
- return self._pagination(endpoint, params)
+ return self._pagination_list(endpoint, params)
+
+ def mediaoffers_location(self, account_id, wall_id):
+ endpoint = "/v1/mediaoffers/location"
+ params = {
+ "locationId": wall_id,
+ "locationType": "1002",
+ "accountId": account_id,
+ "mediaType": "",
+ "before": "",
+ "after" : "0",
+ "limit" : "30",
+ "offset": "0",
+ }
+ return self._pagination_media(endpoint, params)
def post(self, post_id):
endpoint = "/v1/post"
@@ -262,6 +322,7 @@ class FanslyAPI():
for post in posts:
post["account"] = accounts[post.pop("accountId")]
+ extra = None
attachments = []
for attachment in post["attachments"]:
cid = attachment["contentId"]
@@ -270,18 +331,35 @@ class FanslyAPI():
elif cid in bundles:
bundle = bundles[cid]["bundleContent"]
bundle.sort(key=lambda c: c["pos"])
- attachments.extend(
- media[m["accountMediaId"]]
- for m in bundle
- if m["accountMediaId"] in media
- )
+ for c in bundle:
+ mid = c["accountMediaId"]
+ if mid in media:
+ attachments.append(media[mid])
+ else:
+ if extra is None:
+ post["_extra"] = extra = []
+ extra.append(mid)
else:
self.extractor.log.warning(
"%s: Unhandled 'contentId' %s",
post["id"], cid)
post["attachments"] = attachments
+
return posts
+ def _update_media(self, items, response):
+ posts = {
+ post["id"]: post
+ for post in response["posts"]
+ }
+
+ response["posts"] = [
+ posts[item["correlationId"]]
+ for item in items
+ ]
+
+ return self._update_posts(response)
+
def _update_items(self, items):
ids = [item["id"] for item in items]
accounts = {
@@ -304,15 +382,27 @@ class FanslyAPI():
while True:
response = self._call(endpoint, params)
- if isinstance(response, list):
- if not response:
- return
- yield from self._update_items(response)
- params["after"] = response[-1]["sortId"]
-
- else:
- if not response.get("posts"):
- return
- posts = self._update_posts(response)
- yield from posts
- params["before"] = min(p["id"] for p in posts)
+ if not response.get("posts"):
+ return
+ posts = self._update_posts(response)
+ yield from posts
+ params["before"] = min(p["id"] for p in posts)
+
+ def _pagination_list(self, endpoint, params):
+ while True:
+ response = self._call(endpoint, params)
+
+ if not response:
+ return
+ yield from self._update_items(response)
+ params["after"] = response[-1]["sortId"]
+
+ def _pagination_media(self, endpoint, params):
+ while True:
+ response = self._call(endpoint, params)
+
+ data = response["data"]
+ if not data:
+ return
+ yield from self._update_media(data, response["aggregationData"])
+ params["before"] = data[-1]["id"]
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index e6abdeb..d9a63c7 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -16,63 +16,42 @@ from ..cache import cache
class ImgbbExtractor(Extractor):
"""Base class for imgbb extractors"""
category = "imgbb"
- directory_fmt = ("{category}", "{user}")
- filename_fmt = "{title} {id}.{extension}"
- archive_fmt = "{id}"
- root = "https://imgbb.com"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.page_url = self.sort = None
+ directory_fmt = ("{category}", "{user[name]:?//}{user[id]:? (/)/}",
+ "{album[title]} ({album[id]})")
+ filename_fmt = "{title} ({id}).{extension}"
+ archive_fmt = "{user[id]} {id}"
+ cookies_domain = ".imgbb.com"
+ cookies_names = ("PHPSESSID", "LID")
+ root = "https://ibb.co"
def items(self):
self.login()
- url = self.page_url
- params = {"sort": self.sort}
- while True:
- response = self.request(url, params=params, allow_redirects=False)
- if response.status_code < 300:
- break
- url = response.headers["location"]
- if url.startswith(self.root):
- raise exception.NotFoundError(self.subcategory)
-
- page = response.text
- data = self.metadata(page)
- first = True
-
- for img in self.images(page):
- image = {
- "id" : img["url_viewer"].rpartition("/")[2],
- "user" : img["user"]["username"] if "user" in img else "",
- "title" : text.unescape(img["title"]),
- "url" : img["image"]["url"],
- "extension": img["image"]["extension"],
- "size" : text.parse_int(img["image"]["size"]),
- "width" : text.parse_int(img["width"]),
- "height" : text.parse_int(img["height"]),
- }
- image.update(data)
- if first:
- first = False
- yield Message.Directory, data
- yield Message.Url, image["url"], image
+ for image in self.posts():
+ url = image["url"]
+ text.nameext_from_url(url, image)
+ yield Message.Directory, image
+ yield Message.Url, url, image
def login(self):
+ if self.cookies_check(self.cookies_names):
+ return
+
username, password = self._get_auth_info()
if username:
- self.cookies_update(self._login_impl(username, password))
+ return self.cookies_update(self._login_impl(username, password))
@cache(maxage=365*86400, keyarg=1)
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- url = self.root + "/login"
+ url = "https://imgbb.com/login"
page = self.request(url).text
- token = text.extr(page, 'PF.obj.config.auth_token="', '"')
+ token = text.extr(page, 'name="auth_token" value="', '"')
- headers = {"Referer": url}
+ headers = {
+ "Referer": url,
+ }
data = {
"auth_token" : token,
"login-subject": username,
@@ -84,27 +63,26 @@ class ImgbbExtractor(Extractor):
raise exception.AuthenticationError()
return self.cookies
- def _extract_resource(self, page):
- return util.json_loads(text.extr(
- page, "CHV.obj.resource=", "};") + "}")
-
- def _extract_user(self, page):
- return self._extract_resource(page).get("user") or {}
-
- def _pagination(self, page, endpoint, params):
- data = None
+ def _pagination(self, page, url, params):
seek, pos = text.extract(page, 'data-seek="', '"')
tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)
- params["action"] = "list"
- params["list"] = "images"
- params["sort"] = self.sort
- params["seek"] = seek
- params["page"] = 2
- params["auth_token"] = tokn
+ resc, pos = text.extract(page, "CHV.obj.resource=", "};", pos)
+ self.kwdict["user"] = util.json_loads(resc + "}").get("user")
+ data = None
while True:
- for img in text.extract_iter(page, "data-object='", "'"):
- yield util.json_loads(text.unquote(img))
+ for obj in text.extract_iter(page, "data-object='", "'"):
+ post = util.json_loads(text.unquote(obj))
+ image = post["image"]
+ image["filename"], image["name"] = \
+ image["name"], image["filename"]
+ image["id"] = post["id_encoded"]
+ image["title"] = post["title"]
+ image["width"] = text.parse_int(post["width"])
+ image["height"] = text.parse_int(post["height"])
+ image["size"] = text.parse_int(image["size"])
+ yield image
+
if data:
if not data["seekEnd"] or params["seek"] == data["seekEnd"]:
return
@@ -112,105 +90,114 @@ class ImgbbExtractor(Extractor):
params["page"] += 1
elif not seek or 'class="pagination-next"' not in page:
return
- data = self.request_json(endpoint, method="POST", data=params)
+ else:
+ params["action"] = "list"
+ params["page"] = 2
+ params["seek"] = seek
+ params["auth_token"] = tokn
+
+ headers = {
+ "Accept": "application/json, text/javascript, */*; q=0.01",
+ "X-Requested-With": "XMLHttpRequest",
+ "Origin": self.root,
+ "Sec-Fetch-Dest": "empty",
+ "Sec-Fetch-Mode": "cors",
+ "Sec-Fetch-Site": "same-origin",
+ }
+
+ data = self.request_json(
+ url, method="POST", headers=headers, data=params)
page = data["html"]
class ImgbbAlbumExtractor(ImgbbExtractor):
- """Extractor for albums on imgbb.com"""
+ """Extractor for imgbb albums"""
subcategory = "album"
- directory_fmt = ("{category}", "{user}", "{album_name} {album_id}")
pattern = r"(?:https?://)?ibb\.co/album/([^/?#]+)/?(?:\?([^#]+))?"
example = "https://ibb.co/album/ID"
- def __init__(self, match):
- ImgbbExtractor.__init__(self, match)
- self.album_name = None
- self.album_id = match[1]
- self.sort = text.parse_query(match[2]).get("sort", "date_desc")
- self.page_url = "https://ibb.co/album/" + self.album_id
-
- def metadata(self, page):
- album = text.extr(page, '"og:title" content="', '"')
- user = self._extract_user(page)
- return {
- "album_id" : self.album_id,
- "album_name" : text.unescape(album),
- "user" : user.get("username") or "",
- "user_id" : user.get("id") or "",
- "displayname": user.get("name") or "",
- }
-
- def images(self, page):
- url = text.extr(page, '"og:url" content="', '"')
- album_id = url.rpartition("/")[2].partition("?")[0]
-
- return self._pagination(page, "https://ibb.co/json", {
- "from" : "album",
- "albumid" : album_id,
- "params_hidden[list]" : "images",
- "params_hidden[from]" : "album",
- "params_hidden[albumid]": album_id,
- })
-
-
-class ImgbbUserExtractor(ImgbbExtractor):
- """Extractor for user profiles in imgbb.com"""
- subcategory = "user"
- pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?$"
- example = "https://USER.imgbb.com"
+ def posts(self):
+ album_id, qs = self.groups
+ url = f"{self.root}/album/{album_id}"
+ params = text.parse_query(qs)
+ page = self.request(url, params=params).text
+ extr = text.extract_from(page)
- def __init__(self, match):
- ImgbbExtractor.__init__(self, match)
- self.user = match[1]
- self.sort = text.parse_query(match[2]).get("sort", "date_desc")
- self.page_url = f"https://{self.user}.imgbb.com/"
-
- def metadata(self, page):
- user = self._extract_user(page)
- return {
- "user" : user.get("username") or self.user,
- "user_id" : user.get("id") or "",
- "displayname": user.get("name") or "",
+ self.kwdict["album"] = album = {
+ "url": extr(
+ 'property="og:url" content="', '"'),
+ "title": text.unescape(extr(
+ 'property="og:title" content="', '"')),
+ "description": text.unescape(extr(
+ 'property="og:description" content="', '"')),
+ "id": extr(
+ 'data-text="album-name" href="https://ibb.co/album/', '"'),
+ "count": text.parse_int(extr(
+ 'data-text="image-count">', "<")),
}
- def images(self, page):
- user = text.extr(page, '.obj.resource={"id":"', '"')
- return self._pagination(page, self.page_url + "json", {
- "from" : "user",
- "userid" : user,
- "params_hidden[userid]": user,
- "params_hidden[from]" : "user",
- })
+ url = f"{self.root}/json"
+ params["pathname"] = f"/album/{album['id']}"
+ return self._pagination(page, url, params)
class ImgbbImageExtractor(ImgbbExtractor):
subcategory = "image"
- pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?#]+)"
+ pattern = r"(?:https?://)?ibb\.co/([^/?#]+)"
example = "https://ibb.co/ID"
- def __init__(self, match):
- ImgbbExtractor.__init__(self, match)
- self.image_id = match[1]
-
- def items(self):
- url = "https://ibb.co/" + self.image_id
+ def posts(self):
+ url = f"{self.root}/{self.groups[0]}"
page = self.request(url).text
extr = text.extract_from(page)
- user = self._extract_user(page)
image = {
- "id" : self.image_id,
+ "id" : extr('property="og:url" content="https://ibb.co/', '"'),
"title" : text.unescape(extr(
'"og:title" content="', ' hosted at ImgBB"')),
"url" : extr('"og:image" content="', '"'),
"width" : text.parse_int(extr('"og:image:width" content="', '"')),
"height": text.parse_int(extr('"og:image:height" content="', '"')),
- "user" : user.get("username") or "",
- "user_id" : user.get("id") or "",
- "displayname": user.get("name") or "",
+ "album" : extr("Added to <a", "</a>"),
+ "date" : text.parse_datetime(extr(
+ '<span title="', '"'), "%Y-%m-%d %H:%M:%S"),
+ "user" : util.json_loads(extr(
+ "CHV.obj.resource=", "};") + "}").get("user"),
}
- image["extension"] = text.ext_from_url(image["url"])
- yield Message.Directory, image
- yield Message.Url, image["url"], image
+ if album := image["album"]:
+ image["album"] = {
+ "id" : text.extr(album, "/album/", '"'),
+ "title": text.unescape(album.rpartition(">")[2]),
+ }
+ else:
+ image["album"] = None
+
+ return (image,)
+
+
+class ImgbbUserExtractor(ImgbbExtractor):
+ """Extractor for imgbb user profiles"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "{user[name]} ({user[id]})")
+ pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?"
+ example = "https://USER.imgbb.com"
+
+ def posts(self):
+ user, qs = self.groups
+ url = f"https://{user}.imgbb.com/"
+ params = text.parse_query(qs)
+ response = self.request(url, params=params, allow_redirects=False)
+
+ if response.status_code < 300:
+ params["pathname"] = "/"
+ return self._pagination(response.text, f"{url}json", params)
+
+ if response.status_code == 301:
+ raise exception.NotFoundError("user")
+ redirect = f"HTTP redirect to {response.headers.get('Location')}"
+ if response.status_code == 302:
+ raise exception.AuthRequired(
+ ("username & password", "authenticated cookies"),
+ "profile", redirect)
+ raise exception.AbortExtraction(redirect)
diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py
new file mode 100644
index 0000000..8cc7e38
--- /dev/null
+++ b/gallery_dl/extractor/simpcity.py
@@ -0,0 +1,145 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2025 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://simpcity.cr/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)"
+
+
+class SimpcityExtractor(Extractor):
+ """Base class for simpcity extractors"""
+ category = "simpcity"
+ root = "https://simpcity.cr"
+
+ def items(self):
+ extract_urls = text.re(r' href="([^"]+)').findall
+
+ for post in self.posts():
+ urls = extract_urls(post["content"])
+ data = {"post": post}
+ post["count"] = data["count"] = len(urls)
+ for data["num"], url in enumerate(urls, 1):
+ yield Message.Queue, url, data
+
+ def request_page(self, url):
+ try:
+ return self.request(url).text
+ except exception.HttpError as exc:
+ if exc.status == 403 and b">Log in<" in exc.response.content:
+ msg = text.extr(exc.response.text, "blockMessage--error", "</")
+ raise exception.AuthRequired(
+ "'authenticated cookies'", None,
+ msg.rpartition(">")[2].strip())
+ raise
+
+ def _pagination(self, base, pnum=None):
+ base = f"{self.root}{base}"
+
+ if pnum is None:
+ url = base
+ pnum = 1
+ else:
+ url = f"{base}/page-{pnum}"
+ pnum = None
+
+ while True:
+ page = self.request_page(url)
+
+ yield page
+
+ if pnum is None or "pageNav-jump--next" not in page:
+ return
+ pnum += 1
+ url = f"{base}/page-{pnum}"
+
+ def _parse_thread(self, page):
+ schema = self._extract_jsonld(page)["mainEntity"]
+ author = schema["author"]
+ stats = schema["interactionStatistic"]
+ url_t = schema["url"]
+ url_a = author["url"]
+
+ thread = {
+ "id" : url_t[url_t.rfind(".")+1:-1],
+ "url" : url_t,
+ "title": schema["headline"],
+ "date" : text.parse_datetime(schema["datePublished"]),
+ "views": stats[0]["userInteractionCount"],
+ "posts": stats[1]["userInteractionCount"],
+ "tags" : (schema["keywords"].split(", ")
+ if "keywords" in schema else ()),
+ "section" : schema["articleSection"],
+ "author" : author["name"],
+ "author_id" : url_a[url_a.rfind(".")+1:-1],
+ "author_url": url_a,
+ }
+
+ return thread
+
+ def _parse_post(self, html):
+ extr = text.extract_from(html)
+
+ post = {
+ "author": extr('data-author="', '"'),
+ "id": extr('data-content="post-', '"'),
+ "author_url": extr('itemprop="url" content="', '"'),
+ "date": text.parse_datetime(extr('datetime="', '"')),
+ "content": extr('<div itemprop="text">', "\t\t</div>").strip(),
+ }
+
+ url_a = post["author_url"]
+ post["author_id"] = url_a[url_a.rfind(".")+1:-1]
+
+ return post
+
+
+class SimpcityPostExtractor(SimpcityExtractor):
+ subcategory = "post"
+ pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)"
+ example = "https://simpcity.cr/threads/TITLE.12345/post-54321"
+
+ def posts(self):
+ post_id = self.groups[0]
+ url = f"{self.root}/posts/{post_id}/"
+ page = self.request_page(url)
+
+ pos = page.find(f'data-content="post-{post_id}"')
+ if pos < 0:
+ raise exception.NotFoundError("post")
+ html = text.extract(page, "<article ", "</article>", pos-200)[0]
+
+ self.kwdict["thread"] = self._parse_thread(page)
+ return (self._parse_post(html),)
+
+
+class SimpcityThreadExtractor(SimpcityExtractor):
+ subcategory = "thread"
+ pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
+ example = "https://simpcity.cr/threads/TITLE.12345/"
+
+ def posts(self):
+ for page in self._pagination(*self.groups):
+ if "thread" not in self.kwdict:
+ self.kwdict["thread"] = self._parse_thread(page)
+ for html in text.extract_iter(page, "<article ", "</article>"):
+ yield self._parse_post(html)
+
+
+class SimpcityForumExtractor(SimpcityExtractor):
+ subcategory = "forum"
+ pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
+ example = "https://simpcity.cr/forums/TITLE.123/"
+
+ def items(self):
+ data = {"_extractor": SimpcityThreadExtractor}
+ for page in self._pagination(*self.groups):
+ for path in text.extract_iter(page, ' uix-href="', '"'):
+ yield Message.Queue, f"{self.root}{text.unquote(path)}", data
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index 973bd22..f450806 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -42,8 +42,7 @@ class TiktokExtractor(Extractor):
continue
post = video_detail["itemInfo"]["itemStruct"]
- author = post["author"]
- post["user"] = author["uniqueId"]
+ post["user"] = (a := post.get("author")) and a["uniqueId"] or ""
post["date"] = text.parse_timestamp(post["createTime"])
original_title = title = post["desc"]
diff --git a/gallery_dl/extractor/tungsten.py b/gallery_dl/extractor/tungsten.py
index 20d5a59..45836a9 100644
--- a/gallery_dl/extractor/tungsten.py
+++ b/gallery_dl/extractor/tungsten.py
@@ -87,14 +87,17 @@ class TungstenModelExtractor(TungstenExtractor):
class TungstenUserExtractor(TungstenExtractor):
subcategory = "user"
- pattern = rf"{BASE_PATTERN}/user/([^/?#]+)"
- example = "https://tungsten.run/user/USER/posts"
+ pattern = rf"{BASE_PATTERN}/user/([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?"
+ example = "https://tungsten.run/user/USER"
def posts(self):
- url = f"{self.root}/user/{self.groups[0]}"
+ user, qs = self.groups
+ url = f"{self.root}/user/{user}"
page = self.request(url).text
uuid_user = text.extr(page, '"user":{"uuid":"', '"')
url = f"https://api.tungsten.run/v1/users/{uuid_user}/posts"
- params = {"sort": "top_all_time"}
+ params = text.parse_query(qs)
+ params.setdefault("sort", "top_all_time")
+ self.kwdict["search_tags"] = params.get("tag", "")
return self._pagination(url, params)
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index c919cb8..ed3cfae 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -1447,20 +1447,33 @@ class TwitterAPI():
"includePromotedContent": False,
}
return self._pagination_tweets(
- endpoint, variables, ("bookmark_timeline_v2", "timeline"), False)
+ endpoint, variables, ("bookmark_timeline_v2", "timeline"),
+ stop_tweets=128)
def search_timeline(self, query, product="Latest"):
endpoint = "/graphql/4fpceYZ6-YQCx_JSl_Cn_A/SearchTimeline"
variables = {
"rawQuery": query,
- "count": 100,
+ "count": self.extractor.config("search-limit", 20),
"querySource": "typed_query",
"product": product,
"withGrokTranslatedBio": False,
}
+
+ if self.extractor.config("search-pagination") in (
+ "max_id", "maxid", "id"):
+ update_variables = self._update_variables_search
+ else:
+ update_variables = None
+
+ stop_tweets = self.extractor.config("search-stop")
+ if stop_tweets is None or stop_tweets == "auto":
+ stop_tweets = 3 if update_variables is None else 0
+
return self._pagination_tweets(
endpoint, variables,
- ("search_by_raw_query", "search_timeline", "timeline"))
+ ("search_by_raw_query", "search_timeline", "timeline"),
+ stop_tweets=stop_tweets, update_variables=update_variables)
def community_query(self, community_id):
endpoint = "/graphql/2W09l7nD7ZbxGQHXvfB22w/CommunityQuery"
@@ -1870,11 +1883,12 @@ class TwitterAPI():
params["cursor"] = extr._update_cursor(cursor)
def _pagination_tweets(self, endpoint, variables,
- path=None, stop_tweets=True,
+ path=None, stop_tweets=0, update_variables=None,
features=None, field_toggles=None):
extr = self.extractor
original_retweets = (extr.retweets == "original")
pinned_tweet = extr.pinned
+ stop_tweets_max = stop_tweets
params = {"variables": None}
if cursor := extr._init_cursor():
@@ -2067,11 +2081,24 @@ class TwitterAPI():
tweet.get("rest_id"))
continue
- if stop_tweets and not tweet:
- return extr._update_cursor(None)
+ if tweet:
+ stop_tweets = stop_tweets_max
+ last_tweet = tweet
+ else:
+ if stop_tweets <= 0:
+ return extr._update_cursor(None)
+ self.log.debug(
+ "No Tweet results (%s/%s)",
+ stop_tweets_max - stop_tweets + 1, stop_tweets_max)
+ stop_tweets -= 1
+
if not cursor or cursor == variables.get("cursor"):
return extr._update_cursor(None)
- variables["cursor"] = extr._update_cursor(cursor)
+
+ if update_variables is None:
+ variables["cursor"] = extr._update_cursor(cursor)
+ else:
+ variables = update_variables(variables, cursor, last_tweet)
def _pagination_users(self, endpoint, variables, path=None):
extr = self.extractor
@@ -2140,6 +2167,30 @@ class TwitterAPI():
self.log.debug("Skipping %s ('%s')", tweet_id, text)
+ def _update_variables_search(self, variables, cursor, tweet):
+ try:
+ tweet_id = tweet.get("id_str") or tweet["legacy"]["id_str"]
+ max_id = f"max_id:{int(tweet_id)-1}"
+
+ query, n = text.re(r"\bmax_id:\d+").subn(
+ max_id, variables["rawQuery"])
+ if n:
+ variables["rawQuery"] = query
+ else:
+ variables["rawQuery"] = f"{query} {max_id}"
+
+ if prefix := self.extractor._cursor_prefix:
+ self.extractor._cursor_prefix = \
+ f"{prefix.partition('_')[0]}_{tweet_id}/"
+ variables["cursor"] = None
+ except Exception as exc:
+ self.extractor.log.debug(
+ "Failed to update 'max_id' search query (%s: %s). Falling "
+ "back to 'cursor' pagination", exc.__class__.__name__, exc)
+ variables["cursor"] = self.extractor._update_cursor(cursor)
+
+ return variables
+
@cache(maxage=365*86400, keyarg=1)
def _login_impl(extr, username, password):
diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py
index b09203f..cc9af11 100644
--- a/gallery_dl/formatter.py
+++ b/gallery_dl/formatter.py
@@ -301,6 +301,8 @@ def parse_field_name(field_name):
key = _slice(key[1:])
else:
key = _slice(key)
+ elif key[0] == "-":
+ key = int(key)
else:
key = key.strip("\"'")
except TypeError:
@@ -565,7 +567,7 @@ _CONVERSIONS = {
"U": text.unescape,
"H": lambda s: text.unescape(text.remove_html(s)),
"g": text.slugify,
- "R": text.re(r"https?://[^\s\"']+").findall,
+ "R": text.re(r"https?://[^\s\"'<>\\]+").findall,
"W": text.sanitize_whitespace,
"S": util.to_string,
"s": str,
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 45ffc9c..7b9ce99 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -987,16 +987,21 @@ def build_proxy_map(proxies, log=None):
if isinstance(proxies, str):
if "://" not in proxies:
proxies = "http://" + proxies.lstrip("/")
- return {"http": proxies, "https": proxies}
-
- if isinstance(proxies, dict):
+ proxies = {"http": proxies, "https": proxies}
+ elif isinstance(proxies, dict):
for scheme, proxy in proxies.items():
if "://" not in proxy:
proxies[scheme] = "http://" + proxy.lstrip("/")
- return proxies
+ else:
+ proxies = None
if log is not None:
- log.warning("invalid proxy specifier: %s", proxies)
+ if proxies is None:
+ log.warning("Invalid proxy specifier: %r", proxies)
+ else:
+ log.debug("Proxy Map: %s", proxies)
+
+ return proxies
def build_predicate(predicates):
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 187ef92..277d679 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,5 +6,5 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.30.6"
+__version__ = "1.30.7"
__variant__ = None