diff options
| author | 2025-09-16 02:12:49 -0400 | |
|---|---|---|
| committer | 2025-09-16 02:12:49 -0400 | |
| commit | 3b7f8716690b7aa1994a9cb387bbc7215e01a4ed (patch) | |
| tree | 1009e66478f4f0a64324acd92e0cc8709eb5f90f /gallery_dl/extractor | |
| parent | 243b2597edb922fe7e0b0d887e80bb7ebbe72ab7 (diff) | |
New upstream version 1.30.7.upstream/1.30.7
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/ao3.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/bellazon.py | 165 | ||||
| -rw-r--r-- | gallery_dl/extractor/boosty.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/comick.py | 32 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 19 | ||||
| -rw-r--r-- | gallery_dl/extractor/cyberfile.py | 125 | ||||
| -rw-r--r-- | gallery_dl/extractor/danbooru.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/facebook.py | 56 | ||||
| -rw-r--r-- | gallery_dl/extractor/fansly.py | 188 | ||||
| -rw-r--r-- | gallery_dl/extractor/imgbb.py | 253 | ||||
| -rw-r--r-- | gallery_dl/extractor/simpcity.py | 145 | ||||
| -rw-r--r-- | gallery_dl/extractor/tiktok.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/tungsten.py | 11 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 65 |
15 files changed, 841 insertions, 236 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 574d1e2..b32fcd1 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -31,6 +31,7 @@ modules = [ "batoto", "bbc", "behance", + "bellazon", "bilibili", "blogger", "bluesky", @@ -44,6 +45,7 @@ modules = [ "comick", "comicvine", "cyberdrop", + "cyberfile", "danbooru", "dankefuerslesen", "desktopography", @@ -170,6 +172,7 @@ modules = [ "senmanga", "sexcom", "shimmie2", + "simpcity", "simplyhentai", "sizebooru", "skeb", diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py index 2652acb..60380c4 100644 --- a/gallery_dl/extractor/ao3.py +++ b/gallery_dl/extractor/ao3.py @@ -102,8 +102,11 @@ class Ao3Extractor(Extractor): def _pagination(self, path, needle='<li id="work_'): while True: page = self.request(self.root + path).text + yield from text.extract_iter(page, needle, '"') - path = text.extr(page, '<a rel="next" href="', '"') + + path = (text.extr(page, '<a rel="next" href="', '"') or + text.extr(page, '<li class="next"><a href="', '"')) if not path: return path = text.unescape(path) diff --git a/gallery_dl/extractor/bellazon.py b/gallery_dl/extractor/bellazon.py new file mode 100644 index 0000000..5c9b9cd --- /dev/null +++ b/gallery_dl/extractor/bellazon.py @@ -0,0 +1,165 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.bellazon.com/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?bellazon\.com/main" + + +class BellazonExtractor(Extractor): + """Base class for bellazon extractors""" + category = "bellazon" + root = "https://www.bellazon.com/main" + directory_fmt = ("{category}", "{thread[section]}", + "{thread[title]} ({thread[id]})") + filename_fmt = "{post[id]}_{num:>02}_{id}.{extension}" + archive_fmt = "{post[id]}/{filename}" + + def items(self): + extract_urls = text.re(r'<a ([^>]*?href="([^"]+)".*?)</a>').findall + native = f"{self.root}/" + + for post in self.posts(): + urls = extract_urls(post["content"]) + data = {"post": post} + post["count"] = data["count"] = len(urls) + + yield Message.Directory, data + for data["num"], (info, url) in enumerate(urls, 1): + url = text.unescape(url) + if url.startswith(native): + if not (alt := text.extr(info, ' alt="', '"')) or ( + alt.startswith("post-") and "_thumb." in alt): + name = url + else: + name = text.unescape(alt) + dc = text.nameext_from_url(name, data.copy()) + dc["id"] = text.extr(info, 'data-fileid="', '"') + if ext := text.extr(info, 'data-fileext="', '"'): + dc["extension"] = ext + yield Message.Url, url, dc + else: + yield Message.Queue, url, data + + def _pagination(self, base, pnum=None): + base = f"{self.root}{base}" + + if pnum is None: + url = f"{base}/" + pnum = 1 + else: + url = f"{base}/page/{pnum}/" + pnum = None + + while True: + page = self.request(url).text + + yield page + + if pnum is None or ' rel="next" ' not in page or text.extr( + page, " rel=\"next\" data-page='", "'") == str(pnum): + return + pnum += 1 + url = f"{base}/page/{pnum}/" + + def _parse_thread(self, page): + schema = self._extract_jsonld(page) + author = schema["author"] + stats = schema["interactionStatistic"] + url_t = schema["url"] + url_a = author["url"] + + path = text.split_html(text.extr( + page, '<nav class="ipsBreadcrumb', "</nav>"))[2:-1] + + thread = { + "url" : url_t, + "path" : path, + "title": schema["headline"], + "views": stats[0]["userInteractionCount"], + "posts": stats[1]["userInteractionCount"], + "date" : text.parse_datetime(schema["datePublished"]), + "date_updated": text.parse_datetime(schema["dateModified"]), + "description" : text.unescape(schema["text"]), + "section" : path[-2], + "author" : author["name"], + "author_url" : url_a, + } + + thread["id"], _, thread["slug"] = \ + url_t.rsplit("/", 2)[1].partition("-") + thread["author_id"], _, thread["author_slug"] = \ + url_a.rsplit("/", 2)[1].partition("-") + + return thread + + def _parse_post(self, html): + extr = text.extract_from(html) + + post = { + "id": extr('id="elComment_', '"'), + "author_url": extr(" href='", "'"), + "date": text.parse_datetime(extr("datetime='", "'")), + "content": extr("<!-- Post content -->", "\n\t\t</div>"), + } + + if (pos := post["content"].find(">")) >= 0: + post["content"] = post["content"][pos+1:].strip() + + post["author_id"], _, post["author_slug"] = \ + post["author_url"].rsplit("/", 2)[1].partition("-") + + return post + + +class BellazonPostExtractor(BellazonExtractor): + subcategory = "post" + pattern = (rf"{BASE_PATTERN}(/topic/\d+-[\w-]+(?:/page/\d+)?)" + rf"/?#findComment-(\d+)") + example = "https://www.bellazon.com/main/topic/123-SLUG/#findComment-12345" + + def posts(self): + path, post_id = self.groups + page = self.request(f"{self.root}{path}").text + + pos = page.find(f'id="elComment_{post_id}') + if pos < 0: + raise exception.NotFoundError("post") + html = text.extract(page, "<article ", "</article>", pos-100)[0] + + self.kwdict["thread"] = self._parse_thread(page) + return (self._parse_post(html),) + + +class BellazonThreadExtractor(BellazonExtractor): + subcategory = "thread" + pattern = rf"{BASE_PATTERN}(/topic/\d+-[\w-]+)(?:/page/(\d+))?" + example = "https://www.bellazon.com/main/topic/123-SLUG/" + + def posts(self): + for page in self._pagination(*self.groups): + if "thread" not in self.kwdict: + self.kwdict["thread"] = self._parse_thread(page) + for html in text.extract_iter(page, "<article ", "</article>"): + yield self._parse_post(html) + + +class BellazonForumExtractor(BellazonExtractor): + subcategory = "forum" + pattern = rf"{BASE_PATTERN}(/forum/\d+-[\w-]+)(?:/page/(\d+))?" + example = "https://www.bellazon.com/main/forum/123-SLUG/" + + def items(self): + data = {"_extractor": BellazonThreadExtractor} + for page in self._pagination(*self.groups): + for row in text.extract_iter( + page, '<li data-ips-hook="topicRow"', "</"): + yield Message.Queue, text.extr(row, 'href="', '"'), data diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py index e0383bf..22f3259 100644 --- a/gallery_dl/extractor/boosty.py +++ b/gallery_dl/extractor/boosty.py @@ -281,7 +281,7 @@ class BoostyAPI(): if not access_token: if auth := self.extractor.cookies.get("auth", domain=".boosty.to"): access_token = text.extr( - auth, "%22accessToken%22%3A%22", "%22") + text.unquote(auth), '"accessToken":"', '"') if access_token: self.headers["Authorization"] = "Bearer " + access_token diff --git a/gallery_dl/extractor/comick.py b/gallery_dl/extractor/comick.py index a6aec38..c76694c 100644 --- a/gallery_dl/extractor/comick.py +++ b/gallery_dl/extractor/comick.py @@ -9,7 +9,7 @@ """Extractors for https://comick.io/""" from .common import GalleryExtractor, ChapterExtractor, MangaExtractor, Message -from .. import text +from .. import text, exception from ..cache import memcache BASE_PATTERN = r"(?:https?://)?(?:www\.)?comick\.io" @@ -67,9 +67,35 @@ class ComickChapterExtractor(ComickBase, ChapterExtractor): def metadata(self, page): slug, chstr = self.groups manga = _manga_info(self, slug) - props = _chapter_info(self, manga, chstr) - ch = props["chapter"] + while True: + try: + props = _chapter_info(self, manga, chstr) + except exception.HttpError as exc: + if exc.response.status_code != 404: + raise + if exc.response.headers.get( + "Content-Type", "").startswith("text/html"): + if locals().get("_retry_buildid"): + raise + self.log.debug("Updating Next.js build ID") + _retry_buildid = True + _manga_info.cache.clear() + manga = _manga_info(self, slug) + continue + if b'"notFound":true' in exc.response.content: + raise exception.NotFoundError("chapter") + raise + + if "__N_REDIRECT" in props: + path = props["__N_REDIRECT"] + self.log.debug("Following redirect to %s", path) + _, slug, chstr = path.rsplit("/", 2) + continue + + ch = props["chapter"] + break + self._images = ch["md_images"] if chapter := ch["chap"]: diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 568f435..01965f3 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -354,12 +354,11 @@ class Extractor(): raise exception.AbortExtraction( f"User input required ({prompt.strip(' :')})") - def _get_auth_info(self): + def _get_auth_info(self, password=None): """Return authentication information as (username, password) tuple""" username = self.config("username") - password = None - if username: + if username or password: password = self.config("password") if not password: self._check_input_allowed("password") @@ -667,12 +666,18 @@ class Extractor(): return False def _extract_jsonld(self, page): - return util.json_loads(text.extr( - page, '<script type="application/ld+json">', "</script>")) + return util.json_loads( + text.extr(page, '<script type="application/ld+json">', + "</script>") or + text.extr(page, "<script type='application/ld+json'>", + "</script>")) def _extract_nextdata(self, page): - return util.json_loads(text.extr( - page, ' id="__NEXT_DATA__" type="application/json">', "</script>")) + return util.json_loads( + text.extr(page, ' id="__NEXT_DATA__" type="application/json">', + "</script>") or + text.extr(page, " id='__NEXT_DATA__' type='application/json'>", + "</script>")) def _cache(self, func, maxage, keyarg=None): # return cache.DatabaseCacheDecorator(func, maxage, keyarg) diff --git a/gallery_dl/extractor/cyberfile.py b/gallery_dl/extractor/cyberfile.py new file mode 100644 index 0000000..2ea81d6 --- /dev/null +++ b/gallery_dl/extractor/cyberfile.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://cyberfile.me/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?cyberfile\.me" + + +class CyberfileExtractor(Extractor): + """Base class for cyberfile extractors""" + category = "cyberfile" + root = "https://cyberfile.me" + + def request_api(self, endpoint, data): + url = f"{self.root}{endpoint}" + headers = { + "X-Requested-With": "XMLHttpRequest", + "Origin": self.root, + } + resp = self.request_json( + url, method="POST", headers=headers, data=data) + + if "albumPasswordModel" in resp.get("javascript", ""): + url_pw = f"{self.root}/ajax/folder_password_process" + data_pw = { + "folderPassword": self._get_auth_info(password=True)[1], + "folderId": text.extr( + resp["html"], '<input type="hidden" value="', '"'), + "submitme": "1", + } + resp = self.request_json( + url_pw, method="POST", headers=headers, data=data_pw) + if not resp.get("success"): + raise exception.AuthorizationError(f"'{resp.get('msg')}'") + resp = self.request_json( + url, method="POST", headers=headers, data=data) + + return resp + + +class CyberfileFolderExtractor(CyberfileExtractor): + subcategory = "folder" + pattern = rf"{BASE_PATTERN}/folder/([0-9a-f]+)" + example = "https://cyberfile.me/folder/0123456789abcdef/NAME" + + def items(self): + folder_hash = self.groups[0] + url = f"{self.root}/folder/{folder_hash}" + folder_num = text.extr(self.request(url).text, "ages('folder', '", "'") + + extract_urls = text.re(r'dtfullurl="([^"]+)').findall + perpage = 600 + + data = { + "pageType" : "folder", + "nodeId" : folder_num, + "pageStart": 1, + "perPage" : perpage, + "filterOrderBy": "", + } + resp = self.request_api("/account/ajax/load_files", data) + + folder = { + "_extractor" : CyberfileFileExtractor, + "folder_hash": folder_hash, + "folder_num" : text.parse_int(folder_num), + "folder" : resp["page_title"], + } + + while True: + urls = extract_urls(resp["html"]) + for url in urls: + yield Message.Queue, url, folder + + if len(urls) < perpage: + return + data["pageStart"] += 1 + resp = self.request_api("/account/ajax/load_files", data) + + +class CyberfileFileExtractor(CyberfileExtractor): + subcategory = "file" + directory_fmt = ("{category}", "{uploader}", "{folder}") + pattern = rf"{BASE_PATTERN}/([a-zA-Z0-9]+)" + example = "https://cyberfile.me/AbCdE" + + def items(self): + file_id = self.groups[0] + url = f"{self.root}/{file_id}" + file_num = text.extr(self.request(url).text, "owFileInformation(", ")") + + data = {"u": file_num} + resp = self.request_api("/account/ajax/file_details", data) + extr = text.extract_from(resp["html"]) + info = text.split_html(extr('class="text-section">', "</span>")) + folder = info[0] if len(info) > 1 else "" + + file = { + "file_id" : file_id, + "file_num": text.parse_int(file_num), + "name" : resp["page_title"], + "folder" : folder, + "uploader": info[-1][2:].strip(), + "size" : text.parse_bytes(text.remove_html(extr( + "Filesize:", "</tr>"))[:-1]), + "tags" : text.split_html(extr( + "Keywords:", "</tr>")), + "date" : text.parse_datetime(text.remove_html(extr( + "Uploaded:", "</tr>")), "%d/%m/%Y %H:%M:%S"), + "permissions": text.remove_html(extr( + "Permissions:", "</tr>")).split(" & "), + } + + file["file_url"] = url = extr("openUrl('", "'") + text.nameext_from_url(file["name"] or url, file) + yield Message.Directory, file + yield Message.Url, url, file diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 019410c..f8ad07a 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -102,7 +102,10 @@ class DanbooruExtractor(BaseExtractor): post["extension"] = "webm" if url[0] == "/": - url = self.root + url + if url[1] == "/": + url = "https:" + url + else: + url = self.root + url post.update(data) yield Message.Directory, post diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py index f9ed1ab..bf24941 100644 --- a/gallery_dl/extractor/facebook.py +++ b/gallery_dl/extractor/facebook.py @@ -376,34 +376,6 @@ class FacebookExtractor(Extractor): return user -class FacebookSetExtractor(FacebookExtractor): - """Base class for Facebook Set extractors""" - subcategory = "set" - pattern = ( - BASE_PATTERN + - r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)" - r"[^/?#]*(?<!&setextract)$" - r"|([^/?#]+/posts/[^/?#]+)" - r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)" - ) - example = "https://www.facebook.com/media/set/?set=SET_ID" - - def items(self): - set_id = self.groups[0] or self.groups[3] - if path := self.groups[1]: - post_url = self.root + "/" + path - post_page = self.request(post_url).text - set_id = self.parse_post_page(post_page)["set_id"] - - set_url = f"{self.root}/media/set/?set={set_id}" - set_page = self.request(set_url).text - set_data = self.parse_set_page(set_page) - if self.groups[2]: - set_data["first_photo_id"] = self.groups[2] - - return self.extract_set(set_data) - - class FacebookPhotoExtractor(FacebookExtractor): """Base class for Facebook Photo extractors""" subcategory = "photo" @@ -441,6 +413,34 @@ class FacebookPhotoExtractor(FacebookExtractor): yield Message.Url, comment_photo["url"], comment_photo +class FacebookSetExtractor(FacebookExtractor): + """Base class for Facebook Set extractors""" + subcategory = "set" + pattern = ( + BASE_PATTERN + + r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)" + r"[^/?#]*(?<!&setextract)$" + r"|([^/?#]+/posts/[^/?#]+)" + r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)" + ) + example = "https://www.facebook.com/media/set/?set=SET_ID" + + def items(self): + set_id = self.groups[0] or self.groups[3] + if path := self.groups[1]: + post_url = self.root + "/" + path + post_page = self.request(post_url).text + set_id = self.parse_post_page(post_page)["set_id"] + + set_url = f"{self.root}/media/set/?set={set_id}" + set_page = self.request(set_url).text + set_data = self.parse_set_page(set_page) + if self.groups[2]: + set_data["first_photo_id"] = self.groups[2] + + return self.extract_set(set_data) + + class FacebookVideoExtractor(FacebookExtractor): """Base class for Facebook Video extractors""" subcategory = "video" diff --git a/gallery_dl/extractor/fansly.py b/gallery_dl/extractor/fansly.py index 31d242f..8a6dbef 100644 --- a/gallery_dl/extractor/fansly.py +++ b/gallery_dl/extractor/fansly.py @@ -25,7 +25,11 @@ class FanslyExtractor(Extractor): def _init(self): self.api = FanslyAPI(self) - self.formats = self.config("format") or (303, 302, 1, 2, 4) + + if fmts := self.config("formats"): + self.formats = set(fmts) + else: + self.formats = {1, 2, 3, 4, 302, 303} def items(self): for post in self.posts(): @@ -41,6 +45,19 @@ class FanslyExtractor(Extractor): def _extract_files(self, post): files = [] + + if "_extra" in post: + extra = post.pop("_extra", ()) + media = { + media["id"]: media + for media in self.api.account_media(extra) + } + post["attachments"].extend( + media[mid] + for mid in extra + if mid in media + ) + for attachment in post.pop("attachments"): try: self._extract_attachment(files, post, attachment) @@ -54,19 +71,23 @@ class FanslyExtractor(Extractor): def _extract_attachment(self, files, post, attachment): media = attachment["media"] - variants = { - variant["type"]: variant - for variant in media.pop("variants", ()) - } - variants[media["type"]] = media - for fmt in self.formats: - if fmt in variants and (variant := variants[fmt]).get("locations"): - break - else: - return self.log.warning( - "%s/%s: Requested format not available", - post["id"], attachment["id"]) + variants = media.pop("variants") or [] + if media.get("locations"): + variants.append(media) + + formats = [ + (type > 256, variant["width"], type, variant) + for variant in variants + if variant.get("locations") and + (type := variant["type"]) in self.formats + ] + + try: + variant = max(formats)[-1] + except Exception: + return self.log.warning("%s/%s: No format available", + post["id"], attachment["id"]) mime = variant["mimetype"] location = variant.pop("locations")[0] @@ -78,7 +99,7 @@ class FanslyExtractor(Extractor): file = { **variant, - "format": fmt, + "format": variant["type"], "date": text.parse_timestamp(media["createdAt"]), "date_updated": text.parse_timestamp(media["updatedAt"]), } @@ -86,12 +107,17 @@ class FanslyExtractor(Extractor): if "metadata" in location: # manifest meta = location["metadata"] - file["type"] = "video" + + try: + fallback = (media["locations"][0]["location"],) + except Exception: + fallback = () + files.append({ "file": file, "url": f"ytdl:{location['location']}", - # "_fallback": (media["locations"][0]["location"],), + "_fallback": fallback, "_ytdl_manifest": "dash" if mime == "application/dash+xml" else "hls", "_ytdl_manifest_cookies": ( @@ -161,17 +187,26 @@ class FanslyListsExtractor(FanslyExtractor): class FanslyCreatorPostsExtractor(FanslyExtractor): subcategory = "creator-posts" - pattern = rf"{BASE_PATTERN}/([^/?#]+)/posts" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/posts(?:/wall/(\d+))?" example = "https://fansly.com/CREATOR/posts" def posts(self): - creator = self.groups[0] - if creator.startswith("id:"): - account = self.api.account_by_id(creator[3:]) - else: - account = self.api.account(creator) - wall_id = account["walls"][0]["id"] - return self.api.timeline_new(account["id"], wall_id) + creator, wall_id = self.groups + account = self.api.account(creator) + return self.api.timeline_new( + account["id"], wall_id or account["walls"][0]["id"]) + + +class FanslyCreatorMediaExtractor(FanslyExtractor): + subcategory = "creator-media" + pattern = rf"{BASE_PATTERN}/([^/?#]+)/media(?:/wall/(\d+))?" + example = "https://fansly.com/CREATOR/media" + + def posts(self): + creator, wall_id = self.groups + account = self.api.account(creator) + return self.api.mediaoffers_location( + account["id"], wall_id or account["walls"][0]["id"]) class FanslyAPI(): @@ -179,18 +214,24 @@ class FanslyAPI(): def __init__(self, extractor): self.extractor = extractor - - token = extractor.config("token") - if not token: - self.extractor.log.warning("No 'token' provided") - self.headers = { "fansly-client-ts": None, "Origin" : extractor.root, - "authorization" : token, } - def account(self, username): + if token := extractor.config("token"): + self.headers["authorization"] = token + self.extractor.log.debug( + "Using authorization 'token' %.5s...", token) + else: + self.extractor.log.warning("No 'token' provided") + + def account(self, creator): + if creator.startswith("id:"): + return self.account_by_id(creator[3:]) + return self.account_by_username(creator) + + def account_by_username(self, username): endpoint = "/v1/account" params = {"usernames": username} return self._call(endpoint, params)[0] @@ -205,6 +246,11 @@ class FanslyAPI(): params = {"ids": ",".join(map(str, account_ids))} return self._call(endpoint, params) + def account_media(self, media_ids): + endpoint = "/v1/account/media" + params = {"ids": ",".join(map(str, media_ids))} + return self._call(endpoint, params) + def lists_account(self): endpoint = "/v1/lists/account" params = {"itemId": ""} @@ -218,7 +264,21 @@ class FanslyAPI(): "after" : None, "sortMode": sort, } - return self._pagination(endpoint, params) + return self._pagination_list(endpoint, params) + + def mediaoffers_location(self, account_id, wall_id): + endpoint = "/v1/mediaoffers/location" + params = { + "locationId": wall_id, + "locationType": "1002", + "accountId": account_id, + "mediaType": "", + "before": "", + "after" : "0", + "limit" : "30", + "offset": "0", + } + return self._pagination_media(endpoint, params) def post(self, post_id): endpoint = "/v1/post" @@ -262,6 +322,7 @@ class FanslyAPI(): for post in posts: post["account"] = accounts[post.pop("accountId")] + extra = None attachments = [] for attachment in post["attachments"]: cid = attachment["contentId"] @@ -270,18 +331,35 @@ class FanslyAPI(): elif cid in bundles: bundle = bundles[cid]["bundleContent"] bundle.sort(key=lambda c: c["pos"]) - attachments.extend( - media[m["accountMediaId"]] - for m in bundle - if m["accountMediaId"] in media - ) + for c in bundle: + mid = c["accountMediaId"] + if mid in media: + attachments.append(media[mid]) + else: + if extra is None: + post["_extra"] = extra = [] + extra.append(mid) else: self.extractor.log.warning( "%s: Unhandled 'contentId' %s", post["id"], cid) post["attachments"] = attachments + return posts + def _update_media(self, items, response): + posts = { + post["id"]: post + for post in response["posts"] + } + + response["posts"] = [ + posts[item["correlationId"]] + for item in items + ] + + return self._update_posts(response) + def _update_items(self, items): ids = [item["id"] for item in items] accounts = { @@ -304,15 +382,27 @@ class FanslyAPI(): while True: response = self._call(endpoint, params) - if isinstance(response, list): - if not response: - return - yield from self._update_items(response) - params["after"] = response[-1]["sortId"] - - else: - if not response.get("posts"): - return - posts = self._update_posts(response) - yield from posts - params["before"] = min(p["id"] for p in posts) + if not response.get("posts"): + return + posts = self._update_posts(response) + yield from posts + params["before"] = min(p["id"] for p in posts) + + def _pagination_list(self, endpoint, params): + while True: + response = self._call(endpoint, params) + + if not response: + return + yield from self._update_items(response) + params["after"] = response[-1]["sortId"] + + def _pagination_media(self, endpoint, params): + while True: + response = self._call(endpoint, params) + + data = response["data"] + if not data: + return + yield from self._update_media(data, response["aggregationData"]) + params["before"] = data[-1]["id"] diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index e6abdeb..d9a63c7 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -16,63 +16,42 @@ from ..cache import cache class ImgbbExtractor(Extractor): """Base class for imgbb extractors""" category = "imgbb" - directory_fmt = ("{category}", "{user}") - filename_fmt = "{title} {id}.{extension}" - archive_fmt = "{id}" - root = "https://imgbb.com" - - def __init__(self, match): - Extractor.__init__(self, match) - self.page_url = self.sort = None + directory_fmt = ("{category}", "{user[name]:?//}{user[id]:? (/)/}", + "{album[title]} ({album[id]})") + filename_fmt = "{title} ({id}).{extension}" + archive_fmt = "{user[id]} {id}" + cookies_domain = ".imgbb.com" + cookies_names = ("PHPSESSID", "LID") + root = "https://ibb.co" def items(self): self.login() - url = self.page_url - params = {"sort": self.sort} - while True: - response = self.request(url, params=params, allow_redirects=False) - if response.status_code < 300: - break - url = response.headers["location"] - if url.startswith(self.root): - raise exception.NotFoundError(self.subcategory) - - page = response.text - data = self.metadata(page) - first = True - - for img in self.images(page): - image = { - "id" : img["url_viewer"].rpartition("/")[2], - "user" : img["user"]["username"] if "user" in img else "", - "title" : text.unescape(img["title"]), - "url" : img["image"]["url"], - "extension": img["image"]["extension"], - "size" : text.parse_int(img["image"]["size"]), - "width" : text.parse_int(img["width"]), - "height" : text.parse_int(img["height"]), - } - image.update(data) - if first: - first = False - yield Message.Directory, data - yield Message.Url, image["url"], image + for image in self.posts(): + url = image["url"] + text.nameext_from_url(url, image) + yield Message.Directory, image + yield Message.Url, url, image def login(self): + if self.cookies_check(self.cookies_names): + return + username, password = self._get_auth_info() if username: - self.cookies_update(self._login_impl(username, password)) + return self.cookies_update(self._login_impl(username, password)) @cache(maxage=365*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - url = self.root + "/login" + url = "https://imgbb.com/login" page = self.request(url).text - token = text.extr(page, 'PF.obj.config.auth_token="', '"') + token = text.extr(page, 'name="auth_token" value="', '"') - headers = {"Referer": url} + headers = { + "Referer": url, + } data = { "auth_token" : token, "login-subject": username, @@ -84,27 +63,26 @@ class ImgbbExtractor(Extractor): raise exception.AuthenticationError() return self.cookies - def _extract_resource(self, page): - return util.json_loads(text.extr( - page, "CHV.obj.resource=", "};") + "}") - - def _extract_user(self, page): - return self._extract_resource(page).get("user") or {} - - def _pagination(self, page, endpoint, params): - data = None + def _pagination(self, page, url, params): seek, pos = text.extract(page, 'data-seek="', '"') tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos) - params["action"] = "list" - params["list"] = "images" - params["sort"] = self.sort - params["seek"] = seek - params["page"] = 2 - params["auth_token"] = tokn + resc, pos = text.extract(page, "CHV.obj.resource=", "};", pos) + self.kwdict["user"] = util.json_loads(resc + "}").get("user") + data = None while True: - for img in text.extract_iter(page, "data-object='", "'"): - yield util.json_loads(text.unquote(img)) + for obj in text.extract_iter(page, "data-object='", "'"): + post = util.json_loads(text.unquote(obj)) + image = post["image"] + image["filename"], image["name"] = \ + image["name"], image["filename"] + image["id"] = post["id_encoded"] + image["title"] = post["title"] + image["width"] = text.parse_int(post["width"]) + image["height"] = text.parse_int(post["height"]) + image["size"] = text.parse_int(image["size"]) + yield image + if data: if not data["seekEnd"] or params["seek"] == data["seekEnd"]: return @@ -112,105 +90,114 @@ class ImgbbExtractor(Extractor): params["page"] += 1 elif not seek or 'class="pagination-next"' not in page: return - data = self.request_json(endpoint, method="POST", data=params) + else: + params["action"] = "list" + params["page"] = 2 + params["seek"] = seek + params["auth_token"] = tokn + + headers = { + "Accept": "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Origin": self.root, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + } + + data = self.request_json( + url, method="POST", headers=headers, data=params) page = data["html"] class ImgbbAlbumExtractor(ImgbbExtractor): - """Extractor for albums on imgbb.com""" + """Extractor for imgbb albums""" subcategory = "album" - directory_fmt = ("{category}", "{user}", "{album_name} {album_id}") pattern = r"(?:https?://)?ibb\.co/album/([^/?#]+)/?(?:\?([^#]+))?" example = "https://ibb.co/album/ID" - def __init__(self, match): - ImgbbExtractor.__init__(self, match) - self.album_name = None - self.album_id = match[1] - self.sort = text.parse_query(match[2]).get("sort", "date_desc") - self.page_url = "https://ibb.co/album/" + self.album_id - - def metadata(self, page): - album = text.extr(page, '"og:title" content="', '"') - user = self._extract_user(page) - return { - "album_id" : self.album_id, - "album_name" : text.unescape(album), - "user" : user.get("username") or "", - "user_id" : user.get("id") or "", - "displayname": user.get("name") or "", - } - - def images(self, page): - url = text.extr(page, '"og:url" content="', '"') - album_id = url.rpartition("/")[2].partition("?")[0] - - return self._pagination(page, "https://ibb.co/json", { - "from" : "album", - "albumid" : album_id, - "params_hidden[list]" : "images", - "params_hidden[from]" : "album", - "params_hidden[albumid]": album_id, - }) - - -class ImgbbUserExtractor(ImgbbExtractor): - """Extractor for user profiles in imgbb.com""" - subcategory = "user" - pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?$" - example = "https://USER.imgbb.com" + def posts(self): + album_id, qs = self.groups + url = f"{self.root}/album/{album_id}" + params = text.parse_query(qs) + page = self.request(url, params=params).text + extr = text.extract_from(page) - def __init__(self, match): - ImgbbExtractor.__init__(self, match) - self.user = match[1] - self.sort = text.parse_query(match[2]).get("sort", "date_desc") - self.page_url = f"https://{self.user}.imgbb.com/" - - def metadata(self, page): - user = self._extract_user(page) - return { - "user" : user.get("username") or self.user, - "user_id" : user.get("id") or "", - "displayname": user.get("name") or "", + self.kwdict["album"] = album = { + "url": extr( + 'property="og:url" content="', '"'), + "title": text.unescape(extr( + 'property="og:title" content="', '"')), + "description": text.unescape(extr( + 'property="og:description" content="', '"')), + "id": extr( + 'data-text="album-name" href="https://ibb.co/album/', '"'), + "count": text.parse_int(extr( + 'data-text="image-count">', "<")), } - def images(self, page): - user = text.extr(page, '.obj.resource={"id":"', '"') - return self._pagination(page, self.page_url + "json", { - "from" : "user", - "userid" : user, - "params_hidden[userid]": user, - "params_hidden[from]" : "user", - }) + url = f"{self.root}/json" + params["pathname"] = f"/album/{album['id']}" + return self._pagination(page, url, params) class ImgbbImageExtractor(ImgbbExtractor): subcategory = "image" - pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?#]+)" + pattern = r"(?:https?://)?ibb\.co/([^/?#]+)" example = "https://ibb.co/ID" - def __init__(self, match): - ImgbbExtractor.__init__(self, match) - self.image_id = match[1] - - def items(self): - url = "https://ibb.co/" + self.image_id + def posts(self): + url = f"{self.root}/{self.groups[0]}" page = self.request(url).text extr = text.extract_from(page) - user = self._extract_user(page) image = { - "id" : self.image_id, + "id" : extr('property="og:url" content="https://ibb.co/', '"'), "title" : text.unescape(extr( '"og:title" content="', ' hosted at ImgBB"')), "url" : extr('"og:image" content="', '"'), "width" : text.parse_int(extr('"og:image:width" content="', '"')), "height": text.parse_int(extr('"og:image:height" content="', '"')), - "user" : user.get("username") or "", - "user_id" : user.get("id") or "", - "displayname": user.get("name") or "", + "album" : extr("Added to <a", "</a>"), + "date" : text.parse_datetime(extr( + '<span title="', '"'), "%Y-%m-%d %H:%M:%S"), + "user" : util.json_loads(extr( + "CHV.obj.resource=", "};") + "}").get("user"), } - image["extension"] = text.ext_from_url(image["url"]) - yield Message.Directory, image - yield Message.Url, image["url"], image + if album := image["album"]: + image["album"] = { + "id" : text.extr(album, "/album/", '"'), + "title": text.unescape(album.rpartition(">")[2]), + } + else: + image["album"] = None + + return (image,) + + +class ImgbbUserExtractor(ImgbbExtractor): + """Extractor for imgbb user profiles""" + subcategory = "user" + directory_fmt = ("{category}", "{user[name]} ({user[id]})") + pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?" + example = "https://USER.imgbb.com" + + def posts(self): + user, qs = self.groups + url = f"https://{user}.imgbb.com/" + params = text.parse_query(qs) + response = self.request(url, params=params, allow_redirects=False) + + if response.status_code < 300: + params["pathname"] = "/" + return self._pagination(response.text, f"{url}json", params) + + if response.status_code == 301: + raise exception.NotFoundError("user") + redirect = f"HTTP redirect to {response.headers.get('Location')}" + if response.status_code == 302: + raise exception.AuthRequired( + ("username & password", "authenticated cookies"), + "profile", redirect) + raise exception.AbortExtraction(redirect) diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py new file mode 100644 index 0000000..8cc7e38 --- /dev/null +++ b/gallery_dl/extractor/simpcity.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2025 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://simpcity.cr/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)" + + +class SimpcityExtractor(Extractor): + """Base class for simpcity extractors""" + category = "simpcity" + root = "https://simpcity.cr" + + def items(self): + extract_urls = text.re(r' href="([^"]+)').findall + + for post in self.posts(): + urls = extract_urls(post["content"]) + data = {"post": post} + post["count"] = data["count"] = len(urls) + for data["num"], url in enumerate(urls, 1): + yield Message.Queue, url, data + + def request_page(self, url): + try: + return self.request(url).text + except exception.HttpError as exc: + if exc.status == 403 and b">Log in<" in exc.response.content: + msg = text.extr(exc.response.text, "blockMessage--error", "</") + raise exception.AuthRequired( + "'authenticated cookies'", None, + msg.rpartition(">")[2].strip()) + raise + + def _pagination(self, base, pnum=None): + base = f"{self.root}{base}" + + if pnum is None: + url = base + pnum = 1 + else: + url = f"{base}/page-{pnum}" + pnum = None + + while True: + page = self.request_page(url) + + yield page + + if pnum is None or "pageNav-jump--next" not in page: + return + pnum += 1 + url = f"{base}/page-{pnum}" + + def _parse_thread(self, page): + schema = self._extract_jsonld(page)["mainEntity"] + author = schema["author"] + stats = schema["interactionStatistic"] + url_t = schema["url"] + url_a = author["url"] + + thread = { + "id" : url_t[url_t.rfind(".")+1:-1], + "url" : url_t, + "title": schema["headline"], + "date" : text.parse_datetime(schema["datePublished"]), + "views": stats[0]["userInteractionCount"], + "posts": stats[1]["userInteractionCount"], + "tags" : (schema["keywords"].split(", ") + if "keywords" in schema else ()), + "section" : schema["articleSection"], + "author" : author["name"], + "author_id" : url_a[url_a.rfind(".")+1:-1], + "author_url": url_a, + } + + return thread + + def _parse_post(self, html): + extr = text.extract_from(html) + + post = { + "author": extr('data-author="', '"'), + "id": extr('data-content="post-', '"'), + "author_url": extr('itemprop="url" content="', '"'), + "date": text.parse_datetime(extr('datetime="', '"')), + "content": extr('<div itemprop="text">', "\t\t</div>").strip(), + } + + url_a = post["author_url"] + post["author_id"] = url_a[url_a.rfind(".")+1:-1] + + return post + + +class SimpcityPostExtractor(SimpcityExtractor): + subcategory = "post" + pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)" + example = "https://simpcity.cr/threads/TITLE.12345/post-54321" + + def posts(self): + post_id = self.groups[0] + url = f"{self.root}/posts/{post_id}/" + page = self.request_page(url) + + pos = page.find(f'data-content="post-{post_id}"') + if pos < 0: + raise exception.NotFoundError("post") + html = text.extract(page, "<article ", "</article>", pos-200)[0] + + self.kwdict["thread"] = self._parse_thread(page) + return (self._parse_post(html),) + + +class SimpcityThreadExtractor(SimpcityExtractor): + subcategory = "thread" + pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?" + example = "https://simpcity.cr/threads/TITLE.12345/" + + def posts(self): + for page in self._pagination(*self.groups): + if "thread" not in self.kwdict: + self.kwdict["thread"] = self._parse_thread(page) + for html in text.extract_iter(page, "<article ", "</article>"): + yield self._parse_post(html) + + +class SimpcityForumExtractor(SimpcityExtractor): + subcategory = "forum" + pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?" + example = "https://simpcity.cr/forums/TITLE.123/" + + def items(self): + data = {"_extractor": SimpcityThreadExtractor} + for page in self._pagination(*self.groups): + for path in text.extract_iter(page, ' uix-href="', '"'): + yield Message.Queue, f"{self.root}{text.unquote(path)}", data diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index 973bd22..f450806 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -42,8 +42,7 @@ class TiktokExtractor(Extractor): continue post = video_detail["itemInfo"]["itemStruct"] - author = post["author"] - post["user"] = author["uniqueId"] + post["user"] = (a := post.get("author")) and a["uniqueId"] or "" post["date"] = text.parse_timestamp(post["createTime"]) original_title = title = post["desc"] diff --git a/gallery_dl/extractor/tungsten.py b/gallery_dl/extractor/tungsten.py index 20d5a59..45836a9 100644 --- a/gallery_dl/extractor/tungsten.py +++ b/gallery_dl/extractor/tungsten.py @@ -87,14 +87,17 @@ class TungstenModelExtractor(TungstenExtractor): class TungstenUserExtractor(TungstenExtractor): subcategory = "user" - pattern = rf"{BASE_PATTERN}/user/([^/?#]+)" - example = "https://tungsten.run/user/USER/posts" + pattern = rf"{BASE_PATTERN}/user/([^/?#]+)(?:/posts)?/?(?:\?([^#]+))?" + example = "https://tungsten.run/user/USER" def posts(self): - url = f"{self.root}/user/{self.groups[0]}" + user, qs = self.groups + url = f"{self.root}/user/{user}" page = self.request(url).text uuid_user = text.extr(page, '"user":{"uuid":"', '"') url = f"https://api.tungsten.run/v1/users/{uuid_user}/posts" - params = {"sort": "top_all_time"} + params = text.parse_query(qs) + params.setdefault("sort", "top_all_time") + self.kwdict["search_tags"] = params.get("tag", "") return self._pagination(url, params) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c919cb8..ed3cfae 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1447,20 +1447,33 @@ class TwitterAPI(): "includePromotedContent": False, } return self._pagination_tweets( - endpoint, variables, ("bookmark_timeline_v2", "timeline"), False) + endpoint, variables, ("bookmark_timeline_v2", "timeline"), + stop_tweets=128) def search_timeline(self, query, product="Latest"): endpoint = "/graphql/4fpceYZ6-YQCx_JSl_Cn_A/SearchTimeline" variables = { "rawQuery": query, - "count": 100, + "count": self.extractor.config("search-limit", 20), "querySource": "typed_query", "product": product, "withGrokTranslatedBio": False, } + + if self.extractor.config("search-pagination") in ( + "max_id", "maxid", "id"): + update_variables = self._update_variables_search + else: + update_variables = None + + stop_tweets = self.extractor.config("search-stop") + if stop_tweets is None or stop_tweets == "auto": + stop_tweets = 3 if update_variables is None else 0 + return self._pagination_tweets( endpoint, variables, - ("search_by_raw_query", "search_timeline", "timeline")) + ("search_by_raw_query", "search_timeline", "timeline"), + stop_tweets=stop_tweets, update_variables=update_variables) def community_query(self, community_id): endpoint = "/graphql/2W09l7nD7ZbxGQHXvfB22w/CommunityQuery" @@ -1870,11 +1883,12 @@ class TwitterAPI(): params["cursor"] = extr._update_cursor(cursor) def _pagination_tweets(self, endpoint, variables, - path=None, stop_tweets=True, + path=None, stop_tweets=0, update_variables=None, features=None, field_toggles=None): extr = self.extractor original_retweets = (extr.retweets == "original") pinned_tweet = extr.pinned + stop_tweets_max = stop_tweets params = {"variables": None} if cursor := extr._init_cursor(): @@ -2067,11 +2081,24 @@ class TwitterAPI(): tweet.get("rest_id")) continue - if stop_tweets and not tweet: - return extr._update_cursor(None) + if tweet: + stop_tweets = stop_tweets_max + last_tweet = tweet + else: + if stop_tweets <= 0: + return extr._update_cursor(None) + self.log.debug( + "No Tweet results (%s/%s)", + stop_tweets_max - stop_tweets + 1, stop_tweets_max) + stop_tweets -= 1 + if not cursor or cursor == variables.get("cursor"): return extr._update_cursor(None) - variables["cursor"] = extr._update_cursor(cursor) + + if update_variables is None: + variables["cursor"] = extr._update_cursor(cursor) + else: + variables = update_variables(variables, cursor, last_tweet) def _pagination_users(self, endpoint, variables, path=None): extr = self.extractor @@ -2140,6 +2167,30 @@ class TwitterAPI(): self.log.debug("Skipping %s ('%s')", tweet_id, text) + def _update_variables_search(self, variables, cursor, tweet): + try: + tweet_id = tweet.get("id_str") or tweet["legacy"]["id_str"] + max_id = f"max_id:{int(tweet_id)-1}" + + query, n = text.re(r"\bmax_id:\d+").subn( + max_id, variables["rawQuery"]) + if n: + variables["rawQuery"] = query + else: + variables["rawQuery"] = f"{query} {max_id}" + + if prefix := self.extractor._cursor_prefix: + self.extractor._cursor_prefix = \ + f"{prefix.partition('_')[0]}_{tweet_id}/" + variables["cursor"] = None + except Exception as exc: + self.extractor.log.debug( + "Failed to update 'max_id' search query (%s: %s). Falling " + "back to 'cursor' pagination", exc.__class__.__name__, exc) + variables["cursor"] = self.extractor._update_cursor(cursor) + + return variables + @cache(maxage=365*86400, keyarg=1) def _login_impl(extr, username, password): |
