diff options
| author | 2025-04-15 05:25:37 -0400 | |
|---|---|---|
| committer | 2025-04-15 05:25:37 -0400 | |
| commit | b830dc03b3b7c9dd119648e1be9c1145d56e096c (patch) | |
| tree | e9d03b6b4ab93990243c0038c20ada2464fa4072 /gallery_dl | |
| parent | 662e5ac868a5c1a3e7bc95b37054b3a0ca4db74f (diff) | |
New upstream version 1.29.4.upstream/1.29.4
Diffstat (limited to 'gallery_dl')
24 files changed, 213 insertions, 141 deletions
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 9d653b3..7a20dc2 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -200,6 +200,7 @@ class YoutubeDLDownloader(DownloaderBase): return None info_dict = { + "extractor": "", "id" : video_id, "title" : video_id, "formats" : fmts, diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py index c9ccb7d..600d231 100644 --- a/gallery_dl/extractor/chevereto.py +++ b/gallery_dl/extractor/chevereto.py @@ -18,19 +18,23 @@ class CheveretoExtractor(BaseExtractor): directory_fmt = ("{category}", "{user}", "{album}",) archive_fmt = "{id}" - def __init__(self, match): - BaseExtractor.__init__(self, match) - self.path = match.group(match.lastindex) + def _init(self): + self.path = self.groups[-1] def _pagination(self, url): - while url: + while True: page = self.request(url).text for item in text.extract_iter( page, '<div class="list-item-image ', 'image-container'): - yield text.extr(item, '<a href="', '"') + yield text.urljoin(self.root, text.extr( + item, '<a href="', '"')) - url = text.extr(page, '<a data-pagination="next" href="', '" ><') + url = text.extr(page, 'data-pagination="next" href="', '"') + if not url: + return + if url[0] == "/": + url = self.root + url BASE_PATTERN = CheveretoExtractor.update({ @@ -42,6 +46,10 @@ BASE_PATTERN = CheveretoExtractor.update({ "root": "https://img.kiwi", "pattern": r"img\.kiwi", }, + "imagepond": { + "root": "https://imagepond.net", + "pattern": r"imagepond\.net", + }, }) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 741800c..06c31b9 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -282,10 +282,11 @@ class DanbooruPoolExtractor(DanbooruExtractor): example = "https://danbooru.donmai.us/pools/12345" def metadata(self): - return self._collection_metadata(self.groups[-1], "pool") + self.pool_id = self.groups[-1] + return self._collection_metadata(self.pool_id, "pool") def posts(self): - return self._collection_posts(self.groups[-1], "pool") + return self._collection_posts(self.pool_id, "pool") class DanbooruFavgroupExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 3a862c1..378c7ec 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -687,7 +687,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ for folder in folders: if match(folder["name"]): return folder - elif folder["has_subfolders"]: + elif folder.get("has_subfolders"): for subfolder in folder["subfolders"]: if match(subfolder["name"]): return subfolder @@ -695,7 +695,7 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ for folder in folders: if folder["folderid"] == uuid: return folder - elif folder["has_subfolders"]: + elif folder.get("has_subfolders"): for subfolder in folder["subfolders"]: if subfolder["folderid"] == uuid: return subfolder diff --git a/gallery_dl/extractor/discord.py b/gallery_dl/extractor/discord.py index 6a5fcc9..ac21fec 100644 --- a/gallery_dl/extractor/discord.py +++ b/gallery_dl/extractor/discord.py @@ -49,7 +49,10 @@ class DiscordExtractor(Extractor): text_content.append(field.get("name", "")) text_content.append(field.get("value", "")) - text_content.append(embed.get("footer", {}).get("text", "")) + try: + text_content.append(embed["footer"]["text"]) + except Exception: + pass if message.get("poll"): text_content.append(message["poll"]["question"]["text"]) @@ -224,10 +227,12 @@ class DiscordExtractor(Extractor): return self.server_metadata def build_server_and_channels(self, server_id): - server = self.api.get_server(server_id) - self.parse_server(server) + self.parse_server(self.api.get_server(server_id)) - for channel in self.api.get_server_channels(server_id): + for channel in sorted( + self.api.get_server_channels(server_id), + key=lambda ch: ch["type"] != 4 + ): self.parse_channel(channel) @@ -353,7 +358,8 @@ class DiscordAPI(): "limit": MESSAGES_BATCH, "before": before }) - before = messages[-1]["id"] + if messages: + before = messages[-1]["id"] return messages return self._pagination(_method, MESSAGES_BATCH) diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py index 94444ff..e41f6f6 100644 --- a/gallery_dl/extractor/everia.py +++ b/gallery_dl/extractor/everia.py @@ -52,7 +52,7 @@ class EveriaPostExtractor(EveriaExtractor): def items(self): url = self.root + self.groups[0] page = self.request(url).text - content = text.extr(page, 'itemprop="text">', "</div>") + content = text.extr(page, 'itemprop="text">', "<h3") urls = re.findall(r'img.*?src="([^"]+)', content) data = { diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 37c776e..eb07739 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -114,11 +114,12 @@ class GelbooruBase(): md5 = post["md5"] path = "/images/{}/{}/{}.webm".format(md5[0:2], md5[2:4], md5) post["_fallback"] = GelbooruBase._video_fallback(path) - url = "https://img3.gelbooru.com" + path + url = "https://img4.gelbooru.com" + path return url @staticmethod def _video_fallback(path): + yield "https://img3.gelbooru.com" + path yield "https://img2.gelbooru.com" + path yield "https://img1.gelbooru.com" + path diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py index 9ab1411..1317ce9 100644 --- a/gallery_dl/extractor/hentai2read.py +++ b/gallery_dl/extractor/hentai2read.py @@ -25,26 +25,30 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?#]+/([^/?#]+))" example = "https://hentai2read.com/TITLE/1/" - def __init__(self, match): - self.chapter = match.group(2) - ChapterExtractor.__init__(self, match) - def metadata(self, page): title, pos = text.extract(page, "<title>", "</title>") manga_id, pos = text.extract(page, 'data-mid="', '"', pos) chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) - chapter, sep, minor = self.chapter.partition(".") - match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - " + chapter, sep, minor = self.groups[1].partition(".") + + match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.*))? - " r"([^:]+): (.+) . Page 1 ", title) + if match: + manga, type, author, _, title = match.groups() + else: + self.log.warning("Failed to extract 'manga', 'type', 'author', " + "and 'title' metadata") + manga = type = author = title = "" + return { - "manga": match.group(1), + "manga": manga, "manga_id": text.parse_int(manga_id), "chapter": text.parse_int(chapter), "chapter_minor": sep + minor, "chapter_id": text.parse_int(chapter_id), - "type": match.group(2), - "author": match.group(3), - "title": match.group(5), + "type": type, + "author": author, + "title": title, "lang": "en", "language": "English", } diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index aa26408..432a7ad 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -29,6 +29,7 @@ class InstagramExtractor(Extractor): root = "https://www.instagram.com" cookies_domain = ".instagram.com" cookies_names = ("sessionid",) + useragent = util.USERAGENT_CHROME request_interval = (6.0, 12.0) def __init__(self, match): diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py index 65717b4..abbdfd5 100644 --- a/gallery_dl/extractor/issuu.py +++ b/gallery_dl/extractor/issuu.py @@ -29,9 +29,11 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): example = "https://issuu.com/issuu/docs/TITLE/" def metadata(self, page): - pos = page.rindex('id="initial-data"') - data = util.json_loads(text.unescape(text.rextract( - page, '<script data-json="', '"', pos)[0])) + + data = text.extr( + page, '{\\"documentTextVersion\\":', ']\\n"])</script>') + data = util.json_loads(text.unescape( + '{"":' + data.replace('\\"', '"'))) doc = data["initialDocumentData"]["document"] doc["date"] = text.parse_datetime( @@ -39,7 +41,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): self._cnt = text.parse_int(doc["pageCount"]) self._tpl = "https://{}/{}-{}/jpg/page_{{}}.jpg".format( - data["config"]["hosts"]["image"], + "image.isu.pub", # data["config"]["hosts"]["image"], doc["revisionId"], doc["publicationId"], ) @@ -66,9 +68,8 @@ class IssuuUserExtractor(IssuuBase, Extractor): url = base + "/" + str(pnum) if pnum > 1 else base try: html = self.request(url).text - data = util.json_loads(text.unescape(text.extr( - html, '</main></div><script data-json="', '" id="'))) - docs = data["docs"] + data = text.extr(html, '\\"docs\\":', '}]\\n"]') + docs = util.json_loads(data.replace('\\"', '"')) except Exception as exc: self.log.debug("", exc_info=exc) return diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 860e771..de7d040 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -123,6 +123,9 @@ class KemonopartyExtractor(Extractor): g(post) for g in generators): url = file["path"] + if "\\" in url: + file["path"] = url = url.replace("\\", "/") + match = find_hash(url) if match: file["hash"] = hash = match.group(1) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 8a4905d..e8050b3 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -15,7 +15,7 @@ from datetime import datetime, timedelta import itertools import hashlib -BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" +BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?ph?ixiv\.net" USER_PATTERN = BASE_PATTERN + r"/(?:en/)?users/(\d+)" @@ -531,7 +531,7 @@ class PixivMeExtractor(PixivExtractor): class PixivWorkExtractor(PixivExtractor): """Extractor for a single pixiv work/illustration""" subcategory = "work" - pattern = (r"(?:https?://)?(?:(?:www\.|touch\.)?pixiv\.net" + pattern = (r"(?:https?://)?(?:(?:www\.|touch\.)?ph?ixiv\.net" r"/(?:(?:en/)?artworks/" r"|member_illust\.php\?(?:[^&]+&)*illust_id=)(\d+)" r"|(?:i(?:\d+\.pixiv|\.pximg)\.net" diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index c0374eb..2f2daca 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -85,7 +85,7 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): replacements = re.findall( r"l = l\.replace\(/([^/]+)/g, [\"']([^\"']*)", page) - for block in page.split(" pth = '")[1:]: + for block in page.split("\t\tpht = '")[1:]: pth = text.extr(block, "", "'") for needle, repl in re.findall( @@ -129,7 +129,7 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): def baeu(url, root="", root_blogspot="https://2.bp.blogspot.com"): - """https://readcomiconline.li/Scripts/rguard.min.js""" + """https://readcomiconline.li/Scripts/rguard.min.js?v=1.5.4""" if not root: root = root_blogspot diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py index 3b8d344..411a71a 100644 --- a/gallery_dl/extractor/rule34xyz.py +++ b/gallery_dl/extractor/rule34xyz.py @@ -23,10 +23,18 @@ class Rule34xyzExtractor(BooruExtractor): per_page = 60 TAG_TYPES = { - 0: "general", - 1: "copyright", - 2: "character", - 3: "artist", + None: "general", + 0 : "general", + 1 : "general", + 2 : "copyright", + 4 : "character", + 8 : "artist", + } + FORMATS = { + "10" : "pic.jpg", + "100": "mov.mp4", + "101": "mov720.mp4", + "102": "mov480.mp4", } def _init(self): @@ -36,49 +44,49 @@ class Rule34xyzExtractor(BooruExtractor): formats = formats.split(",") self.formats = formats else: - self.formats = ("10", "40", "41", "2") + self.formats = ("100", "101", "102", "10") def _file_url(self, post): - post["files"] = files = { - str(link["type"]): link["url"] - for link in post.pop("imageLinks") - } + files = post["files"] for fmt in self.formats: if fmt in files: + extension = self.FORMATS.get(fmt) break else: - fmt = "2" self.log.warning("%s: Requested format not available", post["id"]) + fmt = next(iter(files)) - post["file_url"] = url = files[fmt] + post_id = post["id"] + root = self.root_cdn if files[fmt][0] else self.root + post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( + root, post_id // 1000, post_id, post_id, extension) post["format_id"] = fmt - post["format"] = url.rsplit(".", 2)[1] + post["format"] = extension.partition(".")[0] + return url def _prepare(self, post): - post.pop("filesPreview", None) - post.pop("tagsWithType", None) + post.pop("files", None) post["date"] = text.parse_datetime( - post["created"][:19], "%Y-%m-%dT%H:%M:%S") + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + post["filename"], _, post["format"] = post["filename"].rpartition(".") + if "tags" in post: + post["tags"] = [t["value"] for t in post["tags"]] def _tags(self, post, _): - if post.get("tagsWithType") is None: + if "tags" not in post: post.update(self._fetch_post(post["id"])) tags = collections.defaultdict(list) - tagslist = [] - for tag in post["tagsWithType"]: - value = tag["value"] - tagslist.append(value) - tags[tag["type"]].append(value) + for tag in post["tags"]: + tags[tag["type"]].append(tag["value"]) types = self.TAG_TYPES for type, values in tags.items(): post["tags_" + types[type]] = values - post["tags"] = tagslist def _fetch_post(self, post_id): - url = "{}/api/post/{}".format(self.root, post_id) + url = "{}/api/v2/post/{}".format(self.root, post_id) return self.request(url).json() def _pagination(self, endpoint, params=None): @@ -86,22 +94,22 @@ class Rule34xyzExtractor(BooruExtractor): if params is None: params = {} - params["IncludeLinks"] = "true" - params["IncludeTags"] = "true" - params["OrderBy"] = "0" params["Skip"] = self.page_start * self.per_page - params["Take"] = self.per_page - params["DisableTotal"] = "true" + params["take"] = self.per_page + params["CountTotal"] = False + params["IncludeLinks"] = True + params["OrderBy"] = 0 threshold = self.per_page while True: - data = self.request(url, params=params).json() + data = self.request(url, method="POST", json=params).json() yield from data["items"] if len(data["items"]) < threshold: return - params["Skip"] += params["Take"] + params["Skip"] += self.per_page + params["cursor"] = data["cursor"] class Rule34xyzPostExtractor(Rule34xyzExtractor): @@ -125,9 +133,8 @@ class Rule34xyzPlaylistExtractor(Rule34xyzExtractor): return {"playlist_id": self.groups[0]} def posts(self): - endpoint = "/playlist-item" - params = {"PlaylistId": self.groups[0]} - return self._pagination(endpoint, params) + endpoint = "/v2/post/search/playlist/" + self.groups[0] + return self._pagination(endpoint) class Rule34xyzTagExtractor(Rule34xyzExtractor): @@ -138,10 +145,11 @@ class Rule34xyzTagExtractor(Rule34xyzExtractor): example = "https://rule34.xyz/TAG" def metadata(self): - self.tags = text.unquote(self.groups[0]).replace("_", " ") - return {"search_tags": self.tags} + self.tags = text.unquote(text.unquote( + self.groups[0]).replace("_", " ")).split("|") + return {"search_tags": ", ".join(self.tags)} def posts(self): - endpoint = "/post/search" - params = {"Tag": self.tags} + endpoint = "/v2/post/search/root" + params = {"includeTags": self.tags} return self._pagination(endpoint, params) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 8d1fcde..6f2114e 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -17,7 +17,7 @@ import re BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" r"(?:https?://)?" - r"(?:www\.tumblr\.com/(?:blog/(?:view/)?)?([\w-]+)|" + r"(?:(?:www\.)?tumblr\.com/(?:blog/(?:view/)?)?([\w-]+)|" r"([\w-]+\.tumblr\.com)))" ) @@ -357,7 +357,7 @@ class TumblrLikesExtractor(TumblrExtractor): class TumblrSearchExtractor(TumblrExtractor): """Extractor for a Tumblr search""" subcategory = "search" - pattern = (BASE_PATTERN + r"/search/([^/?#]+)" + pattern = (r"(?:https?://)?(?:www\.)?tumblr\.com/search/([^/?#]+)" r"(?:/([^/?#]+)(?:/([^/?#]+))?)?(?:/?\?([^#]+))?") example = "https://www.tumblr.com/search/QUERY" diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 008ae6e..8ff32af 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -12,13 +12,15 @@ from .common import GalleryExtractor, Extractor, Message from .. import exception, text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/(([^/?#]+)" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com" +LANG_PATTERN = BASE_PATTERN + r"/(([^/?#]+)" class WebtoonsBase(): category = "webtoons" root = "https://www.webtoons.com" cookies_domain = ".webtoons.com" + request_interval = (0.5, 1.5) def setup_agegate_cookies(self): self.cookies_update({ @@ -34,7 +36,7 @@ class WebtoonsBase(): response = Extractor.request(self, url, **kwargs) if response.history and "/ageGate" in response.url: raise exception.StopExtraction( - "HTTP redirect to age gate check ('%s')", response.request.url) + "HTTP redirect to age gate check ('%s')", response.url) return response @@ -44,47 +46,19 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): directory_fmt = ("{category}", "{comic}") filename_fmt = "{episode_no}-{num:>02}.{extension}" archive_fmt = "{title_no}_{episode_no}_{num}" - pattern = (BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)/(?:[^/?#]+))" - r"/viewer(?:\?([^#'\"]+))") + pattern = (LANG_PATTERN + r"/([^/?#]+)/([^/?#]+)/[^/?#]+)" + r"/viewer\?([^#'\"]+)") example = ("https://www.webtoons.com/en/GENRE/TITLE/NAME/viewer" "?title_no=123&episode_no=12345") - test = ( - (("https://www.webtoons.com/en/comedy/safely-endangered" - "/ep-572-earth/viewer?title_no=352&episode_no=572"), { - "url": "55bec5d7c42aba19e3d0d56db25fdf0b0b13be38", - "content": ("1748c7e82b6db910fa179f6dc7c4281b0f680fa7", - "42055e44659f6ffc410b3fb6557346dfbb993df3", - "49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9"), - "count": 5, - }), - (("https://www.webtoons.com/en/challenge/punderworld" - "/happy-earth-day-/viewer?title_no=312584&episode_no=40"), { - "exception": exception.NotFoundError, - "keyword": { - "comic": "punderworld", - "description": str, - "episode": "36", - "episode_no": "40", - "genre": "challenge", - "title": r"re:^Punderworld - .+", - "title_no": "312584", - }, - }), - ) - - def __init__(self, match): - self.path, self.lang, self.genre, self.comic, self.query = \ - match.groups() - - url = "{}/{}/viewer?{}".format(self.root, self.path, self.query) - GalleryExtractor.__init__(self, match, url) def _init(self): self.setup_agegate_cookies() - params = text.parse_query(self.query) + path, self.lang, self.genre, self.comic, query = self.groups + params = text.parse_query(query) self.title_no = params.get("title_no") self.episode_no = params.get("episode_no") + self.gallery_url = "{}/{}/viewer?{}".format(self.root, path, query) def metadata(self, page): extr = text.extract_from(page) @@ -124,32 +98,49 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): "language" : util.code_to_language(self.lang), } - @staticmethod - def images(page): - return [ - (url.replace("://webtoon-phinf.", "://swebtoon-phinf."), None) - for url in text.extract_iter( - page, 'class="_images" data-url="', '"') - ] + def images(self, page): + quality = self.config("quality") + if quality is None or quality == "original": + quality = {"jpg": False, "jpeg": False, "webp": False} + elif not quality: + quality = None + elif isinstance(quality, str): + quality = {"jpg": quality, "jpeg": quality} + elif isinstance(quality, int): + quality = "q" + str(quality) + quality = {"jpg": quality, "jpeg": quality} + elif not isinstance(quality, dict): + quality = None + + results = [] + for url in text.extract_iter( + page, 'class="_images" data-url="', '"'): + + if quality is not None: + path, _, query = url.rpartition("?") + type = quality.get(path.rpartition(".")[2].lower()) + if type is False: + url = path + elif type: + url = "{}?type={}".format(path, type) + + url = url.replace("://webtoon-phinf.", "://swebtoon-phinf.") + results.append((url, None)) + return results class WebtoonsComicExtractor(WebtoonsBase, Extractor): """Extractor for an entire comic on webtoons.com""" subcategory = "comic" categorytransfer = True - pattern = (BASE_PATTERN + r"/([^/?#]+)/([^/?#]+))" - r"/list(?:\?([^#]+))") + pattern = LANG_PATTERN + r"/([^/?#]+)/([^/?#]+))/list\?([^#]+)" example = "https://www.webtoons.com/en/GENRE/TITLE/list?title_no=123" - def __init__(self, match): - Extractor.__init__(self, match) - self.path, self.lang, self.genre, self.comic, self.query = \ - match.groups() - def _init(self): self.setup_agegate_cookies() - params = text.parse_query(self.query) + self.path, self.lang, self.genre, self.comic, query = self.groups + params = text.parse_query(query) self.title_no = params.get("title_no") self.page_no = text.parse_int(params.get("page"), 1) @@ -164,7 +155,7 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): path = "/{}/list?title_no={}&page={}".format( self.path, self.title_no, self.page_no) - if page and path not in page: + if page is not None and path not in page: return response = self.request(self.root + path) @@ -182,11 +173,47 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): self.page_no += 1 - @staticmethod - def get_episode_urls(page): + def get_episode_urls(self, page): """Extract and return all episode urls in 'page'""" page = text.extr(page, 'id="_listUl"', '</ul>') return [ match.group(0) for match in WebtoonsEpisodeExtractor.pattern.finditer(page) ] + + +class WebtoonsArtistExtractor(WebtoonsBase, Extractor): + """Extractor for webtoons.com artists""" + subcategory = "artist" + pattern = BASE_PATTERN + r"/p/community/([^/?#]+)/u/([^/?#]+)" + example = "https://www.webtoons.com/p/community/LANG/u/ARTIST" + + def items(self): + self.setup_agegate_cookies() + + for comic in self.comics(): + comic["_extractor"] = WebtoonsComicExtractor + comic_url = self.root + comic["extra"]["episodeListPath"] + yield Message.Queue, comic_url, comic + + def comics(self): + lang, artist = self.groups + language = util.code_to_language(lang).upper() + + url = "{}/p/community/{}/u/{}".format( + self.root, lang, artist) + page = self.request(url).text + creator_id = text.extr(page, '\\"creatorId\\":\\"', '\\') + + url = "{}/p/community/api/v1/creator/{}/titles".format( + self.root, creator_id) + params = { + "language": language, + "nextSize": "50", + } + headers = { + "language": language, + } + data = self.request(url, params=params, headers=headers).json() + + return data["result"]["titles"] diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index ac1400e..0ad73c0 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -74,7 +74,6 @@ class ZerochanExtractor(BooruExtractor): extr = text.extract_from(page) data = { "id" : text.parse_int(entry_id), - "author" : jsonld["author"]["name"], "file_url": jsonld["contentUrl"], "date" : text.parse_datetime(jsonld["datePublished"]), "width" : text.parse_int(jsonld["width"][:-3]), @@ -88,6 +87,11 @@ class ZerochanExtractor(BooruExtractor): 'id="source-url"', '</p>').rpartition("</s>")[2])), } + try: + data["author"] = jsonld["author"]["name"] + except Exception: + data["author"] = "" + html = data["tags"] tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: diff --git a/gallery_dl/extractor/zzup.py b/gallery_dl/extractor/zzup.py index 05b12b4..20454b4 100644 --- a/gallery_dl/extractor/zzup.py +++ b/gallery_dl/extractor/zzup.py @@ -16,7 +16,7 @@ class ZzupGalleryExtractor(GalleryExtractor): filename_fmt = "{num:>03}.{extension}" archive_fmt = "{slug}_{num}" root = "https://zzup.com" - pattern = (r"(?:https?://)?(up\.|www\.)?zzup\.com(/(?:viewalbum|content)" + pattern = (r"(?:https?://)?(up\.|w+\.)?zzup\.com(/(?:viewalbum|content)" r"/[\w=]+/([^/?#]+)/[\w=]+)/(?:index|page-\d+)\.html") example = "https://zzup.com/content/xyz=/12345_TITLE/123=/index.html" diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index e662c34..6affc3e 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -495,6 +495,8 @@ _CONVERSIONS = { "s": str, "r": repr, "a": ascii, + "i": int, + "f": float, } _FORMAT_SPECIFIERS = { "?": _parse_optional, diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 21e1aa0..54cf126 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -269,7 +269,7 @@ class PathFormat(): try: for fmt in self.directory_formatters: segment = fmt(kwdict).strip() - if strip and segment != "..": + if strip and segment not in {".", ".."}: # remove trailing dots and spaces (#647) segment = segment.rstrip(strip) if segment: diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 3ef9fbc..fbb3fb8 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -108,6 +108,7 @@ class MetadataPP(PostProcessor): self.omode = options.get("open", omode) self.encoding = options.get("encoding", "utf-8") self.skip = options.get("skip", False) + self.meta_path = options.get("metadata-path") def run(self, pathfmt): archive = self.archive @@ -120,6 +121,9 @@ class MetadataPP(PostProcessor): directory = self._directory(pathfmt) path = directory + self._filename(pathfmt) + if self.meta_path is not None: + pathfmt.kwdict[self.meta_path] = path + if self.skip and os.path.exists(path): return @@ -180,7 +184,10 @@ class MetadataPP(PostProcessor): pathfmt.directory_formatters = self._directory_formatters pathfmt.directory_conditions = () segments = pathfmt.build_directory(pathfmt.kwdict) - directory = pathfmt.clean_path(os.sep.join(segments) + os.sep) + if segments: + directory = pathfmt.clean_path(os.sep.join(segments) + os.sep) + else: + directory = "." + os.sep return os.path.join(self._base(pathfmt), directory) finally: pathfmt.directory_conditions = conditions diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 3a32b39..c1bfc20 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -156,12 +156,7 @@ class UgoiraPP(PostProcessor): return self.log.debug("", exc_info=exc) if self.convert(pathfmt, tempdir): - if self.delete: - pathfmt.delete = True - elif pathfmt.extension != "zip": - self.log.info(pathfmt.filename) - pathfmt.set_extension("zip") - pathfmt.build_path() + pathfmt.delete = self.delete def convert_from_files(self, pathfmt): if not self._convert_files: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 76e6517..eabd4ab 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -700,6 +700,9 @@ EXECUTABLE = getattr(sys, "frozen", False) USERAGENT = "gallery-dl/" + version.__version__ USERAGENT_FIREFOX = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{}.0) " "Gecko/20100101 Firefox/{}.0").format(_ff_ver, _ff_ver) +USERAGENT_CHROME = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 " + "Safari/537.36") SPECIAL_EXTRACTORS = {"oauth", "recursive", "generic"} GLOBALS = { "contains" : contains, diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 43b234d..87169e2 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,5 +6,5 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.29.3" +__version__ = "1.29.4" __variant__ = None |
