diff options
Diffstat (limited to 'gallery_dl')
28 files changed, 764 insertions, 215 deletions
diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index ee00bf7..f18cc47 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -980,6 +980,7 @@ def _is_path(value): def _parse_browser_specification( browser, profile=None, keyring=None, container=None): + browser = browser.lower() if browser not in SUPPORTED_BROWSERS: raise ValueError("unsupported browser '{}'".format(browser)) if keyring and keyring not in SUPPORTED_KEYRINGS: diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 444075c..f26f6a9 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -38,6 +38,7 @@ modules = [ "exhentai", "fallenangels", "fanbox", + "fanleaks", "fantia", "fapello", "fapachi", @@ -135,6 +136,7 @@ modules = [ "speakerdeck", "subscribestar", "tapas", + "tcbscans", "telegraph", "toyhouse", "tsumino", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index cf332ac..6da6175 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.behance.net/""" +"""Extractors for https://www.behance.net/""" from .common import Extractor, Message from .. import text @@ -17,6 +17,7 @@ class BehanceExtractor(Extractor): """Base class for behance extractors""" category = "behance" root = "https://www.behance.net" + request_interval = (2.0, 4.0) def items(self): for gallery in self.galleries(): diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 882c2b3..8283fbc 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -56,8 +56,12 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): files = album["files"] except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) + self.log.debug("Falling back to lolisafe API") self.root = root.replace("://", "://app.", 1) files, data = LolisafeAlbumExtractor.fetch_album(self, album_id) + # fix file URLs (bunkr..ru -> bunkr.ru) (#3481) + for file in files: + file["file"] = file["file"].replace("bunkr..", "bunkr.", 1) else: for file in files: file["file"] = file["cdn"] + "/" + file["name"] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 4352aa7..ad766da 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -327,6 +327,7 @@ class Extractor(): except Exception as exc: self.log.warning("cookies: %s", exc) else: + self.log.debug("Loading cookies from '%s'", cookies) self._cookiefile = cookiefile elif isinstance(cookies, (list, tuple)): diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index ef17176..4c93604 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -101,8 +101,8 @@ class DanbooruExtractor(BaseExtractor): if self.extended_metadata: template = ( - "{}/posts/{}.json" - "?only=artist_commentary,children,notes,parent" + "{}/posts/{}.json?only=artist_commentary,children,notes," + "parent,uploader" ) resp = self.request(template.format(self.root, post["id"])) post.update(resp.json()) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index aa78cfb..aeb2d0a 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -987,13 +987,9 @@ class DeviantartScrapsExtractor(DeviantartExtractor): _warning = True def deviations(self): - eclipse_api = DeviantartEclipseAPI(self) - if self._warning: - DeviantartScrapsExtractor._warning = False - if not self._check_cookies(self.cookienames): - self.log.warning( - "No session cookies set: Unable to fetch mature scraps.") + self.login() + eclipse_api = DeviantartEclipseAPI(self) for obj in eclipse_api.gallery_scraps(self.user, self.offset): deviation = obj["deviation"] deviation_uuid = eclipse_api.deviation_extended_fetch( @@ -1004,6 +1000,17 @@ class DeviantartScrapsExtractor(DeviantartExtractor): yield self.api.deviation(deviation_uuid) + def login(self): + """Login and obtain session cookies""" + if not self._check_cookies(self.cookienames): + username, password = self._get_auth_info() + if username: + self._update_cookies(_login_impl(self, username, password)) + elif self._warning: + self.log.warning( + "No session cookies set: Unable to fetch mature scraps.") + DeviantartScrapsExtractor._warning = False + class DeviantartFollowingExtractor(DeviantartExtractor): """Extractor for user's watched users""" @@ -1513,13 +1520,47 @@ class DeviantartEclipseAPI(): return token -@cache(maxage=100*365*24*3600, keyarg=0) +@cache(maxage=100*365*86400, keyarg=0) def _refresh_token_cache(token): if token and token[0] == "#": return None return token +@cache(maxage=28*86400, keyarg=1) +def _login_impl(extr, username, password): + extr.log.info("Logging in as %s", username) + + url = "https://www.deviantart.com/users/login" + page = extr.request(url).text + + data = {} + for item in text.extract_iter(page, '<input type="hidden" name="', '"/>'): + name, _, value = item.partition('" value="') + data[name] = value + + challenge = data.get("challenge") + if challenge and challenge != "0": + extr.log.warning("Login requires solving a CAPTCHA") + extr.log.debug(challenge) + + data["username"] = username + data["password"] = password + data["remember"] = "on" + + extr.sleep(2.0, "login") + url = "https://www.deviantart.com/_sisu/do/signin" + response = extr.request(url, method="POST", data=data) + + if not response.history: + raise exception.AuthenticationError() + + return { + cookie.name: cookie.value + for cookie in extr.session.cookies + } + + ############################################################################### # Journal Formats ############################################################# diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index f692a90..41431dc 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -69,14 +69,28 @@ class FanboxExtractor(Extractor): if post["type"] == "article": post["articleBody"] = content_body.copy() if "blocks" in content_body: - content = [] + content = [] # text content + images = [] # image IDs in 'body' order + append = content.append + append_img = images.append for block in content_body["blocks"]: if "text" in block: append(block["text"]) if "links" in block: for link in block["links"]: append(link["url"]) + if "imageId" in block: + append_img(block["imageId"]) + + if images and "imageMap" in content_body: + # reorder 'imageMap' (#2718) + image_map = content_body["imageMap"] + content_body["imageMap"] = { + image_id: image_map[image_id] + for image_id in images + } + post["content"] = "\n".join(content) post["date"] = text.parse_datetime(post["publishedDatetime"]) @@ -294,6 +308,10 @@ class FanboxPostExtractor(FanboxExtractor): r"Thank you for your continued support of FANBOX.$", }, }), + # imageMap file order (#2718) + ("https://mochirong.fanbox.cc/posts/3746116", { + "url": "c92ddd06f2efc4a5fe30ec67e21544f79a5c4062", + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/fanleaks.py b/gallery_dl/extractor/fanleaks.py new file mode 100644 index 0000000..466bb8c --- /dev/null +++ b/gallery_dl/extractor/fanleaks.py @@ -0,0 +1,127 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://fanleaks.club/""" + +from .common import Extractor, Message +from .. import text, exception + + +class FanleaksExtractor(Extractor): + """Base class for Fanleaks extractors""" + category = "fanleaks" + directory_fmt = ("{category}", "{model}") + filename_fmt = "{model_id}_{id}.{extension}" + archive_fmt = "{model_id}_{id}" + root = "https://fanleaks.club" + + def __init__(self, match): + Extractor.__init__(self, match) + self.model_id = match.group(1) + + def extract_post(self, url): + extr = text.extract_from(self.request(url, notfound="post").text) + data = { + "model_id": self.model_id, + "model" : text.unescape(extr('text-lg">', "</a>")), + "id" : text.parse_int(self.id), + "type" : extr('type="', '"')[:5] or "photo", + } + url = extr('src="', '"') + yield Message.Directory, data + yield Message.Url, url, text.nameext_from_url(url, data) + + +class FanleaksPostExtractor(FanleaksExtractor): + """Extractor for individual posts on fanleak.club""" + subcategory = "post" + pattern = r"(?:https?://)?(?:www\.)?fanleaks\.club/([^/?#]+)/(\d+)" + test = ( + ("https://fanleaks.club/selti/880", { + "pattern": (r"https://fanleaks\.club//models" + r"/selti/images/selti_0880\.jpg"), + "keyword": { + "model_id": "selti", + "model" : "Selti", + "id" : 880, + "type" : "photo", + }, + }), + ("https://fanleaks.club/daisy-keech/1038", { + "pattern": (r"https://fanleaks\.club//models" + r"/daisy-keech/videos/daisy-keech_1038\.mp4"), + "keyword": { + "model_id": "daisy-keech", + "model" : "Daisy Keech", + "id" : 1038, + "type" : "video", + }, + }), + ("https://fanleaks.club/hannahowo/000", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + FanleaksExtractor.__init__(self, match) + self.id = match.group(2) + + def items(self): + url = "{}/{}/{}".format(self.root, self.model_id, self.id) + return self.extract_post(url) + + +class FanleaksModelExtractor(FanleaksExtractor): + """Extractor for all posts from a fanleaks model""" + subcategory = "model" + pattern = (r"(?:https?://)?(?:www\.)?fanleaks\.club" + r"/(?!latest/?$)([^/?#]+)/?$") + test = ( + ("https://fanleaks.club/hannahowo", { + "pattern": (r"https://fanleaks\.club//models" + r"/hannahowo/(images|videos)/hannahowo_\d+\.\w+"), + "range" : "1-100", + "count" : 100, + }), + ("https://fanleaks.club/belle-delphine", { + "pattern": (r"https://fanleaks\.club//models" + r"/belle-delphine/(images|videos)" + r"/belle-delphine_\d+\.\w+"), + "range" : "1-100", + "count" : 100, + }), + ("https://fanleaks.club/daisy-keech"), + ) + + def items(self): + page_num = 1 + page = self.request( + self.root + "/" + self.model_id, notfound="model").text + data = { + "model_id": self.model_id, + "model" : text.unescape( + text.extr(page, 'mt-4">', "</h1>")), + "type" : "photo", + } + page_url = text.extr(page, "url: '", "'") + while True: + page = self.request("{}{}".format(page_url, page_num)).text + if not page: + return + + for item in text.extract_iter(page, '<a href="/', "</a>"): + self.id = id = text.extr(item, "/", '"') + if "/icon-play.svg" in item: + url = "{}/{}/{}".format(self.root, self.model_id, id) + yield from self.extract_post(url) + continue + + data["id"] = text.parse_int(id) + url = text.extr(item, 'src="', '"').replace( + "/thumbs/", "/", 1) + yield Message.Directory, data + yield Message.Url, url, text.nameext_from_url(url, data) + page_num += 1 diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index d8109e1..8d73949 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -195,7 +195,7 @@ class GelbooruPostExtractor(GelbooruBase, # notes ("https://gelbooru.com/index.php?page=post&s=view&id=5997331", { "options": (("notes", True),), - "keywords": { + "keyword": { "notes": [ { "body": "Look over this way when you talk~", diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 56bd048..1efbbf0 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,25 +9,37 @@ """Extractors for https://www.imagefap.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception import json - BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?imagefap\.com" class ImagefapExtractor(Extractor): """Base class for imagefap extractors""" category = "imagefap" + root = "https://www.imagefap.com" directory_fmt = ("{category}", "{gallery_id} {title}") filename_fmt = "{category}_{gallery_id}_{filename}.{extension}" archive_fmt = "{gallery_id}_{image_id}" - root = "https://www.imagefap.com" + request_interval = (2.0, 4.0) def __init__(self, match): Extractor.__init__(self, match) self.session.headers["Referer"] = self.root + def request(self, url, **kwargs): + response = Extractor.request(self, url, **kwargs) + + if response.history and response.url.endswith("/human-verification"): + msg = text.extr(response.text, '<div class="mt-4', '<') + if msg: + msg = " ".join(msg.partition(">")[2].split()) + raise exception.StopExtraction("'%s'", msg) + self.log.warning("HTTP redirect to %s", response.url) + + return response + class ImagefapGalleryExtractor(ImagefapExtractor): """Extractor for image galleries from imagefap.com""" @@ -41,12 +53,20 @@ class ImagefapGalleryExtractor(ImagefapExtractor): "keyword": "2ba96e84c2952c4750e9fa94a3f2b1f965cec2f3", "content": "694a0a57385980a6f90fbc296cadcd6c11ba2dab", }), - ("https://www.imagefap.com/gallery/5486966", { + ("https://www.imagefap.com/gallery/7876223", { "pattern": r"https://cdnh?\.imagefap\.com" r"/images/full/\d+/\d+/\d+\.jpg", - "keyword": "8d2e562df7a0bc9e8eecb9d1bb68d32b4086bf98", - "archive": False, - "count": 62, + "keyword": { + "count": 44, + "gallery_id": 7876223, + "image_id": int, + "num": int, + "tags": ["big ass", "panties", "horny", + "pussy", "exposed", "outdoor"], + "title": "Kelsi Monroe in lingerie", + "uploader": "BdRachel", + }, + "count": 44, }), ("https://www.imagefap.com/gallery.php?gid=7102714"), ("https://beta.imagefap.com/gallery.php?gid=7102714"), @@ -118,12 +138,20 @@ class ImagefapImageExtractor(ImagefapExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/photo/(\d+)" test = ( - ("https://www.imagefap.com/photo/1369341772/", { + ("https://www.imagefap.com/photo/1962981893", { "pattern": r"https://cdnh?\.imagefap\.com" - r"/images/full/\d+/\d+/\d+\.jpg", - "keyword": "8894e45f7262020d8d66ce59917315def1fc475b", + r"/images/full/65/196/1962981893\.jpg", + "keyword": { + "date": "21/08/2014", + "gallery_id": 7876223, + "height": 1600, + "image_id": 1962981893, + "title": "Kelsi Monroe in lingerie", + "uploader": "BdRachel", + "width": 1066, + }, }), - ("https://beta.imagefap.com/photo/1369341772/"), + ("https://beta.imagefap.com/photo/1962981893"), ) def __init__(self, match): @@ -159,61 +187,70 @@ class ImagefapImageExtractor(ImagefapExtractor): }) -class ImagefapUserExtractor(ImagefapExtractor): - """Extractor for all galleries from a user at imagefap.com""" - subcategory = "user" - categorytransfer = True - pattern = (BASE_PATTERN + - r"/(?:profile(?:\.php\?user=|/)([^/?#]+)" - r"|usergallery\.php\?userid=(\d+))") +class ImagefapFolderExtractor(ImagefapExtractor): + """Extractor for imagefap user folders""" + subcategory = "folder" + pattern = (BASE_PATTERN + r"/(?:organizer/|" + r"(?:usergallery\.php\?user(id)?=([^&#]+)&" + r"|profile/([^/?#]+)/galleries\?)folderid=)(\d+|-1)") test = ( - ("https://www.imagefap.com/profile/LucyRae/galleries", { - "url": "822cb6cbb6f474ca2d0f58d1d6d253bc2338937a", + ("https://www.imagefap.com/organizer/409758", { + "pattern": r"https://www\.imagefap\.com/gallery/7876223", + "url": "37822523e6e4a56feb9dea35653760c86b44ff89", + "count": 1, }), - ("https://www.imagefap.com/usergallery.php?userid=1862791", { - "url": "822cb6cbb6f474ca2d0f58d1d6d253bc2338937a", + (("https://www.imagefap.com/usergallery.php" + "?userid=1981976&folderid=409758"), { + "url": "37822523e6e4a56feb9dea35653760c86b44ff89", + }), + (("https://www.imagefap.com/usergallery.php" + "?user=BdRachel&folderid=409758"), { + "url": "37822523e6e4a56feb9dea35653760c86b44ff89", + }), + ("https://www.imagefap.com/profile/BdRachel/galleries?folderid=-1", { + "pattern": ImagefapGalleryExtractor.pattern, + "range": "1-40", + }), + (("https://www.imagefap.com/usergallery.php" + "?userid=1981976&folderid=-1"), { + "pattern": ImagefapGalleryExtractor.pattern, + "range": "1-40", + }), + (("https://www.imagefap.com/usergallery.php" + "?user=BdRachel&folderid=-1"), { + "pattern": ImagefapGalleryExtractor.pattern, + "range": "1-40", }), - ("https://www.imagefap.com/profile.php?user=LucyRae"), - ("https://beta.imagefap.com/profile.php?user=LucyRae"), ) def __init__(self, match): ImagefapExtractor.__init__(self, match) - self.user, self.user_id = match.groups() + self._id, user, profile, self.folder_id = match.groups() + self.user = user or profile def items(self): - for folder_id in self.folders(): - for gallery_id, name in self.galleries(folder_id): - url = "{}/gallery/{}".format(self.root, gallery_id) - data = { - "gallery_id": text.parse_int(gallery_id), - "title" : text.unescape(name), - "_extractor": ImagefapGalleryExtractor, - } - yield Message.Queue, url, data - - def folders(self): - """Return a list of folder_ids of a specific user""" - if self.user: - url = "{}/profile/{}/galleries".format(self.root, self.user) - else: - url = "{}/usergallery.php?userid={}".format( - self.root, self.user_id) - - response = self.request(url) - self.user = response.url.split("/")[-2] - folders = text.extr(response.text, ' id="tgl_all" value="', '"') - return folders.rstrip("|").split("|") + for gallery_id, name in self.galleries(self.folder_id): + url = "{}/gallery/{}".format(self.root, gallery_id) + data = { + "gallery_id": gallery_id, + "title" : text.unescape(name), + "_extractor": ImagefapGalleryExtractor, + } + yield Message.Queue, url, data def galleries(self, folder_id): - """Yield gallery_ids of a folder""" + """Yield gallery IDs and titles of a folder""" if folder_id == "-1": - url = "{}/profile/{}/galleries?folderid=-1".format( - self.root, self.user) + if self._id: + url = "{}/usergallery.php?userid={}&folderid=-1".format( + self.root, self.user) + else: + url = "{}/profile/{}/galleries?folderid=-1".format( + self.root, self.user) else: url = "{}/organizer/{}/".format(self.root, folder_id) - params = {"page": 0} + params = {"page": 0} while True: extr = text.extract_from(self.request(url, params=params).text) cnt = 0 @@ -228,3 +265,53 @@ class ImagefapUserExtractor(ImagefapExtractor): if cnt < 25: break params["page"] += 1 + + +class ImagefapUserExtractor(ImagefapExtractor): + """Extractor for an imagefap user profile""" + subcategory = "user" + pattern = (BASE_PATTERN + + r"/(?:profile(?:\.php\?user=|/)([^/?#]+)(?:/galleries)?" + r"|usergallery\.php\?userid=(\d+))(?:$|#)") + test = ( + ("https://www.imagefap.com/profile/BdRachel", { + "pattern": ImagefapFolderExtractor.pattern, + "count": ">= 18", + }), + ("https://www.imagefap.com/usergallery.php?userid=1862791", { + "pattern": r"https://www\.imagefap\.com" + r"/profile/LucyRae/galleries\?folderid=-1", + "count": 1, + }), + ("https://www.imagefap.com/profile/BdRachel/galleries"), + ("https://www.imagefap.com/profile.php?user=BdRachel"), + ("https://beta.imagefap.com/profile.php?user=BdRachel"), + ) + + def __init__(self, match): + ImagefapExtractor.__init__(self, match) + self.user, self.user_id = match.groups() + + def items(self): + data = {"_extractor": ImagefapFolderExtractor} + + for folder_id in self.folders(): + if folder_id == "-1": + url = "{}/profile/{}/galleries?folderid=-1".format( + self.root, self.user) + else: + url = "{}/organizer/{}/".format(self.root, folder_id) + yield Message.Queue, url, data + + def folders(self): + """Return a list of folder IDs of a user""" + if self.user: + url = "{}/profile/{}/galleries".format(self.root, self.user) + else: + url = "{}/usergallery.php?userid={}".format( + self.root, self.user_id) + + response = self.request(url) + self.user = response.url.split("/")[-2] + folders = text.extr(response.text, ' id="tgl_all" value="', '"') + return folders.rstrip("|").split("|") diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 8a61728..541e427 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -67,6 +67,7 @@ class KemonopartyExtractor(Extractor): headers["Referer"] = "{}/{}/user/{}/post/{}".format( self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers + post["_http_validate"] = _validate post["date"] = text.parse_datetime( post["published"] or post["added"], "%a, %d %b %Y %H:%M:%S %Z") @@ -197,6 +198,11 @@ class KemonopartyExtractor(Extractor): return dms +def _validate(response): + return (response.headers["content-length"] != "9" and + response.content != b"not found") + + class KemonopartyUserExtractor(KemonopartyExtractor): """Extractor for all posts from a kemono.party user listing""" subcategory = "user" @@ -309,6 +315,12 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "pattern": r"https://coomer\.party/data/7d/3f/7d3fd9804583dc224968" r"c0591163ec91794552b04f00a6c2f42a15b68231d5a8\.jpg", }), + # invalid file (#3510) + ("https://kemono.party/patreon/user/19623797/post/29035449", { + "pattern": r"907ba78b4545338d3539683e63ecb51c" + r"f51c10adc9dabd86e92bd52339f298b9\.txt", + "content": "da39a3ee5e6b4b0d3255bfef95601890afd80709", + }), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"), ("https://beta.kemono.party/subscribestar/user/alcorart/post/184330"), diff --git a/gallery_dl/extractor/lynxchan.py b/gallery_dl/extractor/lynxchan.py index bbcf9c0..85e8bb1 100644 --- a/gallery_dl/extractor/lynxchan.py +++ b/gallery_dl/extractor/lynxchan.py @@ -17,9 +17,13 @@ class LynxchanExtractor(BaseExtractor): BASE_PATTERN = LynxchanExtractor.update({ + "bbw-chan": { + "root": "https://bbw-chan.nl", + "pattern": r"bbw-chan\.nl", + }, "kohlchan": { "root": "https://kohlchan.net", - "pattern": r"kohlchan\.net" + "pattern": r"kohlchan\.net", }, "endchan": { "root": None, @@ -37,6 +41,11 @@ class LynxchanThreadExtractor(LynxchanExtractor): archive_fmt = "{boardUri}_{postId}_{num}" pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" test = ( + ("https://bbw-chan.nl/bbwdraw/res/499.html", { + "pattern": r"https://bbw-chan\.nl/\.media/[0-9a-f]{64}(\.\w+)?$", + "count": ">= 352", + }), + ("https://bbw-chan.nl/bbwdraw/res/489.html"), ("https://kohlchan.net/a/res/4594.html", { "pattern": r"https://kohlchan\.net/\.media/[0-9a-f]{64}(\.\w+)?$", "count": ">= 80", @@ -78,6 +87,11 @@ class LynxchanBoardExtractor(LynxchanExtractor): subcategory = "board" pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)" test = ( + ("https://bbw-chan.nl/bbwdraw/", { + "pattern": LynxchanThreadExtractor.pattern, + "count": ">= 148", + }), + ("https://bbw-chan.nl/bbwdraw/2.html"), ("https://kohlchan.net/a/", { "pattern": LynxchanThreadExtractor.pattern, "count": ">= 100", diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py index 3dbd5fc..5dc4cb6 100644 --- a/gallery_dl/extractor/myhentaigallery.py +++ b/gallery_dl/extractor/myhentaigallery.py @@ -44,7 +44,10 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor): extr = text.extract_from(page) split = text.split_html - title = extr('<div class="comic-description">\n<h1>', '</h1>') + title = extr('<div class="comic-description">\n', '</h1>').lstrip() + if title.startswith("<h1>"): + title = title[len("<h1>"):] + if not title: raise exception.NotFoundError("gallery") diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index dfe78ae..f9c6abf 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -451,7 +451,7 @@ class NitterTweetExtractor(NitterExtractor): }), # age-restricted (#2354) ("https://nitter.unixfox.eu/mightbecurse/status/1492954264909479936", { - "keywords": {"date": "dt:2022-02-13 20:10:09"}, + "keyword": {"date": "dt:2022-02-13 20:10:00"}, "count": 1, }), ) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index f786be6..63b16ce 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,6 +26,13 @@ class PinterestExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) + + domain = self.config("domain") + if not domain or domain == "auto" : + self.root = text.root_from_url(match.group(0)) + else: + self.root = text.ensure_http_scheme(domain) + self.api = PinterestAPI(self) def items(self): @@ -142,7 +149,7 @@ class PinterestBoardExtractor(PinterestExtractor): directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") archive_fmt = "{board[id]}_{id}" pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#&]+)" - "/(?!_saved|_created)([^/?#&]+)/?$") + "/(?!_saved|_created|pins/)([^/?#&]+)/?$") test = ( ("https://www.pinterest.com/g1952849/test-/", { "pattern": r"https://i\.pinimg\.com/originals/", @@ -151,7 +158,7 @@ class PinterestBoardExtractor(PinterestExtractor): # board with sections (#835) ("https://www.pinterest.com/g1952849/stuff/", { "options": (("sections", True),), - "count": 5, + "count": 4, }), # secret board (#1055) ("https://www.pinterest.de/g1952849/secret/", { @@ -194,11 +201,11 @@ class PinterestUserExtractor(PinterestExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)(?:/_saved)?/?$" test = ( - ("https://www.pinterest.de/g1952849/", { + ("https://www.pinterest.com/g1952849/", { "pattern": PinterestBoardExtractor.pattern, "count": ">= 2", }), - ("https://www.pinterest.de/g1952849/_saved/"), + ("https://www.pinterest.com/g1952849/_saved/"), ) def __init__(self, match): @@ -213,15 +220,38 @@ class PinterestUserExtractor(PinterestExtractor): yield Message.Queue, self.root + url, board +class PinterestAllpinsExtractor(PinterestExtractor): + """Extractor for a user's 'All Pins' feed""" + subcategory = "allpins" + directory_fmt = ("{category}", "{user}") + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/pins/?$" + test = ("https://www.pinterest.com/g1952849/pins/", { + "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}" + r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w{3}", + "count": 7, + }) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.user = text.unquote(match.group(1)) + + def metadata(self): + return {"user": self.user} + + def pins(self): + return self.api.user_pins(self.user) + + class PinterestCreatedExtractor(PinterestExtractor): """Extractor for a user's created pins""" subcategory = "created" directory_fmt = ("{category}", "{user}") pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/_created/?$" - test = ("https://www.pinterest.com/amazon/_created", { + test = ("https://www.pinterest.de/digitalmomblog/_created/", { "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}" r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.jpg", "count": 10, + "range": "1-10", }) def __init__(self, match): @@ -272,7 +302,7 @@ class PinterestSearchExtractor(PinterestExtractor): subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") pattern = BASE_PATTERN + r"/search/pins/?\?q=([^&#]+)" - test = ("https://www.pinterest.de/search/pins/?q=nature", { + test = ("https://www.pinterest.com/search/pins/?q=nature", { "range": "1-50", "count": ">= 50", }) @@ -357,26 +387,23 @@ class PinterestAPI(): - https://github.com/seregazhuk/php-pinterest-bot """ - BASE_URL = "https://www.pinterest.com" - HEADERS = { - "Accept" : "application/json, text/javascript, " - "*/*, q=0.01", - "Accept-Language" : "en-US,en;q=0.5", - "Referer" : BASE_URL + "/", - "X-Requested-With" : "XMLHttpRequest", - "X-APP-VERSION" : "31461e0", - "X-CSRFToken" : None, - "X-Pinterest-AppState": "active", - "Origin" : BASE_URL, - } - def __init__(self, extractor): - self.extractor = extractor - csrf_token = util.generate_token() - self.headers = self.HEADERS.copy() - self.headers["X-CSRFToken"] = csrf_token + + self.extractor = extractor + self.root = extractor.root self.cookies = {"csrftoken": csrf_token} + self.headers = { + "Accept" : "application/json, text/javascript, " + "*/*, q=0.01", + "Accept-Language" : "en-US,en;q=0.5", + "Referer" : self.root + "/", + "X-Requested-With" : "XMLHttpRequest", + "X-APP-VERSION" : "0c4af40", + "X-CSRFToken" : csrf_token, + "X-Pinterest-AppState": "active", + "Origin" : self.root, + } def pin(self, pin_id): """Query information about a pin""" @@ -437,6 +464,16 @@ class PinterestAPI(): options = {"board_id": board_id, "add_vase": True} return self._pagination("BoardRelatedPixieFeed", options) + def user_pins(self, user): + """Yield all pins from 'user'""" + options = { + "is_own_profile_pins": False, + "username" : user, + "field_set_key" : "grid_item", + "pin_filter" : None, + } + return self._pagination("UserPins", options) + def user_activity_pins(self, user): """Yield pins created by 'user'""" options = { @@ -462,7 +499,7 @@ class PinterestAPI(): def _login_impl(self, username, password): self.extractor.log.info("Logging in as %s", username) - url = self.BASE_URL + "/resource/UserSessionResource/create/" + url = self.root + "/resource/UserSessionResource/create/" options = { "username_or_email": username, "password" : password, @@ -485,7 +522,7 @@ class PinterestAPI(): } def _call(self, resource, options): - url = "{}/resource/{}Resource/get/".format(self.BASE_URL, resource) + url = "{}/resource/{}Resource/get/".format(self.root, resource) params = {"data": json.dumps({"options": options}), "source_url": ""} response = self.extractor.request( @@ -497,10 +534,11 @@ class PinterestAPI(): except ValueError: data = {} - if response.status_code < 400 and not response.history: + if response.history: + self.root = text.root_from_url(response.url) + if response.status_code < 400: return data - - if response.status_code == 404 or response.history: + if response.status_code == 404: resource = self.extractor.subcategory.rpartition("-")[2] raise exception.NotFoundError(resource) self.extractor.log.debug("Server response: %s", response.text) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 134361d..a17518f 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -45,7 +45,8 @@ class PixivExtractor(Extractor): work["tags"] = [tag["name"] for tag in work["tags"]] ratings = {0: "General", 1: "R-18", 2: "R-18G"} - userdata = self.config("metadata") + meta_user = self.config("metadata") + meta_bookmark = self.config("metadata-bookmark") metadata = self.metadata() works = self.works() @@ -61,8 +62,12 @@ class PixivExtractor(Extractor): del work["image_urls"] del work["meta_pages"] - if userdata: + if meta_user: work.update(self.api.user_detail(work["user"]["id"])) + if meta_bookmark and work["is_bookmarked"]: + detail = self.api.illust_bookmark_detail(work["id"]) + work["tags_bookmark"] = [tag["name"] for tag in detail["tags"] + if tag["is_registered"]] if transform_tags: transform_tags(work) work["num"] = 0 @@ -398,6 +403,8 @@ class PixivFavoriteExtractor(PixivExtractor): # own bookmarks ("https://www.pixiv.net/bookmark.php", { "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba", + "keyword": {"tags_bookmark": ["47", "hitman"]}, + "options": (("metadata-bookmark", True),), }), # own bookmarks with tag (#596) ("https://www.pixiv.net/bookmark.php?tag=foobar", { @@ -880,6 +887,11 @@ class PixivAppAPI(): params = {"illust_id": illust_id} return self._call("/v1/illust/detail", params)["illust"] + def illust_bookmark_detail(self, illust_id): + params = {"illust_id": illust_id} + return self._call( + "/v2/illust/bookmark/detail", params)["bookmark_detail"] + def illust_follow(self, restrict="all"): params = {"restrict": restrict} return self._pagination("/v2/illust/follow", params) @@ -900,9 +912,16 @@ class PixivAppAPI(): return self._pagination("/v1/search/illust", params) def user_bookmarks_illust(self, user_id, tag=None, restrict="public"): + """Return illusts bookmarked by a user""" params = {"user_id": user_id, "tag": tag, "restrict": restrict} return self._pagination("/v1/user/bookmarks/illust", params) + def user_bookmark_tags_illust(self, user_id, restrict="public"): + """Return bookmark tags defined by a user""" + params = {"user_id": user_id, "restrict": restrict} + return self._pagination( + "/v1/user/bookmark-tags/illust", params, "bookmark_tags") + @memcache(keyarg=1) def user_detail(self, user_id): params = {"user_id": user_id} diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index 4283081..c35ee74 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -59,7 +59,7 @@ class PoipikuExtractor(Extractor): "//img.", "//img-org.", 1) yield Message.Url, url, text.nameext_from_url(url, post) - if not extr('> show all', '<'): + if not extr(' show all(+', '<'): continue url = self.root + "/f/ShowAppendFileF.jsp" @@ -79,6 +79,9 @@ class PoipikuExtractor(Extractor): page = self.request( url, method="POST", headers=headers, data=data).json()["html"] + if page.startswith("You need to"): + self.log.warning("'%s'", page) + for thumb in text.extract_iter( page, 'class="IllustItemThumbImg" src="', '"'): post["num"] += 1 @@ -162,6 +165,21 @@ class PoipikuPostExtractor(PoipikuExtractor): "user_name": "wadahito", }, }), + # different warning button style + ("https://poipiku.com/3572553/5776587.html", { + "pattern": r"https://img-org\.poipiku.com/user_img\d+/003572553" + r"/005776587_(\d+_)?\w+\.jpeg$", + "count": 3, + "keyword": { + "count": "3", + "description": "ORANGE OASISボスネタバレ", + "num": int, + "post_category": "SPOILER", + "post_id": "5776587", + "user_id": "3572553", + "user_name": "nagakun", + }, + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py new file mode 100644 index 0000000..cac5a54 --- /dev/null +++ b/gallery_dl/extractor/tcbscans.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://onepiecechapters.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text + + +class TcbscansChapterExtractor(ChapterExtractor): + category = "tcbscans" + pattern = (r"(?:https?://)?onepiecechapters\.com" + r"(/chapters/\d+/[^/?#]+)") + root = "https://onepiecechapters.com" + test = ( + (("https://onepiecechapters.com" + "/chapters/4708/chainsaw-man-chapter-108"), { + "pattern": (r"https://cdn\.[^/]+" + r"/(file|attachments/[^/]+)/[^/]+/[^.]+\.\w+"), + "count" : 17, + "keyword": { + "manga": "Chainsaw Man", + "chapter": 108, + "chapter_minor": "", + "lang": "en", + "language": "English", + }, + }), + ("https://onepiecechapters.com/chapters/4716/one-piece-chapter-1065", { + "pattern": (r"https://cdn\.[^/]+" + r"/(file|attachments/[^/]+)/[^/]+/[^.]+\.\w+"), + "count" : 18, + "keyword": { + "manga": "One Piece", + "chapter": 1065, + "chapter_minor": "", + "lang": "en", + "language": "English", + }, + }), + (("https://onepiecechapters.com/" + "chapters/44/ace-novel-manga-adaptation-chapter-1")), + ) + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter( + page, '<img class="fixed-ratio-content" src="', '"') + ] + + def metadata(self, page): + manga, _, chapter = text.extr( + page, 'font-bold mt-8">', "</h1>").rpartition(" - Chapter ") + chapter, sep, minor = chapter.partition(".") + return { + "manga": text.unescape(manga), + "chapter": text.parse_int(chapter), + "chapter_minor": sep + minor, + "lang": "en", "language": "English", + } + + +class TcbscansMangaExtractor(MangaExtractor): + category = "tcbscans" + chapterclass = TcbscansChapterExtractor + pattern = (r"(?:https?://)?onepiecechapters\.com" + r"(/mangas/\d+/[^/?#]+)") + root = "https://onepiecechapters.com" + test = ( + ("https://onepiecechapters.com/mangas/13/chainsaw-man", { + "pattern": TcbscansChapterExtractor.pattern, + "range" : "1-50", + "count" : 50, + }), + ("https://onepiecechapters.com/mangas/4/jujutsu-kaisen", { + "pattern": TcbscansChapterExtractor.pattern, + "range" : "1-50", + "count" : 50, + }), + ("https://onepiecechapters.com/mangas/15/hunter-x-hunter"), + ) + + def chapters(self, page): + data = { + "manga": text.unescape(text.extr( + page, 'class="my-3 font-bold text-3xl">', "</h1>")), + "lang": "en", "language": "English", + } + + results = [] + page = text.extr(page, 'class="col-span-2"', 'class="order-1') + for chapter in text.extract_iter(page, "<a", "</a>"): + url = text.extr(chapter, 'href="', '"') + data["title"] = text.unescape(text.extr( + chapter, 'text-gray-500">', "</div>")) + chapter = text.extr( + chapter, 'font-bold">', "</div>").rpartition(" Chapter ")[2] + chapter, sep, minor = chapter.partition(".") + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = sep + minor + results.append((self.root + url, data.copy())) + return results diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py index 8e9bf2c..5996268 100644 --- a/gallery_dl/extractor/telegraph.py +++ b/gallery_dl/extractor/telegraph.py @@ -12,7 +12,6 @@ from .. import text class TelegraphGalleryExtractor(GalleryExtractor): """Extractor for articles from telegra.ph""" - category = "telegraph" root = "https://telegra.ph" directory_fmt = ("{category}", "{slug}") @@ -52,6 +51,23 @@ class TelegraphGalleryExtractor(GalleryExtractor): "url": "https://telegra.ph/file/3ea79d23b0dd0889f215a.jpg", }, }), + ("https://telegra.ph/Vsyo-o-druzyah-moej-sestricy-05-27", { + "url": "c1f3048e5d94bee53af30a8c27f70b0d3b15438e", + "pattern": r"^https://pith1\.ru/uploads" + r"/posts/2019-12/\d+_\d+\.jpg$", + "keyword": { + "author": "Shotacon - заходи сюда", + "caption": "", + "count": 19, + "date": "dt:2022-05-27 16:17:27", + "description": "", + "num_formatted": r"re:^\d{2}$", + "post_url": "https://telegra.ph" + "/Vsyo-o-druzyah-moej-sestricy-05-27", + "slug": "Vsyo-o-druzyah-moej-sestricy-05-27", + "title": "Всё о друзьях моей сестрицы", + }, + }), ) def metadata(self, page): @@ -79,11 +95,12 @@ class TelegraphGalleryExtractor(GalleryExtractor): result = [] for figure in figures: - src, pos = text.extract(figure, 'src="', '"') - if src.startswith("/embed/"): + url, pos = text.extract(figure, 'src="', '"') + if url.startswith("/embed/"): continue + elif url.startswith("/"): + url = self.root + url caption, pos = text.extract(figure, "<figcaption>", "<", pos) - url = self.root + src num += 1 result.append((url, { diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 22aa78e..c2d8247 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -39,6 +39,7 @@ class TwitterExtractor(Extractor): self.videos = self.config("videos", True) self.cards = self.config("cards", False) self.cards_blacklist = self.config("cards-blacklist") + self.syndication = self.config("syndication") self._user = self._user_obj = None self._user_cache = {} self._init_sizes() @@ -75,11 +76,6 @@ class TwitterExtractor(Extractor): else: data = tweet - if seen_tweets is not None: - if data["id_str"] in seen_tweets: - continue - seen_tweets.add(data["id_str"]) - if not self.retweets and "retweeted_status_id_str" in data: self.log.debug("Skipping %s (retweet)", data["id_str"]) continue @@ -97,6 +93,13 @@ class TwitterExtractor(Extractor): self.log.debug("Skipping %s (reply)", data["id_str"]) continue + if seen_tweets is not None: + if data["id_str"] in seen_tweets: + self.log.debug( + "Skipping %s (previously seen)", data["id_str"]) + continue + seen_tweets.add(data["id_str"]) + files = [] if "extended_entities" in data: self._extract_media( @@ -220,14 +223,16 @@ class TwitterExtractor(Extractor): def _extract_twitpic(self, tweet, files): for url in tweet["entities"].get("urls", ()): url = url["expanded_url"] - if "//twitpic.com/" in url and "/photos/" not in url: - response = self.request(url, fatal=False) - if response.status_code >= 400: - continue - url = text.extr( - response.text, 'name="twitter:image" value="', '"') - if url: - files.append({"url": url}) + if "//twitpic.com/" not in url or "/photos/" in url: + continue + if url.startswith("http:"): + url = "https" + url[4:] + response = self.request(url, fatal=False) + if response.status_code >= 400: + continue + url = text.extr(response.text, 'name="twitter:image" value="', '"') + if url: + files.append({"url": url}) def _transform_tweet(self, tweet): if "author" in tweet: @@ -299,6 +304,9 @@ class TwitterExtractor(Extractor): if "legacy" in user: user = user["legacy"] + elif "statuses_count" not in user and self.syndication == "extended": + # try to fetch extended user data + user = self.api.user_by_screen_name(user["screen_name"])["legacy"] uget = user.get entities = user["entities"] @@ -361,18 +369,22 @@ class TwitterExtractor(Extractor): def _expand_tweets(self, tweets): seen = set() for tweet in tweets: - - if "legacy" in tweet: - cid = tweet["legacy"]["conversation_id_str"] - else: - cid = tweet["conversation_id_str"] - - if cid not in seen: - seen.add(cid) - try: - yield from self.api.tweet_detail(cid) - except Exception: - yield tweet + obj = tweet["legacy"] if "legacy" in tweet else tweet + cid = obj.get("conversation_id_str") + if not cid: + tid = obj["id_str"] + self.log.warning( + "Unable to expand %s (no 'conversation_id')", tid) + continue + if cid in seen: + self.log.debug( + "Skipping expansion of %s (previously seen)", cid) + continue + seen.add(cid) + try: + yield from self.api.tweet_detail(cid) + except Exception: + yield tweet def _make_tweet(self, user, id_str, url, timestamp): return { @@ -772,7 +784,7 @@ class TwitterTweetExtractor(TwitterExtractor): # age-restricted (#2354) ("https://twitter.com/mightbecursed/status/1492954264909479936", { "options": (("syndication", True),), - "keywords": {"date": "dt:2022-02-13 20:10:09"}, + "keyword": {"date": "dt:2022-02-13 20:10:09"}, "count": 1, }), # media alt texts / descriptions (#2617) @@ -991,7 +1003,7 @@ class TwitterAPI(): } self._nsfw_warning = True - self._syndication = extractor.config("syndication") + self._syndication = self.extractor.syndication self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode cookies = extractor.session.cookies @@ -1516,6 +1528,12 @@ class TwitterAPI(): else: retweet_id = None + # assume 'conversation_id' is the same as 'id' when the tweet + # is not a reply + if "conversation_id_str" not in tweet and \ + "in_reply_to_status_id_str" not in tweet: + tweet["conversation_id_str"] = tweet["id_str"] + tweet["created_at"] = text.parse_datetime( tweet["created_at"], "%Y-%m-%dT%H:%M:%S.%fZ").strftime( "%a %b %d %H:%M:%S +0000 %Y") diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 9b6831b..5692452 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -110,7 +110,7 @@ class VkPhotosExtractor(VkExtractor): "pattern": r"https://sun\d+-\d+\.userapi\.com/s/v1/if1" r"/[\w-]+\.jpg\?size=\d+x\d+&quality=96&type=album", "count": ">= 35", - "keywords": { + "keyword": { "id": r"re:\d+", "user": { "id": "398982326", @@ -122,12 +122,11 @@ class VkPhotosExtractor(VkExtractor): }), ("https://vk.com/cosplayinrussia", { "range": "15-25", - "keywords": { + "keyword": { "id": r"re:\d+", "user": { "id" : "-165740836", - "info": "Предложка открыта, кидайте ваши косплейчики. При " - "правильном оформлении они будут опубликованы", + "info": str, "name": "cosplayinrussia", "nick": "Косплей | Cosplay 18+", }, diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 74da615..03fd909 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -111,13 +111,15 @@ class ZerochanTagExtractor(ZerochanExtractor): test = ("https://www.zerochan.net/Perth+%28Kantai+Collection%29", { "pattern": r"https://static\.zerochan\.net/.+\.full\.\d+\.(jpg|png)", "count": "> 24", - "keywords": { + "keyword": { "extension": r"re:jpg|png", - "file_url": "", - "filename": r"re:Perth.\(Kantai.Collection\).full.\d+", + "file_url": r"re:https://static\.zerochan\.net" + r"/.+\.full\.\d+\.(jpg|png)", + "filename": r"re:(Perth\.\(Kantai\.Collection\)" + r"|Kantai\.Collection)\.full\.\d+", "height": r"re:^\d+$", "id": r"re:^\d+$", - "name": "Perth (Kantai Collection)", + "name": r"re:(Perth \(Kantai Collection\)|Kantai Collection)", "search_tags": "Perth (Kantai Collection)", "size": r"re:^\d+k$", "width": r"re:^\d+$", diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index 8a45330..58bf48d 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -60,14 +60,21 @@ class StringFormatter(): - "u": calls str.upper - "c": calls str.capitalize - "C": calls string.capwords - - "j". calls json.dumps + - "g": calls text.slugify() + - "j": calls json.dumps - "t": calls str.strip + - "T": calls util.datetime_to_timestamp_string() - "d": calls text.parse_timestamp - - "U": calls urllib.parse.unescape + - "s": calls str() - "S": calls util.to_string() - - "T": calls util.to_timestamü() + - "U": calls urllib.parse.unescape + - "r": calls repr() + - "a": calls ascii() - Example: {f!l} -> "example"; {f!u} -> "EXAMPLE" + # Go to _CONVERSIONS and _SPECIFIERS below to se all of them, read: + # https://github.com/mikf/gallery-dl/blob/master/docs/formatting.md + Extra Format Specifiers: - "?<before>/<after>/": Adds <before> and <after> to the actual value if it evaluates to True. diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 91e9169..32cac79 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -325,7 +325,7 @@ def build_parser(): configuration.add_argument( "--ignore-config", dest="load_config", action="store_false", - help="Do not read the default configuration files", + help="Do not read default configuration files", ) authentication = parser.add_argument_group("Authentication Options") @@ -349,7 +349,7 @@ def build_parser(): selection.add_argument( "--download-archive", dest="archive", metavar="FILE", action=ConfigAction, - help=("Record all downloaded files in the archive file and " + help=("Record all downloaded or skipped files in FILE and " "skip downloading any file already in it"), ) selection.add_argument( @@ -367,19 +367,20 @@ def build_parser(): selection.add_argument( "--range", dest="image-range", metavar="RANGE", action=ConfigAction, - help=("Index-range(s) specifying which images to download. " - "For example '5-10' or '1,3-5,10-'"), + help=("Index range(s) specifying which files to download. " + "These can be either a constant value, range, or slice " + "(e.g. '5', '8-20', or '1:24:3')"), ) selection.add_argument( "--chapter-range", dest="chapter-range", metavar="RANGE", action=ConfigAction, - help=("Like '--range', but applies to manga-chapters " + help=("Like '--range', but applies to manga chapters " "and other delegated URLs"), ) selection.add_argument( "--filter", dest="image-filter", metavar="EXPR", action=ConfigAction, - help=("Python expression controlling which images to download. " + help=("Python expression controlling which files to download. " "Files for which the expression evaluates to False are ignored. " "Available keys are the filename-specific ones listed by '-K'. " "Example: --filter \"image_width >= 1000 and " @@ -388,7 +389,7 @@ def build_parser(): selection.add_argument( "--chapter-filter", dest="chapter-filter", metavar="EXPR", action=ConfigAction, - help=("Like '--filter', but applies to manga-chapters " + help=("Like '--filter', but applies to manga chapters " "and other delegated URLs"), ) @@ -472,7 +473,7 @@ def build_parser(): dest="postprocessors", metavar="CMD", action=AppendCommandAction, const={"name": "exec"}, help=("Execute CMD for each downloaded file. " - "Example: --exec 'convert {} {}.png && rm {}'"), + "Example: --exec \"convert {} {}.png && rm {}\""), ) postprocessor.add_argument( "--exec-after", @@ -480,7 +481,7 @@ def build_parser(): action=AppendCommandAction, const={ "name": "exec", "event": "finalize"}, help=("Execute CMD after all files were downloaded successfully. " - "Example: --exec-after 'cd {} && convert * ../doc.pdf'"), + "Example: --exec-after \"cd {} && convert * ../doc.pdf\""), ) postprocessor.add_argument( "-P", "--postprocessor", diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 23d5bc8..543fb10 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -714,74 +714,71 @@ def chain_predicates(predicates, url, kwdict): class RangePredicate(): - """Predicate; True if the current index is in the given range""" + """Predicate; True if the current index is in the given range(s)""" + def __init__(self, rangespec): - self.ranges = self.optimize_range(self.parse_range(rangespec)) + self.ranges = ranges = self._parse(rangespec) self.index = 0 - if self.ranges: - self.lower, self.upper = self.ranges[0][0], self.ranges[-1][1] + if ranges: + # technically wrong, but good enough for now + # and evaluating min/max for a large range is slow + self.lower = min(r.start for r in ranges) + self.upper = max(r.stop for r in ranges) - 1 else: - self.lower, self.upper = 0, 0 + self.lower = self.upper = 0 - def __call__(self, url, _): - self.index += 1 + def __call__(self, _url, _kwdict): + self.index = index = self.index + 1 - if self.index > self.upper: + if index > self.upper: raise exception.StopExtraction() - for lower, upper in self.ranges: - if lower <= self.index <= upper: + for range in self.ranges: + if index in range: return True return False @staticmethod - def parse_range(rangespec): + def _parse(rangespec): """Parse an integer range string and return the resulting ranges Examples: - parse_range("-2,4,6-8,10-") -> [(1,2), (4,4), (6,8), (10,INTMAX)] - parse_range(" - 3 , 4- 4, 2-6") -> [(1,3), (4,4), (2,6)] + _parse("-2,4,6-8,10-") -> [(1,3), (4,5), (6,9), (10,INTMAX)] + _parse(" - 3 , 4- 4, 2-6") -> [(1,4), (4,5), (2,7)] + _parse("1:2,4:8:2") -> [(1,1), (4,7,2)] """ ranges = [] + append = ranges.append - for group in rangespec.split(","): + if isinstance(rangespec, str): + rangespec = rangespec.split(",") + + for group in rangespec: if not group: continue - first, sep, last = group.partition("-") - if not sep: - beg = end = int(first) - else: - beg = int(first) if first.strip() else 1 - end = int(last) if last.strip() else sys.maxsize - ranges.append((beg, end) if beg <= end else (end, beg)) - return ranges + elif ":" in group: + start, _, stop = group.partition(":") + stop, _, step = stop.partition(":") + append(range( + int(start) if start.strip() else 1, + int(stop) if stop.strip() else sys.maxsize, + int(step) if step.strip() else 1, + )) + + elif "-" in group: + start, _, stop = group.partition("-") + append(range( + int(start) if start.strip() else 1, + int(stop) + 1 if stop.strip() else sys.maxsize, + )) - @staticmethod - def optimize_range(ranges): - """Simplify/Combine a parsed list of ranges - - Examples: - optimize_range([(2,4), (4,6), (5,8)]) -> [(2,8)] - optimize_range([(1,1), (2,2), (3,6), (8,9))]) -> [(1,6), (8,9)] - """ - if len(ranges) <= 1: - return ranges - - ranges.sort() - riter = iter(ranges) - result = [] + else: + start = int(group) + append(range(start, start+1)) - beg, end = next(riter) - for lower, upper in riter: - if lower > end+1: - result.append((beg, end)) - beg, end = lower, upper - elif upper > end: - end = upper - result.append((beg, end)) - return result + return ranges class UniquePredicate(): @@ -802,6 +799,8 @@ class FilterPredicate(): """Predicate; True if evaluating the given expression returns True""" def __init__(self, expr, target="image"): + if not isinstance(expr, str): + expr = "(" + ") and (".join(expr) + ")" name = "<{} filter>".format(target) self.expr = compile_expression(expr, name) diff --git a/gallery_dl/version.py b/gallery_dl/version.py index d832185..5e3b507 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.24.2" +__version__ = "1.24.3" diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index db313c3..7b71349 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -199,13 +199,27 @@ def parse_command_line(module, argv): action += args yield action - if getattr(opts, "parse_metadata", None) is None: - opts.parse_metadata = [] - if opts.metafromtitle is not None: - opts.parse_metadata.append("title:%s" % opts.metafromtitle) - opts.metafromtitle = None - opts.parse_metadata = list(itertools.chain.from_iterable(map( - metadataparser_actions, opts.parse_metadata))) + parse_metadata = getattr(opts, "parse_metadata", None) + if isinstance(parse_metadata, dict): + if opts.metafromtitle is not None: + if "pre_process" not in parse_metadata: + parse_metadata["pre_process"] = [] + parse_metadata["pre_process"].append( + "title:%s" % opts.metafromtitle) + opts.parse_metadata = { + k: list(itertools.chain.from_iterable(map( + metadataparser_actions, v))) + for k, v in parse_metadata.items() + } + else: + if parse_metadata is None: + parse_metadata = [] + if opts.metafromtitle is not None: + parse_metadata.append("title:%s" % opts.metafromtitle) + opts.parse_metadata = list(itertools.chain.from_iterable(map( + metadataparser_actions, parse_metadata))) + + opts.metafromtitle = None else: opts.parse_metadata = () |
