diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/arcalive.py | 14 | ||||
| -rw-r--r-- | gallery_dl/extractor/bbc.py | 33 | ||||
| -rw-r--r-- | gallery_dl/extractor/bunkr.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 47 | ||||
| -rw-r--r-- | gallery_dl/extractor/danbooru.py | 101 | ||||
| -rw-r--r-- | gallery_dl/extractor/deviantart.py | 84 | ||||
| -rw-r--r-- | gallery_dl/extractor/hentaifox.py | 119 | ||||
| -rw-r--r-- | gallery_dl/extractor/hitomi.py | 69 | ||||
| -rw-r--r-- | gallery_dl/extractor/imhentai.py | 50 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 11 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 68 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangapark.py | 280 | ||||
| -rw-r--r-- | gallery_dl/extractor/mastodon.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/nozomi.py | 11 | ||||
| -rw-r--r-- | gallery_dl/extractor/patreon.py | 9 | ||||
| -rw-r--r-- | gallery_dl/extractor/pinterest.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/sexcom.py | 121 | ||||
| -rw-r--r-- | gallery_dl/extractor/skeb.py | 7 | ||||
| -rw-r--r-- | gallery_dl/extractor/subscribestar.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/tiktok.py | 110 | ||||
| -rw-r--r-- | gallery_dl/extractor/zerochan.py | 22 |
22 files changed, 615 insertions, 557 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 8198619..87c3798 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -68,7 +68,6 @@ modules = [ "hentai2read", "hentaicosplays", "hentaifoundry", - "hentaifox", "hentaihand", "hentaihere", "hentainexus", diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py index 8e832fe..8c44256 100644 --- a/gallery_dl/extractor/arcalive.py +++ b/gallery_dl/extractor/arcalive.py @@ -41,7 +41,9 @@ class ArcalivePostExtractor(ArcaliveExtractor): def items(self): self.emoticons = self.config("emoticons", False) - self.gifs = self.config("gifs", True) + self.gifs = gifs = self.config("gifs", True) + if gifs: + self.gifs_fallback = (gifs != "check") post = self.api.post(self.groups[0]) files = self._extract_files(post) @@ -90,11 +92,15 @@ class ArcalivePostExtractor(ArcaliveExtractor): url = path + "." + orig elif video and self.gifs: url_gif = url.rpartition(".")[0] + ".gif" - response = self.request( - url_gif + "?type=orig", method="HEAD", fatal=False) - if response.status_code < 400: + if self.gifs_fallback: fallback = (url + "?type=orig",) url = url_gif + else: + response = self.request( + url_gif + "?type=orig", method="HEAD", fatal=False) + if response.status_code < 400: + fallback = (url + "?type=orig",) + url = url_gif files.append({ "url" : url + "?type=orig", diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py index 113a669..b398152 100644 --- a/gallery_dl/extractor/bbc.py +++ b/gallery_dl/extractor/bbc.py @@ -27,7 +27,12 @@ class BbcGalleryExtractor(GalleryExtractor): def metadata(self, page): data = self._extract_jsonld(page) + return { + "title": text.unescape(text.extr( + page, "<h1>", "</h1>").rpartition("</span>")[2]), + "description": text.unescape(text.extr( + page, 'property="og:description" content="', '"')), "programme": self.gallery_url.split("/")[4], "path": list(util.unique_sequence( element["name"] @@ -40,11 +45,20 @@ class BbcGalleryExtractor(GalleryExtractor): width = width - width % 16 if width else 1920 dimensions = "/{}xn/".format(width) - return [ - (src.replace("/320x180_b/", dimensions), - {"_fallback": self._fallback_urls(src, width)}) - for src in text.extract_iter(page, 'data-image-src="', '"') - ] + results = [] + for img in text.extract_iter(page, 'class="gallery__thumbnail', ">"): + src = text.extr(img, 'data-image-src="', '"') + results.append(( + src.replace("/320x180_b/", dimensions), + { + "title_image": text.unescape(text.extr( + img, 'data-gallery-title="', '"')), + "synopsis": text.unescape(text.extr( + img, 'data-gallery-synopsis="', '"')), + "_fallback": self._fallback_urls(src, width), + }, + )) + return results @staticmethod def _fallback_urls(src, max_width): @@ -62,14 +76,11 @@ class BbcProgrammeExtractor(Extractor): pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?" example = "https://www.bbc.co.uk/programmes/ID/galleries" - def __init__(self, match): - Extractor.__init__(self, match) - self.path, self.page = match.groups() - def items(self): + path, pnum = self.groups data = {"_extractor": BbcGalleryExtractor} - params = {"page": text.parse_int(self.page, 1)} - galleries_url = self.root + self.path + params = {"page": text.parse_int(pnum, 1)} + galleries_url = self.root + path while True: page = self.request(galleries_url, params=params).text diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index d74f59c..481e962 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -189,8 +189,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): else: file_url = data["url"] - file_name = (text.extr(page, 'property="og:title" content="', '"') or - text.extr(page, "<title>", " | Bunkr<")) + file_name = text.extr(page, "<h1", "<").rpartition(">")[2] fallback = text.extr(page, 'property="og:url" content="', '"') return { diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index a85eedd..995505f 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -539,7 +539,7 @@ class Extractor(): for name, value in cookiedict.items(): set_cookie(name, value, domain=domain) - def cookies_check(self, cookies_names, domain=None): + def cookies_check(self, cookies_names, domain=None, subdomains=False): """Check if all 'cookies_names' are in the session's cookiejar""" if not self.cookies: return False @@ -550,26 +550,31 @@ class Extractor(): now = time.time() for cookie in self.cookies: - if cookie.name in names and ( - not domain or cookie.domain == domain): - - if cookie.expires: - diff = int(cookie.expires - now) - - if diff <= 0: - self.log.warning( - "Cookie '%s' has expired", cookie.name) - continue - - elif diff <= 86400: - hours = diff // 3600 - self.log.warning( - "Cookie '%s' will expire in less than %s hour%s", - cookie.name, hours + 1, "s" if hours else "") - - names.discard(cookie.name) - if not names: - return True + if cookie.name not in names: + continue + + if not domain or cookie.domain == domain: + pass + elif not subdomains or not cookie.domain.endswith(domain): + continue + + if cookie.expires: + diff = int(cookie.expires - now) + + if diff <= 0: + self.log.warning( + "Cookie '%s' has expired", cookie.name) + continue + + elif diff <= 86400: + hours = diff // 3600 + self.log.warning( + "Cookie '%s' will expire in less than %s hour%s", + cookie.name, hours + 1, "s" if hours else "") + + names.discard(cookie.name) + if not names: + return True return False def _extract_jsonld(self, page): diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 8d00728..741800c 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -175,6 +175,51 @@ class DanbooruExtractor(BaseExtractor): return [{"file": fmt(index), "delay": delay} for index, delay in enumerate(delays)] + def _collection_posts(self, cid, ctype): + reverse = prefix = None + + order = self.config("order-posts") + if not order or order in {"asc", "pool", "pool_asc", "asc_pool"}: + params = {"tags": "ord{}:{}".format(ctype, cid)} + elif order in {"id", "desc_id", "id_desc"}: + params = {"tags": "{}:{}".format(ctype, cid)} + prefix = "b" + elif order in {"desc", "desc_pool", "pool_desc"}: + params = {"tags": "ord{}:{}".format(ctype, cid)} + reverse = True + elif order in {"asc_id", "id_asc"}: + params = {"tags": "{}:{}".format(ctype, cid)} + reverse = True + + posts = self._pagination("/posts.json", params, prefix) + if reverse: + self.log.info("Collecting posts of %s %s", ctype, cid) + return self._collection_enumerate_reverse(posts) + else: + return self._collection_enumerate(posts) + + def _collection_metadata(self, cid, ctype, cname=None): + url = "{}/{}s/{}.json".format(self.root, cname or ctype, cid) + collection = self.request(url).json() + collection["name"] = collection["name"].replace("_", " ") + self.post_ids = collection.pop("post_ids", ()) + return {ctype: collection} + + def _collection_enumerate(self, posts): + pid_to_num = {pid: num for num, pid in enumerate(self.post_ids, 1)} + for post in posts: + post["num"] = pid_to_num[post["id"]] + yield post + + def _collection_enumerate_reverse(self, posts): + posts = list(posts) + posts.reverse() + + pid_to_num = {pid: num for num, pid in enumerate(self.post_ids, 1)} + for post in posts: + post["num"] = pid_to_num[post["id"]] + return posts + BASE_PATTERN = DanbooruExtractor.update({ "danbooru": { @@ -228,7 +273,7 @@ class DanbooruTagExtractor(DanbooruExtractor): class DanbooruPoolExtractor(DanbooruExtractor): - """Extractor for posts from danbooru pools""" + """Extractor for Danbooru pools""" subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}") filename_fmt = "{num:>04}_{id}_{filename}.{extension}" @@ -237,50 +282,28 @@ class DanbooruPoolExtractor(DanbooruExtractor): example = "https://danbooru.donmai.us/pools/12345" def metadata(self): - self.pool_id = self.groups[-1] - url = "{}/pools/{}.json".format(self.root, self.pool_id) - pool = self.request(url).json() - pool["name"] = pool["name"].replace("_", " ") - self.post_ids = pool.pop("post_ids", ()) - return {"pool": pool} + return self._collection_metadata(self.groups[-1], "pool") def posts(self): - reverse = prefix = None + return self._collection_posts(self.groups[-1], "pool") - order = self.config("order-posts") - if not order or order in ("asc", "pool", "pool_asc", "asc_pool"): - params = {"tags": "ordpool:" + self.pool_id} - elif order in ("id", "desc_id", "id_desc"): - params = {"tags": "pool:" + self.pool_id} - prefix = "b" - elif order in ("desc", "desc_pool", "pool_desc"): - params = {"tags": "ordpool:" + self.pool_id} - reverse = True - elif order in ("asc_id", "id_asc"): - params = {"tags": "pool:" + self.pool_id} - reverse = True - posts = self._pagination("/posts.json", params, prefix) - if reverse: - return self._enumerate_posts_reverse(posts) - else: - return self._enumerate_posts(posts) - - def _enumerate_posts(self, posts): - pid_to_num = {pid: num+1 for num, pid in enumerate(self.post_ids)} - for post in posts: - post["num"] = pid_to_num[post["id"]] - yield post +class DanbooruFavgroupExtractor(DanbooruExtractor): + """Extractor for Danbooru favorite groups""" + subcategory = "favgroup" + directory_fmt = ("{category}", "Favorite Groups", + "{favgroup[id]} {favgroup[name]}") + filename_fmt = "{num:>04}_{id}_{filename}.{extension}" + archive_fmt = "fg_{favgroup[id]}_{id}" + pattern = BASE_PATTERN + r"/favorite_group(?:s|/show)/(\d+)" + example = "https://danbooru.donmai.us/favorite_groups/12345" - def _enumerate_posts_reverse(self, posts): - self.log.info("Collecting posts of pool %s", self.pool_id) - posts = list(posts) - posts.reverse() + def metadata(self): + return self._collection_metadata( + self.groups[-1], "favgroup", "favorite_group") - pid_to_num = {pid: num+1 for num, pid in enumerate(self.post_ids)} - for post in posts: - post["num"] = pid_to_num[post["id"]] - return posts + def posts(self): + return self._collection_posts(self.groups[-1], "favgroup") class DanbooruPostExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 59b2d6d..3a862c1 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -687,10 +687,18 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\ for folder in folders: if match(folder["name"]): return folder + elif folder["has_subfolders"]: + for subfolder in folder["subfolders"]: + if match(subfolder["name"]): + return subfolder else: for folder in folders: if folder["folderid"] == uuid: return folder + elif folder["has_subfolders"]: + for subfolder in folder["subfolders"]: + if subfolder["folderid"] == uuid: + return subfolder raise exception.NotFoundError("folder") def _folder_urls(self, folders, category, extractor): @@ -891,7 +899,8 @@ class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" subcategory = "gallery" archive_fmt = "g_{_username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery(?:/all|/?\?catpath=)?/?$" + pattern = (BASE_PATTERN + r"/gallery" + r"(?:/all|/recommended-for-you|/?\?catpath=)?/?$") example = "https://www.deviantart.com/USER/gallery/" def deviations(self): @@ -987,13 +996,36 @@ class DeviantartFolderExtractor(DeviantartExtractor): def deviations(self): folders = self.api.gallery_folders(self.user) folder = self._find_folder(folders, self.folder_name, self.folder_id) + + # Leaving this here for backwards compatibility self.folder = { "title": folder["name"], "uuid" : folder["folderid"], "index": self.folder_id, "owner": self.user, + "parent_uuid": folder["parent"], } - return self.api.gallery(self.user, folder["folderid"], self.offset) + + if folder.get("subfolder"): + self.folder["parent_folder"] = folder["parent_folder"] + self.archive_fmt = "F_{folder[parent_uuid]}_{index}.{extension}" + + if self.flat: + self.directory_fmt = ("{category}", "{username}", + "{folder[parent_folder]}") + else: + self.directory_fmt = ("{category}", "{username}", + "{folder[parent_folder]}", + "{folder[title]}") + + if folder.get("has_subfolders") and self.config("subfolders", True): + for subfolder in folder["subfolders"]: + subfolder["parent_folder"] = folder["name"] + subfolder["subfolder"] = True + yield from self._folder_urls( + folder["subfolders"], "gallery", DeviantartFolderExtractor) + + yield from self.api.gallery(self.user, folder["folderid"], self.offset) def prepare(self, deviation): DeviantartExtractor.prepare(self, deviation) @@ -1004,7 +1036,7 @@ class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" archive_fmt = "{index}.{extension}" - pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)" + pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.s(h))" r"/([a-z0-9]+)") example = "https://www.deviantart.com/stash/abcde" @@ -1016,9 +1048,18 @@ class DeviantartStashExtractor(DeviantartExtractor): def deviations(self, stash_id=None): if stash_id is None: - stash_id = self.groups[0] - url = "https://www.deviantart.com/stash/" + stash_id - page = self._limited_request(url).text + legacy_url, stash_id = self.groups + else: + legacy_url = False + + if legacy_url and stash_id[0] == "2": + url = "https://sta.sh/" + stash_id + response = self._limited_request(url) + stash_id = response.url.rpartition("/")[2] + page = response.text + else: + url = "https://www.deviantart.com/stash/" + stash_id + page = self._limited_request(url).text if stash_id[0] == "0": uuid = text.extr(page, '//deviation/', '"') @@ -1235,7 +1276,34 @@ class DeviantartDeviationExtractor(DeviantartExtractor): deviation = self.api.deviation(uuid) deviation["_page"] = page - return (deviation,) + + _dev_info = text.extr( + page, '\\"deviationExtended\\":', ',\\"deviation\\":', None) + # Clean up escaped quotes + _json_str = re.sub( + r'(?<!\\)\\{1}"', '"', _dev_info).replace("\\'", "'") + _extended_info = util.json_loads(_json_str)[self.deviation_id] + additional_media = _extended_info.get("additionalMedia") or () + + if additional_media: + self.filename_fmt = ("{category}_{index}_{index_file}_{title}_" + "{num:>02}.{extension}") + self.archive_fmt = ("g_{_username}_{index}{index_file:?_//}." + "{extension}") + + deviation["index_file"] = 0 + deviation["count"] = 1 + len(additional_media) + deviation["num"] = 1 + yield deviation + + for index, post in enumerate(additional_media): + uri = post["media"]["baseUri"].encode().decode("unicode-escape") + deviation["content"]["src"] = uri + deviation["num"] += 1 + deviation["index_file"] = post["fileId"] + # Download only works on purchased materials - no way to check + deviation["is_downloadable"] = False + yield deviation class DeviantartScrapsExtractor(DeviantartExtractor): @@ -1366,7 +1434,7 @@ class DeviantartOAuthAPI(): def __init__(self, extractor): self.extractor = extractor self.log = extractor.log - self.headers = {"dA-minor-version": "20200519"} + self.headers = {"dA-minor-version": "20210526"} self._warn_429 = True self.delay = extractor.config("wait-min", 0) diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py deleted file mode 100644 index 31a302d..0000000 --- a/gallery_dl/extractor/hentaifox.py +++ /dev/null @@ -1,119 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2019-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://hentaifox.com/""" - -from .common import GalleryExtractor, Extractor, Message -from .. import text, util - - -class HentaifoxBase(): - """Base class for hentaifox extractors""" - category = "hentaifox" - root = "https://hentaifox.com" - - -class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): - """Extractor for image galleries on hentaifox.com""" - pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" - example = "https://hentaifox.com/gallery/12345/" - - def __init__(self, match): - GalleryExtractor.__init__(self, match) - self.gallery_id = match.group(2) - - @staticmethod - def _split(txt): - return [ - text.remove_html(tag.partition(">")[2], "", "") - for tag in text.extract_iter( - txt, "class='tag_btn", "<span class='t_badge") - ] - - def metadata(self, page): - extr = text.extract_from(page) - split = self._split - - return { - "gallery_id": text.parse_int(self.gallery_id), - "parody" : split(extr(">Parodies:" , "</ul>")), - "characters": split(extr(">Characters:", "</ul>")), - "tags" : split(extr(">Tags:" , "</ul>")), - "artist" : split(extr(">Artists:" , "</ul>")), - "group" : split(extr(">Groups:" , "</ul>")), - "type" : text.remove_html(extr(">Category:", "<span")), - "title" : text.unescape(extr( - 'id="gallery_title" value="', '"')), - "language" : "English", - "lang" : "en", - } - - def images(self, page): - cover, pos = text.extract(page, '<img src="', '"') - data , pos = text.extract(page, "$.parseJSON('", "');", pos) - path = "/".join(cover.split("/")[3:-1]) - - result = [] - append = result.append - extmap = {"j": "jpg", "p": "png", "g": "gif"} - urlfmt = ("/" + path + "/{}.{}").format - - server1 = "https://i.hentaifox.com" - server2 = "https://i2.hentaifox.com" - - for num, image in util.json_loads(data).items(): - ext, width, height = image.split(",") - path = urlfmt(num, extmap[ext]) - append((server1 + path, { - "width" : width, - "height" : height, - "_fallback": (server2 + path,), - })) - - return result - - -class HentaifoxSearchExtractor(HentaifoxBase, Extractor): - """Extractor for search results and listings on hentaifox.com""" - subcategory = "search" - pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com" - r"(/(?:parody|tag|artist|character|search|group)/[^/?%#]+)") - example = "https://hentaifox.com/tag/TAG/" - - def __init__(self, match): - Extractor.__init__(self, match) - self.path = match.group(1) - - def items(self): - for gallery in self.galleries(): - yield Message.Queue, gallery["url"], gallery - - def galleries(self): - num = 1 - - while True: - url = "{}{}/pag/{}/".format(self.root, self.path, num) - page = self.request(url).text - - for info in text.extract_iter( - page, 'class="g_title"><a href="', '</a>'): - url, _, title = info.partition('">') - - yield { - "url" : text.urljoin(self.root, url), - "gallery_id": text.parse_int( - url.strip("/").rpartition("/")[2]), - "title" : text.unescape(title), - "_extractor": HentaifoxGalleryExtractor, - } - - pos = page.find(">Next<") - url = text.rextract(page, "href=", ">", pos)[0] - if pos == -1 or "/pag" not in url: - return - num += 1 diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index e15e13c..086b77c 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -16,19 +16,25 @@ import string import re -class HitomiGalleryExtractor(GalleryExtractor): - """Extractor for image galleries from hitomi.la""" +class HitomiExtractor(Extractor): + """Base class for hitomi extractors""" category = "hitomi" root = "https://hitomi.la" + domain = "gold-usergeneratedcontent.net" + + +class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor): + """Extractor for hitomi.la galleries""" pattern = (r"(?:https?://)?hitomi\.la" r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)" r"/(?:[^/?#]+-)?(\d+)") example = "https://hitomi.la/manga/TITLE-867789.html" def __init__(self, match): - self.gid = match.group(1) - url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gid) - GalleryExtractor.__init__(self, match, url) + GalleryExtractor.__init__(self, match, False) + self.gid = gid = self.groups[0] + self.gallery_url = "https://ltn.{}/galleries/{}.js".format( + self.domain, gid) def _init(self): self.session.headers["Referer"] = "{}/reader/{}.html".format( @@ -71,43 +77,34 @@ class HitomiGalleryExtractor(GalleryExtractor): } def images(self, _): - # see https://ltn.hitomi.la/gg.js + # https://ltn.gold-usergeneratedcontent.net/gg.js gg_m, gg_b, gg_default = _parse_gg(self) - fmt = self.config("format") or "webp" - if fmt == "original": - subdomain, path, ext, check = "b", "images", None, False - else: - subdomain, path, ext, check = "a", fmt, fmt, (fmt != "webp") + fmt = ext = self.config("format") or "webp" + check = (fmt != "webp") result = [] for image in self.info["files"]: if check: - if image.get("has" + fmt): - path = ext = fmt - else: - path = ext = "webp" + ext = fmt if image.get("has" + fmt) else "webp" ihash = image["hash"] idata = text.nameext_from_url(image["name"]) idata["extension_original"] = idata["extension"] - if ext: - idata["extension"] = ext + idata["extension"] = ext - # see https://ltn.hitomi.la/common.js + # https://ltn.gold-usergeneratedcontent.net/common.js inum = int(ihash[-1] + ihash[-3:-1], 16) - url = "https://{}{}.hitomi.la/{}/{}/{}/{}.{}".format( - chr(97 + gg_m.get(inum, gg_default)), - subdomain, path, gg_b, inum, ihash, idata["extension"], + url = "https://{}{}.{}/{}/{}/{}.{}".format( + ext[0], gg_m.get(inum, gg_default) + 1, self.domain, + gg_b, inum, ihash, ext, ) result.append((url, idata)) return result -class HitomiTagExtractor(Extractor): +class HitomiTagExtractor(HitomiExtractor): """Extractor for galleries from tag searches on hitomi.la""" - category = "hitomi" subcategory = "tag" - root = "https://hitomi.la" pattern = (r"(?:https?://)?hitomi\.la" r"/(tag|artist|group|series|type|character)" r"/([^/?#]+)\.html") @@ -126,8 +123,8 @@ class HitomiTagExtractor(Extractor): "_extractor": HitomiGalleryExtractor, "search_tags": text.unquote(self.tag.rpartition("-")[0]), } - nozomi_url = "https://ltn.hitomi.la/{}/{}.nozomi".format( - self.type, self.tag) + nozomi_url = "https://ltn.{}/{}/{}.nozomi".format( + self.domain, self.type, self.tag) headers = { "Origin": self.root, "Cache-Control": "max-age=0", @@ -166,8 +163,8 @@ class HitomiIndexExtractor(HitomiTagExtractor): def items(self): data = {"_extractor": HitomiGalleryExtractor} - nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format( - self.tag, self.language) + nozomi_url = "https://ltn.{}/{}-{}.nozomi".format( + self.domain, self.tag, self.language) headers = { "Origin": self.root, "Cache-Control": "max-age=0", @@ -194,11 +191,9 @@ class HitomiIndexExtractor(HitomiTagExtractor): return -class HitomiSearchExtractor(Extractor): +class HitomiSearchExtractor(HitomiExtractor): """Extractor for galleries from multiple tag searches on hitomi.la""" - category = "hitomi" subcategory = "search" - root = "https://hitomi.la" pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)" example = "https://hitomi.la/search.html?QUERY" @@ -224,11 +219,11 @@ class HitomiSearchExtractor(Extractor): area, tag, language = self.get_nozomi_args(full_tag) if area: - nozomi_url = "https://ltn.hitomi.la/n/{}/{}-{}.nozomi".format( - area, tag, language) + nozomi_url = "https://ltn.{}/n/{}/{}-{}.nozomi".format( + self.domain, area, tag, language) else: - nozomi_url = "https://ltn.hitomi.la/n/{}-{}.nozomi".format( - tag, language) + nozomi_url = "https://ltn.{}/n/{}-{}.nozomi".format( + self.domain, tag, language) headers = { "Origin": self.root, @@ -257,7 +252,7 @@ class HitomiSearchExtractor(Extractor): @memcache(maxage=1800) def _parse_gg(extr): - page = extr.request("https://ltn.hitomi.la/gg.js").text + page = extr.request("https://ltn.gold-usergeneratedcontent.net/gg.js").text m = {} @@ -280,4 +275,4 @@ def _parse_gg(extr): d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page) b = re.search(r"b:\s*[\"'](.+)[\"']", page) - return m, b.group(1).strip("/"), int(d.group(1)) if d else 1 + return m, b.group(1).strip("/"), int(d.group(1)) if d else 0 diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py index 0439f5b..1b0fba3 100644 --- a/gallery_dl/extractor/imhentai.py +++ b/gallery_dl/extractor/imhentai.py @@ -22,10 +22,15 @@ class ImhentaiExtractor(BaseExtractor): while True: page = self.request(url).text + + pos = page.find('class="ranking_list"') + if pos >= 0: + page = page[:pos] + extr = text.extract_from(page) while True: - gallery_id = extr('<a href="/gallery/', '"') + gallery_id = extr('href="/gallery/', '"') if gallery_id == prev: continue if not gallery_id: @@ -57,6 +62,18 @@ BASE_PATTERN = ImhentaiExtractor.update({ "root": "https://hentairox.com", "pattern": r"(?:www\.)?hentairox\.com", }, + "hentaifox": { + "root": "https://hentaifox.com", + "pattern": r"(?:www\.)?hentaifox\.com", + }, + "hentaienvy": { + "root": "https://hentaienvy.com", + "pattern": r"(?:www\.)?hentaienvy\.com", + }, + "hentaizap": { + "root": "https://hentaizap.com", + "pattern": r"(?:www\.)?hentaizap\.com", + }, }) @@ -72,17 +89,20 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) + title = extr("<h1>", "<") + title_alt = extr('class="subtitle">', "<") + end = "</li>" if extr('<ul class="galleries_info', ">") else "</ul>" data = { "gallery_id": text.parse_int(self.gallery_id), - "title" : text.unescape(extr("<h1>", "<")), - "title_alt" : text.unescape(extr('class="subtitle">', "<")), - "parody" : self._split(extr(">Parodies", "</li>")), - "character" : self._split(extr(">Characters", "</li>")), - "tags" : self._split(extr(">Tags", "</li>")), - "artist" : self._split(extr(">Artists", "</li>")), - "group" : self._split(extr(">Groups", "</li>")), - "language" : self._split(extr(">Languages", "</li>")), + "title" : text.unescape(title), + "title_alt" : text.unescape(title_alt), + "parody" : self._split(extr(">Parodies", end)), + "character" : self._split(extr(">Characters", end)), + "tags" : self._split(extr(">Tags", end)), + "artist" : self._split(extr(">Artists", end)), + "group" : self._split(extr(">Groups", end)), + "language" : self._split(extr(">Languages", end)), "type" : extr("href='/category/", "/"), } @@ -94,10 +114,12 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor): def _split(self, html): results = [] for tag in text.extract_iter(html, ">", "</a>"): - tag = tag.partition(" <span class='badge'>")[0] - if "<" in tag: - tag = text.remove_html(tag) + badge = ("badge'>" in tag or "class='badge" in tag) + tag = text.remove_html(tag) + if badge: + tag = tag.rpartition(" ")[0] results.append(tag) + results.sort() return results def images(self, page): @@ -132,9 +154,9 @@ class ImhentaiTagExtractor(ImhentaiExtractor): class ImhentaiSearchExtractor(ImhentaiExtractor): """Extractor for imhentai search results""" subcategory = "search" - pattern = BASE_PATTERN + r"/search/?\?([^#]+)" + pattern = BASE_PATTERN + r"/search(/?\?[^#]+|/[^/?#]+/?)" example = "https://imhentai.xxx/search/?key=QUERY" def items(self): - url = self.root + "/search/?" + self.groups[-1] + url = self.root + "/search" + self.groups[-1] return self._pagination(url) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index e344b2f..aa26408 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -56,9 +56,11 @@ class InstagramExtractor(Extractor): data = self.metadata() videos = self.config("videos", True) + if videos: + videos_dash = (videos != "merged") + videos_headers = {"User-Agent": "Mozilla/5.0"} previews = self.config("previews", False) max_posts = self.config("max-posts") - video_headers = {"User-Agent": "Mozilla/5.0"} order = self.config("order-files") reverse = order[0] in ("r", "d") if order else False @@ -92,8 +94,12 @@ class InstagramExtractor(Extractor): url = file.get("video_url") if url: if videos: - file["_http_headers"] = video_headers + file["_http_headers"] = videos_headers text.nameext_from_url(url, file) + if videos_dash: + file["_fallback"] = (url,) + file["_ytdl_manifest"] = "dash" + url = "ytdl:dash" yield Message.Url, url, file if previews: file["media_id"] += "p" @@ -246,6 +252,7 @@ class InstagramExtractor(Extractor): "video_url" : video["url"] if video else None, "width" : media["width"], "height" : media["height"], + "_ytdl_manifest_data": item.get("video_dash_manifest"), } if "expiring_at" in item: diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 788b5d9..860e771 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -57,11 +57,13 @@ class KemonopartyExtractor(Extractor): find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) announcements = True if self.config("announcements") else None + archives = True if self.config("archives") else False comments = True if self.config("comments") else False duplicates = True if self.config("duplicates") else False dms = True if self.config("dms") else None max_posts = self.config("max-posts") - creator_info = {} if self.config("metadata") else None + creator_info = {} if self.config("metadata", True) else None + exts_archive = {"zip", "rar", "7z"} # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} @@ -115,6 +117,7 @@ class KemonopartyExtractor(Extractor): files = [] hashes = set() + post_archives = post["archives"] = [] for file in itertools.chain.from_iterable( g(post) for g in generators): @@ -129,31 +132,45 @@ class KemonopartyExtractor(Extractor): continue hashes.add(hash) else: - file["hash"] = "" + file["hash"] = hash = "" + + if url[0] == "/": + url = self.root + "/data" + url + elif url.startswith(self.root): + url = self.root + "/data" + url[20:] + file["url"] = url + + text.nameext_from_url(file.get("name", url), file) + ext = text.ext_from_url(url) + if not file["extension"]: + file["extension"] = ext + elif ext == "txt" and file["extension"] != "txt": + file["_http_validate"] = _validate + elif ext in exts_archive: + file["type"] = "archive" + if archives: + try: + data = self.api.posts_archives(file["hash"]) + data.update(file) + post_archives.append(data) + except Exception as exc: + self.log.warning( + "%s: Failed to retrieve archive metadata of " + "'%s' (%s: %s)", post["id"], file.get("name"), + exc.__class__.__name__, exc) + post_archives.append(file.copy()) + else: + post_archives.append(file.copy()) files.append(file) post["count"] = len(files) yield Message.Directory, post - for post["num"], file in enumerate(files, 1): - post["_http_validate"] = None - post["hash"] = file["hash"] - post["type"] = file["type"] - url = file["path"] - - text.nameext_from_url(file.get("name", url), post) - ext = text.ext_from_url(url) - if not post["extension"]: - post["extension"] = ext - elif ext == "txt" and post["extension"] != "txt": - post["_http_validate"] = _validate - - if url[0] == "/": - url = self.root + "/data" + url - elif url.startswith(self.root): - url = self.root + "/data" + url[20:] - yield Message.Url, url, post + if "id" in file: + del file["id"] + post.update(file) + yield Message.Url, file["url"], post def login(self): username, password = self._get_auth_info() @@ -368,17 +385,18 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): key = "id" else: key = "name" + else: + key = "id" + channel = channel_id + if not channel_name or not channel_id: for ch in self.api.discord_server(server_id): if ch[key] == channel: break else: raise exception.NotFoundError("channel") - channel_id = ch["id"] channel_name = ch["name"] - elif channel_name is None: - channel_name = "" find_inline = re.compile( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" @@ -503,6 +521,10 @@ class KemonoAPI(): params = {"q": query, "o": offset, "tag": tags} return self._pagination(endpoint, params, 50, "posts") + def posts_archives(self, file_hash): + endpoint = "/posts/archives/" + file_hash + return self._call(endpoint)["archive"] + def creator_posts(self, service, creator_id, offset=0, query=None): endpoint = "/{}/user/{}".format(service, creator_id) params = {"q": query, "o": offset} diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 6f7a238..b11f81d 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -10,9 +10,13 @@ from .common import ChapterExtractor, Extractor, Message from .. import text, util, exception +from ..cache import memcache import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangapark\.(?:net|com|org|io|me)" +BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:" + r"(?:manga|comic|read)park\.(?:com|net|org|me|io|to)|" + r"parkmanga\.(?:com|net|org)|" + r"mpark\.to)") class MangaparkBase(): @@ -31,57 +35,87 @@ class MangaparkBase(): match = self._match_title(title) return match.groups() if match else (0, 0, "", "") + @memcache(keyarg=1) + def _extract_manga(self, manga_id): + variables = { + "getComicNodeId": manga_id, + } + return self._request_graphql("Get_comicNode", variables)["data"] + + def _extract_chapter(self, chapter_id): + variables = { + "getChapterNodeId": chapter_id, + } + return self._request_graphql("Get_chapterNode", variables)["data"] + + def _extract_chapters_all(self, manga_id): + variables = { + "comicId": manga_id, + } + return self._request_graphql("Get_comicChapterList", variables) + + def _extract_chapters_source(self, source_id): + variables = { + "sourceId": source_id, + } + return self._request_graphql( + "get_content_source_chapterList", variables) + + def _request_graphql(self, opname, variables): + url = self.root + "/apo/" + data = { + "query" : QUERIES[opname], + "variables" : variables, + "operationName": opname, + } + return self.request( + url, method="POST", json=data).json()["data"].popitem()[1] + class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" - pattern = BASE_PATTERN + r"/title/[^/?#]+/(\d+)" + pattern = (BASE_PATTERN + + r"/(?:title/[^/?#]+/|comic/\d+/[^/?#]+/[^/?#]+-i)(\d+)") example = "https://mangapark.net/title/MANGA/12345-en-ch.01" def __init__(self, match): self.root = text.root_from_url(match.group(0)) - url = "{}/title/_/{}".format(self.root, match.group(1)) - ChapterExtractor.__init__(self, match, url) - - def metadata(self, page): - data = self._extract_nextdata(page) - chapter = (data["props"]["pageProps"]["dehydratedState"] - ["queries"][0]["state"]["data"]["data"]) - manga = chapter["comicNode"]["data"] - source = chapter["sourceNode"]["data"] - - self._urls = chapter["imageSet"]["httpLis"] - self._params = chapter["imageSet"]["wordLis"] + ChapterExtractor.__init__(self, match, False) + + def metadata(self, _): + chapter = self._extract_chapter(self.groups[0]) + manga = self._extract_manga(chapter["comicNode"]["id"]) + + self._urls = chapter["imageFile"]["urlList"] vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) + lang = chapter.get("lang") or "en" return { "manga" : manga["name"], - "manga_id" : manga["id"], - "artist" : source["artists"], - "author" : source["authors"], - "genre" : source["genres"], + "manga_id" : text.parse_int(manga["id"]), + "artist" : manga["artists"], + "author" : manga["authors"], + "genre" : manga["genres"], "volume" : text.parse_int(vol), "chapter" : text.parse_int(ch), "chapter_minor": minor, - "chapter_id": chapter["id"], - "title" : chapter["title"] or title or "", - "lang" : chapter["lang"], - "language" : util.code_to_language(chapter["lang"]), - "source" : source["srcTitle"], - "source_id" : source["id"], + "chapter_id": text.parse_int(chapter["id"]), + "title" : title or "", + "lang" : lang, + "language" : util.code_to_language(lang), + "source" : chapter["srcTitle"], + "source_id" : chapter["sourceId"], "date" : text.parse_timestamp(chapter["dateCreate"] // 1000), } - def images(self, page): - return [ - (url + "?" + params, None) - for url, params in zip(self._urls, self._params) - ] + def images(self, _): + return [(url, None) for url in self._urls] class MangaparkMangaExtractor(MangaparkBase, Extractor): """Extractor for manga from mangapark.net""" subcategory = "manga" - pattern = BASE_PATTERN + r"/title/(\d+)(?:-[^/?#]*)?/?$" + pattern = BASE_PATTERN + r"/(?:title|comic)/(\d+)(?:[/-][^/?#]*)?/?$" example = "https://mangapark.net/title/12345-MANGA" def __init__(self, match): @@ -95,6 +129,8 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): url = self.root + chapter["urlPath"] vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) + lang = chapter.get("lang") or "en" + data = { "manga_id" : self.manga_id, "volume" : text.parse_int(vol), @@ -102,8 +138,8 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): "chapter_minor": minor, "chapter_id": chapter["id"], "title" : chapter["title"] or title or "", - "lang" : chapter["lang"], - "language" : util.code_to_language(chapter["lang"]), + "lang" : lang, + "language" : util.code_to_language(lang), "source" : chapter["srcTitle"], "source_id" : chapter["sourceId"], "date" : text.parse_timestamp( @@ -114,45 +150,12 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): def chapters(self): source = self.config("source") - if not source: - return self.chapters_all() - - source_id = self._select_source(source) - self.log.debug("Requesting chapters for source_id %s", source_id) - return self.chapters_source(source_id) - - def chapters_all(self): - pnum = 0 - variables = { - "select": { - "comicId": self.manga_id, - "range" : None, - "isAsc" : not self.config("chapter-reverse"), - } - } - - while True: - data = self._request_graphql( - "get_content_comicChapterRangeList", variables) - - for item in data["items"]: - yield from item["chapterNodes"] - - if not pnum: - pager = data["pager"] - pnum += 1 - - try: - variables["select"]["range"] = pager[pnum] - except IndexError: - return - - def chapters_source(self, source_id): - variables = { - "sourceId": source_id, - } - chapters = self._request_graphql( - "get_content_source_chapterList", variables) + if source: + source_id = self._select_source(source) + self.log.debug("Requesting chapters for source_id %s", source_id) + chapters = self._extract_chapters_source(source_id) + else: + chapters = self._extract_chapters_all(self.groups[0]) if self.config("chapter-reverse"): chapters.reverse() @@ -180,101 +183,58 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor): raise exception.StopExtraction( "'%s' does not match any available source", source) - def _request_graphql(self, opname, variables): - url = self.root + "/apo/" - data = { - "query" : QUERIES[opname], - "variables" : util.json_dumps(variables), - "operationName": opname, - } - return self.request( - url, method="POST", json=data).json()["data"][opname] - QUERIES = { - "get_content_comicChapterRangeList": """ - query get_content_comicChapterRangeList($select: Content_ComicChapterRangeList_Select) { - get_content_comicChapterRangeList( - select: $select - ) { - reqRange{x y} - missing - pager {x y} - items{ - serial - chapterNodes { - - id - data { - - - id - sourceId - - dbStatus - isNormal - isHidden - isDeleted - isFinal - - dateCreate - datePublic - dateModify - lang - volume - serial - dname - title - urlPath - - srcTitle srcColor - - count_images - - stat_count_post_child - stat_count_post_reply - stat_count_views_login - stat_count_views_guest - - userId - userNode { - - id - data { - -id -name -uniq -avatarUrl -urlPath - -verified -deleted -banned - -dateCreate -dateOnline - -stat_count_chapters_normal -stat_count_chapters_others - -is_adm is_mod is_vip is_upr - - } - - } - - disqusId - - - } + "Get_comicChapterList": """ +query Get_comicChapterList($comicId: ID!) { + get_comicChapterList(comicId: $comicId) { + data { + id + dname + title + lang + urlPath + srcTitle + sourceId + dateCreate + } + } +} +""", - sser_read + "Get_chapterNode": """ +query Get_chapterNode($getChapterNodeId: ID!) { + get_chapterNode(id: $getChapterNodeId) { + data { + id + dname + lang + sourceId + srcTitle + dateCreate + comicNode{ + id + } + imageFile { + urlList + } } - } + } +} +""", + "Get_comicNode": """ +query Get_comicNode($getComicNodeId: ID!) { + get_comicNode(id: $getComicNodeId) { + data { + id + name + artists + authors + genres + } } - } +} """, "get_content_source_chapterList": """ diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 5b354ac..5e78ad4 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -196,7 +196,8 @@ class MastodonFollowingExtractor(MastodonExtractor): class MastodonStatusExtractor(MastodonExtractor): """Extractor for images from a status""" subcategory = "status" - pattern = BASE_PATTERN + r"/@[^/?#]+/(?!following)([^/?#]+)" + pattern = (BASE_PATTERN + r"/(?:@[^/?#]+|(?:users/[^/?#]+/)?statuses)" + r"/(?!following)([^/?#]+)") example = "https://mastodon.social/@USER/12345" def statuses(self): diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index 851f663..3d1722a 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -21,6 +21,7 @@ class NozomiExtractor(Extractor): """Base class for nozomi extractors""" category = "nozomi" root = "https://nozomi.la" + domain = "gold-usergeneratedcontent.net" filename_fmt = "{postid} {dataid}.{extension}" archive_fmt = "{dataid}" @@ -31,8 +32,8 @@ class NozomiExtractor(Extractor): data = self.metadata() for post_id in map(str, self.posts()): - url = "https://j.nozomi.la/post/{}/{}/{}.json".format( - post_id[-1], post_id[-3:-1], post_id) + url = "https://j.{}/post/{}/{}/{}.json".format( + self.domain, post_id[-1], post_id[-3:-1], post_id) response = self.request(url, fatal=False) if response.status_code >= 400: @@ -76,8 +77,8 @@ class NozomiExtractor(Extractor): ext = "webp" post["extension"] = ext - post["url"] = url = "https://{}.nozomi.la/{}/{}/{}.{}".format( - subdomain, did[-1], did[-3:-1], did, ext) + post["url"] = url = "https://{}.{}/{}/{}/{}.{}".format( + subdomain, self.domain, did[-1], did[-3:-1], did, ext) yield Message.Url, url, post def posts(self): @@ -168,7 +169,7 @@ class NozomiSearchExtractor(NozomiExtractor): negative = [] def nozomi(path): - url = "https://j.nozomi.la/" + path + ".nozomi" + url = "https://j.{}/{}.nozomi".format(self.domain, path) return decode_nozomi(self.request(url).content) for tag in self.tags: diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index f5a33d5..b8c6acb 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -27,7 +27,7 @@ class PatreonExtractor(Extractor): _warning = True def _init(self): - if not self.cookies_check(("session_id",)): + if not self.cookies_check(("session_id",), subdomains=True): if self._warning: PatreonExtractor._warning = False self.log.warning("no 'session_id' cookie set") @@ -329,10 +329,11 @@ class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" - r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))" + r"/(?!(?:home|create|login|signup|search|posts|messages)" + r"(?:$|[/?#]))" r"(?:profile/creators|(?:c/)?([^/?#]+)(?:/posts)?)" r"/?(?:\?([^#]+))?") - example = "https://www.patreon.com/USER" + example = "https://www.patreon.com/c/USER" def posts(self): creator, query = self.groups @@ -370,7 +371,7 @@ class PatreonCreatorExtractor(PatreonExtractor): data = None data = self._extract_bootstrap(page) return data["campaign"]["data"]["id"] - except (KeyError, ValueError) as exc: + except Exception as exc: if data: self.log.debug(data) raise exception.StopExtraction( diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 121c7bf..1a299c1 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -132,6 +132,9 @@ class PinterestExtractor(Extractor): "extension": "txt", "media_id": block.get("id")} + elif type == "story_pin_static_sticker_block": + continue + else: self.log.warning("%s: Unsupported story block '%s'", pin.get("id"), type) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 7708b5c..9e7d75d 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -10,6 +10,9 @@ from .common import Extractor, Message from .. import text +from datetime import datetime + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?sex\.com" class SexcomExtractor(Extractor): @@ -23,8 +26,20 @@ class SexcomExtractor(Extractor): def items(self): yield Message.Directory, self.metadata() for pin in map(self._parse_pin, self.pins()): - if pin: - yield Message.Url, pin["url"], pin + if not pin: + continue + + url = pin["url"] + parts = url.rsplit("/", 4) + try: + pin["date_url"] = dt = datetime( + int(parts[1]), int(parts[2]), int(parts[3])) + if "date" not in pin: + pin["date"] = dt + except Exception: + pass + + yield Message.Url, url, pin def metadata(self): return {} @@ -53,10 +68,18 @@ class SexcomExtractor(Extractor): self.log.warning('Unable to fetch %s ("%s %s")', url, response.status_code, response.reason) return None + + if "/pin/" in response.url: + return self._parse_pin_legacy(response) + if "/videos/" in response.url: + return self._parse_pin_video(response) + return self._parse_pin_gifs(response) + + def _parse_pin_legacy(self, response): extr = text.extract_from(response.text) data = {} - data["_http_headers"] = {"Referer": url} + data["_http_headers"] = {"Referer": response.url} data["thumbnail"] = extr('itemprop="thumbnail" content="', '"') data["type"] = extr('<h1>' , '<').rstrip(" -").strip().lower() data["title"] = text.unescape(extr('itemprop="name">' , '<')) @@ -82,7 +105,8 @@ class SexcomExtractor(Extractor): src = (text.extr(iframe, ' src="', '"') or text.extr(iframe, " src='", "'")) if not src: - self.log.warning("Unable to fetch media from %s", url) + self.log.warning( + "Unable to fetch media from %s", response.url) return None data["extension"] = None data["url"] = "ytdl:" + src @@ -100,27 +124,60 @@ class SexcomExtractor(Extractor): return data + def _parse_pin_gifs(self, response): + extr = text.extract_from(response.text) + + data = { + "_http_headers": {"Referer": response.url}, + "type": "gif", + "url": extr(' href="', '"'), + "title": text.unescape(extr("<title>", " Gif | Sex.com<")), + "pin_id": text.parse_int(extr( + 'rel="canonical" href="', '"').rpartition("/")[2]), + "tags": text.split_html(extr("</h1>", "</section>")), + } + + return text.nameext_from_url(data["url"], data) + + def _parse_pin_video(self, response): + extr = text.extract_from(response.text) + + if not self.cookies.get("CloudFront-Key-Pair-Id", domain=".sex.com"): + self.log.warning("CloudFront cookies required for video downloads") + + data = { + "_ytdl_manifest": "hls", + "extension": "mp4", + "type": "video", + "title": text.unescape(extr("<title>", " | Sex.com<")), + "pin_id": text.parse_int(extr( + 'rel="canonical" href="', '"').rpartition("/")[2]), + "tags": text.split_html(extr( + 'event_name="video_tags_click"', "<div data-testid=") + .partition(">")[2]), + "url": "ytdl:" + extr('<source src="', '"'), + } + + return data + class SexcomPinExtractor(SexcomExtractor): """Extractor for a pinned image or video on www.sex.com""" subcategory = "pin" directory_fmt = ("{category}",) - pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)(?!.*#related$)" + pattern = (BASE_PATTERN + + r"(/(?:pin|\w\w/(?:gif|video)s)/\d+/?)(?!.*#related$)") example = "https://www.sex.com/pin/12345-TITLE/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.pin_id = match.group(1) - def pins(self): - return ("{}/pin/{}/".format(self.root, self.pin_id),) + return (self.root + self.groups[0],) class SexcomRelatedPinExtractor(SexcomPinExtractor): """Extractor for related pins on www.sex.com""" subcategory = "related-pin" directory_fmt = ("{category}", "related {original_pin[pin_id]}") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+).*#related$" + pattern = BASE_PATTERN + r"(/pin/(\d+)/?).*#related$" example = "https://www.sex.com/pin/12345#related" def metadata(self): @@ -129,7 +186,7 @@ class SexcomRelatedPinExtractor(SexcomPinExtractor): def pins(self): url = "{}/pin/related?pinId={}&limit=24&offset=0".format( - self.root, self.pin_id) + self.root, self.groups[1]) return self._pagination(url) @@ -137,18 +194,14 @@ class SexcomPinsExtractor(SexcomExtractor): """Extractor for a user's pins on www.sex.com""" subcategory = "pins" directory_fmt = ("{category}", "{user}") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/pins/" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/pins/" example = "https://www.sex.com/user/USER/pins/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user = match.group(1) - def metadata(self): - return {"user": text.unquote(self.user)} + return {"user": text.unquote(self.groups[0])} def pins(self): - url = "{}/user/{}/pins/".format(self.root, self.user) + url = "{}/user/{}/pins/".format(self.root, self.groups[0]) return self._pagination(url) @@ -156,18 +209,14 @@ class SexcomLikesExtractor(SexcomExtractor): """Extractor for a user's liked pins on www.sex.com""" subcategory = "likes" directory_fmt = ("{category}", "{user}", "Likes") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/likes/" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/likes/" example = "https://www.sex.com/user/USER/likes/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user = match.group(1) - def metadata(self): - return {"user": text.unquote(self.user)} + return {"user": text.unquote(self.groups[0])} def pins(self): - url = "{}/user/{}/likes/".format(self.root, self.user) + url = "{}/user/{}/likes/".format(self.root, self.groups[0]) return self._pagination(url) @@ -175,15 +224,12 @@ class SexcomBoardExtractor(SexcomExtractor): """Extractor for pins from a board on www.sex.com""" subcategory = "board" directory_fmt = ("{category}", "{user}", "{board}") - pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user" + pattern = (BASE_PATTERN + r"/user" r"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)") example = "https://www.sex.com/user/USER/BOARD/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user, self.board = match.groups() - def metadata(self): + self.user, self.board = self.groups return { "user" : text.unquote(self.user), "board": text.unquote(self.board), @@ -198,19 +244,18 @@ class SexcomSearchExtractor(SexcomExtractor): """Extractor for search results on www.sex.com""" subcategory = "search" directory_fmt = ("{category}", "search", "{search[query]}") - pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:" + pattern = (BASE_PATTERN + r"/((?:" r"(pic|gif|video)s/([^/?#]*)|search/(pic|gif|video)s" r")/?(?:\?([^#]+))?)") example = "https://www.sex.com/search/pics?query=QUERY" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.path = match.group(1) + def _init(self): + self.path, t1, query_alt, t2, query = self.groups - self.search = text.parse_query(match.group(5)) - self.search["type"] = match.group(2) or match.group(4) + self.search = text.parse_query(query) + self.search["type"] = t1 or t2 if "query" not in self.search: - self.search["query"] = match.group(3) or "" + self.search["query"] = query_alt or "" def metadata(self): return {"search": self.search} diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 07c9b21..cdccd4c 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -48,7 +48,12 @@ class SkebExtractor(Extractor): def items(self): metadata = self.metadata() for user_name, post_num in self.posts(): - response, post = self._get_post_data(user_name, post_num) + try: + response, post = self._get_post_data(user_name, post_num) + except Exception as exc: + self.log.error("@%s/%s: %s: %s", user_name, post_num, + exc.__class__.__name__, exc) + continue if metadata: post.update(metadata) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 6c43941..5d0ec46 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -39,6 +39,8 @@ class SubscribestarExtractor(Extractor): for post_html in self.posts(): media = self._media_from_post(post_html) data = self._data_from_post(post_html) + data["title"] = text.unescape(text.extr( + data["content"], "<h1>", "</h1>")) yield Message.Directory, data for num, item in enumerate(media, 1): item.update(data) @@ -55,7 +57,9 @@ class SubscribestarExtractor(Extractor): while True: response = Extractor.request(self, url, **kwargs) - if response.history and "/verify_subscriber" in response.url: + if response.history and ( + "/verify_subscriber" in response.url or + "/age_confirmation_warning" in response.url): raise exception.StopExtraction( "HTTP redirect to %s", response.url) diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py index 30f310d..4c1da7a 100644 --- a/gallery_dl/extractor/tiktok.py +++ b/gallery_dl/extractor/tiktok.py @@ -25,14 +25,8 @@ class TiktokExtractor(Extractor): def _init(self): self.audio = self.config("audio", True) self.video = self.config("videos", True) - if not self.config("avatar", True): - self.avatar = util.false def items(self): - # We assume that all of the URLs served by urls() come from the same - # author. - downloaded_avatar = not self.avatar() - for tiktok_url in self.urls(): tiktok_url = self._sanitize_url(tiktok_url) data = self._extract_rehydration_data(tiktok_url) @@ -49,18 +43,10 @@ class TiktokExtractor(Extractor): post = video_detail["itemInfo"]["itemStruct"] author = post["author"] - post["user"] = user = author["uniqueId"] + post["user"] = author["uniqueId"] post["date"] = text.parse_timestamp(post["createTime"]) original_title = title = post["desc"] - if not downloaded_avatar: - avatar_url = author["avatarLarger"] - avatar = self._generate_avatar( - avatar_url, post, user, author["id"]) - yield Message.Directory, avatar - yield Message.Url, avatar_url, avatar - downloaded_avatar = True - yield Message.Directory, post ytdl_media = False @@ -111,44 +97,29 @@ class TiktokExtractor(Extractor): }) yield Message.Url, "ytdl:" + tiktok_url, post - # If we couldn't download the avatar because the given user has no - # posts, we'll need to make a separate request for the user's page - # and download the avatar that way. - if not downloaded_avatar: - user_name = self.avatar() - profile_url = "https://www.tiktok.com/@{}".format(user_name) - data = self._extract_rehydration_data(profile_url) - data = data["webapp.user-detail"]["userInfo"]["user"] - data["user"] = user_name - avatar_url = data["avatarLarger"] - avatar = self._generate_avatar( - avatar_url, data, user_name, data["id"]) - yield Message.Directory, avatar - yield Message.Url, avatar_url, avatar - - def avatar(self): - return False - - def _generate_avatar(self, avatar_url, data, user_name, user_id): - avatar = text.nameext_from_url(avatar_url, data.copy()) - avatar.update({ - "type" : "avatar", - "title" : "@" + user_name, - "id" : user_id, - "img_id": avatar["filename"].partition("~")[0], - "num" : 0, - }) - return avatar - def _sanitize_url(self, url): return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1)) def _extract_rehydration_data(self, url): - html = self.request(url).text - data = text.extr( - html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" ' - 'type="application/json">', '</script>') - return util.json_loads(data)["__DEFAULT_SCOPE__"] + tries = 0 + while True: + try: + html = self.request(url).text + data = text.extr( + html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" ' + 'type="application/json">', '</script>') + return util.json_loads(data)["__DEFAULT_SCOPE__"] + except ValueError: + # We failed to retrieve rehydration data. This happens + # relatively frequently when making many requests, so + # retry. + if tries >= self._retries: + raise + tries += 1 + self.log.warning("%s: Failed to retrieve rehydration data " + "(%s/%s)", url.rpartition("/")[2], tries, + self._retries) + self.sleep(self._timeout, "retry") def _extract_audio(self, post): audio = post["music"] @@ -179,7 +150,7 @@ class TiktokExtractor(Extractor): elif status == 10204: self.log.error("%s: Requested post not available", url) elif status == 10231: - self.log.error("%s: Region locked - Try downloading with a" + self.log.error("%s: Region locked - Try downloading with a " "VPN/proxy connection", url) else: self.log.error( @@ -230,7 +201,10 @@ class TiktokUserExtractor(TiktokExtractor): pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)" example = "https://www.tiktok.com/@USER" - def urls(self): + def _init(self): + self.avatar = self.config("avatar", True) + + def items(self): """Attempt to use yt-dlp/youtube-dl to extract links from a user's page""" @@ -263,19 +237,39 @@ class TiktokUserExtractor(TiktokExtractor): ytdl_instance = ytdl.construct_YoutubeDL( module, self, user_opts, extr_opts) - # transfer cookies to ytdl + # Transfer cookies to ytdl. if self.cookies: set_cookie = ytdl_instance.cookiejar.set_cookie for cookie in self.cookies: set_cookie(cookie) + user_name = self.groups[0] + profile_url = "{}/@{}".format(self.root, user_name) + if self.avatar: + avatar_url, avatar = self._generate_avatar(user_name, profile_url) + yield Message.Directory, avatar + yield Message.Url, avatar_url, avatar + with ytdl_instance as ydl: info_dict = ydl._YoutubeDL__extract_info( - "{}/@{}".format(self.root, self.groups[0]), - ydl.get_info_extractor("TikTokUser"), + profile_url, ydl.get_info_extractor("TikTokUser"), False, {}, True) # This should include video and photo posts in /video/ URL form. - return [video["url"] for video in info_dict["entries"]] - - def avatar(self): - return self.groups[0] + for video in info_dict["entries"]: + data = {"_extractor": TiktokPostExtractor} + yield Message.Queue, video["url"].partition("?")[0], data + + def _generate_avatar(self, user_name, profile_url): + data = self._extract_rehydration_data(profile_url) + data = data["webapp.user-detail"]["userInfo"]["user"] + data["user"] = user_name + avatar_url = data["avatarLarger"] + avatar = text.nameext_from_url(avatar_url, data.copy()) + avatar.update({ + "type" : "avatar", + "title" : "@" + user_name, + "id" : data["id"], + "img_id": avatar["filename"].partition("~")[0], + "num" : 0, + }) + return (avatar_url, avatar) diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index bc135ad..ac1400e 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -64,16 +64,22 @@ class ZerochanExtractor(BooruExtractor): def _parse_entry_html(self, entry_id): url = "{}/{}".format(self.root, entry_id) - extr = text.extract_from(self.request(url).text) + page = self.request(url).text + try: + jsonld = self._extract_jsonld(page) + except Exception: + return {"id": entry_id} + + extr = text.extract_from(page) data = { "id" : text.parse_int(entry_id), - "author" : text.parse_unicode_escapes(extr(' "name": "', '"')), - "file_url": extr('"contentUrl": "', '"'), - "date" : text.parse_datetime(extr('"datePublished": "', '"')), - "width" : text.parse_int(extr('"width": "', ' ')), - "height" : text.parse_int(extr('"height": "', ' ')), - "size" : text.parse_bytes(extr('"contentSize": "', 'B')), + "author" : jsonld["author"]["name"], + "file_url": jsonld["contentUrl"], + "date" : text.parse_datetime(jsonld["datePublished"]), + "width" : text.parse_int(jsonld["width"][:-3]), + "height" : text.parse_int(jsonld["height"][:-3]), + "size" : text.parse_bytes(jsonld["contentSize"][:-1]), "path" : text.split_html(extr( 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), @@ -86,7 +92,7 @@ class ZerochanExtractor(BooruExtractor): tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: category = text.extr(tag, '"', '"') - name = text.extr(tag, 'data-tag="', '"') + name = text.unescape(text.extr(tag, 'data-tag="', '"')) tags.append(category.partition(" ")[0].capitalize() + ":" + name) return data |
