diff options
Diffstat (limited to 'gallery_dl/extractor')
52 files changed, 2057 insertions, 438 deletions
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index ce1c52a..3e30ddc 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -26,10 +26,6 @@ class _8chanExtractor(Extractor): self.root = "https://8chan." + match.group(1) Extractor.__init__(self, match) - def _init(self): - tos = self.cookies_tos_name() - self.cookies.set(tos, "1", domain=self.root[8:]) - @memcache() def cookies_tos_name(self): url = self.root + "/.static/pages/confirmed.html" @@ -79,6 +75,7 @@ class _8chanThreadExtractor(_8chanExtractor): def items(self): _, board, thread = self.groups + self.cookies.set(self.cookies_tos_name(), "1", domain=self.root[8:]) # fetch thread data url = "{}/{}/res/{}.".format(self.root, board, thread) @@ -116,6 +113,8 @@ class _8chanBoardExtractor(_8chanExtractor): def items(self): _, board, pnum = self.groups + self.cookies.set(self.cookies_tos_name(), "1", domain=self.root[8:]) + pnum = text.parse_int(pnum, 1) url = "{}/{}/{}.json".format(self.root, board, pnum) data = self.request(url).json() diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 4e9fa50..594ce41 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -30,6 +30,7 @@ modules = [ "batoto", "bbc", "behance", + "bilibili", "blogger", "bluesky", "boosty", @@ -47,7 +48,9 @@ modules = [ "dynastyscans", "e621", "erome", + "everia", "exhentai", + "facebook", "fanbox", "fanleaks", "fantia", @@ -107,6 +110,7 @@ modules = [ "mangasee", "mangoxo", "misskey", + "motherless", "myhentaigallery", "myportfolio", "naver", @@ -139,6 +143,9 @@ modules = [ "reddit", "redgifs", "rule34us", + "rule34vault", + "rule34xyz", + "saint", "sankaku", "sankakucomplex", "scrolller", @@ -200,6 +207,7 @@ modules = [ "directlink", "recursive", "oauth", + "noop", "ytdl", "generic", ] diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py new file mode 100644 index 0000000..d5c419e --- /dev/null +++ b/gallery_dl/extractor/bilibili.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.bilibili.com/""" + +from .common import Extractor, Message +from .. import text, util, exception + + +class BilibiliExtractor(Extractor): + """Base class for bilibili extractors""" + category = "bilibili" + root = "https://www.bilibili.com" + request_interval = (3.0, 6.0) + + def _init(self): + self.api = BilibiliAPI(self) + + +class BilibiliUserArticlesExtractor(BilibiliExtractor): + """Extractor for a bilibili user's articles""" + subcategory = "user-articles" + pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article" + example = "https://space.bilibili.com/12345/article" + + def items(self): + for article in self.api.user_articles(self.groups[0]): + article["_extractor"] = BilibiliArticleExtractor + url = "{}/opus/{}".format(self.root, article["opus_id"]) + yield Message.Queue, url, article + + +class BilibiliArticleExtractor(BilibiliExtractor): + """Extractor for a bilibili article""" + subcategory = "article" + pattern = (r"(?:https?://)?" + r"(?:t\.bilibili\.com|(?:www\.)?bilibili.com/opus)/(\d+)") + example = "https://www.bilibili.com/opus/12345" + directory_fmt = ("{category}", "{username}") + filename_fmt = "{id}_{num}.{extension}" + archive_fmt = "{id}_{num}" + + def items(self): + article = self.api.article(self.groups[0]) + + # Flatten modules list + modules = {} + for module in article["detail"]["modules"]: + del module['module_type'] + modules.update(module) + article["detail"]["modules"] = modules + + article["username"] = modules["module_author"]["name"] + + pics = [] + for paragraph in modules['module_content']['paragraphs']: + if "pic" not in paragraph: + continue + + try: + pics.extend(paragraph["pic"]["pics"]) + except Exception: + pass + + article["count"] = len(pics) + yield Message.Directory, article + for article["num"], pic in enumerate(pics, 1): + url = pic["url"] + article.update(pic) + yield Message.Url, url, text.nameext_from_url(url, article) + + +class BilibiliAPI(): + def __init__(self, extractor): + self.extractor = extractor + + def _call(self, endpoint, params): + url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint + data = self.extractor.request(url, params=params).json() + + if data["code"] != 0: + self.extractor.log.debug("Server response: %s", data) + raise exception.StopExtraction("API request failed") + + return data + + def user_articles(self, user_id): + endpoint = "/opus/feed/space" + params = {"host_mid": user_id} + + while True: + data = self._call(endpoint, params) + + for item in data["data"]["items"]: + params["offset"] = item["opus_id"] + yield item + + if not data["data"]["has_more"]: + break + + def article(self, article_id): + url = "https://www.bilibili.com/opus/" + article_id + + while True: + page = self.extractor.request(url).text + try: + return util.json_loads(text.extr( + page, "window.__INITIAL_STATE__=", "};") + "}") + except Exception: + if "window._riskdata_" not in page: + raise exception.StopExtraction( + "%s: Unable to extract INITIAL_STATE data", article_id) + self.extractor.wait(seconds=300) diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 37075ea..ef117da 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -159,7 +159,7 @@ class BloggerAPI(): def __init__(self, extractor): self.extractor = extractor - self.api_key = extractor.config("api-key", self.API_KEY) + self.api_key = extractor.config("api-key") or self.API_KEY def blog_by_url(self, url): return self._call("blogs/byurl", {"url": url}, "blog") diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index a1a488e..bbff17c 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -12,7 +12,8 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache, memcache -BASE_PATTERN = r"(?:https?://)?bsky\.app" +BASE_PATTERN = (r"(?:https?://)?" + r"(?:(?:www\.)?(?:c|[fv]x)?bs[ky]y[ex]?\.app|main\.bsky\.dev)") USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)" @@ -60,8 +61,10 @@ class BlueskyExtractor(Extractor): yield Message.Directory, post if files: - base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob" - "?did={}&cid=".format(post["author"]["did"])) + did = post["author"]["did"] + base = ( + "{}/xrpc/com.atproto.sync.getBlob?did={}&cid=".format( + self.api.service_endpoint(did), did)) for post["num"], file in enumerate(files, 1): post.update(file) yield Message.Url, base + file["filename"], post @@ -84,7 +87,14 @@ class BlueskyExtractor(Extractor): def _pid(self, post): return post["uri"].rpartition("/")[2] + @memcache(keyarg=1) + def _instance(self, handle): + return ".".join(handle.rsplit(".", 2)[-2:]) + def _prepare(self, post): + author = post["author"] + author["instance"] = self._instance(author["handle"]) + if self._metadata_facets: if "facets" in post: post["hashtags"] = tags = [] @@ -102,7 +112,7 @@ class BlueskyExtractor(Extractor): post["hashtags"] = post["mentions"] = post["uris"] = () if self._metadata_user: - post["user"] = self._user or post["author"] + post["user"] = self._user or author post["instance"] = self.instance post["post_id"] = self._pid(post) @@ -317,6 +327,15 @@ class BlueskySearchExtractor(BlueskyExtractor): return self.api.search_posts(self.user) +class BlueskyHashtagExtractor(BlueskyExtractor): + subcategory = "hashtag" + pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)(?:/(top|latest))?" + example = "https://bsky.app/hashtag/NAME" + + def posts(self): + return self.api.search_posts("#"+self.user, self.groups[1]) + + class BlueskyAPI(): """Interface for the Bluesky API @@ -412,11 +431,28 @@ class BlueskyAPI(): params = {"handle": handle} return self._call(endpoint, params)["did"] - def search_posts(self, query): + @memcache(keyarg=1) + def service_endpoint(self, did): + if did.startswith('did:web:'): + url = "https://" + did[8:] + "/.well-known/did.json" + else: + url = "https://plc.directory/" + did + + try: + data = self.extractor.request(url).json() + for service in data["service"]: + if service["type"] == "AtprotoPersonalDataServer": + return service["serviceEndpoint"] + except Exception: + pass + return "https://bsky.social" + + def search_posts(self, query, sort=None): endpoint = "app.bsky.feed.searchPosts" params = { "q" : query, "limit": "100", + "sort" : sort, } return self._pagination(endpoint, params, "posts") @@ -430,7 +466,8 @@ class BlueskyAPI(): if user_did and not extr.config("reposts", False): extr._user_did = did if extr._metadata_user: - extr._user = self.get_profile(did) + extr._user = user = self.get_profile(did) + user["instance"] = extr._instance(user["handle"]) return did diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py index 997de4a..33823be 100644 --- a/gallery_dl/extractor/boosty.py +++ b/gallery_dl/extractor/boosty.py @@ -35,8 +35,16 @@ class BoostyExtractor(Extractor): if isinstance(videos, str): videos = videos.split(",") elif not isinstance(videos, (list, tuple)): - videos = ("quad_hd", "ultra_hd", "full_hd", - "high", "medium", "low") + # ultra_hd: 2160p + # quad_hd: 1440p + # full_hd: 1080p + # high: 720p + # medium: 480p + # low: 360p + # lowest: 240p + # tiny: 144p + videos = ("ultra_hd", "quad_hd", "full_hd", + "high", "medium", "low", "lowest", "tiny") self.videos = videos def items(self): @@ -325,6 +333,7 @@ class BoostyAPI(): def _pagination(self, endpoint, params, transform=None, key=None): if "is_only_allowed" not in params and self.extractor.only_allowed: + params["only_allowed"] = "true" params["is_only_allowed"] = "true" while True: diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 6c79d0a..3e12452 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -22,13 +22,14 @@ else: BASE_PATTERN = ( r"(?:bunkr:(?:https?://)?([^/?#]+)|" r"(?:https?://)?(?:app\.)?(bunkr+" - r"\.(?:s[kiu]|[cf]i|p[hks]|ru|la|is|to|a[cx]" + r"\.(?:s[kiu]|c[ir]|fi|p[hks]|ru|la|is|to|a[cx]" r"|black|cat|media|red|site|ws|org)))" ) DOMAINS = [ "bunkr.ac", "bunkr.ci", + "bunkr.cr", "bunkr.fi", "bunkr.ph", "bunkr.pk", @@ -110,13 +111,17 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): def fetch_album(self, album_id): # album metadata - page = self.request(self.root + "/a/" + self.album_id).text + page = self.request(self.root + "/a/" + album_id).text title, size = text.split_html(text.extr( page, "<h1", "</span>").partition(">")[2]) + if "&" in title: + title = title.replace( + "<", "<").replace(">", ">").replace("&", "&") + # files items = list(text.extract_iter(page, "<!-- item -->", "<!-- -->")) return self._extract_files(items), { - "album_id" : self.album_id, + "album_id" : album_id, "album_name" : title, "album_size" : text.extr(size, "(", ")"), "count" : len(items), diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index 0b1e44a..1e8cb42 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -44,6 +44,16 @@ class CivitaiExtractor(Extractor): self._image_quality = "original=true" self._image_ext = "png" + metadata = self.config("metadata") + if metadata: + if isinstance(metadata, str): + metadata = metadata.split(",") + elif not isinstance(metadata, (list, tuple)): + metadata = ("generation",) + self._meta_generation = ("generation" in metadata) + else: + self._meta_generation = False + def items(self): models = self.models() if models: @@ -81,6 +91,9 @@ class CivitaiExtractor(Extractor): if images: for image in images: url = self._url(image) + if self._meta_generation: + image["generation"] = self.api.image_generationdata( + image["id"]) image["date"] = text.parse_datetime( image["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") text.nameext_from_url(url, image) @@ -127,6 +140,8 @@ class CivitaiExtractor(Extractor): data["extension"] = self._image_ext if "id" not in file and data["filename"].isdecimal(): file["id"] = text.parse_int(data["filename"]) + if self._meta_generation: + file["generation"] = self.api.image_generationdata(file["id"]) yield data @@ -469,7 +484,7 @@ class CivitaiTrpcAPI(): self.root = extractor.root + "/api/trpc/" self.headers = { "content-type" : "application/json", - "x-client-version": "5.0.185", + "x-client-version": "5.0.211", "x-client-date" : "", "x-client" : "web", "x-fingerprint" : "undefined", @@ -491,6 +506,11 @@ class CivitaiTrpcAPI(): params = {"id": int(image_id)} return (self._call(endpoint, params),) + def image_generationdata(self, image_id): + endpoint = "image.getGenerationData" + params = {"id": int(image_id)} + return self._call(endpoint, params) + def images(self, params, defaults=True): endpoint = "image.getInfinite" diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 2146fa6..f364124 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -11,7 +11,6 @@ import os import re import ssl -import sys import time import netrc import queue @@ -23,7 +22,7 @@ import requests import threading from requests.adapters import HTTPAdapter from .message import Message -from .. import config, text, util, cache, exception +from .. import config, output, text, util, cache, exception urllib3 = requests.packages.urllib3 @@ -43,6 +42,8 @@ class Extractor(): ciphers = None tls12 = True browser = None + useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " + "rv:128.0) Gecko/20100101 Firefox/128.0") request_interval = 0.0 request_interval_min = 0.0 request_interval_429 = 60.0 @@ -289,13 +290,8 @@ class Extractor(): def _check_input_allowed(self, prompt=""): input = self.config("input") - if input is None: - try: - input = sys.stdin.isatty() - except Exception: - input = False - + input = output.TTY_STDIN if not input: raise exception.StopExtraction( "User input required (%s)", prompt.strip(" :")) @@ -351,6 +347,9 @@ class Extractor(): headers.clear() ssl_options = ssl_ciphers = 0 + # .netrc Authorization headers are alwsays disabled + session.trust_env = True if self.config("proxy-env", False) else False + browser = self.config("browser") if browser is None: browser = self.browser @@ -384,11 +383,13 @@ class Extractor(): ssl_ciphers = SSL_CIPHERS[browser] else: useragent = self.config("user-agent") - if useragent is None: - useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " - "rv:128.0) Gecko/20100101 Firefox/128.0") + if useragent is None or useragent == "auto": + useragent = self.useragent elif useragent == "browser": useragent = _browser_useragent() + elif useragent is config.get(("extractor",), "user-agent") and \ + useragent == Extractor.useragent: + useragent = self.useragent headers["User-Agent"] = useragent headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" @@ -660,6 +661,8 @@ class Extractor(): headers=(self._write_pages in ("all", "ALL")), hide_auth=(self._write_pages != "ALL") ) + self.log.info("Writing '%s' response to '%s'", + response.url, path + ".txt") except Exception as e: self.log.warning("Failed to dump HTTP request (%s: %s)", e.__class__.__name__, e) @@ -1008,6 +1011,12 @@ SSL_CIPHERS = { } +# disable Basic Authorization header injection from .netrc data +try: + requests.sessions.get_netrc_auth = lambda _: None +except Exception: + pass + # detect brotli support try: BROTLI = urllib3.response.brotli is not None diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 1746647..c3dfd91 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -20,12 +20,22 @@ class DanbooruExtractor(BaseExtractor): page_limit = 1000 page_start = None per_page = 200 + useragent = util.USERAGENT request_interval = (0.5, 1.5) def _init(self): self.ugoira = self.config("ugoira", False) self.external = self.config("external", False) - self.includes = False + + includes = self.config("metadata") + if includes: + if isinstance(includes, (list, tuple)): + includes = ",".join(includes) + elif not isinstance(includes, str): + includes = "artist_commentary,children,notes,parent,uploader" + self.includes = includes + ",id" + else: + self.includes = False threshold = self.config("threshold") if isinstance(threshold, int): @@ -46,16 +56,6 @@ class DanbooruExtractor(BaseExtractor): return pages * self.per_page def items(self): - self.session.headers["User-Agent"] = util.USERAGENT - - includes = self.config("metadata") - if includes: - if isinstance(includes, (list, tuple)): - includes = ",".join(includes) - elif not isinstance(includes, str): - includes = "artist_commentary,children,notes,parent,uploader" - self.includes = includes + ",id" - data = self.metadata() for post in self.posts(): @@ -108,6 +108,13 @@ class DanbooruExtractor(BaseExtractor): yield Message.Directory, post yield Message.Url, url, post + def items_artists(self): + for artist in self.artists(): + artist["_extractor"] = DanbooruTagExtractor + url = "{}/posts?tags={}".format( + self.root, text.quote(artist["name"])) + yield Message.Queue, url, artist + def metadata(self): return () @@ -294,3 +301,39 @@ class DanbooruPopularExtractor(DanbooruExtractor): def posts(self): return self._pagination("/explore/posts/popular.json", self.params) + + +class DanbooruArtistExtractor(DanbooruExtractor): + """Extractor for danbooru artists""" + subcategory = "artist" + pattern = BASE_PATTERN + r"/artists/(\d+)" + example = "https://danbooru.donmai.us/artists/12345" + + items = DanbooruExtractor.items_artists + + def artists(self): + url = "{}/artists/{}.json".format(self.root, self.groups[-1]) + return (self.request(url).json(),) + + +class DanbooruArtistSearchExtractor(DanbooruExtractor): + """Extractor for danbooru artist searches""" + subcategory = "artist-search" + pattern = BASE_PATTERN + r"/artists/?\?([^#]+)" + example = "https://danbooru.donmai.us/artists?QUERY" + + items = DanbooruExtractor.items_artists + + def artists(self): + url = self.root + "/artists.json" + params = text.parse_query(self.groups[-1]) + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + artists = self.request(url, params=params).json() + + yield from artists + + if len(artists) < 20: + return + params["page"] += 1 diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 693def9..ea3f13d 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -31,7 +31,7 @@ class DeviantartExtractor(Extractor): root = "https://www.deviantart.com" directory_fmt = ("{category}", "{username}") filename_fmt = "{category}_{index}_{title}.{extension}" - cookies_domain = None + cookies_domain = ".deviantart.com" cookies_names = ("auth", "auth_secure", "userinfo") _last_request = 0 @@ -399,7 +399,7 @@ class DeviantartExtractor(Extractor): def _textcontent_to_html(self, deviation, content): html = content["html"] - markup = html["markup"] + markup = html.get("markup") if not markup or markup[0] != "{": return markup @@ -1144,7 +1144,6 @@ class DeviantartScrapsExtractor(DeviantartExtractor): subcategory = "scraps" directory_fmt = ("{category}", "{username}", "Scraps") archive_fmt = "s_{_username}_{index}.{extension}" - cookies_domain = ".deviantart.com" pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b" example = "https://www.deviantart.com/USER/gallery/scraps" @@ -1161,7 +1160,6 @@ class DeviantartSearchExtractor(DeviantartExtractor): subcategory = "search" directory_fmt = ("{category}", "Search", "{search_tags}") archive_fmt = "Q_{search_tags}_{index}.{extension}" - cookies_domain = ".deviantart.com" pattern = (r"(?:https?://)?www\.deviantart\.com" r"/search(?:/deviations)?/?\?([^#]+)") example = "https://www.deviantart.com/search?q=QUERY" @@ -1213,7 +1211,6 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): """Extractor for deviantart gallery searches""" subcategory = "gallery-search" archive_fmt = "g_{_username}_{index}.{extension}" - cookies_domain = ".deviantart.com" pattern = BASE_PATTERN + r"/gallery/?\?(q=[^#]+)" example = "https://www.deviantart.com/USER/gallery?q=QUERY" diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 553ec22..4a6624d 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -20,11 +20,10 @@ class E621Extractor(danbooru.DanbooruExtractor): page_limit = 750 page_start = None per_page = 320 + useragent = util.USERAGENT + " (by mikf)" request_interval_min = 1.0 def items(self): - self.session.headers["User-Agent"] = util.USERAGENT + " (by mikf)" - includes = self.config("metadata") or () if includes: if isinstance(includes, str): diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py new file mode 100644 index 0000000..94444ff --- /dev/null +++ b/gallery_dl/extractor/everia.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://everia.club""" + +from .common import Extractor, Message +from .. import text +import re + +BASE_PATTERN = r"(?:https?://)?everia\.club" + + +class EveriaExtractor(Extractor): + category = "everia" + root = "https://everia.club" + + def items(self): + data = {"_extractor": EveriaPostExtractor} + for url in self.posts(): + yield Message.Queue, url, data + + def posts(self): + return self._pagination(self.groups[0]) + + def _pagination(self, path, params=None, pnum=1): + find_posts = re.compile(r'thumbnail">\s*<a href="([^"]+)').findall + + while True: + if pnum == 1: + url = "{}{}/".format(self.root, path) + else: + url = "{}{}/page/{}/".format(self.root, path, pnum) + response = self.request(url, params=params, allow_redirects=False) + + if response.status_code >= 300: + return + + yield from find_posts(response.text) + pnum += 1 + + +class EveriaPostExtractor(EveriaExtractor): + subcategory = "post" + directory_fmt = ("{category}", "{title}") + archive_fmt = "{post_url}_{num}" + pattern = BASE_PATTERN + r"(/\d{4}/\d{2}/\d{2}/[^/?#]+)" + example = "https://everia.club/0000/00/00/TITLE" + + def items(self): + url = self.root + self.groups[0] + page = self.request(url).text + content = text.extr(page, 'itemprop="text">', "</div>") + urls = re.findall(r'img.*?src="([^"]+)', content) + + data = { + "title": text.unescape( + text.extr(page, 'itemprop="headline">', "</h1>")), + "tags": list(text.extract_iter(page, 'rel="tag">', "</a>")), + "post_url": url, + "post_category": text.extr( + page, "post-in-category-", " ").capitalize(), + "count": len(urls), + } + + yield Message.Directory, data + for data["num"], url in enumerate(urls, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + +class EveriaTagExtractor(EveriaExtractor): + subcategory = "tag" + pattern = BASE_PATTERN + r"(/tag/[^/?#]+)" + example = "https://everia.club/tag/TAG" + + +class EveriaCategoryExtractor(EveriaExtractor): + subcategory = "category" + pattern = BASE_PATTERN + r"(/category/[^/?#]+)" + example = "https://everia.club/category/CATEGORY" + + +class EveriaDateExtractor(EveriaExtractor): + subcategory = "date" + pattern = (BASE_PATTERN + + r"(/\d{4}(?:/\d{2})?(?:/\d{2})?)(?:/page/\d+)?/?$") + example = "https://everia.club/0000/00/00" + + +class EveriaSearchExtractor(EveriaExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/(?:page/\d+/)?\?s=([^&#]+)" + example = "https://everia.club/?s=SEARCH" + + def posts(self): + params = {"s": self.groups[0]} + return self._pagination("", params) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 3e6d537..e7ba78e 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache +import collections import itertools import math @@ -227,6 +228,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self.config("metadata", False): data.update(self.metadata_from_api()) data["date"] = text.parse_timestamp(data["posted"]) + if self.config("tags", False): + tags = collections.defaultdict(list) + for tag in data["tags"]: + type, _, value = tag.partition(":") + tags[type].append(value) + for type, values in tags.items(): + data["tags_" + type] = values return data def metadata_from_page(self, page): diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py new file mode 100644 index 0000000..04acfc5 --- /dev/null +++ b/gallery_dl/extractor/facebook.py @@ -0,0 +1,447 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.facebook.com/""" + +from .common import Extractor, Message +from .. import text, exception + +BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com" + + +class FacebookExtractor(Extractor): + """Base class for Facebook extractors""" + category = "facebook" + root = "https://www.facebook.com" + directory_fmt = ("{category}", "{username}", "{title} ({set_id})") + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}.{extension}" + + set_url_fmt = root + "/media/set/?set={set_id}" + photo_url_fmt = root + "/photo/?fbid={photo_id}&set={set_id}" + + def _init(self): + headers = self.session.headers + headers["Accept"] = ( + "text/html,application/xhtml+xml,application/xml;q=0.9," + "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8" + ) + headers["Sec-Fetch-Dest"] = "empty" + headers["Sec-Fetch-Mode"] = "navigate" + headers["Sec-Fetch-Site"] = "same-origin" + + self.fallback_retries = self.config("fallback-retries", 2) + self.videos = self.config("videos", True) + self.author_followups = self.config("author-followups", False) + + @staticmethod + def decode_all(txt): + return text.unescape( + txt.encode("utf-8").decode("unicode_escape") + ).replace("\\/", "/") + + @staticmethod + def parse_set_page(set_page): + directory = { + "set_id": text.extr( + set_page, '"mediaSetToken":"', '"' + ) or text.extr( + set_page, '"mediasetToken":"', '"' + ), + "username": FacebookExtractor.decode_all( + text.extr( + set_page, '"user":{"__isProfile":"User","name":"', '","' + ) or text.extr( + set_page, '"actors":[{"__typename":"User","name":"', '","' + ) + ), + "user_id": text.extr( + set_page, '"owner":{"__typename":"User","id":"', '"' + ), + "title": FacebookExtractor.decode_all(text.extr( + set_page, '"title":{"text":"', '"' + )), + "first_photo_id": text.extr( + set_page, + '{"__typename":"Photo","__isMedia":"Photo","', + '","creation_story"' + ).rsplit('"id":"', 1)[-1] or + text.extr( + set_page, '{"__typename":"Photo","id":"', '"' + ) + } + + return directory + + @staticmethod + def parse_photo_page(photo_page): + photo = { + "id": text.extr( + photo_page, '"__isNode":"Photo","id":"', '"' + ), + "set_id": text.extr( + photo_page, + '"url":"https:\\/\\/www.facebook.com\\/photo\\/?fbid=', + '"' + ).rsplit("&set=", 1)[-1], + "username": FacebookExtractor.decode_all(text.extr( + photo_page, '"owner":{"__typename":"User","name":"', '"' + )), + "user_id": text.extr( + photo_page, '"owner":{"__typename":"User","id":"', '"' + ), + "caption": FacebookExtractor.decode_all(text.extr( + photo_page, + '"message":{"delight_ranges"', + '"},"message_preferred_body"' + ).rsplit('],"text":"', 1)[-1]), + "date": text.parse_timestamp(text.extr( + photo_page, '\\"publish_time\\":', ',' + )), + "url": FacebookExtractor.decode_all(text.extr( + photo_page, ',"image":{"uri":"', '","' + )), + "next_photo_id": text.extr( + photo_page, + '"nextMediaAfterNodeId":{"__typename":"Photo","id":"', + '"' + ) or text.extr( + photo_page, + '"nextMedia":{"edges":[{"node":{"__typename":"Photo","id":"', + '"' + ) + } + + text.nameext_from_url(photo["url"], photo) + + photo["followups_ids"] = [] + for comment_raw in text.extract_iter( + photo_page, '{"node":{"id"', '"cursor":null}' + ): + if ('"is_author_original_poster":true' in comment_raw and + '{"__typename":"Photo","id":"' in comment_raw): + photo["followups_ids"].append(text.extr( + comment_raw, + '{"__typename":"Photo","id":"', + '"' + )) + + return photo + + @staticmethod + def parse_post_page(post_page): + first_photo_url = text.extr( + text.extr( + post_page, '"__isMedia":"Photo"', '"target_group"' + ), '"url":"', ',' + ) + + post = { + "set_id": text.extr(post_page, '{"mediaset_token":"', '"') or + text.extr(first_photo_url, 'set=', '"').rsplit("&", 1)[0] + } + + return post + + @staticmethod + def parse_video_page(video_page): + video = { + "id": text.extr( + video_page, '\\"video_id\\":\\"', '\\"' + ), + "username": FacebookExtractor.decode_all(text.extr( + video_page, '"actors":[{"__typename":"User","name":"', '","' + )), + "user_id": text.extr( + video_page, '"owner":{"__typename":"User","id":"', '"' + ), + "date": text.parse_timestamp(text.extr( + video_page, '\\"publish_time\\":', ',' + )), + "type": "video" + } + + if not video["username"]: + video["username"] = FacebookExtractor.decode_all(text.extr( + video_page, + '"__typename":"User","id":"' + video["user_id"] + '","name":"', + '","' + )) + + first_video_raw = text.extr( + video_page, '"permalink_url"', '\\/Period>\\u003C\\/MPD>' + ) + + audio = { + **video, + "url": FacebookExtractor.decode_all(text.extr( + text.extr( + first_video_raw, + "AudioChannelConfiguration", + "BaseURL>\\u003C" + ), + "BaseURL>", "\\u003C\\/" + )), + "type": "audio" + } + + video["urls"] = {} + + for raw_url in text.extract_iter( + first_video_raw, 'FBQualityLabel=\\"', '\\u003C\\/BaseURL>' + ): + resolution = raw_url.split('\\"', 1)[0] + video["urls"][resolution] = FacebookExtractor.decode_all( + raw_url.split('BaseURL>', 1)[1] + ) + + if not video["urls"]: + return video, audio + + video["url"] = max( + video["urls"].items(), + key=lambda x: text.parse_int(x[0][:-1]) + )[1] + + text.nameext_from_url(video["url"], video) + audio["filename"] = video["filename"] + audio["extension"] = "m4a" + + return video, audio + + def photo_page_request_wrapper(self, url, **kwargs): + LEFT_OFF_TXT = "" if url.endswith("&set=") else ( + "\nYou can use this URL to continue from " + "where you left off (added \"&setextract\"): " + "\n" + url + "&setextract" + ) + + res = self.request(url, **kwargs) + + if res.url.startswith(self.root + "/login"): + raise exception.AuthenticationError( + "You must be logged in to continue viewing images." + + LEFT_OFF_TXT + ) + + if b'{"__dr":"CometErrorRoot.react"}' in res.content: + raise exception.StopExtraction( + "You've been temporarily blocked from viewing images. " + "\nPlease try using a different account, " + "using a VPN or waiting before you retry." + + LEFT_OFF_TXT + ) + + return res + + def extract_set(self, first_photo_id, set_id): + all_photo_ids = [first_photo_id] + + retries = 0 + i = 0 + + while i < len(all_photo_ids): + photo_id = all_photo_ids[i] + photo_url = self.photo_url_fmt.format( + photo_id=photo_id, set_id=set_id + ) + photo_page = self.photo_page_request_wrapper(photo_url).text + + photo = self.parse_photo_page(photo_page) + photo["set_id"] = set_id + photo["num"] = i + 1 + + if self.author_followups: + for followup_id in photo["followups_ids"]: + if followup_id not in all_photo_ids: + self.log.debug( + "Found a followup in comments: %s", followup_id + ) + all_photo_ids.append(followup_id) + + if not photo["url"]: + if retries < self.fallback_retries and self._interval_429: + seconds = self._interval_429() + self.log.warning( + "Failed to find photo download URL for %s. " + "Retrying in %s seconds.", photo_url, seconds, + ) + self.wait(seconds=seconds, reason="429 Too Many Requests") + retries += 1 + continue + else: + self.log.error( + "Failed to find photo download URL for " + photo_url + + ". Skipping." + ) + retries = 0 + else: + retries = 0 + yield Message.Url, photo["url"], photo + + if photo["next_photo_id"] == "": + self.log.debug( + "Can't find next image in the set. " + "Extraction is over." + ) + elif photo["next_photo_id"] in all_photo_ids: + if photo["next_photo_id"] != photo["id"]: + self.log.debug( + "Detected a loop in the set, it's likely finished. " + "Extraction is over." + ) + else: + all_photo_ids.append(photo["next_photo_id"]) + + i += 1 + + +class FacebookSetExtractor(FacebookExtractor): + """Base class for Facebook Set extractors""" + subcategory = "set" + pattern = ( + BASE_PATTERN + + r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)" + r"[^/?#]*(?<!&setextract)$" + r"|([^/?#]+/posts/[^/?#]+)" + r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)" + ) + example = "https://www.facebook.com/media/set/?set=SET_ID" + + def items(self): + set_id = self.groups[0] or self.groups[3] + path = self.groups[1] + if path: + post_url = self.root + "/" + path + post_page = self.request(post_url).text + set_id = self.parse_post_page(post_page)["set_id"] + + set_url = self.set_url_fmt.format(set_id=set_id) + set_page = self.request(set_url).text + + directory = self.parse_set_page(set_page) + + yield Message.Directory, directory + + yield from self.extract_set( + self.groups[2] or directory["first_photo_id"], + directory["set_id"] + ) + + +class FacebookPhotoExtractor(FacebookExtractor): + """Base class for Facebook Photo extractors""" + subcategory = "photo" + pattern = (BASE_PATTERN + + r"/(?:[^/?#]+/photos/[^/?#]+/|photo(?:.php)?/?\?" + r"(?:[^&#]+&)*fbid=)([^/?&#]+)[^/?#]*(?<!&setextract)$") + example = "https://www.facebook.com/photo/?fbid=PHOTO_ID" + + def items(self): + photo_id = self.groups[0] + photo_url = self.photo_url_fmt.format(photo_id=photo_id, set_id="") + photo_page = self.photo_page_request_wrapper(photo_url).text + + i = 1 + photo = self.parse_photo_page(photo_page) + photo["num"] = i + + set_page = self.request( + self.set_url_fmt.format(set_id=photo["set_id"]) + ).text + + directory = self.parse_set_page(set_page) + + yield Message.Directory, directory + yield Message.Url, photo["url"], photo + + if self.author_followups: + for comment_photo_id in photo["followups_ids"]: + comment_photo = self.parse_photo_page( + self.photo_page_request_wrapper( + self.photo_url_fmt.format( + photo_id=comment_photo_id, set_id="" + ) + ).text + ) + i += 1 + comment_photo["num"] = i + yield Message.Url, comment_photo["url"], comment_photo + + +class FacebookVideoExtractor(FacebookExtractor): + """Base class for Facebook Video extractors""" + subcategory = "video" + directory_fmt = ("{category}", "{username}", "{subcategory}") + pattern = BASE_PATTERN + r"/(?:[^/?#]+/videos/|watch/?\?v=)([^/?&#]+)" + example = "https://www.facebook.com/watch/?v=VIDEO_ID" + + def items(self): + video_id = self.groups[0] + video_url = self.root + "/watch/?v=" + video_id + video_page = self.request(video_url).text + + video, audio = self.parse_video_page(video_page) + + if "url" not in video: + return + + yield Message.Directory, video + + if self.videos == "ytdl": + yield Message.Url, "ytdl:" + video_url, video + elif self.videos: + yield Message.Url, video["url"], video + if audio["url"]: + yield Message.Url, audio["url"], audio + + +class FacebookProfileExtractor(FacebookExtractor): + """Base class for Facebook Profile Photos Set extractors""" + subcategory = "profile" + pattern = ( + BASE_PATTERN + + r"/(?!media/|photo/|photo.php|watch/)" + r"(?:profile\.php\?id=|people/[^/?#]+/)?" + r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)" + ) + example = "https://www.facebook.com/USERNAME" + + @staticmethod + def get_profile_photos_set_id(profile_photos_page): + set_ids_raw = text.extr( + profile_photos_page, '"pageItems"', '"page_info"' + ) + + set_id = text.extr( + set_ids_raw, 'set=', '"' + ).rsplit("&", 1)[0] or text.extr( + set_ids_raw, '\\/photos\\/', '\\/' + ) + + return set_id + + def items(self): + profile_photos_url = ( + self.root + "/" + self.groups[0] + "/photos_by" + ) + profile_photos_page = self.request(profile_photos_url).text + + set_id = self.get_profile_photos_set_id(profile_photos_page) + + if set_id: + set_url = self.set_url_fmt.format(set_id=set_id) + set_page = self.request(set_url).text + + directory = self.parse_set_page(set_page) + + yield Message.Directory, directory + + yield from self.extract_set( + directory["first_photo_id"], directory["set_id"] + ) + else: + self.log.debug("Profile photos set ID not found.") diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index df252ee..e85a375 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -20,7 +20,6 @@ class FlickrExtractor(Extractor): filename_fmt = "{category}_{id}.{extension}" directory_fmt = ("{category}", "{user[username]}") archive_fmt = "{id}" - cookies_domain = None request_interval = (1.0, 2.0) request_interval_min = 0.5 @@ -45,7 +44,7 @@ class FlickrExtractor(Extractor): self.log.debug("", exc_info=exc) else: photo.update(data) - url = photo["url"] + url = self._file_url(photo) yield Message.Directory, photo yield Message.Url, url, text.nameext_from_url(url, photo) @@ -57,6 +56,15 @@ class FlickrExtractor(Extractor): def photos(self): """Return an iterable with all relevant photo objects""" + def _file_url(self, photo): + url = photo["url"] + + if "/video/" in url: + return url + + path, _, ext = url.rpartition(".") + return path + "_d." + ext + class FlickrImageExtractor(FlickrExtractor): """Extractor for individual images from flickr.com""" @@ -98,7 +106,7 @@ class FlickrImageExtractor(FlickrExtractor): if isinstance(value, dict): location[key] = value["_content"] - url = photo["url"] + url = self._file_url(photo) yield Message.Directory, photo yield Message.Url, url, text.nameext_from_url(url, photo) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 0baad2f..aad5752 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -22,14 +22,14 @@ class GelbooruV02Extractor(booru.BooruExtractor): def _init(self): self.api_key = self.config("api-key") self.user_id = self.config("user-id") - self.api_root = self.config_instance("api_root") or self.root + self.root_api = self.config_instance("root-api") or self.root if self.category == "realbooru": self.items = self._items_realbooru self._tags = self._tags_realbooru def _api_request(self, params): - url = self.api_root + "/index.php?page=dapi&s=post&q=index" + url = self.root_api + "/index.php?page=dapi&s=post&q=index" return ElementTree.fromstring(self.request(url, params=params).text) def _pagination(self, params): @@ -191,8 +191,8 @@ BASE_PATTERN = GelbooruV02Extractor.update({ }, "rule34": { "root": "https://rule34.xxx", + "root-api": "https://api.rule34.xxx", "pattern": r"(?:www\.)?rule34\.xxx", - "api_root": "https://api.rule34.xxx", }, "safebooru": { "root": "https://safebooru.org", diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index c75c90d..7e128a4 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -254,6 +254,22 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): self.root, self.user) +class HentaifoundryTagExtractor(HentaifoundryExtractor): + """Extractor for tag searches on hentaifoundry.com""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{index}" + pattern = BASE_PATTERN + r"/pictures/tagged/([^/?#]+)" + example = "https://www.hentai-foundry.com/pictures/tagged/TAG" + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match) + self.page_url = "{}/pictures/tagged/{}".format(self.root, self.user) + + def metadata(self): + return {"search_tags": self.user} + + class HentaifoundryRecentExtractor(HentaifoundryExtractor): """Extractor for 'Recent Pictures' on hentaifoundry.com""" subcategory = "recent" diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 4a9759f..c939a3c 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://hiperdex.top/""" +"""Extractors for https://hipertoon.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text @@ -14,13 +14,13 @@ from ..cache import memcache import re BASE_PATTERN = (r"((?:https?://)?(?:www\.)?" - r"(?:1st)?hiperdex\d?\.(?:com|net|info|top))") + r"(?:1st)?hiper(?:dex|toon)\d?\.(?:com|net|info|top))") class HiperdexBase(): """Base class for hiperdex extractors""" category = "hiperdex" - root = "https://hiperdex.top" + root = "https://hipertoon.com" @memcache(keyarg=1) def manga_data(self, manga, page=None): @@ -49,7 +49,7 @@ class HiperdexBase(): "status" : extr( 'class="summary-content">', '<').strip(), "description": text.remove_html(text.unescape(extr( - 'class="description-summary">', '</div>'))), + "Summary </h5>", "</div>"))), "language": "English", "lang" : "en", } @@ -69,7 +69,7 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for hiperdex manga chapters""" pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))" - example = "https://hiperdex.top/manga/MANGA/CHAPTER/" + example = "https://hipertoon.com/manga/MANGA/CHAPTER/" def __init__(self, match): root, path, self.manga, self.chapter = match.groups() @@ -91,7 +91,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): """Extractor for hiperdex manga""" chapterclass = HiperdexChapterExtractor pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$" - example = "https://hiperdex.top/manga/MANGA/" + example = "https://hipertoon.com/manga/MANGA/" def __init__(self, match): root, path, self.manga = match.groups() @@ -127,7 +127,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): chapterclass = HiperdexMangaExtractor reverse = False pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))" - example = "https://hiperdex.top/manga-artist/NAME/" + example = "https://hipertoon.com/manga-artist/NAME/" def __init__(self, match): self.root = text.ensure_http_scheme(match.group(1)) diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 18df9df..308b42c 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -108,9 +108,9 @@ class HitomiTagExtractor(Extractor): category = "hitomi" subcategory = "tag" root = "https://hitomi.la" - pattern = (r"(?:https?://)?hitomi\.la/" - r"(tag|artist|group|series|type|character)/" - r"([^/?#]+)\.html") + pattern = (r"(?:https?://)?hitomi\.la" + r"/(tag|artist|group|series|type|character)" + r"/([^/?#]+)\.html") example = "https://hitomi.la/tag/TAG-LANG.html" def __init__(self, match): @@ -151,6 +151,109 @@ class HitomiTagExtractor(Extractor): return +class HitomiIndexExtractor(HitomiTagExtractor): + """Extractor for galleries from index searches on hitomi.la""" + subcategory = "index" + pattern = r"(?:https?://)?hitomi\.la/(\w+)-(\w+)\.html" + example = "https://hitomi.la/index-LANG.html" + + def __init__(self, match): + Extractor.__init__(self, match) + self.tag, self.language = match.groups() + + def items(self): + data = {"_extractor": HitomiGalleryExtractor} + nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format( + self.tag, self.language) + headers = { + "Origin": self.root, + "Cache-Control": "max-age=0", + } + + offset = 0 + total = None + while True: + headers["Referer"] = "{}/{}-{}.html?page={}".format( + self.root, self.tag, self.language, offset // 100 + 1) + headers["Range"] = "bytes={}-{}".format(offset, offset+99) + response = self.request(nozomi_url, headers=headers) + + for gallery_id in decode_nozomi(response.content): + gallery_url = "{}/galleries/{}.html".format( + self.root, gallery_id) + yield Message.Queue, gallery_url, data + + offset += 100 + if total is None: + total = text.parse_int( + response.headers["content-range"].rpartition("/")[2]) + if offset >= total: + return + + +class HitomiSearchExtractor(Extractor): + """Extractor for galleries from multiple tag searches on hitomi.la""" + category = "hitomi" + subcategory = "search" + root = "https://hitomi.la" + pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)" + example = "https://hitomi.la/search.html?QUERY" + + def __init__(self, match): + Extractor.__init__(self, match) + self.query = match.group(1) + self.tags = text.unquote(self.query).split(" ") + + def items(self): + data = {"_extractor": HitomiGalleryExtractor} + + results = [self.get_nozomi_items(tag) for tag in self.tags] + intersects = set.intersection(*results) + + for gallery_id in sorted(intersects, reverse=True): + gallery_url = "{}/galleries/{}.html".format( + self.root, gallery_id) + yield Message.Queue, gallery_url, data + + def get_nozomi_items(self, full_tag): + area, tag, language = self.get_nozomi_args(full_tag) + + if area: + referer_base = "{}/n/{}/{}-{}.html".format( + self.root, area, tag, language) + nozomi_url = "https://ltn.hitomi.la/{}/{}-{}.nozomi".format( + area, tag, language) + else: + referer_base = "{}/n/{}-{}.html".format( + self.root, tag, language) + nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format( + tag, language) + + headers = { + "Origin": self.root, + "Cache-Control": "max-age=0", + "Referer": "{}/search.html?{}".format(referer_base, self.query), + } + + response = self.request(nozomi_url, headers=headers) + return set(decode_nozomi(response.content)) + + def get_nozomi_args(self, query): + ns, _, tag = query.strip().partition(":") + area = ns + language = "all" + + if ns == "female" or ns == "male": + area = "tag" + tag = query + elif ns == "language": + area = None + language = tag + tag = "index" + + return area, tag, language + + @memcache(maxage=1800) def _parse_gg(extr): page = extr.request("https://ltn.hitomi.la/gg.js").text diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 115fff3..159feba 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -10,7 +10,7 @@ """Extractors for https://imgchest.com/""" from .common import GalleryExtractor, Extractor, Message -from .. import text, exception +from .. import text, util, exception BASE_PATTERN = r"(?:https?://)?(?:www\.)?imgchest\.com" @@ -33,35 +33,23 @@ class ImagechestGalleryExtractor(GalleryExtractor): self.api = ImagechestAPI(self, access_token) self.gallery_url = None self.metadata = self._metadata_api - self.images = self._images_api def metadata(self, page): - if "Sorry, but the page you requested could not be found." in page: - raise exception.NotFoundError("gallery") - - return { - "gallery_id": self.gallery_id, - "title": text.unescape(text.extr( - page, 'property="og:title" content="', '"').strip()) - } + try: + data = util.json_loads(text.unescape(text.extr( + page, 'data-page="', '"'))) + post = data["props"]["post"] + except Exception: + if "<title>Not Found</title>" in page: + raise exception.NotFoundError("gallery") + self.files = () + return {} + + self.files = post.pop("files", ()) + post["gallery_id"] = self.gallery_id + post["tags"] = [tag["name"] for tag in post["tags"]] - def images(self, page): - if ' load-all">' in page: - url = "{}/p/{}/loadAll".format(self.root, self.gallery_id) - headers = { - "X-Requested-With": "XMLHttpRequest", - "Origin" : self.root, - "Referer" : self.gallery_url, - } - csrf_token = text.extr(page, 'name="csrf-token" content="', '"') - data = {"_token": csrf_token} - page += self.request( - url, method="POST", headers=headers, data=data).text - - return [ - (url, None) - for url in text.extract_iter(page, 'data-url="', '"') - ] + return post def _metadata_api(self, page): post = self.api.post(self.gallery_id) @@ -74,15 +62,18 @@ class ImagechestGalleryExtractor(GalleryExtractor): post["gallery_id"] = self.gallery_id post.pop("image_count", None) - self._image_list = post.pop("images") + self.files = post.pop("images") return post - def _images_api(self, page): - return [ - (img["link"], img) - for img in self._image_list - ] + def images(self, page): + try: + return [ + (file["link"], file) + for file in self.files + ] + except Exception: + return () class ImagechestUserExtractor(Extractor): @@ -93,10 +84,6 @@ class ImagechestUserExtractor(Extractor): pattern = BASE_PATTERN + r"/u/([^/?#]+)" example = "https://imgchest.com/u/USER" - def __init__(self, match): - Extractor.__init__(self, match) - self.user = match.group(1) - def items(self): url = self.root + "/api/posts" params = { @@ -104,7 +91,7 @@ class ImagechestUserExtractor(Extractor): "sort" : "new", "tag" : "", "q" : "", - "username": text.unquote(self.user), + "username": text.unquote(self.groups[0]), "nsfw" : "true", } @@ -114,6 +101,9 @@ class ImagechestUserExtractor(Extractor): except (TypeError, KeyError): return + if not data: + return + for gallery in data: gallery["_extractor"] = ImagechestGalleryExtractor yield Message.Queue, gallery["link"], gallery diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index dd1272f..a866f45 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -101,7 +101,10 @@ class InstagramExtractor(Extractor): continue url = file["display_url"] - yield Message.Url, url, text.nameext_from_url(url, file) + text.nameext_from_url(url, file) + if file["extension"] == "webp" and "stp=dst-jpg" in url: + file["extension"] = "jpg" + yield Message.Url, url, file def metadata(self): return () @@ -390,10 +393,11 @@ class InstagramExtractor(Extractor): def _init_cursor(self): cursor = self.config("cursor", True) - if not cursor: + if cursor is True: + return None + elif not cursor: self._update_cursor = util.identity - elif isinstance(cursor, str): - return cursor + return cursor def _update_cursor(self, cursor): self.log.debug("Cursor: %s", cursor) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 6f2d5f3..3d04f75 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, util, exception -from ..cache import cache, memcache +from ..cache import cache import itertools import json import re @@ -38,6 +38,7 @@ class KemonopartyExtractor(Extractor): Extractor.__init__(self, match) def _init(self): + self.api = KemonoAPI(self) self.revisions = self.config("revisions") if self.revisions: self.revisions_unique = (self.revisions == "unique") @@ -53,48 +54,60 @@ class KemonopartyExtractor(Extractor): sort_keys=True, separators=(",", ":")).encode def items(self): + service = self.groups[2] + creator_id = self.groups[3] + find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) - duplicates = self.config("duplicates") - comments = self.config("comments") - username = dms = announcements = None + announcements = True if self.config("announcements") else None + comments = True if self.config("comments") else False + duplicates = True if self.config("duplicates") else False + dms = True if self.config("dms") else None + profile = username = None # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} if self.config("metadata"): - username = text.unescape(text.extract( - self.request(self.user_url).text, - '<meta name="artist_name" content="', '"')[0]) - if self.config("dms"): - dms = True - if self.config("announcements"): - announcements = True + profile = self.api.creator_profile(service, creator_id) + username = profile["name"] posts = self.posts() max_posts = self.config("max-posts") if max_posts: posts = itertools.islice(posts, max_posts) + if self.revisions: + posts = self._revisions(posts) for post in posts: - headers["Referer"] = "{}/{}/user/{}/post/{}".format( self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers post["date"] = self._parse_datetime( post.get("published") or post.get("added") or "") - if username: + if profile is not None: post["username"] = username + post["user_profile"] = profile if comments: - post["comments"] = self._extract_comments(post) + try: + post["comments"] = self.api.creator_post_comments( + service, creator_id, post["id"]) + except exception.HttpError: + post["comments"] = () if dms is not None: if dms is True: - dms = self._extract_cards(post, "dms") + dms = self.api.creator_dms( + post["service"], post["user"]) + try: + dms = dms["props"]["dms"] + except Exception: + dms = () post["dms"] = dms if announcements is not None: if announcements is True: - announcements = self._extract_cards(post, "announcements") + announcements = self.api.creator_announcements( + post["service"], post["user"]) post["announcements"] = announcements files = [] @@ -145,20 +158,23 @@ class KemonopartyExtractor(Extractor): self.cookies_update(self._login_impl( (username, self.cookies_domain), password)) - @cache(maxage=28*86400, keyarg=1) + @cache(maxage=3650*86400, keyarg=1) def _login_impl(self, username, password): username = username[0] self.log.info("Logging in as %s", username) - url = self.root + "/account/login" + url = self.root + "/api/v1/authentication/login" data = {"username": username, "password": password} - response = self.request(url, method="POST", data=data) - if response.url.endswith("/account/login") and \ - "Username or password is incorrect" in response.text: - raise exception.AuthenticationError() + response = self.request(url, method="POST", json=data, fatal=False) + if response.status_code >= 400: + try: + msg = '"' + response.json()["error"] + '"' + except Exception: + msg = '"0/1 Username or password is incorrect"' + raise exception.AuthenticationError(msg) - return {c.name: c.value for c in response.history[0].cookies} + return {c.name: c.value for c in response.cookies} def _file(self, post): file = post["file"] @@ -188,56 +204,21 @@ class KemonopartyExtractor(Extractor): filetypes = filetypes.split(",") return [genmap[ft] for ft in filetypes] - def _extract_comments(self, post): - url = "{}/{}/user/{}/post/{}".format( - self.root, post["service"], post["user"], post["id"]) - page = self.request(url).text - - comments = [] - for comment in text.extract_iter(page, "<article", "</article>"): - extr = text.extract_from(comment) - cid = extr('id="', '"') - comments.append({ - "id" : cid, - "user": extr('href="#' + cid + '"', '</').strip(" \n\r>"), - "body": extr( - '<section class="comment__body">', '</section>').strip(), - "date": extr('datetime="', '"'), - }) - return comments - - def _extract_cards(self, post, type): - url = "{}/{}/user/{}/{}".format( - self.root, post["service"], post["user"], type) - page = self.request(url).text - - cards = [] - for card in text.extract_iter(page, "<article", "</article>"): - footer = text.extr(card, "<footer", "</footer>") - cards.append({ - "body": text.unescape(text.extr( - card, "<pre>", "</pre></", - ).strip()), - "date": text.extr(footer, ': ', '\n'), - }) - return cards - def _parse_datetime(self, date_string): if len(date_string) > 19: date_string = date_string[:19] return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S") - @memcache(keyarg=1) - def _discord_channels(self, server): - url = "{}/api/v1/discord/channel/lookup/{}".format( - self.root, server) - return self.request(url).json() + def _revisions(self, posts): + return itertools.chain.from_iterable( + self._revisions_post(post) for post in posts) - def _revisions_post(self, post, url): + def _revisions_post(self, post): post["revision_id"] = 0 try: - revs = self.request(url + "/revisions").json() + revs = self.api.creator_post_revisions( + post["service"], post["user"], post["id"]) except exception.HttpError: post["revision_hash"] = self._revision_hash(post) post["revision_index"] = 1 @@ -268,8 +249,8 @@ class KemonopartyExtractor(Extractor): return revs - def _revisions_all(self, url): - revs = self.request(url + "/revisions").json() + def _revisions_all(self, service, creator_id, post_id): + revs = self.api.creator_post_revisions(service, creator_id, post_id) cnt = idx = len(revs) for rev in revs: @@ -305,50 +286,30 @@ def _validate(response): class KemonopartyUserExtractor(KemonopartyExtractor): """Extractor for all posts from a kemono.su user listing""" subcategory = "user" - pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|[?#])" + pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)" example = "https://kemono.su/SERVICE/user/12345" def __init__(self, match): - _, _, service, user_id, self.query = match.groups() - self.subcategory = service + self.subcategory = match.group(3) KemonopartyExtractor.__init__(self, match) - self.api_url = "{}/api/v1/{}/user/{}".format( - self.root, service, user_id) - self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): - url = self.api_url - params = text.parse_query(self.query) - params["o"] = text.parse_int(params.get("o")) - - while True: - posts = self.request(url, params=params).json() - - if self.revisions: - for post in posts: - post_url = "{}/api/v1/{}/user/{}/post/{}".format( - self.root, post["service"], post["user"], post["id"]) - yield from self._revisions_post(post, post_url) - else: - yield from posts - - if len(posts) < 50: - break - params["o"] += 50 + _, _, service, creator_id, query = self.groups + params = text.parse_query(query) + return self.api.creator_posts( + service, creator_id, params.get("o"), params.get("q")) class KemonopartyPostsExtractor(KemonopartyExtractor): """Extractor for kemono.su post listings""" subcategory = "posts" - pattern = BASE_PATTERN + r"/posts(?:/?\?([^#]+))?" + pattern = BASE_PATTERN + r"/posts()()(?:/?\?([^#]+))?" example = "https://kemono.su/posts" - def __init__(self, match): - KemonopartyExtractor.__init__(self, match) - self.query = match.group(3) - self.api_url = self.root + "/api/v1/posts" - - posts = KemonopartyUserExtractor.posts + def posts(self): + params = text.parse_query(self.groups[4]) + return self.api.posts( + params.get("o"), params.get("q"), params.get("tag")) class KemonopartyPostExtractor(KemonopartyExtractor): @@ -358,27 +319,23 @@ class KemonopartyPostExtractor(KemonopartyExtractor): example = "https://kemono.su/SERVICE/user/12345/post/12345" def __init__(self, match): - _, _, service, user_id, post_id, self.revision, self.revision_id = \ - match.groups() - self.subcategory = service + self.subcategory = match.group(3) KemonopartyExtractor.__init__(self, match) - self.api_url = "{}/api/v1/{}/user/{}/post/{}".format( - self.root, service, user_id, post_id) - self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) def posts(self): - if not self.revision: - post = self.request(self.api_url).json() - if self.revisions: - return self._revisions_post(post, self.api_url) - return (post,) + _, _, service, creator_id, post_id, revision, revision_id = self.groups + post = self.api.creator_post(service, creator_id, post_id) + if not revision: + return (post["post"],) - revs = self._revisions_all(self.api_url) - if not self.revision_id: + self.revisions = False + + revs = self._revisions_all(service, creator_id, post_id) + if not revision_id: return revs for rev in revs: - if str(rev["revision_id"]) == self.revision_id: + if str(rev["revision_id"]) == revision_id: return (rev,) raise exception.NotFoundError("revision") @@ -391,40 +348,37 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): "{channel_name|channel}") filename_fmt = "{id}_{num:>02}_{filename}.{extension}" archive_fmt = "discord_{server}_{id}_{num}" - pattern = BASE_PATTERN + r"/discord/server/(\d+)(?:/channel/(\d+))?#(.*)" - example = "https://kemono.su/discord/server/12345#CHANNEL" - - def __init__(self, match): - KemonopartyExtractor.__init__(self, match) - _, _, self.server, self.channel_id, self.channel = match.groups() - self.channel_name = "" + pattern = (BASE_PATTERN + r"/discord/server/(\d+)" + r"(?:/(?:channel/)?(\d+)(?:#(.+))?|#(.+))") + example = "https://kemono.su/discord/server/12345/12345" def items(self): self._prepare_ddosguard_cookies() + _, _, server_id, channel_id, channel_name, channel = self.groups - if self.channel_id: - self.channel_name = self.channel - else: - if self.channel.isdecimal() and len(self.channel) >= 16: + if channel_id is None: + if channel.isdecimal() and len(channel) >= 16: key = "id" else: key = "name" - for channel in self._discord_channels(self.server): - if channel[key] == self.channel: + for ch in self.api.discord_server(server_id): + if ch[key] == channel: break else: raise exception.NotFoundError("channel") - self.channel_id = channel["id"] - self.channel_name = channel["name"] + channel_id = ch["id"] + channel_name = ch["name"] + elif channel_name is None: + channel_name = "" find_inline = re.compile( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall find_hash = re.compile(HASH_PATTERN).match - posts = self.posts() + posts = self.api.discord_channel(channel_id) max_posts = self.config("max-posts") if max_posts: posts = itertools.islice(posts, max_posts) @@ -441,7 +395,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): append({"path": "https://cdn.discordapp.com" + path, "name": path, "type": "inline", "hash": ""}) - post["channel_name"] = self.channel_name + post["channel_name"] = channel_name post["date"] = self._parse_datetime(post["published"]) post["count"] = len(files) yield Message.Directory, post @@ -461,33 +415,17 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): url = self.root + "/data" + url[20:] yield Message.Url, url, post - def posts(self): - url = "{}/api/v1/discord/channel/{}".format( - self.root, self.channel_id) - params = {"o": 0} - - while True: - posts = self.request(url, params=params).json() - yield from posts - - if len(posts) < 150: - break - params["o"] += 150 - class KemonopartyDiscordServerExtractor(KemonopartyExtractor): subcategory = "discord-server" pattern = BASE_PATTERN + r"/discord/server/(\d+)$" example = "https://kemono.su/discord/server/12345" - def __init__(self, match): - KemonopartyExtractor.__init__(self, match) - self.server = match.group(3) - def items(self): - for channel in self._discord_channels(self.server): - url = "{}/discord/server/{}/channel/{}#{}".format( - self.root, self.server, channel["id"], channel["name"]) + server_id = self.groups[2] + for channel in self.api.discord_server(server_id): + url = "{}/discord/server/{}/{}#{}".format( + self.root, server_id, channel["id"], channel["name"]) channel["_extractor"] = KemonopartyDiscordExtractor yield Message.Queue, url, channel @@ -495,26 +433,21 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): class KemonopartyFavoriteExtractor(KemonopartyExtractor): """Extractor for kemono.su favorites""" subcategory = "favorite" - pattern = BASE_PATTERN + r"/favorites(?:/?\?([^#]+))?" + pattern = BASE_PATTERN + r"/favorites()()(?:/?\?([^#]+))?" example = "https://kemono.su/favorites" - def __init__(self, match): - KemonopartyExtractor.__init__(self, match) - self.params = text.parse_query(match.group(3)) - self.favorites = (self.params.get("type") or - self.config("favorites") or - "artist") - def items(self): self._prepare_ddosguard_cookies() self.login() - sort = self.params.get("sort") - order = self.params.get("order") or "desc" + params = text.parse_query(self.groups[4]) + type = params.get("type") or self.config("favorites") or "artist" - if self.favorites == "artist": - users = self.request( - self.root + "/api/v1/account/favorites?type=artist").json() + sort = params.get("sort") + order = params.get("order") or "desc" + + if type == "artist": + users = self.api.account_favorites("artist") if not sort: sort = "updated" @@ -527,9 +460,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): self.root, user["service"], user["id"]) yield Message.Queue, url, user - elif self.favorites == "post": - posts = self.request( - self.root + "/api/v1/account/favorites?type=post").json() + elif type == "post": + posts = self.api.account_favorites("post") if not sort: sort = "faved_seq" @@ -541,3 +473,95 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): url = "{}/{}/user/{}/post/{}".format( self.root, post["service"], post["user"], post["id"]) yield Message.Queue, url, post + + +class KemonoAPI(): + """Interface for the Kemono API v1.1.0 + + https://kemono.su/documentation/api + """ + + def __init__(self, extractor): + self.extractor = extractor + self.root = extractor.root + "/api/v1" + + def posts(self, offset=0, query=None, tags=None): + endpoint = "/posts" + params = {"q": query, "o": offset, "tags": tags} + return self._pagination(endpoint, params, 50, "posts") + + def creator_posts(self, service, creator_id, offset=0, query=None): + endpoint = "/{}/user/{}".format(service, creator_id) + params = {"q": query, "o": offset} + return self._pagination(endpoint, params, 50) + + def creator_announcements(self, service, creator_id): + endpoint = "/{}/user/{}/announcements".format(service, creator_id) + return self._call(endpoint) + + def creator_dms(self, service, creator_id): + endpoint = "/{}/user/{}/dms".format(service, creator_id) + return self._call(endpoint) + + def creator_fancards(self, service, creator_id): + endpoint = "/{}/user/{}/fancards".format(service, creator_id) + return self._call(endpoint) + + def creator_post(self, service, creator_id, post_id): + endpoint = "/{}/user/{}/post/{}".format(service, creator_id, post_id) + return self._call(endpoint) + + def creator_post_comments(self, service, creator_id, post_id): + endpoint = "/{}/user/{}/post/{}/comments".format( + service, creator_id, post_id) + return self._call(endpoint) + + def creator_post_revisions(self, service, creator_id, post_id): + endpoint = "/{}/user/{}/post/{}/revisions".format( + service, creator_id, post_id) + return self._call(endpoint) + + def creator_profile(self, service, creator_id): + endpoint = "/{}/user/{}/profile".format(service, creator_id) + return self._call(endpoint) + + def creator_links(self, service, creator_id): + endpoint = "/{}/user/{}/links".format(service, creator_id) + return self._call(endpoint) + + def creator_tags(self, service, creator_id): + endpoint = "/{}/user/{}/tags".format(service, creator_id) + return self._call(endpoint) + + def discord_channel(self, channel_id): + endpoint = "/discord/channel/{}".format(channel_id) + return self._pagination(endpoint, {}, 150) + + def discord_server(self, server_id): + endpoint = "/discord/channel/lookup/{}".format(server_id) + return self._call(endpoint) + + def account_favorites(self, type): + endpoint = "/account/favorites" + params = {"type": type} + return self._call(endpoint, params) + + def _call(self, endpoint, params=None): + url = self.root + endpoint + response = self.extractor.request(url, params=params) + return response.json() + + def _pagination(self, endpoint, params, batch=50, key=False): + params["o"] = text.parse_int(params.get("o")) % 50 + + while True: + data = self._call(endpoint, params) + + if key: + yield from data[key] + else: + yield from data + + if len(data) < batch: + return + params["o"] += batch diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py index cacf504..b60157e 100644 --- a/gallery_dl/extractor/koharu.py +++ b/gallery_dl/extractor/koharu.py @@ -6,20 +6,27 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://koharu.to/""" +"""Extractors for https://niyaniya.moe/""" from .common import GalleryExtractor, Extractor, Message from .. import text, exception from ..cache import cache +import collections -BASE_PATTERN = r"(?i)(?:https?://)?(?:koharu|anchira)\.to" +BASE_PATTERN = ( + r"(?i)(?:https?://)?(" + r"(?:niyaniya|shupogaki)\.moe|" + r"(?:koharu|anchira|seia)\.to|" + r"(?:hoshino)\.one" + r")" +) class KoharuExtractor(Extractor): """Base class for koharu extractors""" category = "koharu" - root = "https://koharu.to" - root_api = "https://api.koharu.to" + root = "https://niyaniya.moe" + root_api = "https://api.schale.network" request_interval = (0.5, 1.5) def _init(self): @@ -62,7 +69,7 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor): archive_fmt = "{id}_{num}" request_interval = 0.0 pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)" - example = "https://koharu.to/g/12345/67890abcde/" + example = "https://niyaniya.moe/g/12345/67890abcde/" TAG_TYPES = { 0 : "general", @@ -100,16 +107,26 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor): def metadata(self, _): url = "{}/books/detail/{}/{}".format( - self.root_api, self.groups[0], self.groups[1]) + self.root_api, self.groups[1], self.groups[2]) self.data = data = self.request(url, headers=self.headers).json() + data["date"] = text.parse_timestamp(data["created_at"] // 1000) tags = [] - for tag in data["tags"]: + types = self.TAG_TYPES + tags_data = data["tags"] + + for tag in tags_data: name = tag["name"] namespace = tag.get("namespace", 0) - tags.append(self.TAG_TYPES[namespace] + ":" + name) + tags.append(types[namespace] + ":" + name) data["tags"] = tags - data["date"] = text.parse_timestamp(data["created_at"] // 1000) + + if self.config("tags", False): + tags = collections.defaultdict(list) + for tag in tags_data : + tags[tag.get("namespace", 0)].append(tag["name"]) + for type, values in tags.items(): + data["tags_" + types[type]] = values try: if self.cbz: @@ -179,11 +196,11 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor): break except KeyError: self.log.debug("%s: Format %s is not available", - self.groups[0], fmtid) + self.groups[1], fmtid) else: raise exception.NotFoundError("format") - self.log.debug("%s: Selected format %s", self.groups[0], fmtid) + self.log.debug("%s: Selected format %s", self.groups[1], fmtid) fmt["w"] = fmtid return fmt @@ -192,10 +209,10 @@ class KoharuSearchExtractor(KoharuExtractor): """Extractor for koharu search results""" subcategory = "search" pattern = BASE_PATTERN + r"/\?([^#]*)" - example = "https://koharu.to/?s=QUERY" + example = "https://niyaniya.moe/?s=QUERY" def items(self): - params = text.parse_query(self.groups[0]) + params = text.parse_query(self.groups[1]) params["page"] = text.parse_int(params.get("page"), 1) return self._pagination("/books", params) @@ -204,12 +221,12 @@ class KoharuFavoriteExtractor(KoharuExtractor): """Extractor for koharu favorites""" subcategory = "favorite" pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?" - example = "https://koharu.to/favorites" + example = "https://niyaniya.moe/favorites" def items(self): self.login() - params = text.parse_query(self.groups[0]) + params = text.parse_query(self.groups[1]) params["page"] = text.parse_int(params.get("page"), 1) return self._pagination("/favorites", params) @@ -226,7 +243,7 @@ class KoharuFavoriteExtractor(KoharuExtractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - url = "https://auth.koharu.to/login" + url = "https://auth.schale.network/login" data = {"uname": username, "passwd": password} response = self.request( url, method="POST", headers=self.headers, data=data) diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 044f4f5..295b9c4 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -46,12 +46,17 @@ class LolisafeAlbumExtractor(LolisafeExtractor): for data["num"], file in enumerate(files, 1): url = file["file"] file.update(data) - text.nameext_from_url(url, file) + + if "extension" not in file: + text.nameext_from_url(url, file) if "name" in file: name = file["name"] file["name"] = name.rpartition(".")[0] or name file["id"] = file["filename"].rpartition("-")[2] + elif "id" in file: + file["name"] = file["filename"] + file["filename"] = "{}-{}".format(file["name"], file["id"]) else: file["name"], sep, file["id"] = \ file["filename"].rpartition("-") diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 1f24593..7f87cff 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -26,6 +26,7 @@ class MangadexExtractor(Extractor): "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") archive_fmt = "{chapter_id}_{page}" root = "https://mangadex.org" + useragent = util.USERAGENT _cache = {} def __init__(self, match): @@ -33,7 +34,6 @@ class MangadexExtractor(Extractor): self.uuid = match.group(1) def _init(self): - self.session.headers["User-Agent"] = util.USERAGENT self.api = MangadexAPI(self) def items(self): @@ -221,7 +221,7 @@ class MangadexAPI(): return self._call("/list/" + uuid)["data"] def list_feed(self, uuid): - return self._pagination("/list/" + uuid + "/feed") + return self._pagination_chapters("/list/" + uuid + "/feed") @memcache(keyarg=1) def manga(self, uuid): @@ -230,7 +230,7 @@ class MangadexAPI(): def manga_author(self, uuid_author): params = {"authorOrArtist": uuid_author} - return self._pagination("/manga", params) + return self._pagination_manga("/manga", params) def manga_feed(self, uuid): order = "desc" if self.extractor.config("chapter-reverse") else "asc" @@ -238,11 +238,11 @@ class MangadexAPI(): "order[volume]" : order, "order[chapter]": order, } - return self._pagination("/manga/" + uuid + "/feed", params) + return self._pagination_chapters("/manga/" + uuid + "/feed", params) def user_follows_manga_feed(self): params = {"order[publishAt]": "desc"} - return self._pagination("/user/follows/manga/feed", params) + return self._pagination_chapters("/user/follows/manga/feed", params) def authenticate(self): self.headers["Authorization"] = \ @@ -289,22 +289,31 @@ class MangadexAPI(): raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, msg) - def _pagination(self, endpoint, params=None): + def _pagination_chapters(self, endpoint, params=None): if params is None: params = {} + lang = self.extractor.config("lang") + if isinstance(lang, str) and "," in lang: + lang = lang.split(",") + params["translatedLanguage[]"] = lang + params["includes[]"] = ("scanlation_group",) + + return self._pagination(endpoint, params) + + def _pagination_manga(self, endpoint, params=None): + if params is None: + params = {} + + return self._pagination(endpoint, params) + + def _pagination(self, endpoint, params): config = self.extractor.config + ratings = config("ratings") if ratings is None: ratings = ("safe", "suggestive", "erotica", "pornographic") - - lang = config("lang") - if isinstance(lang, str) and "," in lang: - lang = lang.split(",") - params["contentRating[]"] = ratings - params["translatedLanguage[]"] = lang - params["includes[]"] = ("scanlation_group",) params["offset"] = 0 api_params = config("api-parameters") diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index cb7f701..5b354ac 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -19,7 +19,6 @@ class MastodonExtractor(BaseExtractor): directory_fmt = ("mastodon", "{instance}", "{account[username]}") filename_fmt = "{category}_{id}_{media[id]}.{extension}" archive_fmt = "{media[id]}" - cookies_domain = None def __init__(self, match): BaseExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/motherless.py b/gallery_dl/extractor/motherless.py new file mode 100644 index 0000000..c5b9322 --- /dev/null +++ b/gallery_dl/extractor/motherless.py @@ -0,0 +1,167 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://motherless.com/""" + +from .common import Extractor, Message +from .. import text, util +from ..cache import memcache +from datetime import timedelta + +BASE_PATTERN = r"(?:https?://)?motherless\.com" + + +class MotherlessExtractor(Extractor): + """Base class for motherless extractors""" + category = "motherless" + root = "https://motherless.com" + filename_fmt = "{id} {title}.{extension}" + archive_fmt = "{id}" + + +class MotherlessMediaExtractor(MotherlessExtractor): + """Extractor for a single image/video from motherless.com""" + subcategory = "media" + pattern = (BASE_PATTERN + + r"/((?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?" + r"(?!G)[A-Z0-9]+)") + example = "https://motherless.com/ABC123" + + def items(self): + file = self._extract_media(self.groups[0]) + url = file["url"] + yield Message.Directory, file + yield Message.Url, url, text.nameext_from_url(url, file) + + def _extract_media(self, path): + url = self.root + "/" + path + page = self.request(url).text + extr = text.extract_from(page) + + path, _, media_id = path.rpartition("/") + data = { + "id" : media_id, + "type" : extr("__mediatype = '", "'"), + "group": extr("__group = '", "'"), + "url" : extr("__fileurl = '", "'"), + "tags" : [ + text.unescape(tag) + for tag in text.extract_iter( + extr('class="media-meta-tags">', "</div>"), ">#", "<") + ], + "title": text.unescape(extr("<h1>", "<")), + "views": text.parse_int(extr( + 'class="count">', " ").replace(",", "")), + "favorites": text.parse_int(extr( + 'class="count">', " ").replace(",", "")), + "date" : self._parse_datetime(extr('class="count">', "<")), + "uploader": text.unescape(extr('class="username">', "<").strip()), + } + + if path and path[0] == "G": + data["gallery_id"] = path[1:] + data["gallery_title"] = self._extract_gallery_title( + page, data["gallery_id"]) + + return data + + def _parse_datetime(self, dt): + if " ago" not in dt: + return text.parse_datetime(dt, "%d %b %Y") + + value = text.parse_int(dt[:-5]) + delta = timedelta(0, value*3600) if dt[-5] == "h" else timedelta(value) + return (util.datetime_utcnow() - delta).replace( + hour=0, minute=0, second=0) + + @memcache(keyarg=2) + def _extract_gallery_title(self, page, gallery_id): + title = text.extr( + text.extr(page, '<h1 class="content-title">', "</h1>"), + "From the gallery:", "<") + if title: + return text.unescape(title.strip()) + + pos = page.find(' href="/G' + gallery_id + '"') + if pos >= 0: + return text.unescape(text.extract( + page, ' title="', '"', pos)[0]) + + return "" + + +class MotherlessGalleryExtractor(MotherlessExtractor): + """Extractor for a motherless.com gallery""" + subcategory = "gallery" + directory_fmt = ("{category}", "{uploader}", + "{gallery_id} {gallery_title}") + archive_fmt = "{gallery_id}_{id}" + pattern = BASE_PATTERN + "/G([IVG])?([A-Z0-9]+)/?$" + example = "https://motherless.com/GABC123" + + def items(self): + type, gid = self.groups + + if not type: + data = {"_extractor": MotherlessGalleryExtractor} + yield Message.Queue, self.root + "/GI" + gid, data + yield Message.Queue, self.root + "/GV" + gid, data + return + + url = "{}/G{}{}".format(self.root, type, gid) + page = self.request(url).text + data = self._extract_gallery_data(page) + + for num, thumb in enumerate(self._pagination(page), 1): + file = self._parse_thumb_data(thumb) + file.update(data) + file["num"] = num + url = file["url"] + yield Message.Directory, file + yield Message.Url, url, text.nameext_from_url(url, file) + + def _pagination(self, page): + while True: + for thumb in text.extract_iter( + page, 'class="thumb-container', "</div>"): + yield thumb + + url = text.extr(page, '<link rel="next" href="', '"') + if not url: + return + page = self.request(text.unescape(url)).text + + def _extract_gallery_data(self, page): + extr = text.extract_from(page) + return { + "gallery_id": self.groups[-1], + "gallery_title": text.unescape(extr( + "<title>", "<").rpartition(" | ")[0]), + "uploader": text.remove_html(extr( + 'class="gallery-member-username">', "</")), + "count": text.parse_int( + extr('<span class="active">', ")") + .rpartition("(")[2].replace(",", "")), + } + + def _parse_thumb_data(self, thumb): + extr = text.extract_from(thumb) + data = { + "id" : extr('data-codename="', '"'), + "type" : extr('data-mediatype="', '"'), + "thumbnail": extr('class="static" src="', '"'), + "title" : extr(' alt="', '"'), + } + + type = data["type"] + url = data["thumbnail"].replace("thumb", type) + if type == "video": + url = "{}/{}.mp4".format(url.rpartition("/")[0], data["id"]) + data["url"] = url + + return data diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 61ffdee..8ffa14b 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -193,7 +193,8 @@ class NewgroundsExtractor(Extractor): data["_comment"] = extr( 'id="author_comments"', '</div>').partition(">")[2] data["comment"] = text.unescape(text.remove_html( - data["_comment"], "", "")) + data["_comment"] + .replace("<p><br></p>", "\n\n").replace("<br>", "\n"), "", "")) data["favorites"] = text.parse_int(extr( 'id="faves_load">', '<').replace(",", "")) data["score"] = text.parse_float(extr('id="score_number">', '<')) @@ -214,7 +215,7 @@ class NewgroundsExtractor(Extractor): data = { "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), - "type" : extr('og:type" content="', '"'), + "type" : "art", "_type" : "i", "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), @@ -231,7 +232,7 @@ class NewgroundsExtractor(Extractor): if image_data: data["_multi"] = self._extract_images_multi(image_data) else: - art_images = extr('<div class="art-images', '\n</div>') + art_images = extr('<div class="art-images', '\n\t\t</div>') if art_images: data["_multi"] = self._extract_images_art(art_images, data) @@ -263,7 +264,7 @@ class NewgroundsExtractor(Extractor): return { "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), - "type" : extr('og:type" content="', '"'), + "type" : "audio", "_type" : "a", "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), @@ -283,8 +284,13 @@ class NewgroundsExtractor(Extractor): if src: src = src.replace("\\/", "/") formats = () + type = extr(',"description":"', '"') date = text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')) + if type: + type = type.rpartition(" ")[2].lower() + else: + type = "flash" if text.ext_from_url(url) == "swf" else "game" else: url = self.root + "/portal/video/" + index headers = { @@ -295,6 +301,7 @@ class NewgroundsExtractor(Extractor): formats = self._video_formats(sources) src = next(formats, "") date = text.parse_timestamp(src.rpartition("?")[2]) + type = "movie" return { "title" : text.unescape(title), @@ -513,7 +520,9 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor): """Extractor for a newgrounds user's favorited users""" subcategory = "following" - pattern = USER_PATTERN + r"/favorites/(following)" + pattern = (USER_PATTERN + r"/favorites/(following)" + r"(?:(?:/page/|/?\?page=)(\d+))?") + example = "https://USER.newgrounds.com/favorites/following" def items(self): diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py index 09b2b16..90c5420 100644 --- a/gallery_dl/extractor/nhentai.py +++ b/gallery_dl/extractor/nhentai.py @@ -61,7 +61,7 @@ class NhentaiGalleryExtractor(GalleryExtractor): def images(self, _): ufmt = ("https://i.nhentai.net/galleries/" + self.data["media_id"] + "/{}.{}") - extdict = {"j": "jpg", "p": "png", "g": "gif"} + extdict = {"j": "jpg", "p": "png", "g": "gif", "w": "webp"} return [ (ufmt.format(num, extdict.get(img["t"], "jpg")), { diff --git a/gallery_dl/extractor/noop.py b/gallery_dl/extractor/noop.py new file mode 100644 index 0000000..df2316c --- /dev/null +++ b/gallery_dl/extractor/noop.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""noop extractor""" + +from .common import Extractor, Message + + +class NoopExtractor(Extractor): + category = "noop" + pattern = r"(?i)noo?p$" + example = "noop" + + def items(self): + # yield *something* to prevent a 'No results' message + yield Message.Version, 1 + + # Save cookies manually, since it happens automatically only after + # extended extractor initialization, i.e. Message.Directory, which + # itself might cause some unintended effects. + if self.cookies: + self.cookies_store() diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 0b64ea3..3eacf1a 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -26,12 +26,15 @@ class PatreonExtractor(Extractor): _warning = True def _init(self): - self.session.headers["User-Agent"] = \ - "Patreon/72.2.28 (Android; Android 14; Scale/2.10)" - if self._warning: - if not self.cookies_check(("session_id",)): + if self.cookies_check(("session_id",)): + self.session.headers["User-Agent"] = \ + "Patreon/72.2.28 (Android; Android 14; Scale/2.10)" + else: + if self._warning: + PatreonExtractor._warning = False self.log.warning("no 'session_id' cookie set") - PatreonExtractor._warning = False + self.session.headers["User-Agent"] = \ + "Patreon/7.6.28 (Android; Android 11; Scale/2.10)" def items(self): generators = self._build_file_generators(self.config("files")) diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 150efed..1b67272 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -46,7 +46,7 @@ BASE_PATTERN = PhilomenaExtractor.update({ "ponybooru": { "root": "https://ponybooru.org", "pattern": r"(?:www\.)?ponybooru\.org", - "filter_id": "2", + "filter_id": "3", }, "furbooru": { "root": "https://furbooru.org", diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py index 422325f..fe26704 100644 --- a/gallery_dl/extractor/piczel.py +++ b/gallery_dl/extractor/piczel.py @@ -19,7 +19,7 @@ class PiczelExtractor(Extractor): filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" root = "https://piczel.tv" - api_root = root + root_api = root def items(self): for post in self.posts(): @@ -75,7 +75,7 @@ class PiczelUserExtractor(PiczelExtractor): self.user = match.group(1) def posts(self): - url = "{}/api/users/{}/gallery".format(self.api_root, self.user) + url = "{}/api/users/{}/gallery".format(self.root_api, self.user) return self._pagination(url) @@ -93,7 +93,7 @@ class PiczelFolderExtractor(PiczelExtractor): self.user, self.folder_id = match.groups() def posts(self): - url = "{}/api/users/{}/gallery".format(self.api_root, self.user) + url = "{}/api/users/{}/gallery".format(self.root_api, self.user) return self._pagination(url, int(self.folder_id)) @@ -108,5 +108,5 @@ class PiczelImageExtractor(PiczelExtractor): self.image_id = match.group(1) def posts(self): - url = "{}/api/gallery/{}".format(self.api_root, self.image_id) + url = "{}/api/gallery/{}".format(self.root_api, self.image_id) return (self.request(url).json(),) diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py index 5362f13..5749240 100644 --- a/gallery_dl/extractor/pillowfort.py +++ b/gallery_dl/extractor/pillowfort.py @@ -52,6 +52,7 @@ class PillowfortExtractor(Extractor): post["date"] = text.parse_datetime( post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") post["post_id"] = post.pop("id") + post["count"] = len(files) yield Message.Directory, post post["num"] = 0 diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 499c579..121c7bf 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -117,11 +117,16 @@ class PinterestExtractor(Extractor): else: media = self._extract_image(page, block) - elif type == "story_pin_video_block": + elif type == "story_pin_video_block" or "video" in block: video = block["video"] media = self._extract_video(video) media["media_id"] = video.get("id") or "" + elif type == "story_pin_music_block" or "audio" in block: + media = block["audio"] + media["url"] = media["audio_url"] + media["media_id"] = media.get("id") or "" + elif type == "story_pin_paragraph_block": media = {"url": "text:" + block["text"], "extension": "txt", @@ -130,7 +135,10 @@ class PinterestExtractor(Extractor): else: self.log.warning("%s: Unsupported story block '%s'", pin.get("id"), type) - continue + try: + media = self._extract_image(page, block) + except Exception: + continue media["story_id"] = story_id media["page_id"] = page_id @@ -397,14 +405,19 @@ class PinterestAPI(): self.root = extractor.root self.cookies = {"csrftoken": csrf_token} self.headers = { - "Accept" : "application/json, text/javascript, " - "*/*, q=0.01", - "Accept-Language" : "en-US,en;q=0.5", - "X-Requested-With" : "XMLHttpRequest", - "X-APP-VERSION" : "0c4af40", - "X-CSRFToken" : csrf_token, - "X-Pinterest-AppState": "active", - "Origin" : self.root, + "Accept" : "application/json, text/javascript, " + "*/*, q=0.01", + "X-Requested-With" : "XMLHttpRequest", + "X-APP-VERSION" : "a89153f", + "X-Pinterest-AppState" : "active", + "X-Pinterest-Source-Url" : None, + "X-Pinterest-PWS-Handler": "www/[username].js", + "Alt-Used" : "www.pinterest.com", + "Connection" : "keep-alive", + "Cookie" : None, + "Sec-Fetch-Dest" : "empty", + "Sec-Fetch-Mode" : "cors", + "Sec-Fetch-Site" : "same-origin", } def pin(self, pin_id): @@ -437,7 +450,12 @@ class PinterestAPI(): def board_pins(self, board_id): """Yield all pins of a specific board""" - options = {"board_id": board_id} + options = { + "board_id": board_id, + "field_set_key": "react_grid_pin", + "prepend": False, + "bookmarks": None, + } return self._pagination("BoardFeed", options) def board_section(self, section_id): diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 8c6e6d8..8ad061d 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -26,13 +26,14 @@ class PixivExtractor(Extractor): directory_fmt = ("{category}", "{user[id]} {user[account]}") filename_fmt = "{id}_p{num}.{extension}" archive_fmt = "{id}{suffix}.{extension}" - cookies_domain = None + cookies_domain = ".pixiv.net" sanity_url = "https://s.pximg.net/common/images/limit_sanity_level_360.png" mypixiv_url = "https://s.pximg.net/common/images/limit_mypixiv_360.png" def _init(self): self.api = PixivAppAPI(self) self.load_ugoira = self.config("ugoira", True) + self.load_ugoira_original = (self.load_ugoira == "original") self.max_posts = self.config("max-posts", 0) self.sanity_workaround = self.config("sanity", True) self.meta_user = self.config("metadata") @@ -105,34 +106,7 @@ class PixivExtractor(Extractor): del work["image_urls"] del work["meta_pages"] - if work["type"] == "ugoira": - if self.load_ugoira: - try: - return self._extract_ugoira(work) - except Exception as exc: - self.log.warning( - "%s: Unable to retrieve Ugoira metatdata (%s - %s)", - work["id"], exc.__class__.__name__, exc) - - elif work["page_count"] == 1: - url = meta_single_page["original_image_url"] - if url == self.sanity_url: - if self.sanity_workaround: - self.log.warning("%s: 'sanity_level' warning", work["id"]) - body = self._request_ajax("/illust/" + str(work["id"])) - return self._extract_ajax(work, body) - else: - self.log.warning( - "%s: Unable to download work ('sanity_level' warning)", - work["id"]) - elif url == self.mypixiv_url: - work["_mypixiv"] = True - self.log.warning("%s: 'My pixiv' locked", work["id"]) - return () - else: - return ({"url": url},) - - else: + if meta_pages: return [ { "url" : img["image_urls"]["original"], @@ -141,30 +115,58 @@ class PixivExtractor(Extractor): for num, img in enumerate(meta_pages) ] + url = meta_single_page["original_image_url"] + if url == self.sanity_url: + work["_ajax"] = True + self.log.warning("%s: 'limit_sanity_level' warning", work["id"]) + if self.sanity_workaround: + body = self._request_ajax("/illust/" + str(work["id"])) + return self._extract_ajax(work, body) + + elif url == self.mypixiv_url: + work["_mypixiv"] = True + self.log.warning("%s: 'My pixiv' locked", work["id"]) + + elif work["type"] != "ugoira": + return ({"url": url},) + + elif self.load_ugoira: + try: + return self._extract_ugoira(work, url) + except Exception as exc: + self.log.warning( + "%s: Unable to retrieve Ugoira metatdata (%s - %s)", + work["id"], exc.__class__.__name__, exc) + return () - def _extract_ugoira(self, work): + def _extract_ugoira(self, work, img_url): ugoira = self.api.ugoira_metadata(work["id"]) - url = ugoira["zip_urls"]["medium"] work["_ugoira_frame_data"] = work["frames"] = frames = ugoira["frames"] - work["date_url"] = self._date_from_url(url) + work["_ugoira_original"] = self.load_ugoira_original work["_http_adjust_extension"] = False - if self.load_ugoira == "original": - work["_ugoira_original"] = True - base, sep, _ = url.rpartition("_ugoira") - base = base.replace("/img-zip-ugoira/", "/img-original/", 1) + sep + if self.load_ugoira_original: + work["date_url"] = self._date_from_url(img_url) - for ext in ("jpg", "png", "gif"): - try: - url = "{}0.{}".format(base, ext) - self.request(url, method="HEAD") - break - except exception.HttpError: - pass + base, sep, ext = img_url.rpartition("_ugoira0.") + if sep: + base += "_ugoira" else: - self.log.warning( - "Unable to find Ugoira frame URLs (%s)", work["id"]) + base, sep, _ = img_url.rpartition("_ugoira") + base = base.replace( + "/img-zip-ugoira/", "/img-original/", 1) + sep + + for ext in ("jpg", "png", "gif"): + try: + url = "{}0.{}".format(base, ext) + self.request(url, method="HEAD") + break + except exception.HttpError: + pass + else: + self.log.warning( + "Unable to find Ugoira frame URLs (%s)", work["id"]) return [ { @@ -174,9 +176,11 @@ class PixivExtractor(Extractor): } for num in range(len(frames)) ] + else: - work["_ugoira_original"] = False - url = url.replace("_ugoira600x600", "_ugoira1920x1080", 1) + zip_url = ugoira["zip_urls"]["medium"] + work["date_url"] = self._date_from_url(zip_url) + url = zip_url.replace("_ugoira600x600", "_ugoira1920x1080", 1) return ({"url": url},) def _request_ajax(self, endpoint): @@ -333,12 +337,12 @@ class PixivUserExtractor(PixivExtractor): class PixivArtworksExtractor(PixivExtractor): """Extractor for artworks of a pixiv user""" subcategory = "artworks" - _warning = True pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" r"(?:/([^/?#]+))?/?(?:$|[?#])" r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") example = "https://www.pixiv.net/en/users/12345/artworks" + _warn_phpsessid = True def _init(self): PixivExtractor._init(self) @@ -352,12 +356,13 @@ class PixivArtworksExtractor(PixivExtractor): self.tag = t1 or t2 if self.sanity_workaround: - self.cookies_domain = d = ".pixiv.net" + self.cookies_domain = domain = ".pixiv.net" self._init_cookies() - if self._warning and not self.cookies.get("PHPSESSID", domain=d): - PixivArtworksExtractor._warning = False - self.log.warning("No 'PHPSESSID' cookie set. Can detect only " - "non R-18 'sanity_level' works.") + if self._warn_phpsessid: + PixivArtworksExtractor._warn_phpsessid = False + if not self.cookies.get("PHPSESSID", domain=domain): + self.log.warning("No 'PHPSESSID' cookie set. Can detect on" + "ly non R-18 'limit_sanity_level' works.") def metadata(self): if self.config("metadata"): @@ -601,7 +606,10 @@ class PixivRankingExtractor(PixivExtractor): self.mode = self.date = None def works(self): - return self.api.illust_ranking(self.mode, self.date) + ranking = self.ranking + for ranking["rank"], work in enumerate( + self.api.illust_ranking(self.mode, self.date), 1): + yield work def metadata(self): query = text.parse_query(self.query) @@ -640,10 +648,12 @@ class PixivRankingExtractor(PixivExtractor): date = (now - timedelta(days=1)).strftime("%Y-%m-%d") self.date = date - return {"ranking": { + self.ranking = ranking = { "mode": mode, "date": self.date, - }} + "rank": 0, + } + return {"ranking": ranking} class PixivSearchExtractor(PixivExtractor): @@ -734,7 +744,6 @@ class PixivPixivisionExtractor(PixivExtractor): directory_fmt = ("{category}", "pixivision", "{pixivision_id} {pixivision_title}") archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}" - cookies_domain = ".pixiv.net" pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)" example = "https://www.pixivision.net/en/a/12345" diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index bd22283..e09a7aa 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -41,7 +41,7 @@ class PoipikuExtractor(Extractor): post = { "post_category": extr("<title>[", "]"), - "count" : extr("(", " "), + "count" : text.parse_int(extr("(", " ")), "post_id" : parts[-1].partition(".")[0], "user_id" : parts[-2], "user_name" : text.unescape(extr( diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 8577e74..89eafc8 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -340,18 +340,16 @@ class RedditRedirectExtractor(Extractor): category = "reddit" subcategory = "redirect" pattern = (r"(?:https?://)?(?:" - r"(?:\w+\.)?reddit\.com/(?:(?:r)/([^/?#]+)))" + r"(?:\w+\.)?reddit\.com/(?:(r|u|user)/([^/?#]+)))" r"/s/([a-zA-Z0-9]{10})") example = "https://www.reddit.com/r/SUBREDDIT/s/abc456GHIJ" - def __init__(self, match): - Extractor.__init__(self, match) - self.subreddit = match.group(1) - self.share_url = match.group(2) - def items(self): - url = "https://www.reddit.com/r/" + self.subreddit + "/s/" + \ - self.share_url + sub_type, subreddit, share_url = self.groups + if sub_type == "u": + sub_type = "user" + url = "https://www.reddit.com/{}/{}/s/{}".format( + sub_type, subreddit, share_url) data = {"_extractor": RedditSubmissionExtractor} response = self.request(url, method="HEAD", allow_redirects=False, notfound="submission") diff --git a/gallery_dl/extractor/rule34vault.py b/gallery_dl/extractor/rule34vault.py new file mode 100644 index 0000000..8c8abfa --- /dev/null +++ b/gallery_dl/extractor/rule34vault.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://rule34vault.com/""" + +from .booru import BooruExtractor +from .. import text +import collections + +BASE_PATTERN = r"(?:https?://)?rule34vault\.com" + + +class Rule34vaultExtractor(BooruExtractor): + category = "rule34vault" + root = "https://rule34vault.com" + root_cdn = "https://r34xyz.b-cdn.net" + filename_fmt = "{category}_{id}.{extension}" + per_page = 100 + + TAG_TYPES = { + 1: "general", + 2: "copyright", + 4: "character", + 8: "artist", + } + + def _file_url(self, post): + post_id = post["id"] + extension = "jpg" if post["type"] == 0 else "mp4" + post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format( + self.root_cdn, post_id // 1000, post_id, post_id, extension) + return url + + def _prepare(self, post): + post.pop("files", None) + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + if "tags" in post: + post["tags"] = [t["value"] for t in post["tags"]] + + def _tags(self, post, _): + if "tags" not in post: + post.update(self._fetch_post(post["id"])) + + tags = collections.defaultdict(list) + for tag in post["tags"]: + tags[tag["type"]].append(tag["value"]) + types = self.TAG_TYPES + for type, values in tags.items(): + post["tags_" + types[type]] = values + + def _fetch_post(self, post_id): + url = "{}/api/v2/post/{}".format(self.root, post_id) + return self.request(url).json() + + def _pagination(self, endpoint, params=None): + url = "{}/api{}".format(self.root, endpoint) + + if params is None: + params = {} + params["CountTotal"] = False + params["Skip"] = self.page_start * self.per_page + params["take"] = self.per_page + threshold = self.per_page + + while True: + data = self.request(url, method="POST", json=params).json() + + yield from data["items"] + + if len(data["items"]) < threshold: + return + params["cursor"] = data.get("cursor") + params["Skip"] += params["take"] + + +class Rule34vaultPostExtractor(Rule34vaultExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post/(\d+)" + example = "https://rule34vault.com/post/12345" + + def posts(self): + return (self._fetch_post(self.groups[0]),) + + +class Rule34vaultPlaylistExtractor(Rule34vaultExtractor): + subcategory = "playlist" + directory_fmt = ("{category}", "{playlist_id}") + archive_fmt = "p_{playlist_id}_{id}" + pattern = BASE_PATTERN + r"/playlists/view/(\d+)" + example = "https://rule34vault.com/playlists/view/12345" + + def metadata(self): + return {"playlist_id": self.groups[0]} + + def posts(self): + endpoint = "/v2/post/search/playlist/" + self.groups[0] + return self._pagination(endpoint) + + +class Rule34vaultTagExtractor(Rule34vaultExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/(?!p(?:ost|laylists)/)([^/?#]+)" + example = "https://rule34vault.com/TAG" + + def metadata(self): + self.tags = text.unquote(self.groups[0]).split("%7C") + return {"search_tags": " ".join(self.tags)} + + def posts(self): + endpoint = "/v2/post/search/root" + params = {"includeTags": [t.replace("_", " ") for t in self.tags]} + return self._pagination(endpoint, params) diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py new file mode 100644 index 0000000..f1e7518 --- /dev/null +++ b/gallery_dl/extractor/rule34xyz.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://rule34.xyz/""" + +from .booru import BooruExtractor +from .. import text +import collections + +BASE_PATTERN = r"(?:https?://)?rule34\.xyz" + + +class Rule34xyzExtractor(BooruExtractor): + category = "rule34xyz" + root = "https://rule34.xyz" + root_cdn = "https://rule34xyz.b-cdn.net" + filename_fmt = "{category}_{id}.{extension}" + per_page = 60 + + TAG_TYPES = { + 0: "general", + 1: "copyright", + 2: "character", + 3: "artist", + } + + def _init(self): + formats = self.config("format") + if formats: + if isinstance(formats, str): + formats = formats.split(",") + self.formats = formats + else: + self.formats = ("10", "40", "41", "2") + + def _file_url(self, post): + post["files"] = files = { + str(link["type"]): link["url"] + for link in post.pop("imageLinks") + } + + for fmt in self.formats: + if fmt in files: + break + else: + fmt = "2" + self.log.warning("%s: Requested format not available", post["id"]) + + post["file_url"] = url = files[fmt] + post["format_id"] = fmt + post["format"] = url.rsplit(".", 2)[1] + return url + + def _prepare(self, post): + post.pop("filesPreview", None) + post.pop("tagsWithType", None) + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%f") + + def _tags(self, post, _): + if post.get("tagsWithType") is None: + post.update(self._fetch_post(post["id"])) + + tags = collections.defaultdict(list) + for tag in post["tagsWithType"]: + tags[tag["type"]].append(tag["value"]) + types = self.TAG_TYPES + for type, values in tags.items(): + post["tags_" + types[type]] = values + + def _fetch_post(self, post_id): + url = "{}/api/post/{}".format(self.root, post_id) + return self.request(url).json() + + def _pagination(self, endpoint, params=None): + url = "{}/api{}".format(self.root, endpoint) + + if params is None: + params = {} + params["IncludeLinks"] = "true" + params["IncludeTags"] = "true" + params["OrderBy"] = "0" + params["Skip"] = self.page_start * self.per_page + params["Take"] = self.per_page + params["DisableTotal"] = "true" + threshold = self.per_page + + while True: + data = self.request(url, params=params).json() + + yield from data["items"] + + if len(data["items"]) < threshold: + return + params["Skip"] += params["Take"] + + +class Rule34xyzPostExtractor(Rule34xyzExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post/(\d+)" + example = "https://rule34.xyz/post/12345" + + def posts(self): + return (self._fetch_post(self.groups[0]),) + + +class Rule34xyzPlaylistExtractor(Rule34xyzExtractor): + subcategory = "playlist" + directory_fmt = ("{category}", "{playlist_id}") + archive_fmt = "p_{playlist_id}_{id}" + pattern = BASE_PATTERN + r"/playlists/view/(\d+)" + example = "https://rule34.xyz/playlists/view/12345" + + def metadata(self): + return {"playlist_id": self.groups[0]} + + def posts(self): + endpoint = "/playlist-item" + params = {"PlaylistId": self.groups[0]} + return self._pagination(endpoint, params) + + +class Rule34xyzTagExtractor(Rule34xyzExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/([^/?#]+)$" + example = "https://rule34.xyz/TAG" + + def metadata(self): + self.tags = text.unquote(self.groups[0]).replace("_", " ") + return {"search_tags": self.tags} + + def posts(self): + endpoint = "/post/search" + params = {"Tag": self.tags} + return self._pagination(endpoint, params) diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py new file mode 100644 index 0000000..784cdc0 --- /dev/null +++ b/gallery_dl/extractor/saint.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://saint2.su/""" + +from .lolisafe import LolisafeAlbumExtractor +from .. import text + +BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|to)" + + +class SaintAlbumExtractor(LolisafeAlbumExtractor): + """Extractor for saint albums""" + category = "saint" + root = "https://saint2.su" + pattern = BASE_PATTERN + r"/a/([^/?#]+)" + example = "https://saint2.su/a/ID" + + def fetch_album(self, album_id): + # album metadata + response = self.request(self.root + "/a/" + album_id) + extr = text.extract_from(response.text) + + title = extr("<title>", "<") + descr = extr('name="description" content="', '"') + files = [] + + while True: + id2 = extr("/thumbs/", "-") + if not id2: + break + files.append({ + "id2" : id2, + "date" : text.parse_timestamp(extr("", ".")), + "id" : extr("/embed/", '"'), + "size" : text.parse_int(extr('data="', '"')), + "file" : text.unescape(extr( + "onclick=\"play(", ")").strip("\"'")), + "id_dl": extr("/d/", ")").rstrip("\"'"), + }) + + return files, { + "album_id" : album_id, + "album_name" : text.unescape(title.rpartition(" - ")[0]), + "album_size" : sum(file["size"] for file in files), + "description" : text.unescape(descr), + "count" : len(files), + "_http_headers": {"Referer": response.url} + } + + +class SaintMediaExtractor(SaintAlbumExtractor): + """Extractor for saint media links""" + subcategory = "media" + directory_fmt = ("{category}",) + pattern = BASE_PATTERN + r"(/(embe)?d/([^/?#]+))" + example = "https://saint2.su/embed/ID" + + def fetch_album(self, album_id): + try: + path, embed, _ = self.groups + + url = self.root + path + response = self.request(url) + extr = text.extract_from(response.text) + + if embed: + file = { + "id" : album_id, + "id2" : extr("/thumbs/", "-"), + "date" : text.parse_timestamp(extr("", ".")), + "file" : text.unescape(extr('<source src="', '"')), + "id_dl": extr("/d/", "'"), + } + + else: # /d/ + file = { + "file" : text.unescape(extr('<a href="', '"')), + "id_dl" : album_id, + "name" : album_id, + "filename" : album_id, + "extension": "mp4", + } + + file["_http_headers"] = {"Referer": response.url} + except Exception as exc: + self.log.error("%s: %s", exc.__class__.__name__, exc) + return (), {} + + return (file,), { + "album_id" : "", + "album_name" : "", + "album_size" : -1, + "description": "", + "count" : 1, + } diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 7db8172..d5309dc 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -76,14 +76,15 @@ class SankakuExtractor(BooruExtractor): def _tags(self, post, page): tags = collections.defaultdict(list) - types = self.TAG_TYPES for tag in post["tags"]: name = tag["name"] if name: - tags[types[tag["type"]]].append(name.lower().replace(" ", "_")) - for key, value in tags.items(): - post["tags_" + key] = value - post["tag_string_" + key] = " ".join(value) + tags[tag["type"]].append(name.lower().replace(" ", "_")) + types = self.TAG_TYPES + for type, values in tags.items(): + name = types[type] + post["tags_" + name] = values + post["tag_string_" + name] = " ".join(values) def _notes(self, post, page): if post.get("has_notes"): diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py index 9f9f0c4..c818c98 100644 --- a/gallery_dl/extractor/scrolller.py +++ b/gallery_dl/extractor/scrolller.py @@ -32,7 +32,12 @@ class ScrolllerExtractor(Extractor): for post in self.posts(): - src = max(post["mediaSources"], key=self._sort_key) + media_sources = post.get("mediaSources") + if not media_sources: + self.log.warning("%s: No media files", post.get("id")) + continue + + src = max(media_sources, key=self._sort_key) post.update(src) url = src["url"] text.nameext_from_url(url, post) diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 3639c0b..48bd918 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -21,7 +21,6 @@ class SmugmugExtractor(Extractor): category = "smugmug" filename_fmt = ("{category}_{User[NickName]:?/_/}" "{Image[UploadKey]}_{Image[ImageKey]}.{extension}") - cookies_domain = None empty_user = { "Uri": "", "ResponseLevel": "Public", diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index 8582824..c120ee5 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -56,14 +56,19 @@ class SteamgriddbExtractor(Extractor): download_fake_png = self.config("download-fake-png", True) for asset in self.assets(): - if download_fake_png and asset.get("fake_png"): - urls = (asset["url"], asset["fake_png"]) - else: - urls = (asset["url"],) + fake_png = download_fake_png and asset.get("fake_png") - asset["count"] = len(urls) + asset["count"] = 2 if fake_png else 1 yield Message.Directory, asset - for asset["num"], url in enumerate(urls, 1): + + asset["num"] = 1 + url = asset["url"] + yield Message.Url, url, text.nameext_from_url(url, asset) + + if fake_png: + asset["num"] = 2 + asset["_http_adjust_extension"] = False + url = fake_png yield Message.Url, url, text.nameext_from_url(url, asset) def _call(self, endpoint, **kwargs): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 73455d2..8d1fcde 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -21,8 +21,8 @@ BASE_PATTERN = ( r"([\w-]+\.tumblr\.com)))" ) -POST_TYPES = frozenset(( - "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) +POST_TYPES = frozenset(("text", "quote", "link", "answer", "video", + "audio", "photo", "chat", "search")) class TumblrExtractor(Extractor): @@ -31,7 +31,6 @@ class TumblrExtractor(Extractor): directory_fmt = ("{category}", "{blog_name}") filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" - cookies_domain = None def __init__(self, match): Extractor.__init__(self, match) @@ -83,14 +82,21 @@ class TumblrExtractor(Extractor): return if post["type"] not in self.types: continue - if not blog: - blog = self.api.info(self.blog) - blog["uuid"] = self.blog - if self.avatar: - url = self.api.avatar(self.blog) - yield Message.Directory, {"blog": blog} - yield self._prepare_avatar(url, post.copy(), blog) + if "blog" in post: + blog = post["blog"] + self.blog = blog["name"] + ".tumblr.com" + else: + if not blog: + blog = self.api.info(self.blog) + blog["uuid"] = self.blog + + if self.avatar: + url = self.api.avatar(self.blog) + yield Message.Directory, {"blog": blog} + yield self._prepare_avatar(url, post.copy(), blog) + + post["blog"] = blog reblog = "reblogged_from_id" in post if reblog and self._skip_reblog(post): @@ -99,7 +105,6 @@ class TumblrExtractor(Extractor): if "trail" in post: del post["trail"] - post["blog"] = blog post["date"] = text.parse_timestamp(post["timestamp"]) posts = [] @@ -349,6 +354,19 @@ class TumblrLikesExtractor(TumblrExtractor): return self.api.likes(self.blog) +class TumblrSearchExtractor(TumblrExtractor): + """Extractor for a Tumblr search""" + subcategory = "search" + pattern = (BASE_PATTERN + r"/search/([^/?#]+)" + r"(?:/([^/?#]+)(?:/([^/?#]+))?)?(?:/?\?([^#]+))?") + example = "https://www.tumblr.com/search/QUERY" + + def posts(self): + _, _, _, search, mode, post_type, query = self.groups + params = text.parse_query(query) + return self.api.search(text.unquote(search), params, mode, post_type) + + class TumblrAPI(oauth.OAuth1API): """Interface for the Tumblr API v2 @@ -394,7 +412,8 @@ class TumblrAPI(oauth.OAuth1API): if self.before and params["offset"]: self.log.warning("'offset' and 'date-max' cannot be used together") - return self._pagination(blog, "/posts", params, cache=True) + endpoint = "/v2/blog/{}/posts".format(blog) + return self._pagination(endpoint, params, blog=blog, cache=True) def likes(self, blog): """Retrieve liked posts""" @@ -410,6 +429,20 @@ class TumblrAPI(oauth.OAuth1API): yield from posts params["before"] = posts[-1]["liked_timestamp"] + def search(self, query, params, mode="top", post_type=None): + """Retrieve search results""" + endpoint = "/v2/timeline/search" + + params["limit"] = "50" + params["days"] = params.pop("t", None) + params["query"] = query + params["mode"] = mode + params["reblog_info"] = "true" if self.extractor.reblogs else "false" + if post_type: + params["post_type_filter"] = post_type + + return self._pagination(endpoint, params) + def _call(self, endpoint, params, **kwargs): url = self.ROOT + endpoint kwargs["params"] = params @@ -478,20 +511,28 @@ class TumblrAPI(oauth.OAuth1API): raise exception.StopExtraction(data) - def _pagination(self, blog, endpoint, params, key="posts", cache=False): - endpoint = "/v2/blog/{}{}".format(blog, endpoint) + def _pagination(self, endpoint, params, + blog=None, key="posts", cache=False): if self.api_key: params["api_key"] = self.api_key strategy = self.extractor.config("pagination") + if not strategy and "offset" not in params: + strategy = "api" + while True: data = self._call(endpoint, params) - if cache: - self.BLOG_CACHE[blog] = data["blog"] - cache = False + if "timeline" in data: + data = data["timeline"] + posts = data["elements"] + + else: + if cache: + self.BLOG_CACHE[blog] = data["blog"] + cache = False + posts = data[key] - posts = data[key] yield from posts if strategy == "api": diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py index 27cc9d0..448625e 100644 --- a/gallery_dl/extractor/tumblrgallery.py +++ b/gallery_dl/extractor/tumblrgallery.py @@ -18,6 +18,7 @@ class TumblrgalleryExtractor(GalleryExtractor): filename_fmt = "{category}_{gallery_id}_{num:>03}_{id}.{extension}" directory_fmt = ("{category}", "{gallery_id} {title}") root = "https://tumblrgallery.xyz" + referer = False @staticmethod def _urls_from_page(page): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 9c9d505..090b11a 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -305,6 +305,7 @@ class TwitterExtractor(Extractor): legacy["created_at"], "%a %b %d %H:%M:%S %z %Y") except Exception: date = util.NONE + source = tweet.get("source") tdata = { "tweet_id" : tweet_id, @@ -320,7 +321,7 @@ class TwitterExtractor(Extractor): "author" : author, "user" : self._user or author, "lang" : legacy["lang"], - "source" : text.extr(tweet["source"], ">", "<"), + "source" : text.extr(source, ">", "<") if source else "", "sensitive" : tget("possibly_sensitive"), "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), @@ -538,12 +539,6 @@ class TwitterExtractor(Extractor): if username: return self.cookies_update(_login_impl(self, username, password)) - for cookie in self.cookies: - if cookie.domain == ".twitter.com": - self.cookies.set( - cookie.name, cookie.value, domain=self.cookies_domain, - expires=cookie.expires, secure=cookie.secure) - class TwitterUserExtractor(TwitterExtractor): """Extractor for a Twitter user""" diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py index f7ce44b..bb80055 100644 --- a/gallery_dl/extractor/urlgalleries.py +++ b/gallery_dl/extractor/urlgalleries.py @@ -13,8 +13,8 @@ from .. import text, exception class UrlgalleriesGalleryExtractor(GalleryExtractor): """Base class for Urlgalleries extractors""" category = "urlgalleries" - root = "urlgalleries.net" - request_interval = (0.5, 1.0) + root = "https://urlgalleries.net" + request_interval = (0.5, 1.5) pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)" example = "https://BLOG.urlgalleries.net/gallery-12345/TITLE" diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 949c7cb..70ab259 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -155,7 +155,10 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): def items(self): page = None - data = {"_extractor": WebtoonsEpisodeExtractor} + data = { + "_extractor": WebtoonsEpisodeExtractor, + "title_no" : text.parse_int(self.title_no), + } while True: path = "/{}/list?title_no={}&page={}".format( @@ -173,6 +176,8 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): data["page"] = self.page_no for url in self.get_episode_urls(page): + params = text.parse_query(url.rpartition("?")[2]) + data["episode_no"] = text.parse_int(params.get("episode_no")) yield Message.Queue, url, data self.page_no += 1 diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 83b1642..9885d79 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -126,11 +126,7 @@ class WeiboExtractor(Extractor): elif pic_type == "livephoto" and self.livephoto: append(pic["largest"].copy()) - - file = {"url": pic["video"]} - file["filename"], _, file["extension"] = \ - pic["video"].rpartition("%2F")[2].rpartition(".") - append(file) + append({"url": pic["video"]}) else: append(pic["largest"].copy()) @@ -251,6 +247,11 @@ class WeiboUserExtractor(WeiboExtractor): pattern = USER_PATTERN + r"(?:$|#)" example = "https://weibo.com/USER" + # do NOT override 'initialize()' + # it is needed for 'self._user_id()' + # def initialize(self): + # pass + def items(self): base = "{}/u/{}?tabtype=".format(self.root, self._user_id()) return self._dispatch_extractors(( |
