diff options
Diffstat (limited to 'gallery_dl')
47 files changed, 1978 insertions, 1410 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 6c2c713..c1f80b6 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -196,7 +196,7 @@ def main(): cnt, "entry" if cnt == 1 else "entries", cache._path(), ) else: - if not args.urls and not args.inputfile: + if not args.urls and not args.inputfiles: parser.error( "The following arguments are required: URL\n" "Use 'gallery-dl --help' to get a list of all options.") @@ -208,18 +208,19 @@ def main(): jobtype = args.jobtype or job.DownloadJob urls = args.urls - if args.inputfile: - try: - if args.inputfile == "-": - if sys.stdin: - urls += parse_inputfile(sys.stdin, log) + if args.inputfiles: + for inputfile in args.inputfiles: + try: + if inputfile == "-": + if sys.stdin: + urls += parse_inputfile(sys.stdin, log) + else: + log.warning("input file: stdin is not readable") else: - log.warning("input file: stdin is not readable") - else: - with open(args.inputfile, encoding="utf-8") as file: - urls += parse_inputfile(file, log) - except OSError as exc: - log.warning("input file: %s", exc) + with open(inputfile, encoding="utf-8") as file: + urls += parse_inputfile(file, log) + except OSError as exc: + log.warning("input file: %s", exc) # unsupported file logging handler handler = output.setup_logging_handler( diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py deleted file mode 100644 index 0f49d61..0000000 --- a/gallery_dl/cloudflare.py +++ /dev/null @@ -1,201 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2020 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Methods to access sites behind Cloudflare protection""" - -import time -import operator -import collections -import urllib.parse -from xml.etree import ElementTree -from . import text -from .cache import memcache - - -def is_challenge(response): - return (response.status_code == 503 and - response.headers.get("Server", "").startswith("cloudflare") and - b"jschl-answer" in response.content) - - -def is_captcha(response): - return (response.status_code == 403 and - b'name="captcha-bypass"' in response.content) - - -def solve_challenge(session, response, kwargs): - """Solve Cloudflare challenge and get cfclearance cookie""" - parsed = urllib.parse.urlsplit(response.url) - root = parsed.scheme + "://" + parsed.netloc - page = response.text - - cf_kwargs = {} - headers = cf_kwargs["headers"] = collections.OrderedDict() - params = cf_kwargs["data"] = collections.OrderedDict() - headers["Referer"] = response.url - - form = text.extract(page, 'id="challenge-form"', '</form>')[0] - for element in ElementTree.fromstring( - "<f>" + form + "</f>").findall("input"): - name = element.attrib.get("name") - if not name: - continue - if name == "jschl_answer": - try: - value = solve_js_challenge(page, parsed.netloc) - except Exception: - return response, None, None - else: - value = element.attrib.get("value") - params[name] = value - - try: - params = {"ray": text.extract(page, '?ray=', '"')[0]} - - url = root + "/cdn-cgi/images/trace/jschal/nojs/transparent.gif" - session.request("GET", url, params=params) - - url = root + "/cdn-cgi/images/trace/jschal/js/nocookie/transparent.gif" - session.request("GET", url, params=params) - except Exception: - pass - - time.sleep(4) - url = root + text.unescape(text.extract(page, 'action="', '"')[0]) - cf_response = session.request("POST", url, **cf_kwargs) - - if cf_response.history: - initial_response = cf_response.history[0] - else: - initial_response = cf_response - - cookies = { - cookie.name: cookie.value - for cookie in initial_response.cookies - } - - if not cookies: - import logging - log = logging.getLogger("cloudflare") - log.debug("Headers:\n%s", initial_response.headers) - log.debug("Content:\n%s", initial_response.text) - return cf_response, None, None - - domain = next(iter(initial_response.cookies)).domain - cookies["__cfduid"] = response.cookies.get("__cfduid", "") - return cf_response, domain, cookies - - -def solve_js_challenge(page, netloc): - """Evaluate JS challenge in 'page' to get 'jschl_answer' value""" - - # build variable name - # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk - data, pos = text.extract_all(page, ( - ('var' , ',f, ', '='), - ('key' , '"' , '"'), - ('expr', ':' , '}'), - )) - variable = "{}.{}".format(data["var"], data["key"]) - vlength = len(variable) - - k = text.extract(page, "k = '", "'")[0] - - # evaluate the initial expression - solution = evaluate_expression(data["expr"], page, netloc) - - # iterator over all remaining expressions - # and combine their values in 'solution' - expressions = text.extract( - page, "'challenge-form');", "f.submit();", pos)[0] - for expr in expressions.split(";")[1:]: - - if expr.startswith(variable): - # select arithmetc function based on operator (+/-/*) - func = OPERATORS[expr[vlength]] - # evaluate the rest of the expression - value = evaluate_expression(expr[vlength+2:], page, netloc, k) - # combine expression value with our current solution - solution = func(solution, value) - - elif expr.startswith("a.value"): - if "t.length)" in expr: - # add length of hostname - solution += len(netloc) - if ".toFixed(" in expr: - # trim solution to 10 decimal places - solution = "{:.10f}".format(solution) - return solution - - elif expr.startswith("k+="): - k += str(evaluate_expression(expr[3:], page, netloc)) - - -def evaluate_expression(expr, page, netloc, k=""): - """Evaluate a single Javascript expression for the challenge""" - - if expr.startswith("function(p)"): - # get HTML element with ID k and evaluate the expression inside - # 'eval(eval("document.getElementById(k).innerHTML"))' - expr = text.extract(page, 'id="'+k+'"', '<')[0] - return evaluate_expression(expr.partition(">")[2], page, netloc) - - if "/" in expr: - # split the expression in numerator and denominator subexpressions, - # evaluate them separately, - # and return their fraction-result - num, _, denom = expr.partition("/") - num = evaluate_expression(num, page, netloc) - denom = evaluate_expression(denom, page, netloc) - return num / denom - - if "function(p)" in expr: - # split initial expression and function code - initial, _, func = expr.partition("function(p)") - # evaluate said expression - initial = evaluate_expression(initial, page, netloc) - # get function argument and use it as index into 'netloc' - index = evaluate_expression(func[func.index("}")+1:], page, netloc) - return initial + ord(netloc[int(index)]) - - # iterate over all subexpressions, - # evaluate them, - # and accumulate their values in 'result' - result = "" - for subexpr in expr.strip("+()").split(")+("): - value = 0 - for part in subexpr.split("+"): - if "-" in part: - p1, _, p2 = part.partition("-") - value += VALUES[p1] - VALUES[p2] - else: - value += VALUES[part] - result += str(value) - return int(result) - - -OPERATORS = { - "+": operator.add, - "-": operator.sub, - "*": operator.mul, -} - - -VALUES = { - "": 0, - "!": 1, - "[]": 0, - "!![]": 1, - "(!![]": 1, - "(!![])": 1, -} - - -@memcache(keyarg=0) -def cookies(category): - return None diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py index 6fb09e1..e1b936e 100644 --- a/gallery_dl/downloader/__init__.py +++ b/gallery_dl/downloader/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,8 +8,6 @@ """Downloader modules""" -import importlib - modules = [ "http", "text", @@ -24,22 +22,22 @@ def find(scheme): except KeyError: pass - klass = None + cls = None if scheme == "https": scheme = "http" if scheme in modules: # prevent unwanted imports try: - module = importlib.import_module("." + scheme, __package__) + module = __import__(scheme, globals(), None, (), 1) except ImportError: pass else: - klass = module.__downloader__ + cls = module.__downloader__ if scheme == "http": - _cache["http"] = _cache["https"] = klass + _cache["http"] = _cache["https"] = cls else: - _cache[scheme] = klass - return klass + _cache[scheme] = cls + return cls # -------------------------------------------------------------------- diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 8d72dc2..bc42d7c 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -31,6 +31,7 @@ class HttpDownloader(DownloaderBase): self.downloading = False self.adjust_extension = self.config("adjust-extensions", True) + self.headers = self.config("headers") self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") self.retries = self.config("retries", extractor._retries) @@ -93,13 +94,16 @@ class HttpDownloader(DownloaderBase): time.sleep(tries) tries += 1 - headers = {} + headers = {"Accept": "*/*"} file_header = None # check for .part file file_size = pathfmt.part_size() if file_size: headers["Range"] = "bytes={}-".format(file_size) + # general headers + if self.headers: + headers.update(self.headers) # file-specific headers extra = pathfmt.kwdict.get("_http_headers") if extra: diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 8086b5d..e116188 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,6 @@ """Downloader module for URLs requiring youtube-dl support""" -from youtube_dl import YoutubeDL, DEFAULT_OUTTMPL from .common import DownloaderBase from .. import text import os @@ -16,8 +15,14 @@ import os class YoutubeDLDownloader(DownloaderBase): scheme = "ytdl" + module = None def __init__(self, job): + module = self.module + if not module: + module_name = self.config("module") or "youtube_dl" + module = YoutubeDLDownloader.module = __import__(module_name) + DownloaderBase.__init__(self, job) extractor = job.extractor @@ -42,10 +47,11 @@ class YoutubeDLDownloader(DownloaderBase): options["logger"] = self.log self.forward_cookies = self.config("forward-cookies", False) - outtmpl = self.config("outtmpl") - self.outtmpl = DEFAULT_OUTTMPL if outtmpl == "default" else outtmpl + self.outtmpl = self.config("outtmpl") + if self.outtmpl == "default": + self.outtmpl = module.DEFAULT_OUTTMPL - self.ytdl = YoutubeDL(options) + self.ytdl = module.YoutubeDL(options) def download(self, url, pathfmt): if self.forward_cookies: diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 81b11fd..aa0e8ad 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -50,6 +50,8 @@ class _500pxExtractor(Extractor): def _extend(self, edges): """Extend photos with additional metadata and higher resolution URLs""" + ids = [str(edge["node"]["legacyId"]) for edge in edges] + url = "https://api.500px.com/v1/photos" params = { "expanded_user_info" : "true", @@ -62,14 +64,14 @@ class _500pxExtractor(Extractor): "liked_by" : "1", "following_sample" : "100", "image_size" : "4096", - "ids" : ",".join( - str(edge["node"]["legacyId"]) for edge in edges), + "ids" : ",".join(ids), } - data = self._request_api(url, params)["photos"] + photos = self._request_api(url, params)["photos"] return [ - data[str(edge["node"]["legacyId"])] - for edge in edges + photos[pid] for pid in ids + if pid in photos or + self.log.warning("Unable to fetch photo %s", pid) ] def _request_api(self, url, params, csrf_token=None): @@ -142,6 +144,10 @@ class _500pxGalleryExtractor(_500pxExtractor): "user": dict, }, }), + # unavailable photos (#1335) + ("https://500px.com/p/Light_Expression_Photography/galleries/street", { + "count": 0, + }), ("https://500px.com/fashvamp/galleries/lera"), ) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 923a78b..57794d0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -7,7 +7,6 @@ # published by the Free Software Foundation. import re -import importlib modules = [ "2chan", @@ -23,6 +22,7 @@ modules = [ "bcy", "behance", "blogger", + "cyberdrop", "danbooru", "derpibooru", "deviantart", @@ -35,6 +35,8 @@ modules = [ "furaffinity", "fuskator", "gelbooru", + "gelbooru_v01", + "gelbooru_v02", "gfycat", "hbrowse", "hentai2read", @@ -76,6 +78,7 @@ modules = [ "myhentaigallery", "myportfolio", "naver", + "naverwebtoon", "newgrounds", "ngomik", "nhentai", @@ -111,6 +114,7 @@ modules = [ "subscribestar", "tsumino", "tumblr", + "tumblrgallery", "twitter", "unsplash", "vanillarock", @@ -182,11 +186,12 @@ def _list_classes(): """Yield all available extractor classes""" yield from _cache + globals_ = globals() for module_name in _module_iter: - module = importlib.import_module("."+module_name, __package__) + module = __import__(module_name, globals_, None, (), 1) yield from add_module(module) - globals()["_list_classes"] = lambda : _cache + globals_["_list_classes"] = lambda : _cache def _get_classes(module): diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 64cde80..c3cf3f7 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,16 +8,12 @@ """Extractors for *booru sites""" -from .common import Extractor, Message, generate_extractors -from .. import text, util, exception - -from xml.etree import ElementTree -import collections +from .common import BaseExtractor, Message +from .. import text import operator -import re -class BooruExtractor(Extractor): +class BooruExtractor(BaseExtractor): """Base class for *booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" @@ -66,191 +62,8 @@ class BooruExtractor(Extractor): _file_url = operator.itemgetter("file_url") - @staticmethod - def _prepare(post): - post["date"] = text.parse_datetime( - post["created_at"], "%a %b %d %H:%M:%S %z %Y") + def _prepare(self, post): + """Prepare the 'post's metadata""" def _extended_tags(self, post, page=None): - if not page: - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"]) - page = self.request(url).text - html = text.extract(page, '<ul id="tag-', '</ul>')[0] - if html: - tags = collections.defaultdict(list) - pattern = re.compile( - r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S) - for tag_type, tag_name in pattern.findall(html): - tags[tag_type].append(text.unquote(tag_name)) - for key, value in tags.items(): - post["tags_" + key] = " ".join(value) - - def _api_request(self, params): - url = self.root + "/index.php?page=dapi&s=post&q=index" - return ElementTree.fromstring(self.request(url, params=params).text) - - def _pagination(self, params): - params["pid"] = self.page_start - params["limit"] = self.per_page - - while True: - root = self._api_request(params) - for post in root: - yield post.attrib - - if len(root) < self.per_page: - return - params["pid"] += 1 - - -class BooruPostExtractor(BooruExtractor): - subcategory = "post" - archive_fmt = "{id}" - pattern_fmt = r"/index\.php\?page=post&s=view&id=(\d+)" - - def __init__(self, match): - BooruExtractor.__init__(self, match) - self.post_id = match.group(1) - - def posts(self): - return self._pagination({"id": self.post_id}) - - -class BooruTagExtractor(BooruExtractor): - subcategory = "tag" - directory_fmt = ("{category}", "{search_tags}") - archive_fmt = "t_{search_tags}_{id}" - pattern_fmt = r"/index\.php\?page=post&s=list&tags=([^&#]+)" - - def __init__(self, match): - BooruExtractor.__init__(self, match) - self.tags = text.unquote(match.group(1).replace("+", " ")) - - def metadata(self): - return {"search_tags": self.tags} - - def posts(self): - return self._pagination({"tags" : self.tags}) - - -class BooruPoolExtractor(BooruExtractor): - subcategory = "pool" - directory_fmt = ("{category}", "pool", "{pool}") - archive_fmt = "p_{pool}_{id}" - pattern_fmt = r"/index\.php\?page=pool&s=show&id=(\d+)" - - def __init__(self, match): - BooruExtractor.__init__(self, match) - self.pool_id = match.group(1) - self.post_ids = () - - def skip(self, num): - self.page_start += num - return num - - def metadata(self): - url = "{}/index.php?page=pool&s=show&id={}".format( - self.root, self.pool_id) - page = self.request(url).text - - name, pos = text.extract(page, "<h4>Pool: ", "</h4>") - if not name: - raise exception.NotFoundError("pool") - self.post_ids = text.extract_iter( - page, 'class="thumb" id="p', '"', pos) - - return { - "pool": text.parse_int(self.pool_id), - "pool_name": text.unescape(name), - } - - def posts(self): - params = {} - for params["id"] in util.advance(self.post_ids, self.page_start): - for post in self._api_request(params): - yield post.attrib - - -EXTRACTORS = { - "rule34": { - "root": "https://rule34.xxx", - "test-tag": ( - ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { - "content": "97e4bbf86c3860be18de384d02d544251afe1d45", - "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", - "count": 1, - }), - ), - "test-pool": ( - ("https://rule34.xxx/index.php?page=pool&s=show&id=179", { - "count": 3, - }), - ), - "test-post": ( - ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", { - "content": "97e4bbf86c3860be18de384d02d544251afe1d45", - "options": (("tags", True),), - "keyword": { - "tags_artist": "danraku", - "tags_character": "kashima_(kantai_collection)", - "tags_copyright": "kantai_collection", - "tags_general": str, - "tags_metadata": str, - }, - }), - ), - }, - "safebooru": { - "root": "https://safebooru.org", - "test-tag": ( - ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", { - "url": "17c61b386530cf4c30842c9f580d15ef1cd09586", - "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb", - }), - ), - "test-pool": ( - ("https://safebooru.org/index.php?page=pool&s=show&id=11", { - "count": 5, - }), - ), - "test-post": ( - ("https://safebooru.org/index.php?page=post&s=view&id=1169132", { - "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b", - "content": "93b293b27dabd198afafabbaf87c49863ac82f27", - "options": (("tags", True),), - "keyword": { - "tags_artist": "kawanakajima", - "tags_character": "heath_ledger ronald_mcdonald the_joker", - "tags_copyright": "dc_comics mcdonald's the_dark_knight", - "tags_general": str, - }, - }), - ), - }, - "realbooru": { - "root": "https://realbooru.com", - "test-tag": ( - ("https://realbooru.com/index.php?page=post&s=list&tags=wine", { - "count": ">= 64", - }), - ), - "test-pool": ( - ("https://realbooru.com/index.php?page=pool&s=show&id=1", { - "count": 3, - }), - ), - "test-post": ( - ("https://realbooru.com/index.php?page=post&s=view&id=668483", { - "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9", - "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", - }), - ), - }, -} - -generate_extractors(EXTRACTORS, globals(), ( - BooruTagExtractor, - BooruPoolExtractor, - BooruPostExtractor, -)) + """Generate extended tag information""" diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 15cc776..e9b9718 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,6 +9,7 @@ """Common classes and constants used by extractor modules.""" import re +import ssl import time import netrc import queue @@ -16,8 +17,9 @@ import logging import datetime import requests import threading +from requests.adapters import HTTPAdapter from .message import Message -from .. import config, text, util, exception, cloudflare +from .. import config, text, util, exception class Extractor(): @@ -30,6 +32,7 @@ class Extractor(): filename_fmt = "{filename}.{extension}" archive_fmt = "" cookiedomain = "" + browser = None root = "" test = None request_interval = 0.0 @@ -37,15 +40,15 @@ class Extractor(): request_timestamp = 0.0 def __init__(self, match): - self.session = requests.Session() self.log = logging.getLogger(self.category) self.url = match.string - self._cookiefile = None - self._cookiejar = self.session.cookies + if self.basecategory: + self.config = self._config_shared + self.config_accumulate = self._config_shared_accumulate + self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" - self._cfgpath = ("extractor", self.category, self.subcategory) self._write_pages = self.config("write-pages", False) self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) @@ -58,11 +61,7 @@ class Extractor(): if self.request_interval < self.request_interval_min: self.request_interval = self.request_interval_min - if self.basecategory: - self.config = self._config_shared - self.config_accumulate = self._config_shared_accumulate - - self._init_headers() + self._init_session() self._init_cookies() self._init_proxies() @@ -140,21 +139,20 @@ class Extractor(): if notfound and code == 404: raise exception.NotFoundError(notfound) - reason = response.reason - if cloudflare.is_challenge(response): - self.log.info("Solving Cloudflare challenge") - response, domain, cookies = cloudflare.solve_challenge( - session, response, kwargs) - if cookies: - cloudflare.cookies.update( - self.category, (domain, cookies)) - return response - if cloudflare.is_captcha(response): - self.log.warning("Cloudflare CAPTCHA") - - msg = "'{} {}' for '{}'".format(code, reason, url) + msg = "'{} {}' for '{}'".format(code, response.reason, url) + server = response.headers.get("Server") + if server and server.startswith("cloudflare"): + if code == 503 and \ + b"jschl-answer" in response.content: + self.log.warning("Cloudflare IUAM challenge") + break + if code == 403 and \ + b'name="captcha-bypass"' in response.content: + self.log.warning("Cloudflare CAPTCHA") + break if code < 500 and code != 429 and code != 430: break + finally: Extractor.request_timestamp = time.time() @@ -212,19 +210,46 @@ class Extractor(): return username, password - def _init_headers(self): - """Initialize HTTP headers for the 'session' object""" - headers = self.session.headers + def _init_session(self): + self.session = session = requests.Session() + headers = session.headers headers.clear() - headers["User-Agent"] = self.config( - "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) " - "Gecko/20100101 Firefox/68.0")) - headers["Accept"] = "*/*" - headers["Accept-Language"] = "en-US,en;q=0.5" - headers["Accept-Encoding"] = "gzip, deflate" - headers["Connection"] = "keep-alive" - headers["Upgrade-Insecure-Requests"] = "1" + browser = self.config("browser") or self.browser + if browser: + browser, _, platform = browser.lower().partition(":") + + if not platform or platform == "auto": + platform = ("Windows NT 10.0; Win64; x64" + if util.WINDOWS else "X11; Linux x86_64") + elif platform == "windows": + platform = "Windows NT 10.0; Win64; x64" + elif platform == "linux": + platform = "X11; Linux x86_64" + elif platform == "macos": + platform = "Macintosh; Intel Mac OS X 11.2" + + if browser == "chrome": + _emulate_browser_chrome(session, platform) + else: + _emulate_browser_firefox(session, platform) + else: + headers["User-Agent"] = self.config("user-agent", ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; " + "rv:78.0) Gecko/20100101 Firefox/78.0")) + headers["Accept"] = "*/*" + headers["Accept-Language"] = "en-US,en;q=0.5" + headers["Accept-Encoding"] = "gzip, deflate" + + custom_headers = self.config("headers") + if custom_headers: + headers.update(custom_headers) + + ciphers = self.config("ciphers") + if ciphers: + if isinstance(ciphers, list): + ciphers = ":".join(ciphers) + session.mount("https://", HTTPSAdapter(ciphers)) def _init_proxies(self): """Update the session's proxy map""" @@ -242,6 +267,8 @@ class Extractor(): def _init_cookies(self): """Populate the session's cookiejar""" + self._cookiefile = None + self._cookiejar = self.session.cookies if self.cookiedomain is None: return @@ -264,11 +291,6 @@ class Extractor(): "expected 'dict' or 'str' value for 'cookies' option, " "got '%s' (%s)", cookies.__class__.__name__, cookies) - cookies = cloudflare.cookies(self.category) - if cookies: - domain, cookies = cookies - self._update_cookies_dict(cookies, domain) - def _store_cookies(self): """Store the session's cookiejar in a cookies.txt file""" if self._cookiefile and self.config("cookies-update", True): @@ -527,46 +549,126 @@ class AsynchronousMixin(): messages.put(None) -def generate_extractors(extractor_data, symtable, classes): - """Dynamically generate Extractor classes""" - extractors = config.get(("extractor",), classes[0].basecategory) - ckey = extractor_data.get("_ckey") - prev = None - - if extractors: - extractor_data.update(extractors) - - for category, info in extractor_data.items(): - - if not isinstance(info, dict) or "root" not in info: - continue - - root = info["root"] - domain = root[root.index(":") + 3:] - pattern = info.get("pattern") or re.escape(domain) - name = (info.get("name") or category).capitalize() - - for cls in classes: - - class Extr(cls): - pass - Extr.__module__ = cls.__module__ - Extr.__name__ = Extr.__qualname__ = \ - name + cls.subcategory.capitalize() + "Extractor" - Extr.__doc__ = \ - "Extractor for " + cls.subcategory + "s from " + domain - Extr.category = category - Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt - Extr.test = info.get("test-" + cls.subcategory) - Extr.root = root +class BaseExtractor(Extractor): + instances = () - if "extra" in info: - for key, value in info["extra"].items(): - setattr(Extr, key, value) - if prev and ckey: - setattr(Extr, ckey, prev) + def __init__(self, match): + if not self.category: + for index, group in enumerate(match.groups()): + if group is not None: + self.category, self.root = self.instances[index] + break + Extractor.__init__(self, match) - symtable[Extr.__name__] = prev = Extr + @classmethod + def update(cls, instances): + extra_instances = config.get(("extractor",), cls.basecategory) + if extra_instances: + for category, info in extra_instances.items(): + if isinstance(info, dict) and "root" in info: + instances[category] = info + + pattern_list = [] + instance_list = cls.instances = [] + for category, info in instances.items(): + root = info["root"].rstrip("/") + instance_list.append((category, root)) + + pattern = info.get("pattern") + if not pattern: + pattern = re.escape(root[root.index(":") + 3:]) + pattern_list.append(pattern + "()") + + return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")" + + +class HTTPSAdapter(HTTPAdapter): + + def __init__(self, ciphers): + context = self.ssl_context = ssl.create_default_context() + context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | + ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) + context.set_ecdh_curve("prime256v1") + context.set_ciphers(ciphers) + HTTPAdapter.__init__(self) + + def init_poolmanager(self, *args, **kwargs): + kwargs["ssl_context"] = self.ssl_context + return HTTPAdapter.init_poolmanager(self, *args, **kwargs) + + def proxy_manager_for(self, *args, **kwargs): + kwargs["ssl_context"] = self.ssl_context + return HTTPAdapter.proxy_manager_for(self, *args, **kwargs) + + +def _emulate_browser_firefox(session, platform): + headers = session.headers + headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:78.0) " + "Gecko/20100101 Firefox/78.0") + headers["Accept"] = ("text/html,application/xhtml+xml," + "application/xml;q=0.9,image/webp,*/*;q=0.8") + headers["Accept-Language"] = "en-US,en;q=0.5" + headers["Accept-Encoding"] = "gzip, deflate" + headers["Referer"] = None + headers["Upgrade-Insecure-Requests"] = "1" + headers["Cookie"] = None + + session.mount("https://", HTTPSAdapter( + "TLS_AES_128_GCM_SHA256:" + "TLS_CHACHA20_POLY1305_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-AES256-SHA:" + "ECDHE-ECDSA-AES128-SHA:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "DHE-RSA-AES128-SHA:" + "DHE-RSA-AES256-SHA:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + )) + + +def _emulate_browser_chrome(session, platform): + if platform.startswith("Macintosh"): + platform = platform.replace(".", "_") + "_0" + + headers = session.headers + headers["Upgrade-Insecure-Requests"] = "1" + headers["User-Agent"] = ( + "Mozilla/5.0 (" + platform + ") AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36") + headers["Accept"] = ("text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/webp,image/apng,*/*;q=0.8") + headers["Referer"] = None + headers["Accept-Encoding"] = "gzip, deflate" + headers["Accept-Language"] = "en-US,en;q=0.9" + headers["Cookie"] = None + + session.mount("https://", HTTPSAdapter( + "TLS_AES_128_GCM_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "TLS_CHACHA20_POLY1305_SHA256:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "AES128-GCM-SHA256:" + "AES256-GCM-SHA384:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + )) # Undo automatic pyOpenSSL injection by requests @@ -578,38 +680,3 @@ if not pyopenssl: except ImportError: pass del pyopenssl - - -# Replace urllib3's default cipher list to avoid Cloudflare CAPTCHAs -ciphers = config.get((), "ciphers", True) -if ciphers: - - if ciphers is True: - ciphers = ( - # Firefox's list - "TLS_AES_128_GCM_SHA256:" - "TLS_CHACHA20_POLY1305_SHA256:" - "TLS_AES_256_GCM_SHA384:" - "ECDHE-ECDSA-AES128-GCM-SHA256:" - "ECDHE-RSA-AES128-GCM-SHA256:" - "ECDHE-ECDSA-CHACHA20-POLY1305:" - "ECDHE-RSA-CHACHA20-POLY1305:" - "ECDHE-ECDSA-AES256-GCM-SHA384:" - "ECDHE-RSA-AES256-GCM-SHA384:" - "ECDHE-ECDSA-AES256-SHA:" - "ECDHE-ECDSA-AES128-SHA:" - "ECDHE-RSA-AES128-SHA:" - "ECDHE-RSA-AES256-SHA:" - "DHE-RSA-AES128-SHA:" - "DHE-RSA-AES256-SHA:" - "AES128-SHA:" - "AES256-SHA:" - "DES-CBC3-SHA" - ) - elif isinstance(ciphers, list): - ciphers = ":".join(ciphers) - - from requests.packages.urllib3.util import ssl_ # noqa - ssl_.DEFAULT_CIPHERS = ciphers - del ssl_ -del ciphers diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py new file mode 100644 index 0000000..a057b84 --- /dev/null +++ b/gallery_dl/extractor/cyberdrop.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://cyberdrop.me/""" + +from .common import Extractor, Message +from .. import text +import base64 + + +class CyberdropAlbumExtractor(Extractor): + category = "cyberdrop" + subcategory = "album" + root = "https://cyberdrop.me" + directory_fmt = ("{category}", "{album_id} {album_name}") + archive_fmt = "{album_id}_{id}" + pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.me/a/([^/?#]+)" + test = ("https://cyberdrop.me/a/keKRjm4t", { + "pattern": r"https://f\.cyberdrop\.cc/.*\.[a-z]+$", + "keyword": { + "album_id": "keKRjm4t", + "album_name": "Fate (SFW)", + "album_size": 150069254, + "count": 62, + "date": "dt:2020-06-18 13:14:20", + "description": "", + "id": r"re:\w{8}", + }, + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.album_id = match.group(1) + + def items(self): + url = self.root + "/a/" + self.album_id + extr = text.extract_from(self.request(url).text) + extr("const albumData = {", "") + + data = { + "album_id" : self.album_id, + "album_name" : extr("name: '", "'"), + "date" : text.parse_timestamp(extr("timestamp: ", ",")), + "album_size" : text.parse_int(extr("totalSize: ", ",")), + "description": extr("description: `", "`"), + } + files = extr("fl: '", "'").split(",") + data["count"] = len(files) + + yield Message.Directory, data + for file_b64 in files: + file = base64.b64decode(file_b64.encode()).decode() + text.nameext_from_url(file, data) + data["filename"], _, data["id"] = data["filename"].rpartition("-") + yield Message.Url, "https://f.cyberdrop.cc/" + file, data diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2eb3b28..47286b7 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -78,6 +78,10 @@ class DeviantartExtractor(Extractor): else: self.user = profile["user"]["username"] + if self.extra: + finditer_stash = DeviantartStashExtractor.pattern.finditer + finditer_deviation = DeviantartDeviationExtractor.pattern.finditer + yield Message.Version, 1 for deviation in self.deviations(): if isinstance(deviation, tuple): @@ -134,10 +138,14 @@ class DeviantartExtractor(Extractor): if self.extra: txt = (deviation.get("description", "") + deviation.get("_journal", "")) - for match in DeviantartStashExtractor.pattern.finditer(txt): + for match in finditer_stash(txt): url = text.ensure_http_scheme(match.group(0)) deviation["_extractor"] = DeviantartStashExtractor yield Message.Queue, url, deviation + for match in finditer_deviation(txt): + url = text.ensure_http_scheme(match.group(0)) + deviation["_extractor"] = DeviantartDeviationExtractor + yield Message.Queue, url, deviation def deviations(self): """Return an iterable containing all relevant Deviation-objects""" diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 1c6ebb4..842de7e 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -50,7 +50,8 @@ class EromeExtractor(Extractor): for data["num"], group in enumerate(util.advance(groups, 1), 1): url = (text.extract(group, '<source src="', '"')[0] or text.extract(group, 'data-src="', '"')[0]) - yield Message.Url, url, text.nameext_from_url(url, data) + if url: + yield Message.Url, url, text.nameext_from_url(url, data) def albums(self): return () @@ -84,14 +85,14 @@ class EromeAlbumExtractor(EromeExtractor): """Extractor for albums on erome.com""" subcategory = "album" pattern = BASE_PATTERN + r"/a/(\w+)" - test = ("https://www.erome.com/a/UHUX1B73", { - "pattern": r"https://s\d+\.erome\.com/342/UHUX1B73/\w+", - "count": 5, + test = ("https://www.erome.com/a/KandxY7y", { + "pattern": r"https://s\d+\.erome\.com/355/KandxY7y/\w+", + "count": 26, "keyword": { - "album_id": "UHUX1B73", + "album_id": "KandxY7y", "num": int, - "title": "Ryan Ryans", - "user": "gutiquq", + "title": "Therealbrittfitt", + "user": "pokow", }, }) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 4ead3fb..5a7de23 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -12,24 +12,22 @@ from .common import Extractor, Message from .. import text, util, exception from ..cache import cache import itertools -import random -import time import math - BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org" class ExhentaiExtractor(Extractor): """Base class for exhentai extractors""" category = "exhentai" - directory_fmt = ("{category}", "{gallery_id} {title[:247]}") + directory_fmt = ("{category}", "{gid} {title[:247]}") filename_fmt = ( - "{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}") - archive_fmt = "{gallery_id}_{num}" + "{gid}_{num:>04}_{image_token}_{filename}.{extension}") + archive_fmt = "{gid}_{num}" cookienames = ("ipb_member_id", "ipb_pass_hash") cookiedomain = ".exhentai.org" root = "https://exhentai.org" + request_interval = 5.0 LIMIT = False @@ -47,8 +45,6 @@ class ExhentaiExtractor(Extractor): Extractor.__init__(self, match) self.limits = self.config("limits", True) self.original = self.config("original", True) - self.wait_min = self.config("wait-min", 3) - self.wait_max = self.config("wait-max", 6) if type(self.limits) is int: self._limit_max = self.limits @@ -57,8 +53,6 @@ class ExhentaiExtractor(Extractor): self._limit_max = 0 self._remaining = 0 - if self.wait_max < self.wait_min: - self.wait_max = self.wait_min self.session.headers["Referer"] = self.root + "/" if version != "ex": self.session.cookies.set("nw", "1", domain=self.cookiedomain) @@ -70,14 +64,6 @@ class ExhentaiExtractor(Extractor): raise exception.AuthorizationError() return response - def wait(self, waittime=None): - """Wait for a randomly chosen amount of seconds""" - if not waittime: - waittime = random.uniform(self.wait_min, self.wait_max) - else: - waittime = random.uniform(waittime * 0.66, waittime * 1.33) - time.sleep(waittime) - def login(self): """Login and set necessary cookies""" if self.LIMIT: @@ -132,7 +118,39 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): r"|/s/([\da-f]{10})/(\d+)-(\d+))") test = ( ("https://exhentai.org/g/1200119/d55c44d3d0/", { - "keyword": "199db053b4ccab94463b459e1cfe079df8cdcdd1", + "keyword": { + "cost": int, + "date": "dt:2018-03-18 20:15:00", + "eh_category": "Non-H", + "expunged": False, + "favorites": "17", + "filecount": "4", + "filesize": 1488978, + "gid": 1200119, + "height": int, + "image_token": "re:[0-9a-f]{10}", + "lang": "jp", + "language": "Japanese", + "parent": "", + "rating": r"re:\d\.\d+", + "size": int, + "tags": [ + "parody:komi-san wa komyushou desu.", + "character:shouko komi", + "group:seventh lowlife", + "sample", + ], + "thumb": "https://exhentai.org/t/ce/0a/ce0a5bcb583229a9b07c0f8" + "3bcb1630ab1350640-624622-736-1036-jpg_250.jpg", + "title": "C93 [Seventh_Lowlife] Komi-san ha Tokidoki Daitan de" + "su (Komi-san wa Komyushou desu) [Sample]", + "title_jpn": "(C93) [Comiketjack (わ!)] 古見さんは、時々大胆" + "です。 (古見さんは、コミュ症です。) [見本]", + "token": "d55c44d3d0", + "torrentcount": "0", + "uploader": "klorpa", + "width": int, + }, "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff", }), ("https://exhentai.org/g/960461/4f0e369d82/", { @@ -169,7 +187,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.log.error("Failed to extract initial image token") self.log.debug("Page content:\n%s", gpage) return - self.wait() ipage = self._image_page() else: ipage = self._image_page() @@ -179,13 +196,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.log.debug("Page content:\n%s", ipage) return self.gallery_token = part.split("/")[1] - self.wait() gpage = self._gallery_page() data = self.get_metadata(gpage) - self.count = data["count"] - - yield Message.Version, 1 + self.count = text.parse_int(data["filecount"]) yield Message.Directory, data images = itertools.chain( @@ -196,39 +210,64 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self._check_limits(data) if "/fullimg.php" in url: data["extension"] = "" - self.wait(self.wait_max / 4) yield Message.Url, url, data def get_metadata(self, page): """Extract gallery metadata""" + data = self.metadata_from_page(page) + if self.config("metadata", False): + data.update(self.metadata_from_api()) + data["date"] = text.parse_timestamp(data["posted"]) + return data + + def metadata_from_page(self, page): extr = text.extract_from(page) data = { - "gallery_id" : self.gallery_id, - "gallery_token": self.gallery_token, + "gid" : self.gallery_id, + "token" : self.gallery_token, + "thumb" : extr("background:transparent url(", ")"), "title" : text.unescape(extr('<h1 id="gn">', '</h1>')), - "title_jp" : text.unescape(extr('<h1 id="gj">', '</h1>')), + "title_jpn" : text.unescape(extr('<h1 id="gj">', '</h1>')), + "_" : extr('<div id="gdc"><div class="cs ct', '"'), + "eh_category" : extr('>', '<'), + "uploader" : text.unquote(extr('/uploader/', '"')), "date" : text.parse_datetime(extr( '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"), "parent" : extr( '>Parent:</td><td class="gdt2"><a href="', '"'), - "visible" : extr( + "expunged" : "Yes" != extr( '>Visible:</td><td class="gdt2">', '<'), - "language" : extr( - '>Language:</td><td class="gdt2">', ' '), - "gallery_size" : text.parse_bytes(extr( + "language" : extr('>Language:</td><td class="gdt2">', ' '), + "filesize" : text.parse_bytes(extr( '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")), - "count" : text.parse_int(extr( - '>Length:</td><td class="gdt2">', ' ')), + "filecount" : extr('>Length:</td><td class="gdt2">', ' '), + "favorites" : extr('id="favcount">', ' '), + "rating" : extr(">Average: ", "<"), + "torrentcount" : extr('>Torrent Download (', ')'), } data["lang"] = util.language_to_code(data["language"]) data["tags"] = [ - text.unquote(tag) + text.unquote(tag.replace("+", " ")) for tag in text.extract_iter(page, 'hentai.org/tag/', '"') ] return data + def metadata_from_api(self): + url = self.root + "/api.php" + data = { + "method": "gdata", + "gidlist": ((self.gallery_id, self.gallery_token),), + "namespace": 1, + } + + data = self.request(url, method="POST", json=data).json() + if "error" in data: + raise exception.StopExtraction(data["error"]) + + return data["gmetadata"][0] + def image_from_page(self, page): """Get image url and data from webpage""" pos = page.index('<div id="i3"><a onclick="return load_image(') + 26 @@ -267,7 +306,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "showkey": self.key["show"], } for request["page"] in range(self.image_num + 1, self.count + 1): - self.wait() page = self.request(api_url, method="POST", json=request).json() imgkey = nextkey nextkey, pos = text.extract(page["i3"], "'", "'") @@ -317,7 +355,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): return page def _check_limits(self, data): - if not self._remaining or data["num"] % 20 == 0: + if not self._remaining or data["num"] % 25 == 0: self._update_limits() self._remaining -= data["cost"] @@ -400,7 +438,6 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): def items(self): self.login() - yield Message.Version, 1 data = {"_extractor": ExhentaiGalleryExtractor} while True: @@ -417,7 +454,6 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): if 'class="ptdd">><' in page or ">No hits found</p>" in page: return self.params["page"] += 1 - self.wait() class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 319ebe2..0bcec2b 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -8,21 +8,21 @@ """Extractors for 4chan archives based on FoolFuuka""" -from .common import Extractor, Message, generate_extractors +from .common import BaseExtractor, Message from .. import text import itertools -class FoolfuukaExtractor(Extractor): +class FoolfuukaExtractor(BaseExtractor): """Base extractor for FoolFuuka based boards/archives""" basecategory = "foolfuuka" archive_fmt = "{board[shortname]}_{num}_{timestamp}" external = "default" def __init__(self, match): - Extractor.__init__(self, match) + BaseExtractor.__init__(self, match) self.session.headers["Referer"] = self.root - if self.external == "direct": + if self.category == "b4k": self.remote = self._remote_direct def items(self): @@ -43,7 +43,7 @@ class FoolfuukaExtractor(Extractor): yield Message.Url, url, post def metadata(self): - """ """ + """Return general metadata""" def posts(self): """Return an iterable with all relevant posts""" @@ -59,16 +59,90 @@ class FoolfuukaExtractor(Extractor): return media["remote_media_link"] +BASE_PATTERN = FoolfuukaExtractor.update({ + "4plebs": { + "root": "https://archive.4plebs.org", + "pattern": r"(?:archive\.)?4plebs\.org", + }, + "archivedmoe": { + "root": "https://archived.moe", + }, + "archiveofsins": { + "root": "https://archiveofsins.com", + "pattern": r"(?:www\.)?archiveofsins\.com", + }, + "b4k": { + "root": "https://arch.b4k.co", + }, + "desuarchive": { + "root": "https://desuarchive.org", + }, + "fireden": { + "root": "https://boards.fireden.net", + }, + "nyafuu": { + "root": "https://archive.nyafuu.org", + "pattern": r"(?:archive\.)?nyafuu\.org", + }, + "rbt": { + "root": "https://rbt.asia", + "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)", + }, + "thebarchive": { + "root": "https://thebarchive.com", + "pattern": r"thebarchive\.com", + }, +}) + + class FoolfuukaThreadExtractor(FoolfuukaExtractor): """Base extractor for threads on FoolFuuka based boards/archives""" subcategory = "thread" directory_fmt = ("{category}", "{board[shortname]}", "{thread_num}{title:? - //}") - pattern_fmt = r"/([^/?#]+)/thread/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)" + test = ( + ("https://archive.4plebs.org/tg/thread/54059290", { + "url": "07452944164b602502b02b24521f8cee5c484d2a", + }), + ("https://archived.moe/gd/thread/309639/", { + "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8", + "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", + }), + ("https://archived.moe/a/thread/159767162/", { + "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87", + }), + ("https://archiveofsins.com/h/thread/4668813/", { + "url": "f612d287087e10a228ef69517cf811539db9a102", + "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4", + }), + ("https://arch.b4k.co/meta/thread/196/", { + "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a", + }), + ("https://desuarchive.org/a/thread/159542679/", { + "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406", + }), + ("https://boards.fireden.net/sci/thread/11264294/", { + "url": "3adfe181ee86a8c23021c705f623b3657a9b0a43", + }), + ("https://archive.nyafuu.org/c/thread/2849220/", { + "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", + }), + ("https://rbt.asia/g/thread/61487650/", { + "url": "61896d9d9a2edb556b619000a308a984307b6d30", + }), + ("https://archive.rebeccablacktech.com/g/thread/61487650/", { + "url": "61896d9d9a2edb556b619000a308a984307b6d30", + }), + ("https://thebarchive.com/b/thread/739772332/", { + "url": "e8b18001307d130d67db31740ce57c8561b5d80c", + }), + ) def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board, self.thread = match.groups() + self.board = match.group(match.lastindex-1) + self.thread = match.group(match.lastindex) self.data = None def metadata(self): @@ -78,23 +152,34 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): return self.data["op"] def posts(self): + op = (self.data["op"],) posts = self.data.get("posts") if posts: posts = list(posts.values()) posts.sort(key=lambda p: p["timestamp"]) - else: - posts = () - return itertools.chain((self.data["op"],), posts) + return itertools.chain(op, posts) + return op class FoolfuukaBoardExtractor(FoolfuukaExtractor): """Base extractor for FoolFuuka based boards/archives""" subcategory = "board" - pattern_fmt = r"/([^/?#]+)/\d*$" + pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$" + test = ( + ("https://archive.4plebs.org/tg/"), + ("https://archived.moe/gd/"), + ("https://archiveofsins.com/h/"), + ("https://arch.b4k.co/meta/"), + ("https://desuarchive.org/a/"), + ("https://boards.fireden.net/sci/"), + ("https://archive.nyafuu.org/c/"), + ("https://rbt.asia/g/"), + ("https://thebarchive.com/b/"), + ) def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(1) + self.board = match.group(match.lastindex) def items(self): index_base = "{}/_/api/chan/index/?board={}&page=".format( @@ -113,7 +198,7 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): for num, thread in threads.items(): thread["url"] = thread_base + format(num) - thread["_extractor"] = self.childclass + thread["_extractor"] = FoolfuukaThreadExtractor yield Message.Queue, thread["url"], thread @@ -121,15 +206,24 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): """Base extractor for search results on FoolFuuka based boards/archives""" subcategory = "search" directory_fmt = ("{category}", "search", "{search}") - pattern_fmt = r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" + pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)" request_interval = 1.0 + test = ( + ("https://archive.4plebs.org/_/search/text/test/"), + ("https://archived.moe/_/search/text/test/"), + ("https://archiveofsins.com/_/search/text/test/"), + ("https://archiveofsins.com/_/search/text/test/"), + ("https://desuarchive.org/_/search/text/test/"), + ("https://boards.fireden.net/_/search/text/test/"), + ("https://archive.nyafuu.org/_/search/text/test/"), + ("https://rbt.asia/_/search/text/test/"), + ("https://thebarchive.com/_/search/text/test/"), + ) def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - board, search = match.groups() - self.params = params = {} - args = search.split("/") + args = match.group(match.lastindex).split("/") key = None for arg in args: @@ -138,6 +232,8 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): key = None else: key = arg + + board = match.group(match.lastindex-1) if board != "_": params["boards"] = board @@ -170,105 +266,3 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): if len(posts) <= 3: return params["page"] += 1 - - -EXTRACTORS = { - "4plebs": { - "name": "_4plebs", - "root": "https://archive.4plebs.org", - "pattern": r"(?:archive\.)?4plebs\.org", - "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", { - "url": "07452944164b602502b02b24521f8cee5c484d2a", - }), - "test-board": ("https://archive.4plebs.org/tg/",), - "test-search": ("https://archive.4plebs.org/_/search/text/test/",), - }, - "archivedmoe": { - "root": "https://archived.moe", - "test-thread": ( - ("https://archived.moe/gd/thread/309639/", { - "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8", - "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", - }), - ("https://archived.moe/a/thread/159767162/", { - "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87", - }), - ), - "test-board": ("https://archived.moe/gd/",), - "test-search": ("https://archived.moe/_/search/text/test/",), - }, - "archiveofsins": { - "root": "https://archiveofsins.com", - "pattern": r"(?:www\.)?archiveofsins\.com", - "test-thread": ("https://archiveofsins.com/h/thread/4668813/", { - "url": "f612d287087e10a228ef69517cf811539db9a102", - "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4", - }), - "test-board": ("https://archiveofsins.com/h/",), - "test-search": ("https://archiveofsins.com/_/search/text/test/",), - }, - "b4k": { - "root": "https://arch.b4k.co", - "extra": {"external": "direct"}, - "test-thread": ("https://arch.b4k.co/meta/thread/196/", { - "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a", - }), - "test-board": ("https://arch.b4k.co/meta/",), - "test-search": ("https://arch.b4k.co/_/search/text/test/",), - }, - "desuarchive": { - "root": "https://desuarchive.org", - "test-thread": ("https://desuarchive.org/a/thread/159542679/", { - "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406", - }), - "test-board": ("https://desuarchive.org/a/",), - "test-search": ("https://desuarchive.org/_/search/text/test/",), - }, - "fireden": { - "root": "https://boards.fireden.net", - "test-thread": ("https://boards.fireden.net/sci/thread/11264294/", { - "url": "3adfe181ee86a8c23021c705f623b3657a9b0a43", - }), - "test-board": ("https://boards.fireden.net/sci/",), - "test-search": ("https://boards.fireden.net/_/search/text/test/",), - }, - "nyafuu": { - "root": "https://archive.nyafuu.org", - "pattern": r"(?:archive\.)?nyafuu\.org", - "test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", { - "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", - }), - "test-board": ("https://archive.nyafuu.org/c/",), - "test-search": ("https://archive.nyafuu.org/_/search/text/test/",), - }, - "rbt": { - "root": "https://rbt.asia", - "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)", - "test-thread": ( - ("https://rbt.asia/g/thread/61487650/", { - "url": "61896d9d9a2edb556b619000a308a984307b6d30", - }), - ("https://archive.rebeccablacktech.com/g/thread/61487650/", { - "url": "61896d9d9a2edb556b619000a308a984307b6d30", - }), - ), - "test-board": ("https://rbt.asia/g/",), - "test-search": ("https://rbt.asia/_/search/text/test/",), - }, - "thebarchive": { - "root": "https://thebarchive.com", - "pattern": r"thebarchive\.com", - "test-thread": ("https://thebarchive.com/b/thread/739772332/", { - "url": "e8b18001307d130d67db31740ce57c8561b5d80c", - }), - "test-board": ("https://thebarchive.com/b/",), - "test-search": ("https://thebarchive.com/_/search/text/test/",), - }, - "_ckey": "childclass", -} - -generate_extractors(EXTRACTORS, globals(), ( - FoolfuukaThreadExtractor, - FoolfuukaBoardExtractor, - FoolfuukaSearchExtractor, -)) diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index db5e250..f8664e7 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,23 +8,21 @@ """Extractors for FoOlSlide based sites""" -from .common import ( - Extractor, - ChapterExtractor, - MangaExtractor, - Message, - generate_extractors, -) +from .common import BaseExtractor, Message from .. import text, util import json -class FoolslideBase(): +class FoolslideExtractor(BaseExtractor): """Base class for FoOlSlide extractors""" basecategory = "foolslide" + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.gallery_url = self.root + match.group(match.lastindex) + def request(self, url): - return Extractor.request( + return BaseExtractor.request( self, url, encoding="utf-8", method="POST", data={"adult": "true"}) @staticmethod @@ -40,12 +38,53 @@ class FoolslideBase(): return data -class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): +BASE_PATTERN = FoolslideExtractor.update({ + "dokireader": { + "root": "https://kobato.hologfx.com/reader", + }, + "kireicake": { + "root": "https://reader.kireicake.com", + }, + "powermanga": { + "root": "https://read.powermanga.org", + "pattern": r"read(?:er)?\.powermanga\.org", + }, + "sensescans": { + "root": "https://sensescans.com/reader", + "pattern": r"(?:(?:www\.)?sensescans\.com/reader" + r"|reader\.sensescans\.com)", + }, +}) + + +class FoolslideChapterExtractor(FoolslideExtractor): """Base class for chapter extractors for FoOlSlide based sites""" + subcategory = "chapter" directory_fmt = ("{category}", "{manga}", "{chapter_string}") + filename_fmt = ( + "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") archive_fmt = "{id}" - pattern_fmt = r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" - decode = "default" + pattern = BASE_PATTERN + r"(/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" + test = ( + (("https://kobato.hologfx.com/reader/read/" + "hitoribocchi_no_oo_seikatsu/en/3/34"), { + "keyword": "6e719ac86f0c6dab89390dd7e507e678459e0dbc", + }), + ("https://reader.kireicake.com/read/wonderland/en/1/1/", { + "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e", + "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6", + }), + (("https://read.powermanga.org" + "/read/one_piece_digital_colour_comics/en/0/75/"), { + "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384", + "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe", + }), + ("https://sensescans.com/reader/read/ao_no_orchestra/en/0/26/", { + "url": "bbd428dc578f5055e9f86ad635b510386cd317cd", + "keyword": "083ef6f8831c84127fe4096fa340a249be9d1424", + }), + ("https://reader.sensescans.com/read/ao_no_orchestra/en/0/26/"), + ) def items(self): page = self.request(self.gallery_url).text @@ -83,9 +122,51 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): return json.loads(text.extract(page, "var pages = ", ";")[0]) -class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): +class FoolslideMangaExtractor(FoolslideExtractor): """Base class for manga extractors for FoOlSlide based sites""" - pattern_fmt = r"(/series/[^/?#]+)" + subcategory = "manga" + categorytransfer = True + pattern = BASE_PATTERN + r"(/series/[^/?#]+)" + test = ( + (("https://kobato.hologfx.com/reader/series/" + "boku_ha_ohimesama_ni_narenai/"), { + "url": "1c1f5a7258ce4f631f5fc32be548d78a6a57990d", + "keyword": "614d89a6045b85c822cbd3e67578ea7577dfc995", + }), + ("https://reader.kireicake.com/series/wonderland/", { + "url": "d067b649af1cc88fa8c8b698fde04a10909fd169", + "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb", + }), + (("https://read.powermanga.org" + "/series/one_piece_digital_colour_comics/"), { + "count": ">= 1", + "keyword": { + "chapter": int, + "chapter_minor": str, + "chapter_string": str, + "group": "PowerManga", + "lang": "en", + "language": "English", + "manga": "One Piece Digital Colour Comics", + "title": str, + "volume": int, + }, + }), + ("https://sensescans.com/reader/series/yotsubato/", { + "count": ">= 3", + }), + ) + + def items(self): + page = self.request(self.gallery_url).text + + chapters = self.chapters(page) + if not self.config("chapter-reverse", False): + chapters.reverse() + + for chapter, data in chapters: + data["_extractor"] = FoolslideChapterExtractor + yield Message.Queue, chapter, data def chapters(self, page): extr = text.extract_from(page) @@ -103,82 +184,3 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): "chapter_string": extr('title="', '"'), "group" : extr('title="', '"'), }))) - - -EXTRACTORS = { - "dokireader": { - "root": "https://kobato.hologfx.com/reader", - "test-chapter": - (("https://kobato.hologfx.com/reader/read/" - "hitoribocchi_no_oo_seikatsu/en/3/34"), { - "keyword": "6e719ac86f0c6dab89390dd7e507e678459e0dbc", - }), - "test-manga": - (("https://kobato.hologfx.com/reader/series/" - "boku_ha_ohimesama_ni_narenai/"), { - "url": "1c1f5a7258ce4f631f5fc32be548d78a6a57990d", - "keyword": "614d89a6045b85c822cbd3e67578ea7577dfc995", - }), - }, - "kireicake": { - "root": "https://reader.kireicake.com", - "test-chapter": - ("https://reader.kireicake.com/read/wonderland/en/1/1/", { - "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e", - "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6", - }), - "test-manga": - ("https://reader.kireicake.com/series/wonderland/", { - "url": "d067b649af1cc88fa8c8b698fde04a10909fd169", - "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb", - }), - }, - "powermanga": { - "root": "https://read.powermanga.org", - "pattern": r"read(?:er)?\.powermanga\.org", - "test-chapter": - (("https://read.powermanga.org" - "/read/one_piece_digital_colour_comics/en/0/75/"), { - "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384", - "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe", - }), - "test-manga": - (("https://read.powermanga.org" - "/series/one_piece_digital_colour_comics/"), { - "count": ">= 1", - "keyword": { - "chapter": int, - "chapter_minor": str, - "chapter_string": str, - "group": "PowerManga", - "lang": "en", - "language": "English", - "manga": "One Piece Digital Colour Comics", - "title": str, - "volume": int, - }, - }), - }, - "sensescans": { - "root": "https://sensescans.com/reader", - "pattern": r"(?:(?:www\.)?sensescans\.com/reader" - r"|reader\.sensescans\.com)", - "test-chapter": ( - ("https://sensescans.com/reader/read/ao_no_orchestra/en/0/26/", { - "url": "bbd428dc578f5055e9f86ad635b510386cd317cd", - "keyword": "083ef6f8831c84127fe4096fa340a249be9d1424", - }), - ("https://reader.sensescans.com/read/ao_no_orchestra/en/0/26/"), - ), - "test-manga": - ("https://sensescans.com/reader/series/yotsubato/", { - "count": ">= 3", - }), - }, - "_ckey": "chapterclass", -} - -generate_extractors(EXTRACTORS, globals(), ( - FoolslideChapterExtractor, - FoolslideMangaExtractor, -)) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 7a28e9c..92d27a9 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,13 +8,14 @@ """Extractors for https://gelbooru.com/""" -from . import booru +from . import gelbooru_v02 from .. import text, exception class GelbooruBase(): """Base class for gelbooru extractors""" category = "gelbooru" + basecategory = "booru" root = "https://gelbooru.com" @staticmethod @@ -27,7 +28,8 @@ class GelbooruBase(): return url -class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor): +class GelbooruTagExtractor(GelbooruBase, + gelbooru_v02.GelbooruV02TagExtractor): """Extractor for images from gelbooru.com based on search-tags""" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") @@ -42,7 +44,8 @@ class GelbooruTagExtractor(GelbooruBase, booru.BooruTagExtractor): ) -class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor): +class GelbooruPoolExtractor(GelbooruBase, + gelbooru_v02.GelbooruV02PoolExtractor): """Extractor for image-pools from gelbooru.com""" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=pool&s=show&id=(?P<pool>\d+)") @@ -72,7 +75,8 @@ class GelbooruPoolExtractor(GelbooruBase, booru.BooruPoolExtractor): } -class GelbooruPostExtractor(GelbooruBase, booru.BooruPostExtractor): +class GelbooruPostExtractor(GelbooruBase, + gelbooru_v02.GelbooruV02PostExtractor): """Extractor for single images from gelbooru.com""" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=post&s=view&id=(?P<post>\d+)") diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py new file mode 100644 index 0000000..0935998 --- /dev/null +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -0,0 +1,143 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Gelbooru v0.1 sites""" + +from . import booru +from .. import text + + +class GelbooruV01Extractor(booru.BooruExtractor): + basecategory = "gelbooru_v01" + per_page = 20 + + def _parse_post(self, post_id): + url = "{}/index.php?page=post&s=view&id={}".format( + self.root, post_id) + page = self.request(url).text + + post = text.extract_all(page, ( + ("created_at", 'Posted: ', ' <'), + ("uploader" , 'By: ', ' <'), + ("width" , 'Size: ', 'x'), + ("height" , '', ' <'), + ("source" , 'Source: <a href="', '"'), + ("rating" , 'Rating: ', '<'), + ("score" , 'Score: ', ' <'), + ("file_url" , '<img alt="img" src="', '"'), + ("tags" , 'id="tags" name="tags" cols="40" rows="5">', '<'), + ))[0] + + post["id"] = post_id + post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + post["rating"] = (post["rating"] or "?")[0].lower() + post["tags"] = text.unescape(post["tags"]) + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%d %H:%M:%S") + + return post + + +BASE_PATTERN = GelbooruV01Extractor.update({ + "thecollection" : {"root": "https://the-collection.booru.org"}, + "illusioncardsbooru": {"root": "https://illusioncards.booru.org"}, + "allgirlbooru" : {"root": "https://allgirl.booru.org"}, +}) + + +class GelbooruV01TagExtractor(GelbooruV01Extractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" + test = ( + (("https://the-collection.booru.org" + "/index.php?page=post&s=list&tags=parody"), { + "range": "1-25", + "count": 25, + }), + (("https://illusioncards.booru.org" + "/index.php?page=post&s=list&tags=koikatsu"), { + "range": "1-25", + "count": 25, + }), + ("https://allgirl.booru.org/index.php?page=post&s=list&tags=dress", { + "range": "1-25", + "count": 25, + }), + ) + + def __init__(self, match): + GelbooruV01Extractor.__init__(self, match) + self.tags = match.group(match.lastindex) + + def metadata(self): + return {"search_tags": text.unquote(self.tags.replace("+", " "))} + + def posts(self): + url = "{}/index.php?page=post&s=list&tags={}&pid=".format( + self.root, self.tags) + pid = self.page_start + + while True: + page = self.request(url + str(pid)).text + + cnt = 0 + for post_id in text.extract_iter( + page, 'class="thumb"><a id="p', '"'): + yield self._parse_post(post_id) + cnt += 1 + + if cnt < self.per_page: + return + pid += self.per_page + + +class GelbooruV01PostExtractor(GelbooruV01Extractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" + test = ( + (("https://the-collection.booru.org" + "/index.php?page=post&s=view&id=100520"), { + "url": "0329ac8588bb93cf242ca0edbe3e995b4ba554e8", + "content": "1e585874e7b874f7937df1060dd1517fef2f4dfb", + }), + (("https://illusioncards.booru.org" + "/index.php?page=post&s=view&id=82746"), { + "url": "3f9cd2fadf78869b90bc5422f27b48f1af0e0909", + "content": "159e60b92d05597bd1bb63510c2c3e4a4bada1dc", + }), + ("https://allgirl.booru.org/index.php?page=post&s=view&id=107213", { + "url": "b416800d2d2b072f80d3b37cfca9cb806fb25d51", + "content": "3e3c65e0854a988696e11adf0de52f8fa90a51c7", + "keyword": { + "created_at": "2021-02-13 16:27:39", + "date": "dt:2021-02-13 16:27:39", + "file_url": "https://img.booru.org/allgirl//images/107" + "/2aaa0438d58fc7baa75a53b4a9621bb89a9d3fdb.jpg", + "height": "1200", + "id": "107213", + "md5": "2aaa0438d58fc7baa75a53b4a9621bb89a9d3fdb", + "rating": "s", + "score": str, + "source": None, + "tags": "blush dress green_eyes green_hair hatsune_miku " + "long_hair twintails vocaloid", + "uploader": "Honochi31", + "width": "1600" + }, + }), + ) + + def __init__(self, match): + GelbooruV01Extractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def posts(self): + return (self._parse_post(self.post_id),) diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py new file mode 100644 index 0000000..51fb478 --- /dev/null +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Gelbooru v0.2 sites""" + +from . import booru +from .. import text, util, exception + +from xml.etree import ElementTree +import collections +import re + + +class GelbooruV02Extractor(booru.BooruExtractor): + basecategory = "gelbooru_v02" + + def _api_request(self, params): + url = self.root + "/index.php?page=dapi&s=post&q=index" + return ElementTree.fromstring(self.request(url, params=params).text) + + def _pagination(self, params): + params["pid"] = self.page_start + params["limit"] = self.per_page + + while True: + root = self._api_request(params) + for post in root: + yield post.attrib + + if len(root) < self.per_page: + return + params["pid"] += 1 + + @staticmethod + def _prepare(post): + post["date"] = text.parse_datetime( + post["created_at"], "%a %b %d %H:%M:%S %z %Y") + + def _extended_tags(self, post, page=None): + if not page: + url = "{}/index.php?page=post&s=view&id={}".format( + self.root, post["id"]) + page = self.request(url).text + html = text.extract(page, '<ul id="tag-', '</ul>')[0] + if html: + tags = collections.defaultdict(list) + pattern = re.compile( + r"tag-type-([^\"' ]+).*?[?;]tags=([^\"'&]+)", re.S) + for tag_type, tag_name in pattern.findall(html): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + post["tags_" + key] = " ".join(value) + + +BASE_PATTERN = GelbooruV02Extractor.update({ + "realbooru": {"root": "https://realbooru.com"}, + "rule34" : {"root": "https://rule34.xxx"}, + "safebooru": {"root": "https://safebooru.org"}, + "tbib" : {"root": "https://tbib.org"}, +}) + + +class GelbooruV02TagExtractor(GelbooruV02Extractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)" + test = ( + ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { + "content": "97e4bbf86c3860be18de384d02d544251afe1d45", + "pattern": r"https?://.*rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", + "count": 1, + }), + ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", { + "url": "17c61b386530cf4c30842c9f580d15ef1cd09586", + "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb", + }), + ("https://realbooru.com/index.php?page=post&s=list&tags=wine", { + "count": ">= 64", + }), + ("https://tbib.org/index.php?page=post&s=list&tags=yuyaiyaui", { + "count": ">= 120", + }), + ) + + def __init__(self, match): + GelbooruV02Extractor.__init__(self, match) + tags = match.group(match.lastindex) + self.tags = text.unquote(tags.replace("+", " ")) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + return self._pagination({"tags" : self.tags}) + + +class GelbooruV02PoolExtractor(GelbooruV02Extractor): + subcategory = "pool" + directory_fmt = ("{category}", "pool", "{pool}") + archive_fmt = "p_{pool}_{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=pool&s=show&id=(\d+)" + test = ( + ("https://rule34.xxx/index.php?page=pool&s=show&id=179", { + "count": 3, + }), + ("https://safebooru.org/index.php?page=pool&s=show&id=11", { + "count": 5, + }), + ("https://realbooru.com/index.php?page=pool&s=show&id=1", { + "count": 3, + }), + ) + + def __init__(self, match): + GelbooruV02Extractor.__init__(self, match) + self.pool_id = match.group(match.lastindex) + self.post_ids = () + + def skip(self, num): + self.page_start += num + return num + + def metadata(self): + url = "{}/index.php?page=pool&s=show&id={}".format( + self.root, self.pool_id) + page = self.request(url).text + + name, pos = text.extract(page, "<h4>Pool: ", "</h4>") + if not name: + raise exception.NotFoundError("pool") + self.post_ids = text.extract_iter( + page, 'class="thumb" id="p', '"', pos) + + return { + "pool": text.parse_int(self.pool_id), + "pool_name": text.unescape(name), + } + + def posts(self): + params = {} + for params["id"] in util.advance(self.post_ids, self.page_start): + for post in self._api_request(params): + yield post.attrib + + +class GelbooruV02PostExtractor(GelbooruV02Extractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/index\.php\?page=post&s=view&id=(\d+)" + test = ( + ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", { + "content": "97e4bbf86c3860be18de384d02d544251afe1d45", + "options": (("tags", True),), + "keyword": { + "tags_artist": "danraku", + "tags_character": "kashima_(kantai_collection)", + "tags_copyright": "kantai_collection", + "tags_general": str, + "tags_metadata": str, + }, + }), + ("https://safebooru.org/index.php?page=post&s=view&id=1169132", { + "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b", + "content": "93b293b27dabd198afafabbaf87c49863ac82f27", + "options": (("tags", True),), + "keyword": { + "tags_artist": "kawanakajima", + "tags_character": "heath_ledger ronald_mcdonald the_joker", + "tags_copyright": "dc_comics mcdonald's the_dark_knight", + "tags_general": str, + }, + }), + ("https://realbooru.com/index.php?page=post&s=view&id=668483", { + "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9", + "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", + }), + ("https://tbib.org/index.php?page=post&s=view&id=9233957", { + "url": "5a6ebe07bfff8e6d27f7c30b5480f27abcb577d2", + "content": "1c3831b6fbaa4686e3c79035b5d98460b1c85c43", + }), + ) + + def __init__(self, match): + GelbooruV02Extractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def posts(self): + return self._pagination({"id": self.post_id}) diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py index 462d3e9..aa79b67 100644 --- a/gallery_dl/extractor/hentaicafe.py +++ b/gallery_dl/extractor/hentaicafe.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,23 +10,46 @@ from . import foolslide from .. import text -from .common import Extractor +from .common import Extractor, Message from ..cache import memcache import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?hentai\.cafe" -class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): - """Extractor for manga-chapters from hentai.cafe""" + +class HentaicafeBase(): + """Base class for hentaicafe extractors""" category = "hentaicafe" + root = "https://hentai.cafe" + + def _pagination(self, urlfmt): + data = {"_extractor": HentaicafeMangaExtractor} + pnum = text.parse_int(self.page_start, 1) + + while True: + page = self.request(urlfmt(pnum)).text + + for entry in text.extract_iter( + page, 'class="entry-featured', 'title="'): + url = text.extract(entry, 'href="', '"')[0] + if url: + yield Message.Queue, url, data + + if '>→<' not in page: + return + pnum += 1 + + +class HentaicafeChapterExtractor(HentaicafeBase, + foolslide.FoolslideChapterExtractor): + """Extractor for manga-chapters from hentai.cafe""" directory_fmt = ("{category}", "{manga}") filename_fmt = "c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}" - pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe" - r"(/manga/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)") + pattern = BASE_PATTERN + r"(/manga/read/[^/?#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", { "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2", "keyword": "6913608267d883c82b887303b9ced13821188329", }) - root = "https://hentai.cafe" def metadata(self, page): info = text.unescape(text.extract(page, '<title>', '</title>')[0]) @@ -43,11 +66,10 @@ class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): return {"artist": (), "tags": ()} -class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): +class HentaicafeMangaExtractor(HentaicafeBase, + foolslide.FoolslideMangaExtractor): """Extractor for manga from hentai.cafe""" - category = "hentaicafe" - pattern = (r"(?:https?://)?" + r"(?:www\.)?hentai\.cafe" - r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?#]+)/?$") + pattern = BASE_PATTERN + r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?#]+)/?$" test = ( # single chapter ("https://hentai.cafe/hazuki-yuuto-summer-blues/", { @@ -71,13 +93,20 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): }), ) - root = "https://hentai.cafe" - reverse = False - request = Extractor.request - chapterclass = HentaicafeChapterExtractor + + def items(self): + page = Extractor.request(self, self.gallery_url).text + + chapters = self.chapters(page) + if self.config("chapter-reverse", False): + chapters.reverse() + + for chapter, data in chapters: + data["_extractor"] = HentaicafeChapterExtractor + yield Message.Queue, chapter, data def chapters(self, page): - if "/manga/series/" in self.manga_url: + if "/manga/series/" in self.gallery_url: chapters = foolslide.FoolslideMangaExtractor.chapters(self, page) chapters.reverse() return chapters @@ -100,3 +129,45 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): for url in re.findall( r'<a +class="x-btn[^"]*" +href="([^"]+)"', page) ] + + +class HentaicafeSearchExtractor(HentaicafeBase, Extractor): + """Extractor for hentaicafe search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/(?:page/(\d+)/?)?\?s=([^&#]+)" + test = ("https://hentai.cafe/?s=benimura", { + "pattern": HentaicafeMangaExtractor.pattern, + "count": ">= 10", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.page_start, self.search = match.groups() + + def items(self): + fmt = "{}/page/{}?s={}".format + return self._pagination(lambda pnum: fmt(self.root, pnum, self.search)) + + +class HentaicafeTagExtractor(HentaicafeBase, Extractor): + """Extractor for hentaicafe tag/artist searches""" + subcategory = "tag" + pattern = (BASE_PATTERN + + r"/hc\.fyi/(tag|artist|category)/([^/?#]+)(?:/page/(\d+))?") + test = ( + ("https://hentai.cafe/hc.fyi/tag/vanilla"), + ("https://hentai.cafe/hc.fyi/category/book/page/5"), + ("https://hentai.cafe/hc.fyi/artist/benimura-karu", { + "pattern": HentaicafeMangaExtractor.pattern, + "count": ">= 10", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.type, self.search, self.page_start = match.groups() + + def items(self): + fmt = "{}/hc.fyi/{}/{}/page/{}".format + return self._pagination( + lambda pnum: fmt(self.root, self.type, self.search, pnum)) diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index 6d3ed74..6c1879c 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -63,14 +63,18 @@ class HentainexusGalleryExtractor(GalleryExtractor): data = json.loads(self._decode(text.extract( page, 'initReader("', '"')[0])) + headers = None + if not self.config("original", True): + headers = {"_http_headers": {"Accept": "image/webp,*/*"}} + pages = data.get("pages") if pages: - return [(page, None) for page in pages] + return [(page, headers) for page in pages] base = data["b"] + data["r"] gid = data["i"] return [ - ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), None) + ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), headers) for page in data["f"] ] diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index 16fe0a0..3d4bcfb 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -13,8 +13,6 @@ from .common import Message from ..cache import cache from .. import text, util, exception import collections -import random -import time import re @@ -24,6 +22,7 @@ class IdolcomplexExtractor(SankakuExtractor): cookienames = ("login", "pass_hash") cookiedomain = "idol.sankakucomplex.com" root = "https://" + cookiedomain + request_interval = 5.0 def __init__(self, match): SankakuExtractor.__init__(self, match) @@ -31,17 +30,12 @@ class IdolcomplexExtractor(SankakuExtractor): self.start_page = 1 self.start_post = 0 self.extags = self.config("tags", False) - self.wait_min = self.config("wait-min", 3.0) - self.wait_max = self.config("wait-max", 6.0) - if self.wait_max < self.wait_min: - self.wait_max = self.wait_min def items(self): self.login() data = self.metadata() for post_id in util.advance(self.post_ids(), self.start_post): - self.wait() post = self._parse_post(post_id) url = post["file_url"] post.update(data) @@ -130,10 +124,6 @@ class IdolcomplexExtractor(SankakuExtractor): return data - def wait(self): - """Wait for a randomly chosen amount of seconds""" - time.sleep(random.uniform(self.wait_min, self.wait_max)) - class IdolcomplexTagExtractor(IdolcomplexExtractor): """Extractor for images from idol.sankakucomplex.com by search-tags""" @@ -192,7 +182,6 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): params["page"] = self.start_page while True: - self.wait() page = self.request(self.root, params=params, retries=10).text pos = page.find("<div id=more-popular-posts-link>") + 1 diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index ae4e606..f6e8f2d 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -166,8 +166,6 @@ class ImgurAlbumExtractor(ImgurExtractor): "privacy" : "private", "score" : int, "title" : "138", - "topic" : "", - "topic_id" : 0, "upvote_count" : int, "url" : "https://imgur.com/a/TcBmP", "view_count" : int, diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 84018a9..81355ce 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -46,10 +46,10 @@ class InstagramExtractor(Extractor): for post in self.posts(): - if post["__typename"] == "GraphReel": - post = self._parse_reel(post["id"]) + if "__typename" in post: + post = self._parse_post_graphql(post) else: - post = self._parse_post(post) + post = self._parse_post_reel(post) post.update(data) files = post.pop("_files") @@ -85,21 +85,19 @@ class InstagramExtractor(Extractor): return response - def _api_request(self, endpoint, params): - url = "https://i.instagram.com/api/" + endpoint - headers = { + def _request_api(self, endpoint, **kwargs): + url = "https://i.instagram.com/api" + endpoint + kwargs["headers"] = { "X-CSRFToken" : self.csrf_token, "X-IG-App-ID" : "936619743392459", "X-IG-WWW-Claim": self.www_claim, } - cookies = { + kwargs["cookies"] = { "csrftoken": self.csrf_token, } - return self.request( - url, params=params, headers=headers, cookies=cookies, - ).json() + return self.request(url, **kwargs).json() - def _graphql_request(self, query_hash, variables): + def _request_graphql(self, query_hash, variables): url = self.root + "/graphql/query/" params = { "query_hash": query_hash, @@ -162,7 +160,7 @@ class InstagramExtractor(Extractor): for key in ("sessionid", "mid", "csrftoken") } - def _parse_post(self, post): + def _parse_post_graphql(self, post): if post.get("is_video") and "video_url" not in post: url = "{}/tv/{}/".format(self.root, post["shortcode"]) post = self._extract_post_page(url) @@ -230,27 +228,31 @@ class InstagramExtractor(Extractor): return data - def _parse_reel(self, reel_id): - params = {"reel_ids": reel_id} - data = self._api_request("v1/feed/reels_media/", params) - if not data["reels_media"]: - raise exception.NotFoundError("reel") - reel = data["reels_media"][0] - - reel_id = reel_id.rpartition(":")[2] - owner = reel["user"] + def _parse_post_reel(self, post): - data = { - "expires" : text.parse_timestamp(reel.get("expiring_at")), - "owner_id" : owner["pk"], - "username" : owner.get("username"), - "fullname" : owner.get("full_name"), - "post_id" : reel_id, - "post_shortcode": self._shortcode_from_id(reel_id), - } + if "media" in post: + media = post["media"] + owner = media["user"] + post["items"] = (media,) + data = { + "post_id" : media["pk"], + "post_shortcode": self._shortcode_from_id(media["pk"]), + } + else: + reel_id = str(post["id"]).rpartition(":")[2] + owner = post["user"] + data = { + "expires" : text.parse_timestamp(post.get("expiring_at")), + "post_id" : reel_id, + "post_shortcode": self._shortcode_from_id(reel_id), + } + data["owner_id"] = owner["pk"] + data["username"] = owner.get("username") + data["fullname"] = owner.get("full_name") data["_files"] = files = [] - for num, item in enumerate(reel["items"], 1): + + for num, item in enumerate(post["items"], 1): image = item["image_versions2"]["candidates"][0] @@ -337,7 +339,7 @@ class InstagramExtractor(Extractor): } return user[key] - def _pagination(self, query_hash, variables, data): + def _pagination_graphql(self, query_hash, variables, data): while True: for edge in data["edges"]: yield edge["node"] @@ -352,9 +354,19 @@ class InstagramExtractor(Extractor): variables["after"] = self._cursor = info["end_cursor"] self.log.debug("Cursor: %s", self._cursor) - data = next(iter(self._graphql_request( + data = next(iter(self._request_graphql( query_hash, variables)["user"].values())) + def _pagination_api(self, endpoint, params): + while True: + data = self._request_api(endpoint, method="POST", data=params) + yield from data["items"] + + info = data["paging_info"] + if not info["more_available"]: + return + params["max_id"] = info["max_id"] + class InstagramUserExtractor(InstagramExtractor): """Extractor for an Instagram user profile""" @@ -366,13 +378,6 @@ class InstagramUserExtractor(InstagramExtractor): ) def items(self): - if self.config("highlights"): - self.log.warning("'highlights' is deprecated, " - "use '\"include\": \"…,highlights\"' instead") - default = ("highlights", "posts") - else: - default = ("posts",) - base = "{}/{}/".format(self.root, self.item) stories = "{}/stories/{}/".format(self.root, self.item) return self._dispatch_extractors(( @@ -380,7 +385,7 @@ class InstagramUserExtractor(InstagramExtractor): (InstagramHighlightsExtractor, base + "highlights/"), (InstagramPostsExtractor , base + "posts/"), (InstagramChannelExtractor , base + "channel/"), - ), default) + ), ("posts",)) class InstagramPostsExtractor(InstagramExtractor): @@ -399,7 +404,7 @@ class InstagramPostsExtractor(InstagramExtractor): query_hash = "003056d32c2554def87228bc3fd9668a" variables = {"id": user["id"], "first": 50} edge = self._get_edge_data(user, "edge_owner_to_timeline_media") - return self._pagination(query_hash, variables, edge) + return self._pagination_graphql(query_hash, variables, edge) class InstagramChannelExtractor(InstagramExtractor): @@ -418,7 +423,7 @@ class InstagramChannelExtractor(InstagramExtractor): query_hash = "bc78b344a68ed16dd5d7f264681c4c76" variables = {"id": user["id"], "first": 50} edge = self._get_edge_data(user, "edge_felix_video_timeline") - return self._pagination(query_hash, variables, edge) + return self._pagination_graphql(query_hash, variables, edge) class InstagramSavedExtractor(InstagramExtractor): @@ -434,7 +439,7 @@ class InstagramSavedExtractor(InstagramExtractor): query_hash = "2ce1d673055b99250e93b6f88f878fde" variables = {"id": user["id"], "first": 50} edge = self._get_edge_data(user, "edge_saved_media") - return self._pagination(query_hash, variables, edge) + return self._pagination_graphql(query_hash, variables, edge) class InstagramTagExtractor(InstagramExtractor): @@ -458,9 +463,9 @@ class InstagramTagExtractor(InstagramExtractor): query_hash = "9b498c08113f1e09617a1703c22b2f32" variables = {"tag_name": hashtag["name"], "first": 50} edge = self._get_edge_data(hashtag, "edge_hashtag_to_media") - return self._pagination(query_hash, variables, edge) + return self._pagination_graphql(query_hash, variables, edge) - def _pagination(self, query_hash, variables, data): + def _pagination_graphql(self, query_hash, variables, data): while True: for edge in data["edges"]: yield edge["node"] @@ -471,7 +476,7 @@ class InstagramTagExtractor(InstagramExtractor): variables["after"] = self._cursor = info["end_cursor"] self.log.debug("Cursor: %s", self._cursor) - data = self._graphql_request( + data = self._request_graphql( query_hash, variables)["hashtag"]["edge_hashtag_to_media"] @@ -582,7 +587,7 @@ class InstagramPostExtractor(InstagramExtractor): ) def posts(self): - query_hash = "a9441f24ac73000fa17fe6e6da11d59d" + query_hash = "2c4c2e343a8f64c625ba02b2aa12c7f8" variables = { "shortcode" : self.item, "child_comment_count" : 3, @@ -590,7 +595,7 @@ class InstagramPostExtractor(InstagramExtractor): "parent_comment_count" : 24, "has_threaded_comments": True } - data = self._graphql_request(query_hash, variables) + data = self._request_graphql(query_hash, variables) media = data.get("shortcode_media") if not media: raise exception.NotFoundError("post") @@ -626,7 +631,9 @@ class InstagramStoriesExtractor(InstagramExtractor): return () reel_id = user["id"] - return ({"__typename": "GraphReel", "id": reel_id},) + endpoint = "/v1/feed/reels_media/" + params = {"reel_ids": reel_id} + return self._request_api(endpoint, params=params)["reels"].values() class InstagramHighlightsExtractor(InstagramExtractor): @@ -649,12 +656,35 @@ class InstagramHighlightsExtractor(InstagramExtractor): "include_highlight_reels": True, "include_live_status": True, } - data = self._graphql_request(query_hash, variables) + data = self._request_graphql(query_hash, variables) + edges = data["user"]["edge_highlight_reels"]["edges"] + if not edges: + return () + + reel_ids = ["highlight:" + edge["node"]["id"] for edge in edges] + endpoint = "/v1/feed/reels_media/?reel_ids=" + \ + "&reel_ids=".join(text.quote(rid) for rid in reel_ids) + reels = self._request_api(endpoint)["reels"] + return [reels[rid] for rid in reel_ids] + + +class InstagramReelsExtractor(InstagramExtractor): + """Extractor for an Instagram user's reels""" + subcategory = "reels" + pattern = USER_PATTERN + r"/reels" + test = ("https://www.instagram.com/instagram/reels/", { + "range": "40-60", + "count": ">= 20", + }) - return [ - { - "__typename": "GraphReel", - "id" : "highlight:" + edge["node"]["id"], - } - for edge in data["user"]["edge_highlight_reels"]["edges"] - ] + def posts(self): + url = "{}/{}/".format(self.root, self.item) + user = self._extract_profile_page(url) + + endpoint = "/v1/clips/user/" + data = { + "target_user_id": user["id"], + "page_size" : "50", + } + + return self._pagination_api(endpoint, data) diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py index b54afb7..8a4e413 100644 --- a/gallery_dl/extractor/komikcast.py +++ b/gallery_dl/extractor/komikcast.py @@ -69,7 +69,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): @staticmethod def images(page): readerarea = text.extract( - page, '<div id="readerarea">', '<div class="navig">')[0] + page, '<div id="readerarea"', '<div class="navig')[0] return [ (text.unescape(url), None) for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index d59e5bb..6a88d58 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -17,6 +17,7 @@ class MangadexExtractor(Extractor): """Base class for mangadex extractors""" category = "mangadex" root = "https://mangadex.org" + api_root = "https://api.mangadex.org" # mangadex-to-iso639-1 codes iso639_map = { @@ -28,7 +29,10 @@ class MangadexExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.api_root = self.config("api-server") or "https://mangadex.org/api" + + server = self.config("api-server") + if server is not None: + self.api_root = server.rstrip("/") def chapter_data(self, chapter_id): """Request API results for 'chapter_id'""" @@ -177,7 +181,7 @@ class MangadexMangaExtractor(MangadexExtractor): def chapters(self): """Return a sorted list of chapter-metadata dicts""" - manga = self.manga_data(self.manga_id) + manga = self.manga_data(int(self.manga_id)) results = [] for cdata in self.manga_chapters(self.manga_id): diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 0e063d5..daa3d65 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,35 +8,25 @@ """Extractors for mastodon instances""" -from .common import Extractor, Message -from .. import text, util, config, exception -import re +from .common import BaseExtractor, Message +from .. import text, exception +from ..cache import cache -class MastodonExtractor(Extractor): +class MastodonExtractor(BaseExtractor): """Base class for mastodon extractors""" basecategory = "mastodon" directory_fmt = ("mastodon", "{instance}", "{account[username]}") filename_fmt = "{category}_{id}_{media[id]}.{extension}" archive_fmt = "{media[id]}" cookiedomain = None - instance = None - root = None def __init__(self, match): - Extractor.__init__(self, match) - self.api = MastodonAPI(self) - - def config(self, key, default=None): - return config.interpolate_common( - ("extractor",), ( - (self.category, self.subcategory), - (self.basecategory, self.instance, self.subcategory), - ), key, default, - ) + BaseExtractor.__init__(self, match) + self.instance = self.root.partition("://")[2] + self.item = match.group(match.lastindex) def items(self): - yield Message.Version, 1 for status in self.statuses(): attachments = status["media_attachments"] if attachments: @@ -60,34 +50,81 @@ class MastodonExtractor(Extractor): status["created_at"][:19], "%Y-%m-%dT%H:%M:%S") +INSTANCES = { + "mastodon.social": { + "root" : "https://mastodon.social", + "access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48", + "client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo", + "client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI", + }, + "pawoo": { + "root" : "https://pawoo.net", + "access-token" : "c12c9d275050bce0dc92169a28db09d7" + "0d62d0a75a8525953098c167eacd3668", + "client-id" : "978a25f843ec01e53d09be2c290cd75c" + "782bc3b7fdbd7ea4164b9f3c3780c8ff", + "client-secret": "9208e3d4a7997032cf4f1b0e12e5df38" + "8428ef1fadb446dcfeb4f5ed6872d97b", + }, + "baraag": { + "root" : "https://baraag.net", + "access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0", + "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o", + "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY", + } +} + +BASE_PATTERN = MastodonExtractor.update(INSTANCES) + + class MastodonUserExtractor(MastodonExtractor): """Extractor for all images of an account/user""" subcategory = "user" - - def __init__(self, match): - MastodonExtractor.__init__(self, match) - self.account_name = match.group(1) + pattern = BASE_PATTERN + r"/@([^/?#]+)(?:/media)?/?$" + test = ( + ("https://mastodon.social/@jk", { + "pattern": r"https://files.mastodon.social/media_attachments" + r"/files/(\d+/){3,}original/\w+", + "range": "1-60", + "count": 60, + }), + ("https://pawoo.net/@yoru_nine/", { + "range": "1-60", + "count": 60, + }), + ("https://baraag.net/@pumpkinnsfw"), + ) def statuses(self): - handle = "@{}@{}".format(self.account_name, self.instance) - for account in self.api.account_search(handle, 1): - if account["username"] == self.account_name: + api = MastodonAPI(self) + username = self.item + handle = "@{}@{}".format(username, self.instance) + for account in api.account_search(handle, 1): + if account["username"] == username: break else: raise exception.NotFoundError("account") - return self.api.account_statuses(account["id"]) + return api.account_statuses(account["id"]) class MastodonStatusExtractor(MastodonExtractor): """Extractor for images from a status""" subcategory = "status" - - def __init__(self, match): - MastodonExtractor.__init__(self, match) - self.status_id = match.group(1) + pattern = BASE_PATTERN + r"/@[^/?#]+/(\d+)" + test = ( + ("https://mastodon.social/@jk/103794036899778366", { + "count": 4, + }), + ("https://pawoo.net/@yoru_nine/105038878897832922", { + "content": "b52e807f8ab548d6f896b09218ece01eba83987a", + }), + ("https://baraag.net/@pumpkinnsfw/104364170556898443", { + "content": "67748c1b828c58ad60d0fe5729b59fb29c872244", + }), + ) def statuses(self): - return (self.api.status(self.status_id),) + return (MastodonAPI(self).status(self.item),) class MastodonAPI(): @@ -97,35 +134,46 @@ class MastodonAPI(): https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md """ - def __init__(self, extractor, access_token=None): + def __init__(self, extractor): self.root = extractor.root self.extractor = extractor + access_token = extractor.config("access-token") + if access_token is None or access_token == "cache": + access_token = _access_token_cache(extractor.instance) if not access_token: - access_token = extractor.config( - "access-token", extractor.access_token) - self.headers = {"Authorization": "Bearer {}".format(access_token)} + try: + access_token = INSTANCES[extractor.category]["access-token"] + except (KeyError, TypeError): + raise exception.StopExtraction( + "Missing access token.\n" + "Run 'gallery-dl oauth:mastodon:%s' to obtain one.", + extractor.instance) + + self.headers = {"Authorization": "Bearer " + access_token} def account_search(self, query, limit=40): """Search for content""" + endpoint = "/v1/accounts/search" params = {"q": query, "limit": limit} - return self._call("accounts/search", params).json() + return self._call(endpoint, params).json() def account_statuses(self, account_id): """Get an account's statuses""" - endpoint = "accounts/{}/statuses".format(account_id) + endpoint = "/v1/accounts/{}/statuses".format(account_id) params = {"only_media": "1"} return self._pagination(endpoint, params) def status(self, status_id): - """Fetch a Status""" - return self._call("statuses/" + status_id).json() + """Fetch a status""" + endpoint = "/v1/statuses/" + status_id + return self._call(endpoint).json() def _call(self, endpoint, params=None): if endpoint.startswith("http"): url = endpoint else: - url = "{}/api/v1/{}".format(self.root, endpoint) + url = self.root + "/api" + endpoint while True: response = self.extractor.request( @@ -145,7 +193,7 @@ class MastodonAPI(): raise exception.StopExtraction(response.json().get("error")) def _pagination(self, endpoint, params): - url = "{}/api/v1/{}".format(self.root, endpoint) + url = endpoint while url: response = self._call(url, params) yield from response.json() @@ -156,86 +204,6 @@ class MastodonAPI(): url = url["url"] -def generate_extractors(): - """Dynamically generate Extractor classes for Mastodon instances""" - - symtable = globals() - extractors = config.get(("extractor",), "mastodon") - if extractors: - util.combine_dict(EXTRACTORS, extractors) - config.set(("extractor",), "mastodon", EXTRACTORS) - - for instance, info in EXTRACTORS.items(): - - if not isinstance(info, dict): - continue - - category = info.get("category") or instance.replace(".", "") - root = info.get("root") or "https://" + instance - name = (info.get("name") or category).capitalize() - token = info.get("access-token") - pattern = info.get("pattern") or re.escape(instance) - - class Extr(MastodonUserExtractor): - pass - - Extr.__name__ = Extr.__qualname__ = name + "UserExtractor" - Extr.__doc__ = "Extractor for all images of a user on " + instance - Extr.category = category - Extr.instance = instance - Extr.pattern = (r"(?:https?://)?" + pattern + - r"/@([^/?#]+)(?:/media)?/?$") - Extr.test = info.get("test-user") - Extr.root = root - Extr.access_token = token - symtable[Extr.__name__] = Extr - - class Extr(MastodonStatusExtractor): - pass - - Extr.__name__ = Extr.__qualname__ = name + "StatusExtractor" - Extr.__doc__ = "Extractor for images from a status on " + instance - Extr.category = category - Extr.instance = instance - Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?#]+/(\d+)" - Extr.test = info.get("test-status") - Extr.root = root - Extr.access_token = token - symtable[Extr.__name__] = Extr - - -EXTRACTORS = { - "mastodon.social": { - "category" : "mastodon.social", - "access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48", - "client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo", - "client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI", - "test-user" : ("https://mastodon.social/@jk", { - "pattern": r"https://files.mastodon.social/media_attachments" - r"/files/(\d+/){3,}original/\w+", - "range": "1-60", - "count": 60, - }), - "test-status" : ("https://mastodon.social/@jk/103794036899778366", { - "count": 4, - }), - }, - "pawoo.net": { - "category" : "pawoo", - "access-token" : "c12c9d275050bce0dc92169a28db09d7" - "0d62d0a75a8525953098c167eacd3668", - "client-id" : "978a25f843ec01e53d09be2c290cd75c" - "782bc3b7fdbd7ea4164b9f3c3780c8ff", - "client-secret": "9208e3d4a7997032cf4f1b0e12e5df38" - "8428ef1fadb446dcfeb4f5ed6872d97b", - }, - "baraag.net": { - "category" : "baraag", - "access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0", - "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o", - "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY", - }, -} - - -generate_extractors() +@cache(maxage=100*365*24*3600, keyarg=0) +def _access_token_cache(instance): + return None diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py index 7bf0084..d5c2554 100644 --- a/gallery_dl/extractor/message.py +++ b/gallery_dl/extractor/message.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2018 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -52,4 +52,4 @@ class Message(): # Cookies = 5 Queue = 6 # Urllist = 7 - Metadata = 8 + # Metadata = 8 diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 0ac55cd..df77110 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Mike Fährmann +# Copyright 2020-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,6 @@ """Extractors for Moebooru based sites""" -from .common import generate_extractors from .booru import BooruExtractor from .. import text @@ -52,15 +51,93 @@ class MoebooruExtractor(BooruExtractor): params["page"] += 1 +BASE_PATTERN = MoebooruExtractor.update({ + "yandere": { + "root": "https://yande.re", + }, + "konachan": { + "root": "https://konachan.com", + "pattern": r"konachan\.(?:com|net)", + }, + "hypnohub": { + "root": "https://hypnohub.net", + }, + "sakugabooru": { + "root": "https://www.sakugabooru.com", + "pattern": r"(?:www\.)?sakugabooru\.com", + }, + "lolibooru": { + "root": "https://lolibooru.moe", + }, +}) + + +class MoebooruPostExtractor(MoebooruExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/post/show/(\d+)" + test = ( + ("https://yande.re/post/show/51824", { + "content": "59201811c728096b2d95ce6896fd0009235fe683", + "options": (("tags", True),), + "keyword": { + "tags_artist": "sasaki_tamaru", + "tags_circle": "softhouse_chara", + "tags_copyright": "ouzoku", + "tags_general": str, + }, + }), + ("https://konachan.com/post/show/205189", { + "content": "674e75a753df82f5ad80803f575818b8e46e4b65", + "options": (("tags", True),), + "keyword": { + "tags_artist": "patata", + "tags_character": "clownpiece", + "tags_copyright": "touhou", + "tags_general": str, + }, + }), + ("https://konachan.net/post/show/205189"), + ("https://hypnohub.net/post/show/73964", { + "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee", + }), + ("https://www.sakugabooru.com/post/show/125570"), + ("https://lolibooru.moe/post/show/287835"), + ) + + def __init__(self, match): + MoebooruExtractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def posts(self): + params = {"tags": "id:" + self.post_id} + return self.request(self.root + "/post.json", params=params).json() + + class MoebooruTagExtractor(MoebooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern_fmt = r"/post\?(?:[^&#]*&)*tags=([^&#]+)" + pattern = BASE_PATTERN + r"/post\?(?:[^&#]*&)*tags=([^&#]+)" + test = ( + ("https://yande.re/post?tags=ouzoku+armor", { + "content": "59201811c728096b2d95ce6896fd0009235fe683", + }), + ("https://konachan.com/post?tags=patata", { + "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d", + }), + ("https://konachan.net/post?tags=patata"), + ("https://hypnohub.net/post?tags=gonoike_biwa", { + "url": "072330c34a1e773d0cafd00e64b8060d34b078b6", + }), + ("https://www.sakugabooru.com/post?tags=nichijou"), + ("https://lolibooru.moe/post?tags=ruu_%28tksymkw%29"), + ) def __init__(self, match): MoebooruExtractor.__init__(self, match) - self.tags = text.unquote(match.group(1).replace("+", " ")) + tags = match.group(match.lastindex) + self.tags = text.unquote(tags.replace("+", " ")) def metadata(self): return {"search_tags": self.tags} @@ -74,11 +151,25 @@ class MoebooruPoolExtractor(MoebooruExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern_fmt = r"/pool/show/(\d+)" + pattern = BASE_PATTERN + r"/pool/show/(\d+)" + test = ( + ("https://yande.re/pool/show/318", { + "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68", + }), + ("https://konachan.com/pool/show/95", { + "content": "cf0546e38a93c2c510a478f8744e60687b7a8426", + }), + ("https://konachan.net/pool/show/95"), + ("https://hypnohub.net/pool/show/61", { + "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf", + }), + ("https://www.sakugabooru.com/pool/show/54"), + ("https://lolibooru.moe/pool/show/239"), + ) def __init__(self, match): MoebooruExtractor.__init__(self, match) - self.pool_id = match.group(1) + self.pool_id = match.group(match.lastindex) def metadata(self): return {"pool": text.parse_int(self.pool_id)} @@ -88,29 +179,34 @@ class MoebooruPoolExtractor(MoebooruExtractor): return self._pagination(self.root + "/post.json", params) -class MoebooruPostExtractor(MoebooruExtractor): - subcategory = "post" - archive_fmt = "{id}" - pattern_fmt = r"/post/show/(\d+)" - - def __init__(self, match): - MoebooruExtractor.__init__(self, match) - self.post_id = match.group(1) - - def posts(self): - params = {"tags": "id:" + self.post_id} - return self.request(self.root + "/post.json", params=params).json() - - class MoebooruPopularExtractor(MoebooruExtractor): subcategory = "popular" directory_fmt = ("{category}", "popular", "{scale}", "{date}") archive_fmt = "P_{scale[0]}_{date}_{id}" - pattern_fmt = r"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?" + pattern = BASE_PATTERN + \ + r"/post/popular_(by_(?:day|week|month)|recent)(?:\?([^#]*))?" + test = ( + ("https://yande.re/post/popular_by_month?month=6&year=2014", { + "count": 40, + }), + ("https://yande.re/post/popular_recent"), + ("https://konachan.com/post/popular_by_month?month=11&year=2010", { + "count": 20, + }), + ("https://konachan.com/post/popular_recent"), + ("https://konachan.net/post/popular_recent"), + ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", { + "count": 20, + }), + ("https://hypnohub.net/post/popular_recent"), + ("https://www.sakugabooru.com/post/popular_recent"), + ("https://lolibooru.moe/post/popular_recent"), + ) def __init__(self, match): MoebooruExtractor.__init__(self, match) - self.scale, self.query = match.groups() + self.scale = match.group(match.lastindex-1) + self.query = match.group(match.lastindex) def metadata(self): self.params = params = text.parse_query(self.query) @@ -138,108 +234,3 @@ class MoebooruPopularExtractor(MoebooruExtractor): def posts(self): url = "{}/post/popular_{}.json".format(self.root, self.scale) return self.request(url, params=self.params).json() - - -EXTRACTORS = { - "yandere": { - "root": "https://yande.re", - "test-tag": ("https://yande.re/post?tags=ouzoku+armor", { - "content": "59201811c728096b2d95ce6896fd0009235fe683", - }), - "test-pool": ("https://yande.re/pool/show/318", { - "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68", - }), - "test-post": ("https://yande.re/post/show/51824", { - "content": "59201811c728096b2d95ce6896fd0009235fe683", - "options": (("tags", True),), - "keyword": { - "tags_artist": "sasaki_tamaru", - "tags_circle": "softhouse_chara", - "tags_copyright": "ouzoku", - "tags_general": str, - }, - }), - "test-popular": ( - ("https://yande.re/post/popular_by_month?month=6&year=2014", { - "count": 40, - }), - ("https://yande.re/post/popular_recent"), - ), - }, - "konachan": { - "root": "https://konachan.com", - "pattern": r"konachan\.(?:com|net)", - "test-tag": ( - ("https://konachan.com/post?tags=patata", { - "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d", - }), - ("https://konachan.net/post?tags=patata"), - ), - "test-pool": ( - ("https://konachan.com/pool/show/95", { - "content": "cf0546e38a93c2c510a478f8744e60687b7a8426", - }), - ("https://konachan.net/pool/show/95"), - ), - "test-post": ( - ("https://konachan.com/post/show/205189", { - "content": "674e75a753df82f5ad80803f575818b8e46e4b65", - "options": (("tags", True),), - "keyword": { - "tags_artist": "patata", - "tags_character": "clownpiece", - "tags_copyright": "touhou", - "tags_general": str, - }, - }), - ("https://konachan.net/post/show/205189"), - ), - "test-popular": ( - ("https://konachan.com/post/popular_by_month?month=11&year=2010", { - "count": 20, - }), - ("https://konachan.com/post/popular_recent"), - ("https://konachan.net/post/popular_recent"), - ), - }, - "hypnohub": { - "root": "https://hypnohub.net", - "test-tag": ("https://hypnohub.net/post?tags=gonoike_biwa", { - "url": "072330c34a1e773d0cafd00e64b8060d34b078b6", - }), - "test-pool": ("https://hypnohub.net/pool/show/61", { - "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf", - }), - "test-post": ("https://hypnohub.net/post/show/73964", { - "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee", - }), - "test-popular": ( - ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", { - "count": 20, - }), - ("https://hypnohub.net/post/popular_recent"), - ), - }, - "lolibooru": { - "root": "https://lolibooru.moe", - "test-tag" : ("https://lolibooru.moe/post?tags=ruu_%28tksymkw%29",), - "test-pool" : ("https://lolibooru.moe/pool/show/239",), - "test-post" : ("https://lolibooru.moe/post/show/287835",), - "test-popular": ("https://lolibooru.moe/post/popular_recent",), - }, - "sakugabooru": { - "root": "https://www.sakugabooru.com", - "pattern": r"(?:www\.)?sakugabooru\.com", - "test-tag" : ("https://www.sakugabooru.com/post?tags=nichijou",), - "test-pool" : ("https://www.sakugabooru.com/pool/show/54",), - "test-post" : ("https://www.sakugabooru.com/post/show/125570",), - "test-popular": ("https://www.sakugabooru.com/post/popular_recent",), - }, -} - -generate_extractors(EXTRACTORS, globals(), ( - MoebooruTagExtractor, - MoebooruPoolExtractor, - MoebooruPostExtractor, - MoebooruPopularExtractor, -)) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py new file mode 100644 index 0000000..db15572 --- /dev/null +++ b/gallery_dl/extractor/naverwebtoon.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Seonghyeon Cho +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://comic.naver.com/""" + +from .common import Extractor, Message +from .. import exception, text + +BASE_PATTERN = r"(?:https?://)?comic\.naver\.com/webtoon" + + +class NaverwebtoonExtractor(Extractor): + category = "naverwebtoon" + root = "https://comic.naver.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.query = match.group(1) + + +class NaverwebtoonEpisodeExtractor(NaverwebtoonExtractor): + subcategory = "episode" + directory_fmt = ("{category}", "{comic}") + filename_fmt = "{episode:>03}-{num:>02}.{extension}" + archive_fmt = "{title_id}_{episode}_{num}" + pattern = (BASE_PATTERN + r"/detail\.nhn\?([^#]+)") + test = ( + (("https://comic.naver.com/webtoon/detail.nhn?" + "titleId=26458&no=1&weekday=tue"), { + "url": "47a956ba8c7a837213d5985f50c569fcff986f75", + "content": "3806b6e8befbb1920048de9888dfce6220f69a60", + "count": 14 + }), + ) + + def __init__(self, match): + NaverwebtoonExtractor.__init__(self, match) + query = text.parse_query(self.query) + self.title_id = query.get("titleId") + if not self.title_id: + raise exception.NotFoundError("titleId") + self.episode = query.get("no") + if not self.episode: + raise exception.NotFoundError("no") + + def items(self): + url = "{}/webtoon/detail.nhn?{}".format(self.root, self.query) + page = self.request(url).text + data = self.get_job_metadata(page) + + yield Message.Directory, data + for data["num"], url in enumerate(self.get_image_urls(page), 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + title, pos = text.extract(page, 'property="og:title" content="', '"') + comic, pos = text.extract(page, '<h2>', '<span', pos) + authors, pos = text.extract(page, 'class="wrt_nm">', '</span>', pos) + authors = authors.strip().split("/") + descr, pos = text.extract(page, '<p class="txt">', '</p>', pos) + genre, pos = text.extract(page, '<span class="genre">', '</span>', pos) + date, pos = text.extract(page, '<dd class="date">', '</dd>', pos) + + return { + "title": title, + "comic": comic, + "authors": authors, + "description": descr, + "genre": genre, + "title_id": self.title_id, + "episode": self.episode, + "date": date, + } + + @staticmethod + def get_image_urls(page): + view_area = text.extract(page, 'id="comic_view_area"', '</div>')[0] + return text.extract_iter(view_area, '<img src="', '"') + + +class NaverwebtoonComicExtractor(NaverwebtoonExtractor): + subcategory = "comic" + categorytransfer = True + pattern = (BASE_PATTERN + r"/list\.nhn\?([^#]+)") + test = ( + ("https://comic.naver.com/webtoon/list.nhn?titleId=22073", { + "pattern": NaverwebtoonEpisodeExtractor.pattern, + "count": 32, + }), + ) + + def __init__(self, match): + NaverwebtoonExtractor.__init__(self, match) + query = text.parse_query(self.query) + self.title_id = query.get("titleId") + if not self.title_id: + raise exception.NotFoundError("titleId") + self.page_no = text.parse_int(query.get("page", 1)) + + def items(self): + url = self.root + "/webtoon/list.nhn" + params = {"titleId": self.title_id, "page": self.page_no} + data = {"_extractor": NaverwebtoonEpisodeExtractor} + + while True: + page = self.request(url, params=params).text + data["page"] = self.page_no + + for episode_url in self.get_episode_urls(page): + yield Message.Queue, episode_url, data + + if 'class="next"' not in page: + return + params["page"] += 1 + + def get_episode_urls(self, page): + """Extract and return all episode urls in page""" + return [ + self.root + "/webtoon/detail.nhn?" + query + for query in text.extract_iter( + page, '<a href="/webtoon/detail.nhn?', '"') + ][::2] diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 2ec7165..483c657 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -9,7 +9,7 @@ """Utility classes to setup OAuth and link accounts to gallery-dl""" from .common import Extractor, Message -from . import deviantart, flickr, pixiv, reddit, smugmug, tumblr +from . import deviantart, flickr, mastodon, pixiv, reddit, smugmug, tumblr from .. import text, oauth, util, config, exception from ..cache import cache import urllib.parse @@ -106,9 +106,9 @@ class OAuthBase(Extractor): )) def _oauth2_authorization_code_grant( - self, client_id, client_secret, auth_url, token_url, + self, client_id, client_secret, auth_url, token_url, *, scope="read", key="refresh_token", auth=True, - message_template=None, cache=None): + cache=None, instance=None): """Perform an OAuth2 authorization code grant""" state = "gallery-dl_{}_{}".format( @@ -117,12 +117,12 @@ class OAuthBase(Extractor): ) auth_params = { - "client_id": client_id, + "client_id" : client_id, "response_type": "code", - "state": state, - "redirect_uri": self.redirect_uri, - "duration": "permanent", - "scope": scope, + "state" : state, + "redirect_uri" : self.redirect_uri, + "duration" : "permanent", + "scope" : scope, } # receive an authorization code @@ -140,8 +140,8 @@ class OAuthBase(Extractor): # exchange the authorization code for a token data = { - "grant_type": "authorization_code", - "code": params["code"], + "grant_type" : "authorization_code", + "code" : params["code"], "redirect_uri": self.redirect_uri, } @@ -159,27 +159,18 @@ class OAuthBase(Extractor): self.send(data["error"]) return + token = data[key] + token_name = key.replace("_", "-") + # write to cache if self.cache and cache: - cache.update("#" + str(client_id), data[key]) - self.log.info("Writing 'refresh-token' to cache") + cache.update(instance or ("#" + str(client_id)), token) + self.log.info("Writing '%s' to cache", token_name) # display token - if message_template: - msg = message_template.format( - category=self.subcategory, - key=key.partition("_")[0], - token=data[key], - instance=getattr(self, "instance", ""), - client_id=client_id, - client_secret=client_secret, - ) - else: - msg = self._generate_message( - ("refresh-token",), - (data[key],), - ) - self.send(msg) + self.send(self._generate_message( + (token_name,), (token,), + )) def _generate_message(self, names, values): _vh, _va, _is, _it = ( @@ -326,8 +317,10 @@ class OAuthMastodon(OAuthBase): def items(self): yield Message.Version, 1 - application = self.oauth_config(self.instance) - if not application: + for application in mastodon.INSTANCES.values(): + if self.instance == application["root"].partition("://")[2]: + break + else: application = self._register(self.instance) self._oauth2_authorization_code_grant( @@ -335,8 +328,9 @@ class OAuthMastodon(OAuthBase): application["client-secret"], "https://{}/oauth/authorize".format(self.instance), "https://{}/oauth/token".format(self.instance), + instance=self.instance, key="access_token", - message_template=MASTODON_MSG_TEMPLATE, + cache=mastodon._access_token_cache, ) @cache(maxage=10*365*24*3600, keyarg=1) @@ -425,29 +419,3 @@ class OAuthPixiv(OAuthBase): """) code = input("code: ") return code.rpartition("=")[2].strip() - - -MASTODON_MSG_TEMPLATE = """ -Your 'access-token' is - -{token} - -Put this value into your configuration file as -'extractor.mastodon.{instance}.{key}-token'. - -You can also add your 'client-id' and 'client-secret' values -if you want to register another account in the future. - -Example: -{{ - "extractor": {{ - "mastodon": {{ - "{instance}": {{ - "{key}-token": "{token}", - "client-id": "{client_id}", - "client-secret": "{client_secret}" - }} - }} - }} -}} -""" diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 688c005..839e0b8 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,6 +23,7 @@ class PatreonExtractor(Extractor): directory_fmt = ("{category}", "{creator[full_name]}") filename_fmt = "{id}_{title}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" + browser = "firefox" _warning = True def items(self): @@ -42,8 +43,6 @@ class PatreonExtractor(Extractor): hashes = set() yield Message.Directory, post - yield Message.Metadata, post - for kind, url, name in itertools.chain( self._images(post), self._attachments(post), @@ -249,9 +248,9 @@ class PatreonCreatorExtractor(PatreonExtractor): creator_id = query.get("u") if creator_id: - url = "{}/user?u={}".format(self.root, creator_id) + url = "{}/user/posts?u={}".format(self.root, creator_id) else: - url = "{}/{}".format(self.root, self.creator.lower()) + url = "{}/{}/posts".format(self.root, self.creator) page = self.request(url, notfound="creator").text campaign_id = text.extract(page, "/campaign/", "/")[0] diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index be976e9..db49b90 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -29,11 +29,12 @@ class PixivExtractor(Extractor): Extractor.__init__(self, match) self.api = PixivAppAPI(self) self.load_ugoira = self.config("ugoira", True) + self.translated_tags = self.config("translated-tags", False) def items(self): + tkey = "translated_name" if self.translated_tags else "name" ratings = {0: "General", 1: "R-18", 2: "R-18G"} metadata = self.metadata() - yield Message.Version, 1 for work in self.works(): if not work["user"]["id"]: @@ -45,7 +46,7 @@ class PixivExtractor(Extractor): del work["image_urls"] del work["meta_pages"] work["num"] = 0 - work["tags"] = [tag["name"] for tag in work["tags"]] + work["tags"] = [tag[tkey] or tag["name"] for tag in work["tags"]] work["date"] = text.parse_datetime(work["create_date"]) work["rating"] = ratings.get(work["x_restrict"]) work["suffix"] = "" diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index aa0ba6d..971347b 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,11 +11,8 @@ from .common import Extractor, Message from .. import text import urllib.parse -import random -import time import json - BASE_PATTERN = r"(?:https?://)?((?:[^/.]+\.)?reactor\.cc)" @@ -24,17 +21,14 @@ class ReactorExtractor(Extractor): basecategory = "reactor" filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" archive_fmt = "{post_id}_{num}" + instances = () + request_interval = 5.0 def __init__(self, match): Extractor.__init__(self, match) self.root = "http://" + match.group(1) self.session.headers["Referer"] = self.root - self.wait_min = self.config("wait-min", 3) - self.wait_max = self.config("wait-max", 6) - if self.wait_max < self.wait_min: - self.wait_max = self.wait_min - if not self.category: # set category based on domain name netloc = urllib.parse.urlsplit(self.root).netloc @@ -60,8 +54,6 @@ class ReactorExtractor(Extractor): def _pagination(self, url): while True: - time.sleep(random.uniform(self.wait_min, self.wait_max)) - response = self.request(url) if response.history: # sometimes there is a redirect from @@ -231,11 +223,11 @@ class JoyreactorSearchExtractor(ReactorSearchExtractor): category = "joyreactor" pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" test = ( - ("http://joyreactor.cc/search/Cirno", { + ("http://joyreactor.cc/search/Nature", { "range": "1-25", "count": ">= 20", }), - ("http://joyreactor.com/search?q=Cirno", { + ("http://joyreactor.com/search?q=Nature", { "range": "1-25", "count": ">= 20", }), @@ -305,10 +297,7 @@ class PornreactorSearchExtractor(ReactorSearchExtractor): category = "pornreactor" pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" test = ( - ("http://pornreactor.cc/search?q=ecchi+hentai", { - "range": "1-25", - "count": ">= 25", - }), + ("http://pornreactor.cc/search?q=ecchi+hentai"), ("http://fapreactor.com/search/ecchi+hentai"), ) diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index ae1749e..7ffe5dc 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -47,12 +47,13 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" r"(/Comic/[^/?#]+/[^/?#]+\?id=(\d+))") test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", { - "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682", - "keyword": "30fe110273e871305001f33c18634516a0a51421", + "url": "30d29c5afc65043bfd384c010257ec2d0ecbafa6", + "keyword": "2d9ec81ce1b11fac06ebf96ce33cdbfca0e85eb5", }) def __init__(self, match): ChapterExtractor.__init__(self, match) + self.gallery_url += "&quality=hq" self.issue_id = match.group(2) def metadata(self, page): diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index 972750c..5d83299 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -125,17 +125,14 @@ class SankakucomplexTagExtractor(SankakucomplexExtractor): def items(self): pnum = 1 - last = None data = {"_extractor": SankakucomplexArticleExtractor} - yield Message.Version, 1 while True: url = "{}/{}/page/{}/".format(self.root, self.path, pnum) response = self.request(url, fatal=False) if response.status_code >= 400: return - for url in text.extract_iter(response.text, 'data-direct="', '"'): - if url != last: - last = url - yield Message.Queue, url, data + for url in util.unique_sequence(text.extract_iter( + response.text, 'data-direct="', '"')): + yield Message.Queue, url, data pnum += 1 diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index d65f334..ba1ab08 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2020 Mike Fährmann +# Copyright 2019-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,28 +8,23 @@ """Extractors for Shopify instances""" -from .common import Extractor, Message, generate_extractors +from .common import BaseExtractor, Message from .. import text import re -class ShopifyExtractor(Extractor): +class ShopifyExtractor(BaseExtractor): """Base class for Shopify extractors""" basecategory = "shopify" filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}" archive_fmt = "{id}" def __init__(self, match): - Extractor.__init__(self, match) - self.item_url = self.root + match.group(1) - - def request(self, url, **kwargs): - kwargs["retries"] = float("inf") - return Extractor.request(self, url, **kwargs) + BaseExtractor.__init__(self, match) + self.item_url = self.root + match.group(match.lastindex) def items(self): data = self.metadata() - yield Message.Version, 1 yield Message.Directory, data headers = {"X-Requested-With": "XMLHttpRequest"} @@ -58,22 +53,34 @@ class ShopifyExtractor(Extractor): """Return an iterable with all relevant product URLs""" +BASE_PATTERN = ShopifyExtractor.update({ + "fashionnova": { + "root": "https://www.fashionnova.com", + "pattern": r"(?:www\.)?fashionnova\.com", + }, +}) + + class ShopifyCollectionExtractor(ShopifyExtractor): """Base class for collection extractors for Shopify based sites""" subcategory = "collection" directory_fmt = ("{category}", "{collection[title]}") - pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)" - - def __init__(self, match): - ShopifyExtractor.__init__(self, match) - self.params = match.group(2) + pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$|[?#])" + test = ( + ("https://www.fashionnova.com/collections/mini-dresses", { + "range": "1-20", + "count": 20, + "archive": False, + }), + ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), + ("https://www.fashionnova.com/collections/mini-dresses#1"), + ) def metadata(self): return self.request(self.item_url + ".json").json() def products(self): - params = text.parse_query(self.params) - params["page"] = text.parse_int(params.get("page"), 1) + params = {"page": 1} fetch = True last = None @@ -107,36 +114,14 @@ class ShopifyProductExtractor(ShopifyExtractor): """Base class for product extractors for Shopify based sites""" subcategory = "product" directory_fmt = ("{category}", "Products") - pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)" + pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)" + test = ( + ("https://www.fashionnova.com/products/essential-slide-red", { + "pattern": r"https?://cdn\d*\.shopify.com/", + "count": 3, + }), + ("https://www.fashionnova.com/collections/flats/products/name"), + ) def products(self): return (self.item_url,) - - -EXTRACTORS = { - "fashionnova": { - "root": "https://www.fashionnova.com", - "pattern": r"(?:www\.)?fashionnova\.com", - "test-product": ( - ("https://www.fashionnova.com/products/essential-slide-red", { - "pattern": r"https?://cdn\d*\.shopify.com/", - "count": 3, - }), - ("https://www.fashionnova.com/collections/flats/products/name"), - ), - "test-collection": ( - ("https://www.fashionnova.com/collections/mini-dresses", { - "range": "1-20", - "count": 20, - "archive": False, - }), - ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), - ("https://www.fashionnova.com/collections/mini-dresses#1"), - ), - }, -} - -generate_extractors(EXTRACTORS, globals(), ( - ShopifyProductExtractor, - ShopifyCollectionExtractor, -)) diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py new file mode 100644 index 0000000..849dc49 --- /dev/null +++ b/gallery_dl/extractor/tumblrgallery.py @@ -0,0 +1,149 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://tumblrgallery.xyz/""" + +from .common import GalleryExtractor +from .. import text + +BASE_PATTERN = r"(?:https?://)?tumblrgallery\.xyz" + + +class TumblrgalleryExtractor(GalleryExtractor): + """Base class for tumblrgallery extractors""" + category = "tumblrgallery" + filename_fmt = "{category}_{gallery_id}_{num:>03}_{id}.{extension}" + directory_fmt = ("{category}", "{gallery_id} {title}") + root = "https://tumblrgallery.xyz" + + +class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor): + """Extractor for Tumblrblog on tumblrgallery.xyz""" + subcategory = "tumblrblog" + pattern = BASE_PATTERN + r"(/tumblrblog/gallery/(\d+)\.html)" + test = ("https://tumblrgallery.xyz/tumblrblog/gallery/103975.html",) + + def __init__(self, match): + TumblrgalleryExtractor.__init__(self, match) + self.gallery_id = text.parse_int(match.group(2)) + + def metadata(self, page): + return { + "title" : text.unescape(text.extract(page, "<h1>", "</h1>"))[0], + "gallery_id": self.gallery_id, + } + + def images(self, _): + page_num = 1 + while True: + response = self.request( + "{}/tumblrblog/gallery/{}/{}.html" + .format(self.root, self.gallery_id, page_num), + allow_redirects=False + ) + if response.status_code != 200: + return + + page = response.text + page_num += 1 + + urls = list(text.extract_iter( + page, + '<div class="report xx-co-me"> <a href="', + '" data-fancybox="gallery"' + )) + + for image_src in urls: + yield image_src, { + "id": text.extract(image_src, "tumblr_", "_")[0] + } + + +class TumblrgalleryPostExtractor(TumblrgalleryExtractor): + """Extractor for Posts on tumblrgallery.xyz""" + subcategory = "post" + pattern = BASE_PATTERN + r"(/post/(\d+)\.html)" + test = ("https://tumblrgallery.xyz/post/405674.html",) + + def __init__(self, match): + TumblrgalleryExtractor.__init__(self, match) + self.gallery_id = text.parse_int(match.group(2)) + + def metadata(self, page): + return { + "title" : text.remove_html( + text.unescape(text.extract(page, "<title>", "</title>")[0]) + ).replace("_", "-"), + "gallery_id": self.gallery_id, + } + + def images(self, page): + urls = list(text.extract_iter( + page, + '<div class="report xx-co-me"> <a href="', + '" data-fancybox="gallery"' + )) + + for image_src in urls: + yield image_src, { + "id": text.extract(image_src, "tumblr_", "_")[0] or + text.nameext_from_url(image_src)["filename"] + } + + +class TumblrgallerySearchExtractor(TumblrgalleryExtractor): + """Extractor for Search result on tumblrgallery.xyz""" + subcategory = "search" + filename_fmt = "{category}_{num:>03}_{gallery_id}_{id}_{title}.{extension}" + directory_fmt = ("{category}", "{search_term}") + pattern = BASE_PATTERN + r"(/s\.php\?q=([^&#]+))" + test = ("https://tumblrgallery.xyz/s.php?q=everyday-life",) + + def __init__(self, match): + TumblrgalleryExtractor.__init__(self, match) + self.search_term = match.group(2) + + def metadata(self, page): + return { + "search_term": self.search_term, + } + + def images(self, _): + page_num = 1 + while True: + response = self.request( + "{}/s.php?q={}&page={}" + .format(self.root, self.search_term, page_num), + allow_redirects=False + ) + if response.status_code != 200: + return + + page = response.text + page_num += 1 + + gallery_ids = list(text.extract_iter( + page, + '<div class="title"><a href="post/', + '.html' + )) + + for gallery_id in gallery_ids: + post_page = self.request( + "{}/post/{}.html" + .format(self.root, gallery_id), + allow_redirects=False + ).text + for image_src in TumblrgalleryPostExtractor.images( + self, post_page + ): + image_src[1]["title"] = text.remove_html( + text.unescape( + text.extract(post_page, "<title>", "</title>")[0] + ) + ).replace("_", "-") + image_src[1]["gallery_id"] = gallery_id + yield image_src diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 7b6bf21..a7d2de5 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -362,6 +362,23 @@ class TwitterListMembersExtractor(TwitterExtractor): yield Message.Queue, url, user +class TwitterFollowingExtractor(TwitterExtractor): + """Extractor for followed users""" + subcategory = "following" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/following(?!\w)" + test = ( + ("https://twitter.com/supernaturepics/following"), + ("https://www.twitter.com/id:2976459548/following"), + ) + + def items(self): + self.login() + for user in TwitterAPI(self).user_following(self.user): + user["_extractor"] = TwitterTimelineExtractor + url = "{}/i/user/{}".format(self.root, user["rest_id"]) + yield Message.Queue, url, user + + class TwitterSearchExtractor(TwitterExtractor): """Extractor for all images from a search timeline""" subcategory = "search" @@ -451,6 +468,11 @@ class TwitterTweetExtractor(TwitterExtractor): "date" : "dt:2020-08-20 04:00:28", }, }), + # all Tweets from a conversation (#1319) + ("https://twitter.com/BlankArts_/status/1323314488611872769", { + "options": (("conversations", True),), + "count": ">= 50", + }), ) def __init__(self, match): @@ -458,6 +480,8 @@ class TwitterTweetExtractor(TwitterExtractor): self.tweet_id = match.group(2) def tweets(self): + if self.config("conversations", False): + return TwitterAPI(self).conversation(self.tweet_id) return TwitterAPI(self).tweet(self.tweet_id) @@ -537,6 +561,10 @@ class TwitterAPI(): break return tweets + def conversation(self, conversation_id): + endpoint = "/2/timeline/conversation/{}.json".format(conversation_id) + return self._pagination(endpoint) + def timeline_profile(self, screen_name): user_id = self._user_id_by_screen_name(screen_name) endpoint = "/2/timeline/profile/{}.json".format(user_id) @@ -577,18 +605,8 @@ class TwitterAPI(): params["spelling_corrections"] = "1" return self._pagination(endpoint, params) - def list_members(self, list_id): - endpoint = "/graphql/3pV4YlpljXUTFAa1jVNWQw/ListMembers" - variables = { - "listId": list_id, - "count" : 20, - "withTweetResult": False, - "withUserResult" : False, - } - return self._pagination_members(endpoint, variables) - def list_by_rest_id(self, list_id): - endpoint = "/graphql/EhaI2uiCBJI97e28GN8WjQ/ListByRestId" + endpoint = "/graphql/18MAHTcDU-TdJSjWWmoH7w/ListByRestId" params = {"variables": '{"listId":"' + list_id + '"' ',"withUserResult":false}'} try: @@ -596,8 +614,33 @@ class TwitterAPI(): except KeyError: raise exception.NotFoundError("list") + def list_members(self, list_id): + endpoint = "/graphql/tA7h9hy4U0Yc9COfIOh3qQ/ListMembers" + variables = { + "listId": list_id, + "count" : 100, + "withTweetResult": False, + "withUserResult" : False, + } + return self._pagination_graphql( + endpoint, variables, "list", "members_timeline") + + def user_following(self, screen_name): + endpoint = "/graphql/Q_QTiPvoXwsA13eoA7okIQ/Following" + variables = { + "userId": self._user_id_by_screen_name(screen_name), + "count" : 100, + "withTweetResult": False, + "withUserResult" : False, + "withTweetQuoteCount" : False, + "withHighlightedLabel" : False, + "includePromotedContent": False, + } + return self._pagination_graphql( + endpoint, variables, "user", "following_timeline") + def user_by_screen_name(self, screen_name): - endpoint = "/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName" + endpoint = "/graphql/hc-pka9A7gyS3xODIafnrQ/UserByScreenName" params = {"variables": '{"screen_name":"' + screen_name + '"' ',"withHighlightedLabel":true}'} try: @@ -691,6 +734,13 @@ class TwitterAPI(): tweet = True cursor = cursor["value"] + elif entry_startswith("conversationThread-"): + tweet_ids.extend( + item["entryId"][6:] + for item in entry["content"]["timelineModule"]["items"] + if item["entryId"].startswith("tweet-") + ) + # process tweets for tweet_id in tweet_ids: try: @@ -728,15 +778,15 @@ class TwitterAPI(): return params["cursor"] = cursor - def _pagination_members(self, endpoint, variables): + def _pagination_graphql(self, endpoint, variables, key, timeline): while True: cursor = entry = stop = None params = {"variables": json.dumps(variables)} data = self._call(endpoint, params) try: - instructions = (data["data"]["list"]["members_timeline"] - ["timeline"]["instructions"]) + instructions = \ + data["data"][key][timeline]["timeline"]["instructions"] except KeyError: raise exception.AuthorizationError() diff --git a/gallery_dl/extractor/unsplash.py b/gallery_dl/extractor/unsplash.py index 545eb31..c653c01 100644 --- a/gallery_dl/extractor/unsplash.py +++ b/gallery_dl/extractor/unsplash.py @@ -122,7 +122,7 @@ class UnsplashImageExtractor(UnsplashExtractor): "total_photos": int, "twitter_username": None, "updated_at": str, - "username": "johnwestrock" + "username": "davehoefler", }, "views": int, "width": 4480, @@ -138,7 +138,7 @@ class UnsplashUserExtractor(UnsplashExtractor): """Extractor for all photos of an unsplash user""" subcategory = "user" pattern = BASE_PATTERN + r"/@(\w+)/?$" - test = ("https://unsplash.com/@johnwestrock", { + test = ("https://unsplash.com/@davehoefler", { "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", "range": "1-30", @@ -155,7 +155,7 @@ class UnsplashFavoriteExtractor(UnsplashExtractor): """Extractor for all likes of an unsplash user""" subcategory = "favorite" pattern = BASE_PATTERN + r"/@(\w+)/likes" - test = ("https://unsplash.com/@johnwestrock/likes", { + test = ("https://unsplash.com/@davehoefler/likes", { "pattern": r"https://images\.unsplash\.com/(photo-\d+-\w+" r"|reserve/[^/?#]+)\?ixid=\w+&ixlib=rb-1\.2\.1$", "range": "1-30", diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py index 20980ac..e025a22 100644 --- a/gallery_dl/extractor/wallhaven.py +++ b/gallery_dl/extractor/wallhaven.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,11 +16,35 @@ class WallhavenExtractor(Extractor): """Base class for wallhaven extractors""" category = "wallhaven" filename_fmt = "{category}_{id}_{resolution}.{extension}" + archive_fmt = "{id}" root = "https://wallhaven.cc" - def __init__(self, match): - Extractor.__init__(self, match) - self.api = WallhavenAPI(self) + def items(self): + metadata = self.metadata() + for wp in self.wallpapers(): + self._transform(wp) + wp.update(metadata) + url = wp["url"] + yield Message.Directory, wp + yield Message.Url, url, text.nameext_from_url(url, wp) + + def wallpapers(self): + """Return relevant 'wallpaper' objects""" + + def metadata(self): + """Return general metadata""" + return () + + @staticmethod + def _transform(wp): + wp["url"] = wp.pop("path") + if "tags" in wp: + wp["tags"] = [t["name"] for t in wp["tags"]] + wp["date"] = text.parse_datetime( + wp.pop("created_at"), "%Y-%m-%d %H:%M:%S") + wp["width"] = wp.pop("dimension_x") + wp["height"] = wp.pop("dimension_y") + wp["wh_category"] = wp["category"] class WallhavenSearchExtractor(WallhavenExtractor): @@ -42,18 +66,57 @@ class WallhavenSearchExtractor(WallhavenExtractor): WallhavenExtractor.__init__(self, match) self.params = text.parse_query(match.group(1)) + def wallpapers(self): + return WallhavenAPI(self).search(self.params.copy()) + + def metadata(self): + return {"search": self.params} + + +class WallhavenCollectionExtractor(WallhavenExtractor): + """Extractor for a collection on wallhaven.cc""" + subcategory = "collection" + directory_fmt = ("{category}", "{username}", "{collection_id}") + pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/favorites/(\d+)" + test = ("https://wallhaven.cc/user/AksumkA/favorites/74", { + "count": ">= 50", + }) + + def __init__(self, match): + WallhavenExtractor.__init__(self, match) + self.username, self.collection_id = match.groups() + + def wallpapers(self): + return WallhavenAPI(self).collection(self.username, self.collection_id) + + def metadata(self): + return {"username": self.username, "collection_id": self.collection_id} + + +class WallhavenCollectionsExtractor(WallhavenExtractor): + """Extractor for all collections of a wallhaven user""" + subcategory = "collections" + pattern = r"(?:https?://)?wallhaven\.cc/user/([^/?#]+)/favorites/?$" + test = ("https://wallhaven.cc/user/AksumkA/favorites", { + "pattern": WallhavenCollectionExtractor.pattern, + "count": 4, + }) + + def __init__(self, match): + WallhavenExtractor.__init__(self, match) + self.username = match.group(1) + def items(self): - yield Message.Version, 1 - yield Message.Directory, {"search": self.params} - for wp in self.api.search(self.params.copy()): - wp["search"] = self.params - yield Message.Url, wp["url"], wp + for collection in WallhavenAPI(self).collections(self.username): + collection["_extractor"] = WallhavenCollectionExtractor + url = "https://wallhaven.cc/user/{}/favorites/{}".format( + self.username, collection["id"]) + yield Message.Queue, url, collection class WallhavenImageExtractor(WallhavenExtractor): """Extractor for individual wallpaper on wallhaven.cc""" subcategory = "image" - archive_fmt = "{id}" pattern = (r"(?:https?://)?(?:wallhaven\.cc/w/|whvn\.cc/" r"|w\.wallhaven\.cc/[a-z]+/\w\w/wallhaven-)(\w+)") test = ( @@ -65,7 +128,7 @@ class WallhavenImageExtractor(WallhavenExtractor): "width" : 1920, "height" : 1200, "resolution" : "1920x1200", - "ratio" : 1.6, + "ratio" : "1.6", "colors" : list, "tags" : list, "file_size" : 278799, @@ -95,15 +158,15 @@ class WallhavenImageExtractor(WallhavenExtractor): WallhavenExtractor.__init__(self, match) self.wallpaper_id = match.group(1) - def items(self): - data = self.api.info(self.wallpaper_id) - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, data["url"], data + def wallpapers(self): + return (WallhavenAPI(self).info(self.wallpaper_id),) class WallhavenAPI(): - """Minimal interface to wallhaven's API""" + """Interface for wallhaven's API + + Ref: https://wallhaven.cc/help/api + """ def __init__(self, extractor): self.extractor = extractor @@ -117,32 +180,35 @@ class WallhavenAPI(): self.headers = {"X-API-Key": key} def info(self, wallpaper_id): - url = "https://wallhaven.cc/api/v1/w/" + wallpaper_id - return self._update(self._call(url)["data"]) + endpoint = "/v1/w/" + wallpaper_id + return self._call(endpoint)["data"] + + def collection(self, username, collection_id): + endpoint = "/v1/collections/{}/{}".format(username, collection_id) + return self._pagination(endpoint) + + def collections(self, username): + endpoint = "/v1/collections/" + username + return self._pagination(endpoint) def search(self, params): - url = "https://wallhaven.cc/api/v1/search" - while True: - data = self._call(url, params) - yield from map(self._update, data["data"]) - if data["meta"]["current_page"] >= data["meta"]["last_page"]: - return - params["page"] = data["meta"]["current_page"] + 1 + endpoint = "/v1/search" + return self._pagination(endpoint, params) - def _call(self, url, params=None): + def _call(self, endpoint, params=None): + url = "https://wallhaven.cc/api" + endpoint return self.extractor.request( url, headers=self.headers, params=params).json() - @staticmethod - def _update(wp): - width, _, height = wp["resolution"].partition("x") - wp["url"] = wp.pop("path") - if "tags" in wp: - wp["tags"] = [t["name"] for t in wp["tags"]] - wp["date"] = text.parse_datetime( - wp.pop("created_at"), "%Y-%m-%d %H:%M:%S") - wp["ratio"] = text.parse_float(wp["ratio"]) - wp["width"] = wp.pop("dimension_x") - wp["height"] = wp.pop("dimension_y") - wp["wh_category"] = wp["category"] - return text.nameext_from_url(wp["url"], wp) + def _pagination(self, endpoint, params=None): + if params is None: + params = {} + + while True: + data = self._call(endpoint, params) + yield from data["data"] + + meta = data.get("meta") + if not meta or meta["current_page"] >= meta["last_page"]: + return + params["page"] = meta["current_page"] + 1 diff --git a/gallery_dl/job.py b/gallery_dl/job.py index c1d32ef..0f40bb9 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -1,15 +1,17 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. import sys +import json import time import errno import logging +import operator import collections from . import extractor, downloader, postprocessor from . import config, text, util, output, exception @@ -111,10 +113,6 @@ class Job(): if self.pred_queue(url, kwds): self.handle_queue(url, kwds) - elif msg[0] == Message.Metadata: - self.update_kwdict(msg[1]) - self.handle_metadata(msg[1]) - elif msg[0] == Message.Version: if msg[1] != 1: raise "unsupported message-version ({}, {})".format( @@ -128,9 +126,6 @@ class Job(): def handle_directory(self, kwdict): """Handle Message.Directory""" - def handle_metadata(self, kwdict): - """Handle Message.Metadata""" - def handle_queue(self, url, kwdict): """Handle Message.Queue""" @@ -280,15 +275,6 @@ class DownloadJob(Job): for callback in self.hooks["post"]: callback(self.pathfmt) - def handle_metadata(self, kwdict): - """Run postprocessors with metadata from 'kwdict'""" - if "metadata" in self.hooks: - kwdict["extension"] = "metadata" - pathfmt = self.pathfmt - pathfmt.set_filename(kwdict) - for callback in self.hooks["metadata"]: - callback(pathfmt) - def handle_queue(self, url, kwdict): if url in self.visited: return @@ -456,7 +442,21 @@ class DownloadJob(Job): if wlist is not None: if isinstance(wlist, str): wlist = wlist.split(",") - blist = {e.category for e in extractor._list_classes()} + + # build a set of all categories + blist = set() + add = blist.add + update = blist.update + get = operator.itemgetter(0) + + for extr in extractor._list_classes(): + category = extr.category + if category: + add(category) + else: + update(map(get, extr.instances)) + + # remove whitelisted categories blist.difference_update(wlist) return blist @@ -576,6 +576,38 @@ class UrlJob(Job): self._write_unsupported(url) +class InfoJob(Job): + """Print extractor defaults and settings""" + + def run(self): + ex = self.extractor + pm = self._print_multi + pc = self._print_config + + if ex.basecategory: + pm("Category / Subcategory / Basecategory", + ex.category, ex.subcategory, ex.basecategory) + else: + pm("Category / Subcategory", ex.category, ex.subcategory) + + pc("Filename format", "filename", ex.filename_fmt) + pc("Directory format", "directory", ex.directory_fmt) + pc("Request interval", "sleep-request", ex.request_interval) + + return 0 + + def _print_multi(self, title, *values): + print(title, "\n ", " / ".join(json.dumps(v) for v in values), sep="") + + def _print_config(self, title, optname, value): + optval = self.extractor.config(optname, util.SENTINEL) + if optval is not util.SENTINEL: + print(title, "(custom):\n ", json.dumps(optval)) + print(title, "(default):\n ", json.dumps(value)) + elif value: + print(title, "(default):\n ", json.dumps(value)) + + class DataJob(Job): """Collect extractor results and dump them""" @@ -624,8 +656,5 @@ class DataJob(Job): def handle_directory(self, kwdict): self.data.append((Message.Directory, self.filter(kwdict))) - def handle_metadata(self, kwdict): - self.data.append((Message.Metadata, self.filter(kwdict))) - def handle_queue(self, url, kwdict): self.data.append((Message.Queue, url, self.filter(kwdict))) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 367b934..3e585fe 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2020 Mike Fährmann +# Copyright 2017-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -98,8 +98,9 @@ def build_parser(): ) general.add_argument( "-i", "--input-file", - dest="inputfile", metavar="FILE", - help="Download URLs found in FILE ('-' for stdin)", + dest="inputfiles", metavar="FILE", action="append", + help=("Download URLs found in FILE ('-' for stdin). " + "More than one --input-file can be specified"), ) general.add_argument( "--cookies", @@ -136,9 +137,9 @@ def build_parser(): help="Print URLs instead of downloading", ) output.add_argument( - "-G", + "-G", "--resolve-urls", dest="list_urls", action="store_const", const=128, - help=argparse.SUPPRESS, + help="Print URLs instead of downloading; resolve intermediary URLs", ) output.add_argument( "-j", "--dump-json", @@ -151,6 +152,11 @@ def build_parser(): help="Simulate data extraction; do not download anything", ) output.add_argument( + "-E", "--extractor-info", + dest="jobtype", action="store_const", const=job.InfoJob, + help="Print extractor defaults and settings", + ) + output.add_argument( "-K", "--list-keywords", dest="jobtype", action="store_const", const=job.KeywordJob, help=("Print a list of available keywords and example values " diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index faa4d6c..ee490e7 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,8 +8,6 @@ """Post-processing modules""" -import importlib - modules = [ "classify", "compare", @@ -28,16 +26,16 @@ def find(name): except KeyError: pass - klass = None + cls = None if name in modules: # prevent unwanted imports try: - module = importlib.import_module("." + name, __package__) + module = __import__(name, globals(), None, (), 1) except ImportError: pass else: - klass = module.__postprocessor__ - _cache[name] = klass - return klass + cls = module.__postprocessor__ + _cache[name] = cls + return cls # -------------------------------------------------------------------- diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index 5a54a77..2514219 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -39,10 +39,6 @@ class ExecPP(PostProcessor): events = options.get("event") if events is None: events = ("after",) - if options.get("final"): - self.log.warning("'final' is deprecated, " - "use '\"event\": \"finalize\"' instead") - events = ("finalize",) elif isinstance(events, str): events = events.split(",") for event in events: diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index c08f111..49696a0 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -39,7 +39,7 @@ class MetadataPP(PostProcessor): if directory: self._directory = self._directory_custom sep = os.sep + (os.altsep or "") - self._metadir = directory.rstrip(sep) + os.sep + self._metadir = util.expand_path(directory).rstrip(sep) + os.sep filename = options.get("filename") extfmt = options.get("extension-format") @@ -55,10 +55,6 @@ class MetadataPP(PostProcessor): events = options.get("event") if events is None: events = ("file",) - if options.get("bypost"): - self.log.warning("'bypost' is deprecated, use '\"event\": " - "\"post\"' and 'filename' instead") - events = ("metadata",) elif isinstance(events, str): events = events.split(",") for event in events: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 2161b9d..2466adf 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2020 Mike Fährmann +# Copyright 2017-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -65,6 +65,15 @@ def unique(iterable): yield element +def unique_sequence(iterable): + """Yield sequentially unique elements from 'iterable'""" + last = None + for element in iterable: + if element != last: + last = element + yield element + + def raises(cls): """Returns a function that raises 'cls' as exception""" def wrap(*args): @@ -731,21 +740,25 @@ class PathFormat(): } def __init__(self, extractor): - filename_fmt = extractor.config("filename", extractor.filename_fmt) - directory_fmt = extractor.config("directory", extractor.directory_fmt) - kwdefault = extractor.config("keywords-default") + filename_fmt = extractor.config("filename") + if filename_fmt is None: + filename_fmt = extractor.filename_fmt + + directory_fmt = extractor.config("directory") + if directory_fmt is None: + directory_fmt = extractor.directory_fmt extension_map = extractor.config("extension-map") if extension_map is None: extension_map = self.EXTENSION_MAP self.extension_map = extension_map.get + kwdefault = extractor.config("keywords-default") try: self.filename_formatter = Formatter( filename_fmt, kwdefault).format_map except Exception as exc: raise exception.FilenameFormatError(exc) - try: self.directory_formatters = [ Formatter(dirfmt, kwdefault).format_map @@ -754,20 +767,23 @@ class PathFormat(): except Exception as exc: raise exception.DirectoryFormatError(exc) - self.directory = self.realdirectory = "" - self.filename = self.extension = self.prefix = "" - self.path = self.realpath = self.temppath = "" self.kwdict = {} + self.directory = self.realdirectory = \ + self.filename = self.extension = self.prefix = \ + self.path = self.realpath = self.temppath = "" self.delete = self._create_directory = False basedir = extractor._parentdir if not basedir: - basedir = expand_path( - extractor.config("base-directory", (".", "gallery-dl"))) - if os.altsep and os.altsep in basedir: - basedir = basedir.replace(os.altsep, os.sep) - if basedir[-1] != os.sep: - basedir += os.sep + basedir = extractor.config("base-directory") + if basedir is None: + basedir = "." + os.sep + "gallery-dl" + os.sep + elif basedir: + basedir = expand_path(basedir) + if os.altsep and os.altsep in basedir: + basedir = basedir.replace(os.altsep, os.sep) + if basedir[-1] != os.sep: + basedir += os.sep self.basedirectory = basedir restrict = extractor.config("path-restrict", "auto") diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 8244a95..f1c49e9 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.16.5" +__version__ = "1.17.0" |
