From 3201d77a148367d739862b4f07868a76eaeb7cb1 Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sat, 13 Mar 2021 16:26:30 -0500 Subject: New upstream version 1.17.0. --- gallery_dl/__init__.py | 25 +-- gallery_dl/cloudflare.py | 201 ---------------------- gallery_dl/downloader/__init__.py | 16 +- gallery_dl/downloader/http.py | 8 +- gallery_dl/downloader/ytdl.py | 16 +- gallery_dl/extractor/500px.py | 16 +- gallery_dl/extractor/__init__.py | 11 +- gallery_dl/extractor/booru.py | 201 +--------------------- gallery_dl/extractor/common.py | 293 ++++++++++++++++++++------------ gallery_dl/extractor/cyberdrop.py | 58 +++++++ gallery_dl/extractor/deviantart.py | 10 +- gallery_dl/extractor/erome.py | 15 +- gallery_dl/extractor/exhentai.py | 116 ++++++++----- gallery_dl/extractor/foolfuuka.py | 232 ++++++++++++------------- gallery_dl/extractor/foolslide.py | 190 +++++++++++---------- gallery_dl/extractor/gelbooru.py | 14 +- gallery_dl/extractor/gelbooru_v01.py | 143 ++++++++++++++++ gallery_dl/extractor/gelbooru_v02.py | 194 +++++++++++++++++++++ gallery_dl/extractor/hentaicafe.py | 103 +++++++++-- gallery_dl/extractor/hentainexus.py | 10 +- gallery_dl/extractor/idolcomplex.py | 15 +- gallery_dl/extractor/imgur.py | 2 - gallery_dl/extractor/instagram.py | 144 +++++++++------- gallery_dl/extractor/komikcast.py | 2 +- gallery_dl/extractor/mangadex.py | 8 +- gallery_dl/extractor/mastodon.py | 216 ++++++++++------------- gallery_dl/extractor/message.py | 4 +- gallery_dl/extractor/moebooru.py | 245 +++++++++++++------------- gallery_dl/extractor/naverwebtoon.py | 128 ++++++++++++++ gallery_dl/extractor/oauth.py | 80 +++------ gallery_dl/extractor/patreon.py | 9 +- gallery_dl/extractor/pixiv.py | 5 +- gallery_dl/extractor/reactor.py | 23 +-- gallery_dl/extractor/readcomiconline.py | 5 +- gallery_dl/extractor/sankakucomplex.py | 11 +- gallery_dl/extractor/shopify.py | 79 ++++----- gallery_dl/extractor/tumblrgallery.py | 149 ++++++++++++++++ gallery_dl/extractor/twitter.py | 80 +++++++-- gallery_dl/extractor/unsplash.py | 6 +- gallery_dl/extractor/wallhaven.py | 146 +++++++++++----- gallery_dl/job.py | 71 +++++--- gallery_dl/option.py | 16 +- gallery_dl/postprocessor/__init__.py | 14 +- gallery_dl/postprocessor/exec.py | 6 +- gallery_dl/postprocessor/metadata.py | 6 +- gallery_dl/util.py | 44 +++-- gallery_dl/version.py | 2 +- 47 files changed, 1978 insertions(+), 1410 deletions(-) delete mode 100644 gallery_dl/cloudflare.py create mode 100644 gallery_dl/extractor/cyberdrop.py create mode 100644 gallery_dl/extractor/gelbooru_v01.py create mode 100644 gallery_dl/extractor/gelbooru_v02.py create mode 100644 gallery_dl/extractor/naverwebtoon.py create mode 100644 gallery_dl/extractor/tumblrgallery.py (limited to 'gallery_dl') diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 6c2c713..c1f80b6 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -196,7 +196,7 @@ def main(): cnt, "entry" if cnt == 1 else "entries", cache._path(), ) else: - if not args.urls and not args.inputfile: + if not args.urls and not args.inputfiles: parser.error( "The following arguments are required: URL\n" "Use 'gallery-dl --help' to get a list of all options.") @@ -208,18 +208,19 @@ def main(): jobtype = args.jobtype or job.DownloadJob urls = args.urls - if args.inputfile: - try: - if args.inputfile == "-": - if sys.stdin: - urls += parse_inputfile(sys.stdin, log) + if args.inputfiles: + for inputfile in args.inputfiles: + try: + if inputfile == "-": + if sys.stdin: + urls += parse_inputfile(sys.stdin, log) + else: + log.warning("input file: stdin is not readable") else: - log.warning("input file: stdin is not readable") - else: - with open(args.inputfile, encoding="utf-8") as file: - urls += parse_inputfile(file, log) - except OSError as exc: - log.warning("input file: %s", exc) + with open(inputfile, encoding="utf-8") as file: + urls += parse_inputfile(file, log) + except OSError as exc: + log.warning("input file: %s", exc) # unsupported file logging handler handler = output.setup_logging_handler( diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py deleted file mode 100644 index 0f49d61..0000000 --- a/gallery_dl/cloudflare.py +++ /dev/null @@ -1,201 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2020 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Methods to access sites behind Cloudflare protection""" - -import time -import operator -import collections -import urllib.parse -from xml.etree import ElementTree -from . import text -from .cache import memcache - - -def is_challenge(response): - return (response.status_code == 503 and - response.headers.get("Server", "").startswith("cloudflare") and - b"jschl-answer" in response.content) - - -def is_captcha(response): - return (response.status_code == 403 and - b'name="captcha-bypass"' in response.content) - - -def solve_challenge(session, response, kwargs): - """Solve Cloudflare challenge and get cfclearance cookie""" - parsed = urllib.parse.urlsplit(response.url) - root = parsed.scheme + "://" + parsed.netloc - page = response.text - - cf_kwargs = {} - headers = cf_kwargs["headers"] = collections.OrderedDict() - params = cf_kwargs["data"] = collections.OrderedDict() - headers["Referer"] = response.url - - form = text.extract(page, 'id="challenge-form"', '')[0] - for element in ElementTree.fromstring( - "" + form + "").findall("input"): - name = element.attrib.get("name") - if not name: - continue - if name == "jschl_answer": - try: - value = solve_js_challenge(page, parsed.netloc) - except Exception: - return response, None, None - else: - value = element.attrib.get("value") - params[name] = value - - try: - params = {"ray": text.extract(page, '?ray=', '"')[0]} - - url = root + "/cdn-cgi/images/trace/jschal/nojs/transparent.gif" - session.request("GET", url, params=params) - - url = root + "/cdn-cgi/images/trace/jschal/js/nocookie/transparent.gif" - session.request("GET", url, params=params) - except Exception: - pass - - time.sleep(4) - url = root + text.unescape(text.extract(page, 'action="', '"')[0]) - cf_response = session.request("POST", url, **cf_kwargs) - - if cf_response.history: - initial_response = cf_response.history[0] - else: - initial_response = cf_response - - cookies = { - cookie.name: cookie.value - for cookie in initial_response.cookies - } - - if not cookies: - import logging - log = logging.getLogger("cloudflare") - log.debug("Headers:\n%s", initial_response.headers) - log.debug("Content:\n%s", initial_response.text) - return cf_response, None, None - - domain = next(iter(initial_response.cookies)).domain - cookies["__cfduid"] = response.cookies.get("__cfduid", "") - return cf_response, domain, cookies - - -def solve_js_challenge(page, netloc): - """Evaluate JS challenge in 'page' to get 'jschl_answer' value""" - - # build variable name - # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk - data, pos = text.extract_all(page, ( - ('var' , ',f, ', '='), - ('key' , '"' , '"'), - ('expr', ':' , '}'), - )) - variable = "{}.{}".format(data["var"], data["key"]) - vlength = len(variable) - - k = text.extract(page, "k = '", "'")[0] - - # evaluate the initial expression - solution = evaluate_expression(data["expr"], page, netloc) - - # iterator over all remaining expressions - # and combine their values in 'solution' - expressions = text.extract( - page, "'challenge-form');", "f.submit();", pos)[0] - for expr in expressions.split(";")[1:]: - - if expr.startswith(variable): - # select arithmetc function based on operator (+/-/*) - func = OPERATORS[expr[vlength]] - # evaluate the rest of the expression - value = evaluate_expression(expr[vlength+2:], page, netloc, k) - # combine expression value with our current solution - solution = func(solution, value) - - elif expr.startswith("a.value"): - if "t.length)" in expr: - # add length of hostname - solution += len(netloc) - if ".toFixed(" in expr: - # trim solution to 10 decimal places - solution = "{:.10f}".format(solution) - return solution - - elif expr.startswith("k+="): - k += str(evaluate_expression(expr[3:], page, netloc)) - - -def evaluate_expression(expr, page, netloc, k=""): - """Evaluate a single Javascript expression for the challenge""" - - if expr.startswith("function(p)"): - # get HTML element with ID k and evaluate the expression inside - # 'eval(eval("document.getElementById(k).innerHTML"))' - expr = text.extract(page, 'id="'+k+'"', '<')[0] - return evaluate_expression(expr.partition(">")[2], page, netloc) - - if "/" in expr: - # split the expression in numerator and denominator subexpressions, - # evaluate them separately, - # and return their fraction-result - num, _, denom = expr.partition("/") - num = evaluate_expression(num, page, netloc) - denom = evaluate_expression(denom, page, netloc) - return num / denom - - if "function(p)" in expr: - # split initial expression and function code - initial, _, func = expr.partition("function(p)") - # evaluate said expression - initial = evaluate_expression(initial, page, netloc) - # get function argument and use it as index into 'netloc' - index = evaluate_expression(func[func.index("}")+1:], page, netloc) - return initial + ord(netloc[int(index)]) - - # iterate over all subexpressions, - # evaluate them, - # and accumulate their values in 'result' - result = "" - for subexpr in expr.strip("+()").split(")+("): - value = 0 - for part in subexpr.split("+"): - if "-" in part: - p1, _, p2 = part.partition("-") - value += VALUES[p1] - VALUES[p2] - else: - value += VALUES[part] - result += str(value) - return int(result) - - -OPERATORS = { - "+": operator.add, - "-": operator.sub, - "*": operator.mul, -} - - -VALUES = { - "": 0, - "!": 1, - "[]": 0, - "!![]": 1, - "(!![]": 1, - "(!![])": 1, -} - - -@memcache(keyarg=0) -def cookies(category): - return None diff --git a/gallery_dl/downloader/__init__.py b/gallery_dl/downloader/__init__.py index 6fb09e1..e1b936e 100644 --- a/gallery_dl/downloader/__init__.py +++ b/gallery_dl/downloader/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,8 +8,6 @@ """Downloader modules""" -import importlib - modules = [ "http", "text", @@ -24,22 +22,22 @@ def find(scheme): except KeyError: pass - klass = None + cls = None if scheme == "https": scheme = "http" if scheme in modules: # prevent unwanted imports try: - module = importlib.import_module("." + scheme, __package__) + module = __import__(scheme, globals(), None, (), 1) except ImportError: pass else: - klass = module.__downloader__ + cls = module.__downloader__ if scheme == "http": - _cache["http"] = _cache["https"] = klass + _cache["http"] = _cache["https"] = cls else: - _cache[scheme] = klass - return klass + _cache[scheme] = cls + return cls # -------------------------------------------------------------------- diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 8d72dc2..bc42d7c 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -31,6 +31,7 @@ class HttpDownloader(DownloaderBase): self.downloading = False self.adjust_extension = self.config("adjust-extensions", True) + self.headers = self.config("headers") self.minsize = self.config("filesize-min") self.maxsize = self.config("filesize-max") self.retries = self.config("retries", extractor._retries) @@ -93,13 +94,16 @@ class HttpDownloader(DownloaderBase): time.sleep(tries) tries += 1 - headers = {} + headers = {"Accept": "*/*"} file_header = None # check for .part file file_size = pathfmt.part_size() if file_size: headers["Range"] = "bytes={}-".format(file_size) + # general headers + if self.headers: + headers.update(self.headers) # file-specific headers extra = pathfmt.kwdict.get("_http_headers") if extra: diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 8086b5d..e116188 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2020 Mike Fährmann +# Copyright 2018-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,7 +8,6 @@ """Downloader module for URLs requiring youtube-dl support""" -from youtube_dl import YoutubeDL, DEFAULT_OUTTMPL from .common import DownloaderBase from .. import text import os @@ -16,8 +15,14 @@ import os class YoutubeDLDownloader(DownloaderBase): scheme = "ytdl" + module = None def __init__(self, job): + module = self.module + if not module: + module_name = self.config("module") or "youtube_dl" + module = YoutubeDLDownloader.module = __import__(module_name) + DownloaderBase.__init__(self, job) extractor = job.extractor @@ -42,10 +47,11 @@ class YoutubeDLDownloader(DownloaderBase): options["logger"] = self.log self.forward_cookies = self.config("forward-cookies", False) - outtmpl = self.config("outtmpl") - self.outtmpl = DEFAULT_OUTTMPL if outtmpl == "default" else outtmpl + self.outtmpl = self.config("outtmpl") + if self.outtmpl == "default": + self.outtmpl = module.DEFAULT_OUTTMPL - self.ytdl = YoutubeDL(options) + self.ytdl = module.YoutubeDL(options) def download(self, url, pathfmt): if self.forward_cookies: diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 81b11fd..aa0e8ad 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -50,6 +50,8 @@ class _500pxExtractor(Extractor): def _extend(self, edges): """Extend photos with additional metadata and higher resolution URLs""" + ids = [str(edge["node"]["legacyId"]) for edge in edges] + url = "https://api.500px.com/v1/photos" params = { "expanded_user_info" : "true", @@ -62,14 +64,14 @@ class _500pxExtractor(Extractor): "liked_by" : "1", "following_sample" : "100", "image_size" : "4096", - "ids" : ",".join( - str(edge["node"]["legacyId"]) for edge in edges), + "ids" : ",".join(ids), } - data = self._request_api(url, params)["photos"] + photos = self._request_api(url, params)["photos"] return [ - data[str(edge["node"]["legacyId"])] - for edge in edges + photos[pid] for pid in ids + if pid in photos or + self.log.warning("Unable to fetch photo %s", pid) ] def _request_api(self, url, params, csrf_token=None): @@ -142,6 +144,10 @@ class _500pxGalleryExtractor(_500pxExtractor): "user": dict, }, }), + # unavailable photos (#1335) + ("https://500px.com/p/Light_Expression_Photography/galleries/street", { + "count": 0, + }), ("https://500px.com/fashvamp/galleries/lera"), ) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 923a78b..57794d0 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -7,7 +7,6 @@ # published by the Free Software Foundation. import re -import importlib modules = [ "2chan", @@ -23,6 +22,7 @@ modules = [ "bcy", "behance", "blogger", + "cyberdrop", "danbooru", "derpibooru", "deviantart", @@ -35,6 +35,8 @@ modules = [ "furaffinity", "fuskator", "gelbooru", + "gelbooru_v01", + "gelbooru_v02", "gfycat", "hbrowse", "hentai2read", @@ -76,6 +78,7 @@ modules = [ "myhentaigallery", "myportfolio", "naver", + "naverwebtoon", "newgrounds", "ngomik", "nhentai", @@ -111,6 +114,7 @@ modules = [ "subscribestar", "tsumino", "tumblr", + "tumblrgallery", "twitter", "unsplash", "vanillarock", @@ -182,11 +186,12 @@ def _list_classes(): """Yield all available extractor classes""" yield from _cache + globals_ = globals() for module_name in _module_iter: - module = importlib.import_module("."+module_name, __package__) + module = __import__(module_name, globals_, None, (), 1) yield from add_module(module) - globals()["_list_classes"] = lambda : _cache + globals_["_list_classes"] = lambda : _cache def _get_classes(module): diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index 64cde80..c3cf3f7 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020 Mike Fährmann +# Copyright 2015-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,16 +8,12 @@ """Extractors for *booru sites""" -from .common import Extractor, Message, generate_extractors -from .. import text, util, exception - -from xml.etree import ElementTree -import collections +from .common import BaseExtractor, Message +from .. import text import operator -import re -class BooruExtractor(Extractor): +class BooruExtractor(BaseExtractor): """Base class for *booru extractors""" basecategory = "booru" filename_fmt = "{category}_{id}_{md5}.{extension}" @@ -66,191 +62,8 @@ class BooruExtractor(Extractor): _file_url = operator.itemgetter("file_url") - @staticmethod - def _prepare(post): - post["date"] = text.parse_datetime( - post["created_at"], "%a %b %d %H:%M:%S %z %Y") + def _prepare(self, post): + """Prepare the 'post's metadata""" def _extended_tags(self, post, page=None): - if not page: - url = "{}/index.php?page=post&s=view&id={}".format( - self.root, post["id"]) - page = self.request(url).text - html = text.extract(page, '