diff options
| author | 2020-06-01 23:11:37 -0400 | |
|---|---|---|
| committer | 2020-06-01 23:11:37 -0400 | |
| commit | a70a3246927b72f1ded37acd55ee719515441b5b (patch) | |
| tree | 57f0d3ab0b1387b665325f42a24b8aab63cbce07 /gallery_dl | |
| parent | 90e50db2e3c38f523bb5195d295290b06e5cedb0 (diff) | |
New upstream version 1.14.0.upstream/1.14.0
Diffstat (limited to 'gallery_dl')
45 files changed, 872 insertions, 304 deletions
diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py index 6cde65d..3886091 100644 --- a/gallery_dl/cache.py +++ b/gallery_dl/cache.py @@ -57,7 +57,7 @@ class MemoryCacheDecorator(CacheDecorator): value, expires = self.cache[key] except KeyError: expires = 0 - if expires < timestamp: + if expires <= timestamp: value = self.func(*args, **kwargs) expires = timestamp + self.maxage self.cache[key] = value, expires @@ -189,25 +189,26 @@ def clear(): def _path(): - path = config.get(("cache",), "file", -1) - if path != -1: + path = config.get(("cache",), "file", util.SENTINEL) + if path is not util.SENTINEL: return util.expand_path(path) - if os.name == "nt": - import tempfile - return os.path.join(tempfile.gettempdir(), ".gallery-dl.cache") + if util.WINDOWS: + cachedir = os.environ.get("APPDATA", "~") + else: + cachedir = os.environ.get("XDG_CACHE_HOME", "~/.cache") - cachedir = util.expand_path(os.path.join( - os.environ.get("XDG_CACHE_HOME", "~/.cache"), "gallery-dl")) + cachedir = util.expand_path(os.path.join(cachedir, "gallery-dl")) os.makedirs(cachedir, exist_ok=True) return os.path.join(cachedir, "cache.sqlite3") try: dbfile = _path() - if os.name != "nt": - # restrict access permissions for new db files - os.close(os.open(dbfile, os.O_CREAT | os.O_RDONLY, 0o600)) + + # restrict access permissions for new db files + os.close(os.open(dbfile, os.O_CREAT | os.O_RDONLY, 0o600)) + DatabaseCacheDecorator.db = sqlite3.connect( dbfile, timeout=30, check_same_thread=False) except (OSError, TypeError, sqlite3.OperationalError): diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py index 43ccdeb..0cf5a57 100644 --- a/gallery_dl/cloudflare.py +++ b/gallery_dl/cloudflare.py @@ -8,11 +8,11 @@ """Methods to access sites behind Cloudflare protection""" -import re import time import operator import collections import urllib.parse +from xml.etree import ElementTree from . import text from .cache import memcache @@ -41,12 +41,16 @@ def solve_challenge(session, response, kwargs): url = root + text.unescape(text.extract(page, 'action="', '"')[0]) headers["Referer"] = response.url - for inpt in text.extract_iter(page, "<input ", ">"): - name = text.extract(inpt, 'name="', '"')[0] + form = text.extract(page, 'id="challenge-form"', '</form>')[0] + for element in ElementTree.fromstring( + "<f>" + form + "</f>").findall("input"): + name = element.attrib.get("name") + if not name: + continue if name == "jschl_answer": value = solve_js_challenge(page, parsed.netloc) else: - value = text.unescape(text.extract(inpt, 'value="', '"')[0]) + value = element.attrib.get("value") params[name] = value time.sleep(4) @@ -84,6 +88,8 @@ def solve_js_challenge(page, netloc): variable = "{}.{}".format(data["var"], data["key"]) vlength = len(variable) + k = text.extract(page, "k = '", "'")[0] + # evaluate the initial expression solution = evaluate_expression(data["expr"], page, netloc) @@ -97,7 +103,7 @@ def solve_js_challenge(page, netloc): # select arithmetc function based on operator (+/-/*) func = OPERATORS[expr[vlength]] # evaluate the rest of the expression - value = evaluate_expression(expr[vlength+2:], page, netloc) + value = evaluate_expression(expr[vlength+2:], page, netloc, k) # combine expression value with our current solution solution = func(solution, value) @@ -110,17 +116,18 @@ def solve_js_challenge(page, netloc): solution = "{:.10f}".format(solution) return solution + elif expr.startswith("k+="): + k += str(evaluate_expression(expr[3:], page, netloc)) + -def evaluate_expression(expr, page, netloc, *, - split_re=re.compile(r"[(+]+([^)]*)\)")): +def evaluate_expression(expr, page, netloc, k=""): """Evaluate a single Javascript expression for the challenge""" if expr.startswith("function(p)"): # get HTML element with ID k and evaluate the expression inside # 'eval(eval("document.getElementById(k).innerHTML"))' - k, pos = text.extract(page, "k = '", "'") - e, pos = text.extract(page, 'id="'+k+'"', '<') - return evaluate_expression(e.partition(">")[2], page, netloc) + expr = text.extract(page, 'id="'+k+'"', '<')[0] + return evaluate_expression(expr.partition(">")[2], page, netloc) if "/" in expr: # split the expression in numerator and denominator subexpressions, diff --git a/gallery_dl/config.py b/gallery_dl/config.py index c2787ad..5303616 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -22,8 +22,9 @@ log = logging.getLogger("config") _config = {} -if os.name == "nt": +if util.WINDOWS: _default_configs = [ + r"%APPDATA%\gallery-dl\config.json", r"%USERPROFILE%\gallery-dl\config.json", r"%USERPROFILE%\gallery-dl.conf", ] @@ -139,7 +140,6 @@ def unset(path, key, *, conf=_config): class apply(): """Context Manager: apply a collection of key-value pairs""" - _sentinel = object() def __init__(self, kvlist): self.original = [] @@ -147,12 +147,12 @@ class apply(): def __enter__(self): for path, key, value in self.kvlist: - self.original.append((path, key, get(path, key, self._sentinel))) + self.original.append((path, key, get(path, key, util.SENTINEL))) set(path, key, value) def __exit__(self, etype, value, traceback): for path, key, value in self.original: - if value is self._sentinel: + if value is util.SENTINEL: unset(path, key) else: set(path, key, value) diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py index eca1284..d858075 100644 --- a/gallery_dl/downloader/common.py +++ b/gallery_dl/downloader/common.py @@ -9,7 +9,6 @@ """Common classes and constants used by downloader modules.""" import os -import logging from .. import config, util @@ -17,15 +16,12 @@ class DownloaderBase(): """Base class for downloaders""" scheme = "" - def __init__(self, extractor, output): - self.session = extractor.session - self.out = output + def __init__(self, job): + self.out = job.out + self.session = job.extractor.session self.part = self.config("part", True) self.partdir = self.config("part-directory") - - self.log = logging.getLogger("downloader." + self.scheme) - self.log.job = extractor.log.job - self.log.extractor = extractor + self.log = job.get_logger("downloader." + self.scheme) if self.partdir: self.partdir = util.expand_path(self.partdir) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 021dc16..6644827 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -24,16 +24,18 @@ except ImportError: class HttpDownloader(DownloaderBase): scheme = "http" - def __init__(self, extractor, output): - DownloaderBase.__init__(self, extractor, output) + def __init__(self, job): + DownloaderBase.__init__(self, job) + extractor = job.extractor + self.chunk_size = 16384 + self.downloading = False + self.adjust_extension = self.config("adjust-extensions", True) self.retries = self.config("retries", extractor._retries) self.timeout = self.config("timeout", extractor._timeout) self.verify = self.config("verify", extractor._verify) self.mtime = self.config("mtime", True) self.rate = self.config("rate") - self.downloading = False - self.chunk_size = 16384 if self.retries < 0: self.retries = float("inf") diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index fe6c4bc..c3dd863 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,8 +17,9 @@ import os class YoutubeDLDownloader(DownloaderBase): scheme = "ytdl" - def __init__(self, extractor, output): - DownloaderBase.__init__(self, extractor, output) + def __init__(self, job): + DownloaderBase.__init__(self, job) + extractor = job.extractor retries = self.config("retries", extractor._retries) options = { @@ -35,7 +36,7 @@ class YoutubeDLDownloader(DownloaderBase): if self.config("logging", True): options["logger"] = self.log - self.forward_cookies = self.config("forward-cookies", True) + self.forward_cookies = self.config("forward-cookies", False) outtmpl = self.config("outtmpl") self.outtmpl = DEFAULT_OUTTMPL if outtmpl == "default" else outtmpl @@ -70,6 +71,10 @@ class YoutubeDLDownloader(DownloaderBase): if "url" in info_dict: text.nameext_from_url(info_dict["url"], pathfmt.kwdict) + formats = info_dict.get("requested_formats") + if formats and not compatible_formats(formats): + info_dict["ext"] = "mkv" + if self.outtmpl: self.ytdl.params["outtmpl"] = self.outtmpl pathfmt.filename = filename = self.ytdl.prepare_filename(info_dict) @@ -105,4 +110,15 @@ class YoutubeDLDownloader(DownloaderBase): return True +def compatible_formats(formats): + video_ext = formats[0].get("ext") + audio_ext = formats[1].get("ext") + + if video_ext == "webm" and audio_ext == "webm": + return True + + exts = ("mp3", "mp4", "m4a", "m4p", "m4b", "m4r", "m4v", "ismv", "isma") + return video_ext in exts and audio_ext in exts + + __downloader__ = YoutubeDLDownloader diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 85fbddb..561b484 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -48,6 +48,7 @@ modules = [ "hypnohub", "idolcomplex", "imagebam", + "imagechest", "imagefap", "imgbb", "imgbox", @@ -94,6 +95,7 @@ modules = [ "readcomiconline", "realbooru", "reddit", + "redgifs", "rule34", "safebooru", "sankaku", @@ -113,6 +115,7 @@ modules = [ "vsco", "wallhaven", "warosu", + "webtoons", "weibo", "wikiart", "xhamster", diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 3a282c2..dd685df 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -40,6 +40,7 @@ class Extractor(): self._cookiefile = None self._cookiejar = self.session.cookies self._parentdir = "" + self._write_pages = self.config("write-pages", False) self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) @@ -91,6 +92,8 @@ class Extractor(): raise exception.HttpError(exc) else: code = response.status_code + if self._write_pages: + self._dump_response(response) if 200 <= code < 400 or fatal is None and \ (400 <= code < 500) or not fatal and \ (400 <= code < 429 or 431 <= code < 500): @@ -325,6 +328,33 @@ class Extractor(): test = (test, None) yield test + def _dump_response(self, response): + """Write the response content to a .dump file in the current directory. + + The file name is derived from the response url, + replacing special characters with "_" + """ + for resp in response.history: + self._dump_response(resp) + + if hasattr(Extractor, "_dump_index"): + Extractor._dump_index += 1 + else: + Extractor._dump_index = 1 + Extractor._dump_sanitize = re.compile(r"[\\\\|/<>:\"?*&=#]+").sub + + fname = "{:>02}_{}".format( + Extractor._dump_index, + Extractor._dump_sanitize('_', response.url) + )[:250] + + try: + with open(fname + ".dump", 'wb') as fp: + util.dump_response(response, fp) + except Exception as e: + self.log.warning("Failed to dump HTTP request (%s: %s)", + e.__class__.__name__, e) + class GalleryExtractor(Extractor): @@ -460,7 +490,7 @@ class SharedConfigMixin(): """Enable sharing of config settings based on 'basecategory'""" basecategory = "" - def config(self, key, default=None, *, sentinel=object()): + def config(self, key, default=None, *, sentinel=util.SENTINEL): value = Extractor.config(self, key, sentinel) return value if value is not sentinel else config.interpolate( ("extractor", self.basecategory, self.subcategory), key, default) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 3a0d0ef..e0edf89 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -32,7 +32,7 @@ class DanbooruExtractor(SharedConfigMixin, Extractor): def __init__(self, match): super().__init__(match) self.root = "https://{}.donmai.us".format(match.group(1)) - self.ugoira = self.config("ugoira", True) + self.ugoira = self.config("ugoira", False) self.params = {} username, api_key = self._get_auth_info() @@ -156,8 +156,8 @@ class DanbooruPostExtractor(DanbooruExtractor): "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", }), ("https://danbooru.donmai.us/posts/3613024", { - "pattern": r"https?://.+\.webm$", - "options": (("ugoira", False),) + "pattern": r"https?://.+\.zip$", + "options": (("ugoira", True),) }) ) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2631052..cda357a 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -126,8 +126,9 @@ class DeviantartExtractor(Extractor): if self.extra: for match in DeviantartStashExtractor.pattern.finditer( deviation.get("description", "")): + url = text.ensure_http_scheme(match.group(0)) deviation["_extractor"] = DeviantartStashExtractor - yield Message.Queue, match.group(0), deviation + yield Message.Queue, url, deviation def deviations(self): """Return an iterable containing all relevant Deviation-objects""" @@ -849,9 +850,12 @@ class DeviantartOAuthAPI(): self.client_secret = extractor.config( "client-secret", self.CLIENT_SECRET) - self.refresh_token = extractor.config("refresh-token") - if self.refresh_token == "cache": - self.refresh_token = "#" + str(self.client_id) + token = extractor.config("refresh-token") + if token is None or token == "cache": + token = "#" + str(self.client_id) + if not _refresh_token_cache(token): + token = None + self.refresh_token_key = token self.log.debug( "Using %s API credentials (client-id %s)", @@ -904,7 +908,7 @@ class DeviantartOAuthAPI(): """Get extended content of a single Deviation""" endpoint = "deviation/content" params = {"deviationid": deviation_id} - return self._call(endpoint, params) + return self._call(endpoint, params, public=False) def deviation_download(self, deviation_id): """Get the original file download (if allowed)""" @@ -951,18 +955,19 @@ class DeviantartOAuthAPI(): endpoint = "user/profile/" + username return self._call(endpoint, fatal=False) - def authenticate(self, refresh_token): + def authenticate(self, refresh_token_key): """Authenticate the application by requesting an access token""" - self.headers["Authorization"] = self._authenticate_impl(refresh_token) + self.headers["Authorization"] = \ + self._authenticate_impl(refresh_token_key) @cache(maxage=3600, keyarg=1) - def _authenticate_impl(self, refresh_token): + def _authenticate_impl(self, refresh_token_key): """Actual authenticate implementation""" url = "https://www.deviantart.com/oauth2/token" - if refresh_token: + if refresh_token_key: self.log.info("Refreshing private access token") data = {"grant_type": "refresh_token", - "refresh_token": _refresh_token_cache(refresh_token)} + "refresh_token": _refresh_token_cache(refresh_token_key)} else: self.log.info("Requesting public access token") data = {"grant_type": "client_credentials"} @@ -976,8 +981,9 @@ class DeviantartOAuthAPI(): self.log.debug("Server response: %s", data) raise exception.AuthenticationError('"{}" ({})'.format( data.get("error_description"), data.get("error"))) - if refresh_token: - _refresh_token_cache.update(refresh_token, data["refresh_token"]) + if refresh_token_key: + _refresh_token_cache.update( + refresh_token_key, data["refresh_token"]) return "Bearer " + data["access_token"] def _call(self, endpoint, params=None, fatal=True, public=True): @@ -987,7 +993,7 @@ class DeviantartOAuthAPI(): if self.delay >= 0: time.sleep(2 ** self.delay) - self.authenticate(None if public else self.refresh_token) + self.authenticate(None if public else self.refresh_token_key) response = self.extractor.request( url, headers=self.headers, params=params, fatal=None) data = response.json() @@ -1023,7 +1029,7 @@ class DeviantartOAuthAPI(): if extend: if public and len(data["results"]) < params["limit"]: - if self.refresh_token: + if self.refresh_token_key: self.log.debug("Switching to private access token") public = False continue @@ -1154,9 +1160,11 @@ class DeviantartEclipseAPI(): return text.rextract(page, '\\"id\\":', ',', pos)[0].strip('" ') -@cache(maxage=10*365*24*3600, keyarg=0) -def _refresh_token_cache(original_token, new_token=None): - return new_token or original_token +@cache(maxage=100*365*24*3600, keyarg=0) +def _refresh_token_cache(token): + if token and token[0] == "#": + return None + return token ############################################################################### diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 0c05a97..612c742 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2019 Mike Fährmann +# Copyright 2014-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,7 @@ from . import booru from .common import Message -from .. import text, util +from .. import text class GelbooruExtractor(booru.XmlParserMixin, @@ -31,6 +31,7 @@ class GelbooruExtractor(booru.XmlParserMixin, else: self.items = self.items_noapi self.session.cookies["fringeBenefits"] = "yup" + self.per_page = 42 def items_noapi(self): yield Message.Version, 1 @@ -46,6 +47,19 @@ class GelbooruExtractor(booru.XmlParserMixin, def get_posts(self): """Return an iterable containing all relevant post objects""" + url = "https://gelbooru.com/index.php?page=post&s=list" + params = { + "tags": self.params["tags"], + "pid" : self.page_start * self.per_page + } + + while True: + page = self.request(url, params=params).text + ids = list(text.extract_iter(page, '<a id="p', '"')) + yield from ids + if len(ids) < self.per_page: + return + params["pid"] += self.per_page def get_post_data(self, post_id): """Extract metadata of a single post""" @@ -88,34 +102,20 @@ class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor): }), ) - def __init__(self, match): - super().__init__(match) - if not self.use_api: - self.per_page = 42 - - def get_posts(self): - url = "https://gelbooru.com/index.php?page=post&s=list" - params = {"tags": self.tags, "pid": self.page_start * self.per_page} - while True: - page = self.request(url, params=params).text - ids = list(text.extract_iter(page, '<a id="p', '"')) - yield from ids - if len(ids) < self.per_page: - return - params["pid"] += self.per_page - - -class GelbooruPoolExtractor(booru.GelbooruPoolMixin, GelbooruExtractor): +class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor): """Extractor for image-pools from gelbooru.com""" pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" r"\?page=pool&s=show&id=(?P<pool>\d+)") - test = ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { - "count": 6, - }) - - def get_posts(self): - return util.advance(self.posts, self.page_start) + test = ( + ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { + "count": 6, + }), + ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { + "options": (("api", False),), + "count": 6, + }), + ) class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor): diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index ef64942..aa41836 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -51,20 +51,38 @@ class HentainexusGalleryExtractor(GalleryExtractor): "description": rmve(extr('viewcolumn">Description</td>', '</td>')), } data["lang"] = util.language_to_code(data["language"]) - data["type"] = "Doujinshi" if 'doujin' in data["tags"] else "Manga" - data["title_conventional"] = self.join_title( - data["event"], - data["circle"], - data["artist"], - data["title"], - data["parody"], - data["book"], - data["magazine"], - ) + if 'doujin' in data['tags']: + data['type'] = 'Doujinshi' + elif 'illustration' in data['tags']: + data['type'] = 'Illustration' + else: + data['type'] = 'Manga' + data["title_conventional"] = self._join_title(data) return data + def images(self, page): + url = "{}/read/{}".format(self.root, self.gallery_id) + extr = text.extract_from(self.request(url).text) + urls = extr("initReader(", "]") + "]" + return [(url, None) for url in json.loads(urls)] + @staticmethod - def join_title(event, circle, artist, title, parody, book, magazine): + def _join_title(data): + event = data['event'] + artist = data['artist'] + circle = data['circle'] + title = data['title'] + parody = data['parody'] + book = data['book'] + magazine = data['magazine'] + + # a few galleries have a large number of artists or parodies, + # which get replaced with "Various" in the title string + if artist.count(',') >= 3: + artist = 'Various' + if parody.count(',') >= 3: + parody = 'Various' + jt = '' if event: jt += '({}) '.format(event) @@ -81,12 +99,6 @@ class HentainexusGalleryExtractor(GalleryExtractor): jt += ' ({})'.format(magazine) return jt - def images(self, page): - url = "{}/read/{}".format(self.root, self.gallery_id) - extr = text.extract_from(self.request(url).text) - urls = extr("initReader(", "]") + "]" - return [(url, None) for url in json.loads(urls)] - class HentainexusSearchExtractor(Extractor): """Extractor for search results on hentainexus.com""" diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 3883445..1c53723 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -14,6 +14,9 @@ from ..cache import memcache import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?hiperdex\.(?:com|net|info)" + + class HiperdexBase(): """Base class for hiperdex extractors""" category = "hiperdex" @@ -61,11 +64,10 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for manga chapters from hiperdex.com""" - pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.(?:com|net)" - r"(/manga/([^/?&#]+)/([^/?&#]+))") + pattern = BASE_PATTERN + r"(/manga/([^/?&#]+)/([^/?&#]+))" test = ( ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", { - "pattern": r"https://hiperdex.com/wp-content/uploads" + "pattern": r"https://hiperdex.(com|net|info)/wp-content/uploads" r"/WP-manga/data/manga_\w+/[0-9a-f]{32}/\d+\.webp", "count": 9, "keyword": { @@ -82,6 +84,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): }, }), ("https://hiperdex.net/manga/domestic-na-kanojo/154-5/"), + ("https://hiperdex.info/manga/domestic-na-kanojo/154-5/"), ) def __init__(self, match): @@ -102,8 +105,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): """Extractor for manga from hiperdex.com""" chapterclass = HiperdexChapterExtractor - pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.(?:com|net)" - r"(/manga/([^/?&#]+))/?$") + pattern = BASE_PATTERN + r"(/manga/([^/?&#]+))/?$" test = ( ("https://hiperdex.com/manga/youre-not-that-special/", { "count": 51, @@ -123,6 +125,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): }, }), ("https://hiperdex.net/manga/youre-not-that-special/"), + ("https://hiperdex.info/manga/youre-not-that-special/"), ) def __init__(self, match): @@ -154,11 +157,11 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor): categorytransfer = False chapterclass = HiperdexMangaExtractor reverse = False - pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.(?:com|net)" - r"(/manga-a(?:rtist|uthor)/([^/?&#]+))") + pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/([^/?&#]+))" test = ( ("https://hiperdex.com/manga-artist/beck-ho-an/"), ("https://hiperdex.net/manga-artist/beck-ho-an/"), + ("https://hiperdex.info/manga-artist/beck-ho-an/"), ("https://hiperdex.com/manga-author/viagra/", { "pattern": HiperdexMangaExtractor.pattern, "count": ">= 6", diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py new file mode 100644 index 0000000..a1ba0c3 --- /dev/null +++ b/gallery_dl/extractor/imagechest.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Leonid "Bepis" Pavel +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from galleries at https://imgchest.com/""" + +from .common import GalleryExtractor +from .. import text, exception + + +class ImagechestGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from imgchest.com""" + category = "imagechest" + root = "https://imgchest.com" + pattern = r"(?:https?://)?(?:www\.)?imgchest\.com/p/([A-Za-z0-9]{11})" + test = ( + ("https://imgchest.com/p/3na7kr3by8d", { + "url": "f095b4f78c051e5a94e7c663814d1e8d4c93c1f7", + "content": "076959e65be30249a2c651fbe6090dc30ba85193", + "count": 3 + }), + ) + + def __init__(self, match): + self.gallery_id = match.group(1) + url = self.root + "/p/" + self.gallery_id + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + if "Sorry, but the page you requested could not be found." in page: + raise exception.NotFoundError("gallery") + + return { + "gallery_id": self.gallery_id, + "title": text.unescape(text.extract( + page, 'property="og:title" content="', '"')[0].strip()) + } + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter( + page, 'property="og:image" content="', '"') + ] diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 0813ea9..44fa5f2 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -34,7 +34,11 @@ class ImgurExtractor(Extractor): except KeyError: pass - url = image["mp4"] if image["animated"] and self.mp4 else image["link"] + if image["animated"] and self.mp4 and "mp4" in image: + url = image["mp4"] + else: + url = image["link"] + image["date"] = text.parse_timestamp(image["datetime"]) text.nameext_from_url(url, image) @@ -100,6 +104,9 @@ class ImgurImageExtractor(ImgurExtractor): ("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1' "url": "ec2cf11a2bfb4939feff374781a6e6f3e9af8e8e", }), + ("https://imgur.com/1Nily2P", { # animated png + "pattern": "https://i.imgur.com/1Nily2P.png", + }), ("https://imgur.com/zzzzzzz", { # not found "exception": exception.HttpError, }), @@ -130,7 +137,7 @@ class ImgurAlbumExtractor(ImgurExtractor): directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}") filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}" archive_fmt = "{album[id]}_{id}" - pattern = BASE_PATTERN + r"/(?:a|t/unmuted)/(\w{7}|\w{5})" + pattern = BASE_PATTERN + r"/a/(\w{7}|\w{5})" test = ( ("https://imgur.com/a/TcBmP", { "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", @@ -192,9 +199,6 @@ class ImgurAlbumExtractor(ImgurExtractor): ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash "url": "695ef0c950023362a0163ee5041796300db76674", }), - ("https://imgur.com/t/unmuted/YMqBcua", { # unmuted URL - "url": "86b4747f8147cec7602f0214e267309af73a8655", - }), ("https://imgur.com/a/TcBmQ", { "exception": exception.HttpError, }), @@ -225,7 +229,7 @@ class ImgurAlbumExtractor(ImgurExtractor): class ImgurGalleryExtractor(ImgurExtractor): """Extractor for imgur galleries""" subcategory = "gallery" - pattern = BASE_PATTERN + r"/gallery/(\w{7}|\w{5})" + pattern = BASE_PATTERN + r"/(?:gallery|t/unmuted)/(\w{7}|\w{5})" test = ( ("https://imgur.com/gallery/zf2fIms", { # non-album gallery (#380) "pattern": "https://imgur.com/zf2fIms", @@ -233,6 +237,9 @@ class ImgurGalleryExtractor(ImgurExtractor): ("https://imgur.com/gallery/eD9CT", { "pattern": "https://imgur.com/a/eD9CT", }), + ("https://imgur.com/t/unmuted/26sEhNr", { # unmuted URL + "pattern": "https://imgur.com/26sEhNr", + }), ) def items(self): diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index ea39cab..3781711 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -14,6 +14,8 @@ from .. import text, exception from ..cache import cache import itertools import json +import time +import re class InstagramExtractor(Extractor): @@ -26,6 +28,10 @@ class InstagramExtractor(Extractor): cookiedomain = ".instagram.com" cookienames = ("sessionid",) + def __init__(self, match): + Extractor.__init__(self, match) + self._find_tags = re.compile(r'#\w+').findall + def get_metadata(self): return {} @@ -78,9 +84,10 @@ class InstagramExtractor(Extractor): url = self.root + "/accounts/login/ajax/" data = { "username" : username, - "password" : password, + "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format( + int(time.time()), password), "queryParams" : "{}", - "optIntoOneTap": "true", + "optIntoOneTap": "false", } response = self.request(url, method="POST", headers=headers, data=data) @@ -133,12 +140,24 @@ class InstagramExtractor(Extractor): 'fullname': media['owner']['full_name'], 'post_id': media['id'], 'post_shortcode': media['shortcode'], + 'post_url': url, 'description': text.parse_unicode_escapes('\n'.join( edge['node']['text'] for edge in media['edge_media_to_caption']['edges'] )), } + tags = self._find_tags(common['description']) + if tags: + common['tags'] = sorted(set(tags)) + + location = media['location'] + if location: + common['location_id'] = location['id'] + common['location_slug'] = location['slug'] + common['location_url'] = "{}/explore/locations/{}/{}/".format( + self.root, location['id'], location['slug']) + medias = [] if media['__typename'] == 'GraphSidecar': for num, edge in enumerate( @@ -156,6 +175,7 @@ class InstagramExtractor(Extractor): 'sidecar_media_id': media['id'], 'sidecar_shortcode': media['shortcode'], } + self._extract_tagged_users(children, media_data) media_data.update(common) medias.append(media_data) @@ -169,6 +189,7 @@ class InstagramExtractor(Extractor): 'height': text.parse_int(media['dimensions']['height']), 'width': text.parse_int(media['dimensions']['width']), } + self._extract_tagged_users(media, media_data) media_data.update(common) medias.append(media_data) @@ -189,12 +210,12 @@ class InstagramExtractor(Extractor): user_id = '"{}"'.format( shared_data['entry_data']['StoriesPage'][0]['user']['id']) highlight_id = '' - query_hash = 'cda12de4f7fd3719c0569ce03589f4c4' + query_hash = '0a85e6ea60a4c99edc58ab2f3d17cfdf' variables = ( '{{' '"reel_ids":[{}],"tag_names":[],"location_ids":[],' - '"highlight_reel_ids":[{}],"precomposed_overlay":true,' + '"highlight_reel_ids":[{}],"precomposed_overlay":false,' '"show_story_viewer_list":true,' '"story_viewer_fetch_count":50,"story_viewer_cursor":"",' '"stories_video_dash_manifest":false' @@ -250,7 +271,7 @@ class InstagramExtractor(Extractor): data = self._request_graphql( variables, - 'aec5501414615eca36a9acf075655b1e', + 'ad99dd9d3646cc3c0dda65debcd266a7', shared_data['config']['csrf_token'], ) @@ -305,6 +326,18 @@ class InstagramExtractor(Extractor): variables, psdf['query_hash'], csrf, ) + def _extract_tagged_users(self, src_media, dest_dict): + edges = src_media['edge_media_to_tagged_user']['edges'] + if edges: + dest_dict['tagged_users'] = tagged_users = [] + for edge in edges: + user = edge['node']['user'] + tagged_users.append({ + 'id' : user['id'], + 'username' : user['username'], + 'full_name': user['full_name'], + }) + class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" @@ -321,10 +354,15 @@ class InstagramImageExtractor(InstagramExtractor): "description": str, "height": int, "likes": int, + "location_id": "214424288", + "location_slug": "hong-kong", + "location_url": "re:/explore/locations/214424288/hong-kong/", "media_id": "1922949326347663701", "shortcode": "BqvsDleB3lV", "post_id": "1922949326347663701", "post_shortcode": "BqvsDleB3lV", + "post_url": "https://www.instagram.com/p/BqvsDleB3lV/", + "tags": ["#WHPsquares"], "typename": "GraphImage", "username": "instagram", "width": int, @@ -339,6 +377,7 @@ class InstagramImageExtractor(InstagramExtractor): "sidecar_shortcode": "BoHk1haB5tM", "post_id": "1875629777499953996", "post_shortcode": "BoHk1haB5tM", + "post_url": "https://www.instagram.com/p/BoHk1haB5tM/", "num": int, "likes": int, "username": "instagram", @@ -354,7 +393,9 @@ class InstagramImageExtractor(InstagramExtractor): "height": int, "likes": int, "media_id": "1923502432034620000", + "post_url": "https://www.instagram.com/p/Bqxp0VSBgJg/", "shortcode": "Bqxp0VSBgJg", + "tags": ["#ASMR"], "typename": "GraphVideo", "username": "instagram", "width": int, @@ -370,6 +411,7 @@ class InstagramImageExtractor(InstagramExtractor): "height": int, "likes": int, "media_id": "1806097553666903266", + "post_url": "https://www.instagram.com/p/BkQjCfsBIzi/", "shortcode": "BkQjCfsBIzi", "typename": "GraphVideo", "username": "instagram", @@ -381,11 +423,23 @@ class InstagramImageExtractor(InstagramExtractor): ("https://www.instagram.com/p/BtOvDOfhvRr/", { "count": 2, "keyword": { + "post_url": "https://www.instagram.com/p/BtOvDOfhvRr/", "sidecar_media_id": "1967717017113261163", "sidecar_shortcode": "BtOvDOfhvRr", "video_url": str, } - }) + }), + + # GraphImage with tagged user + ("https://www.instagram.com/p/B_2lf3qAd3y/", { + "keyword": { + "tagged_users": [{ + "id": "1246468638", + "username": "kaaymbl", + "full_name": "Call Me Kay", + }] + } + }), ) def __init__(self, match): @@ -476,7 +530,7 @@ class InstagramUserExtractor(InstagramExtractor): 'node_id': 'id', 'variables_id': 'id', 'edge_to_medias': 'edge_owner_to_timeline_media', - 'query_hash': 'f2405b236d85e8296cf30347c9f08c2a', + 'query_hash': '44efc15d3c13342d02df0b5a9fa3d33f', }) if self.config('highlights'): @@ -545,5 +599,5 @@ class InstagramTagExtractor(InstagramExtractor): 'node_id': 'name', 'variables_id': 'tag_name', 'edge_to_medias': 'edge_hashtag_to_media', - 'query_hash': 'f12c9ec5e46a3173b2969c712ad84744', + 'query_hash': '7dabc71d3e758b1ec19ffb85639e427b', }) diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 38c90df..72465f7 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -51,7 +51,7 @@ class MangadexChapterExtractor(MangadexExtractor): test = ( ("https://mangadex.org/chapter/122094", { "keyword": "ef1084c2845825979e150512fed8fdc209baf05a", - "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f", + "content": "50383a4c15124682057b197d40261641a98db514", }), # oneshot ("https://mangadex.cc/chapter/138086", { diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 4f0e38d..002c8f7 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -27,11 +27,9 @@ class MastodonExtractor(Extractor): Extractor.__init__(self, match) self.api = MastodonAPI(self) - def config(self, key, default=None, *, sentinel=object()): + def config(self, key, default=None, *, sentinel=util.SENTINEL): value = Extractor.config(self, key, sentinel) - if value is not sentinel: - return value - return config.interpolate( + return value if value is not sentinel else config.interpolate( ("extractor", "mastodon", self.instance, self.subcategory), key, default, ) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 17fe935..84794ad 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -224,10 +224,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor): self.post_url = "https://www.newgrounds.com/art/view/{}/{}".format( self.user, match.group(3)) else: - url = match.group(0) - if not url.startswith("http"): - url = "https://" + url - self.post_url = url + self.post_url = text.ensure_http_scheme(match.group(0)) def posts(self): return (self.post_url,) @@ -414,6 +411,6 @@ class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor): @staticmethod def _extract_favorites(page): return [ - "https://" + user.rpartition('"')[2].lstrip("/:") + text.ensure_http_scheme(user.rpartition('"')[2]) for user in text.extract_iter(page, 'class="item-user', '"><img') ] diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index c06721c..c07c4b7 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -10,9 +10,8 @@ from .common import Extractor, Message from . import deviantart, flickr, reddit, smugmug, tumblr -from .. import text, oauth, config, exception +from .. import text, oauth, util, config, exception from ..cache import cache -import os import urllib.parse REDIRECT_URI_LOCALHOST = "http://localhost:6414/" @@ -27,6 +26,7 @@ class OAuthBase(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.client = None + self.cache = config.get(("extractor", self.category), "cache", True) def oauth_config(self, key, default=None): return config.interpolate( @@ -42,7 +42,7 @@ class OAuthBase(Extractor): server.listen(1) # workaround for ctrl+c not working during server.accept on Windows - if os.name == "nt": + if util.WINDOWS: server.settimeout(1.0) while True: try: @@ -87,12 +87,20 @@ class OAuthBase(Extractor): # exchange the request token for an access token data = self.session.get(access_token_url, params=data).text - data = text.parse_query(data) - self.send(OAUTH1_MSG_TEMPLATE.format( - category=self.subcategory, - token=data["oauth_token"], - token_secret=data["oauth_token_secret"], + token = data["oauth_token"] + token_secret = data["oauth_token_secret"] + + # write to cache + if self.cache: + key = (self.subcategory, self.session.auth.consumer_key) + oauth._token_cache.update(key, (token, token_secret)) + self.log.info("Writing tokens to cache") + + # display tokens + self.send(self._generate_message( + ("access-token", "access-token-secret"), + (token, token_secret), )) def _oauth2_authorization_code_grant( @@ -149,24 +157,66 @@ class OAuthBase(Extractor): self.send(data["error"]) return - # display token - part = key.partition("_")[0] - template = message_template or OAUTH2_MSG_TEMPLATE - self.send(template.format( - category=self.subcategory, - key=part, - Key=part.capitalize(), - token=data[key], - instance=getattr(self, "instance", ""), - client_id=client_id, - client_secret=client_secret, - )) - # write to cache - if cache and config.get(("extractor", self.category), "cache"): + if self.cache and cache: cache.update("#" + str(client_id), data[key]) self.log.info("Writing 'refresh-token' to cache") + # display token + if message_template: + msg = message_template.format( + category=self.subcategory, + key=key.partition("_")[0], + token=data[key], + instance=getattr(self, "instance", ""), + client_id=client_id, + client_secret=client_secret, + ) + else: + msg = self._generate_message( + ("refresh-token",), + (data[key],), + ) + self.send(msg) + + def _generate_message(self, names, values): + if len(names) == 1: + _vh = "This value has" + _is = "is" + _it = "it" + _va = "this value" + else: + _vh = "These values have" + _is = "are" + _it = "them" + _va = "these values" + + msg = "\nYour {} {}\n\n{}\n\n".format( + " and ".join("'" + n + "'" for n in names), + _is, + "\n".join(values), + ) + + if self.cache: + opt = self.oauth_config(names[0]) + if opt is None or opt == "cache": + msg += _vh + " been cached and will automatically be used." + else: + msg += ( + "Set 'extractor.{}.{}' to \"cache\" to use {}.".format( + self.subcategory, names[0], _it, + ) + ) + else: + msg += "Put " + _va + " into your configuration file as \n" + msg += " and\n".join( + "'extractor." + self.subcategory + "." + n + "'" + for n in names + ) + msg += "." + + return msg + class OAuthDeviantart(OAuthBase): subcategory = "deviantart" @@ -224,6 +274,7 @@ class OAuthReddit(OAuthBase): "https://www.reddit.com/api/v1/authorize", "https://www.reddit.com/api/v1/access_token", scope="read history", + cache=reddit._refresh_token_cache, ) @@ -318,49 +369,8 @@ class OAuthMastodon(OAuthBase): return data -OAUTH1_MSG_TEMPLATE = """ -Your Access Token and Access Token Secret are - -{token} -{token_secret} - -Put these values into your configuration file as -'extractor.{category}.access-token' and -'extractor.{category}.access-token-secret'. - -Example: -{{ - "extractor": {{ - "{category}": {{ - "access-token": "{token}", - "access-token-secret": "{token_secret}" - }} - }} -}} -""" - - -OAUTH2_MSG_TEMPLATE = """ -Your {Key} Token is - -{token} - -Put this value into your configuration file as -'extractor.{category}.{key}-token'. - -Example: -{{ - "extractor": {{ - "{category}": {{ - "{key}-token": "{token}" - }} - }} -}} -""" - - MASTODON_MSG_TEMPLATE = """ -Your {Key} Token is +Your 'access-token' is {token} diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 570bd72..a14ec9c 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -98,8 +98,7 @@ class PatreonExtractor(Extractor): headers = {"Referer": self.root} while url: - if not url.startswith("http"): - url = "https://" + url.lstrip("/:") + url = text.ensure_http_scheme(url) posts = self.request(url, headers=headers).json() if "included" in posts: diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py index 1a793a0..ead5c35 100644 --- a/gallery_dl/extractor/recursive.py +++ b/gallery_dl/extractor/recursive.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -28,6 +28,7 @@ class RecursiveExtractor(Extractor): self.session.mount("file://", FileAdapter()) page = self.request(self.url.partition(":")[2]).text + del self.session.adapters["file://"] yield Message.Version, 1 with extractor.blacklist(blist): diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index d0232cc..2e3864a 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -222,20 +222,25 @@ class RedditAPI(): self.extractor = extractor self.comments = text.parse_int(extractor.config("comments", 0)) self.morecomments = extractor.config("morecomments", False) - self.refresh_token = extractor.config("refresh-token") self.log = extractor.log client_id = extractor.config("client-id", self.CLIENT_ID) user_agent = extractor.config("user-agent", self.USER_AGENT) if (client_id == self.CLIENT_ID) ^ (user_agent == self.USER_AGENT): - self.client_id = None - self.log.warning( + raise exception.StopExtraction( "Conflicting values for 'client-id' and 'user-agent': " "overwrite either both or none of them.") + + self.client_id = client_id + self.headers = {"User-Agent": user_agent} + + token = extractor.config("refresh-token") + if token is None or token == "cache": + key = "#" + self.client_id + self.refresh_token = _refresh_token_cache(key) else: - self.client_id = client_id - extractor.session.headers["User-Agent"] = user_agent + self.refresh_token = token def submission(self, submission_id): """Fetch the (submission, comments)=-tuple for a submission id""" @@ -277,13 +282,15 @@ class RedditAPI(): def authenticate(self): """Authenticate the application by requesting an access token""" - access_token = self._authenticate_impl(self.refresh_token) - self.extractor.session.headers["Authorization"] = access_token + self.headers["Authorization"] = \ + self._authenticate_impl(self.refresh_token) @cache(maxage=3600, keyarg=1) def _authenticate_impl(self, refresh_token=None): """Actual authenticate implementation""" url = "https://www.reddit.com/api/v1/access_token" + self.headers["Authorization"] = None + if refresh_token: self.log.info("Refreshing private access token") data = {"grant_type": "refresh_token", @@ -294,9 +301,9 @@ class RedditAPI(): "grants/installed_client"), "device_id": "DO_NOT_TRACK_THIS_DEVICE"} - auth = (self.client_id, "") response = self.extractor.request( - url, method="POST", data=data, auth=auth, fatal=False) + url, method="POST", headers=self.headers, + data=data, auth=(self.client_id, ""), fatal=False) data = response.json() if response.status_code != 200: @@ -307,9 +314,10 @@ class RedditAPI(): def _call(self, endpoint, params): url = "https://oauth.reddit.com" + endpoint - params["raw_json"] = 1 + params["raw_json"] = "1" self.authenticate() - response = self.extractor.request(url, params=params, fatal=None) + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: @@ -380,3 +388,10 @@ class RedditAPI(): @staticmethod def _decode(sid): return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz") + + +@cache(maxage=100*365*24*3600, keyarg=0) +def _refresh_token_cache(token): + if token and token[0] == "#": + return None + return token diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py new file mode 100644 index 0000000..7855eab --- /dev/null +++ b/gallery_dl/extractor/redgifs.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://redgifs.com/""" + +from .gfycat import GfycatImageExtractor +from ..cache import cache + + +class RedgifsImageExtractor(GfycatImageExtractor): + """Extractor for individual images from redgifs.com""" + category = "redgifs" + pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/watch/([A-Za-z]+)" + test = ("https://redgifs.com/watch/foolishforkedabyssiniancat", { + "pattern": r"https://\w+.redgifs.com/FoolishForkedAbyssiniancat.mp4", + "content": "f6e03f1df9a2ff2a74092f53ee7580d2fb943533", + }) + + def _get_info(self, gfycat_id): + api = RedgifsAPI(self) + return api.gfycat(gfycat_id) + + +class RedgifsAPI(): + + def __init__(self, extractor): + self.extractor = extractor + self.headers = {} + + def gfycat(self, gfycat_id): + endpoint = "v1/gfycats/" + gfycat_id + return self._call(endpoint)["gfyItem"] + + @cache(maxage=3600) + def _authenticate_impl(self): + url = "https://weblogin.redgifs.com/oauth/webtoken" + headers = { + "Referer": "https://www.redgifs.com/", + "Origin" : "https://www.redgifs.com", + } + data = { + "access_key": "dBLwVuGn9eq4dtXLs8WSfpjcYFY7bPQe" + "AqGPSFgqeW5B9uzj2cMVhF63pTFF4Rg9", + } + + response = self.extractor.request( + url, method="POST", headers=headers, json=data) + return "Bearer " + response.json()["access_token"] + + def _call(self, endpoint): + self.headers["Authorization"] = self._authenticate_impl() + url = "https://napi.redgifs.com/" + endpoint + return self.extractor.request(url, headers=self.headers).json() diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index b21ad32..2cef430 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -121,9 +121,9 @@ class SexcomPinExtractor(SexcomExtractor): }, }), # gif - ("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", { - "pattern": "https://cdn.sex.com/images/.+/2014/01/26/4829951.gif", - "content": "af6726d74d11d819e1c885fe5303f711862eae96", + ("https://www.sex.com/pin/55435122-ecchi/", { + "pattern": "https://cdn.sex.com/images/.+/2017/12/07/18760842.gif", + "content": "176cc63fa05182cb0438c648230c0f324a5965fe", }), # video ("https://www.sex.com/pin/55748341/", { diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 7e99823..3e3a5a0 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -108,11 +108,11 @@ class TumblrExtractor(Extractor): del photo["alt_sizes"] yield self._prepare_image(photo["url"], post) - url = post.get("audio_url") # type: "audio" + url = post.get("audio_url") # type "audio" if url and url.startswith("https://a.tumblr.com/"): yield self._prepare(url, post) - url = post.get("video_url") # type: "video" + url = post.get("video_url") # type "video" if url: yield self._prepare(_original_video(url), post) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index c409f54..4c7b757 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -360,12 +360,13 @@ class TwitterTweetExtractor(TwitterExtractor): "pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8", }), # content with emoji, newlines, hashtags (#338) - ("https://twitter.com/yumi_san0112/status/1151144618936823808", { + ("https://twitter.com/playpokemon/status/1263832915173048321", { "options": (("content", True),), "keyword": {"content": ( - "re:晴、お誕生日おめでとう🎉!\n実は下の名前が同じなので結構親近感ある" - "アイドルです✨\n今年の晴ちゃんめちゃくちゃ可愛い路線攻めてるから、そろ" - "そろまたかっこいい晴が見たいですねw\n#結城晴生誕祭2019\n#結城晴生誕祭" + r"re:Gear up for #PokemonSwordShieldEX with special Mystery " + "Gifts! \n\nYou’ll be able to receive four Galarian form " + "Pokémon with Hidden Abilities, plus some very useful items. " + "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ " )}, }), # Reply to another tweet (#403) diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py new file mode 100644 index 0000000..86ada49 --- /dev/null +++ b/gallery_dl/extractor/webtoons.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- + +# Copyright 2020 Leonardo Taccari +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.webtoons.com/""" + +from .common import Extractor, Message +from .. import exception, text, util + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/((en|fr)" + + +class WebtoonsExtractor(Extractor): + category = "webtoons" + root = "https://www.webtoons.com" + cookiedomain = "www.webtoons.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.cookies.set("ageGatePass", "true", + domain=self.cookiedomain) + self.path, self.lang, self.genre , self.comic, self.query = \ + match.groups() + + +class WebtoonsEpisodeExtractor(WebtoonsExtractor): + """Extractor for an episode on webtoons.com""" + subcategory = "episode" + directory_fmt = ("{category}", "{comic}") + filename_fmt = "{episode}-{num:>02}.{extension}" + archive_fmt = "{title_no}_{episode}_{num}" + pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+)/(?:[^/?&#]+))" + r"/viewer(?:\?([^#]+))") + test = ( + (("https://www.webtoons.com/en/comedy/safely-endangered" + "/ep-572-earth/viewer?title_no=352&episode_no=572"), { + "url": "11041d71a3f92728305c11a228e77cf0f7aa02ef", + "content": "4f7701a750368e377d65900e6e8f64a5f9cb9c86", + "count": 5, + }), + ) + + def __init__(self, match): + WebtoonsExtractor.__init__(self, match) + query = text.parse_query(self.query) + self.title_no = query.get("title_no") + if not self.title_no: + raise exception.NotFoundError("title_no") + self.episode = query.get("episode_no") + if not self.episode: + raise exception.NotFoundError("episode_no") + + def items(self): + url = "{}/{}/viewer?{}".format(self.root, self.path, self.query) + self.session.headers["Referer"] = url + + page = self.request(url).text + data = self.get_job_metadata(page) + imgs = self.get_image_urls(page) + data["count"] = len(imgs) + + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], url in enumerate(imgs, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + title, pos = text.extract( + page, '<meta property="og:title" content="', '"') + descr, pos = text.extract( + page, '<meta property="og:description" content="', '"', pos) + + return { + "genre": self.genre, + "comic": self.comic, + "title_no": self.title_no, + "episode": self.episode, + "title": text.unescape(title), + "description": text.unescape(descr), + "lang": self.lang, + "language": util.code_to_language(self.lang), + } + + @staticmethod + def get_image_urls(page): + """Extract and return a list of all image urls""" + return list(text.extract_iter(page, 'class="_images" data-url="', '"')) + + +class WebtoonsComicExtractor(WebtoonsExtractor): + """Extractor for an entire comic on webtoons.com""" + subcategory = "comic" + pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+))" + r"/list(?:\?([^#]+))") + test = ( + # english + (("https://www.webtoons.com/en/comedy/live-with-yourself/" + "list?title_no=919"), { + "pattern": WebtoonsEpisodeExtractor.pattern, + "range": "1-15", + "count": ">= 15", + }), + # french + (("https://www.webtoons.com/fr/romance/subzero/" + "list?title_no=1845&page=3"), { + "count": ">= 15", + }), + ) + + def __init__(self, match): + WebtoonsExtractor.__init__(self, match) + query = text.parse_query(self.query) + self.title_no = query.get("title_no") + if not self.title_no: + raise exception.NotFoundError("title_no") + self.page_no = int(query.get("page", 1)) + + def items(self): + page = None + data = {"_extractor": WebtoonsEpisodeExtractor} + + while True: + path = "/{}/list?title_no={}&page={}".format( + self.path, self.title_no, self.page_no) + + if page and path not in page: + return + + page = self.request(self.root + path).text + data["page"] = self.page_no + + for url in self.get_episode_urls(page): + yield Message.Queue, url, data + + self.page_no += 1 + + @staticmethod + def get_episode_urls(page): + """Extract and return all episode urls in 'page'""" + pos = page.find('id="_listUl"') + return text.extract_iter( + page, '<a href="', '" class="NPI=a:list', pos) diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index b614cab..0ada118 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -69,8 +69,8 @@ class WikiartArtistExtractor(WikiartExtractor): directory_fmt = ("{category}", "{artist[artistName]}") pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)" test = ("https://www.wikiart.org/en/thomas-cole", { - "url": "9049e52e897b9ae6586df4c2c4f827d0a19dafa3", - "keyword": "c3168b21a993707c41efb7674e8c90d53a79d483", + "url": "5ba2fbe6783fcce34e65014d16e5fbc581490c98", + "keyword": "6d92913c55675e05553f000cfee5daff0b4107cf", }) def __init__(self, match): diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 6ba2572..130df58 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -24,20 +24,32 @@ class Job(): extr = extractor.find(extr) if not extr: raise exception.NoExtractorError() - self.extractor = extr - extr.log.extractor = extr - extr.log.job = self + self.pathfmt = None + + self._logger_extra = { + "job" : self, + "extractor": extr, + "path" : output.PathfmtProxy(self), + "keywords" : output.KwdictProxy(self), + } + extr.log = self._wrap_logger(extr.log) extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url) self.status = 0 self.pred_url = self._prepare_predicates("image", True) self.pred_queue = self._prepare_predicates("chapter", False) - if parent and parent.extractor.config( - "category-transfer", parent.extractor.categorytransfer): - self.extractor.category = parent.extractor.category - self.extractor.subcategory = parent.extractor.subcategory + if parent: + pextr = parent.extractor + + # transfer (sub)category + if pextr.config("category-transfer", pextr.categorytransfer): + extr.category = pextr.category + extr.subcategory = pextr.subcategory + + # reuse connection adapters + extr.session.adapters = pextr.session.adapters # user-supplied metadata self.userkwds = self.extractor.config("keywords") @@ -165,6 +177,12 @@ class Job(): return util.build_predicate(predicates) + def get_logger(self, name): + return self._wrap_logger(logging.getLogger(name)) + + def _wrap_logger(self, logger): + return output.LoggerAdapter(logger, self._logger_extra) + def _write_unsupported(self, url): if self.ulog: self.ulog.info(url) @@ -175,8 +193,7 @@ class DownloadJob(Job): def __init__(self, url, parent=None): Job.__init__(self, url, parent) - self.log = logging.getLogger("download") - self.pathfmt = None + self.log = self.get_logger("download") self.archive = None self.sleep = None self.downloaders = {} @@ -325,7 +342,7 @@ class DownloadJob(Job): cls = downloader.find(scheme) if cls and config.get(("downloader", cls.scheme), "enabled", True): - instance = cls(self.extractor, self.out) + instance = cls(self) else: instance = None self.log.error("'%s:' URLs are not supported/enabled", scheme) @@ -338,19 +355,20 @@ class DownloadJob(Job): def initialize(self, kwdict=None): """Delayed initialization of PathFormat, etc.""" - self.pathfmt = util.PathFormat(self.extractor) + config = self.extractor.config + pathfmt = self.pathfmt = util.PathFormat(self.extractor) if kwdict: - self.pathfmt.set_directory(kwdict) + pathfmt.set_directory(kwdict) - self.sleep = self.extractor.config("sleep") - if not self.extractor.config("download", True): - self.download = self.pathfmt.fix_extension + self.sleep = config("sleep") + if not config("download", True): + self.download = pathfmt.fix_extension - skip = self.extractor.config("skip", True) + skip = config("skip", True) if skip: self._skipexc = None if skip == "enumerate": - self.pathfmt.check_file = self.pathfmt._enum_file + pathfmt.check_file = pathfmt._enum_file elif isinstance(skip, str): skip, _, smax = skip.partition(":") if skip == "abort": @@ -360,9 +378,9 @@ class DownloadJob(Job): self._skipcnt = 0 self._skipmax = text.parse_int(smax) else: - self.pathfmt.exists = lambda x=None: False + pathfmt.exists = lambda x=None: False - archive = self.extractor.config("archive") + archive = config("archive") if archive: path = util.expand_path(archive) try: @@ -374,27 +392,28 @@ class DownloadJob(Job): else: self.extractor.log.debug("Using download archive '%s'", path) - postprocessors = self.extractor.config("postprocessors") + postprocessors = config("postprocessors") if postprocessors: + pp_log = self.get_logger("postprocessor") pp_list = [] + category = self.extractor.category for pp_dict in postprocessors: whitelist = pp_dict.get("whitelist") blacklist = pp_dict.get("blacklist") - if (whitelist and self.extractor.category not in whitelist or - blacklist and self.extractor.category in blacklist): + if (whitelist and category not in whitelist or + blacklist and category in blacklist): continue name = pp_dict.get("name") pp_cls = postprocessor.find(name) if not pp_cls: - postprocessor.log.warning("module '%s' not found", name) + pp_log.warning("module '%s' not found", name) continue try: - pp_obj = pp_cls(self.pathfmt, pp_dict) + pp_obj = pp_cls(self, pp_dict) except Exception as exc: - postprocessor.log.error( - "'%s' initialization failed: %s: %s", - name, exc.__class__.__name__, exc) + pp_log.error("'%s' initialization failed: %s: %s", + name, exc.__class__.__name__, exc) else: pp_list.append(pp_obj) diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py index 9ceefbf..e9dfff0 100644 --- a/gallery_dl/oauth.py +++ b/gallery_dl/oauth.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,6 +20,7 @@ import requests import requests.auth from . import text +from .cache import cache def nonce(size, alphabet=string.ascii_letters): @@ -117,6 +118,10 @@ class OAuth1API(): token_secret = extractor.config("access-token-secret") key_type = "default" if api_key == self.API_KEY else "custom" + if token is None or token == "cache": + key = (extractor.category, api_key) + token, token_secret = _token_cache(key) + if api_key and api_secret and token and token_secret: self.log.debug("Using %s OAuth1.0 authentication", key_type) self.session = OAuth1Session( @@ -131,3 +136,8 @@ class OAuth1API(): kwargs["fatal"] = None kwargs["session"] = self.session return self.extractor.request(url, **kwargs) + + +@cache(maxage=100*365*24*3600, keyarg=0) +def _token_cache(key): + return None, None diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 34222a2..5b99bee 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -173,6 +173,12 @@ def build_parser(): help=("Write URLs, which get emitted by other extractors but cannot " "be handled, to FILE"), ) + output.add_argument( + "--write-pages", + dest="write-pages", nargs=0, action=ConfigConstAction, const=True, + help=("Write downloaded intermediary pages to files " + "in the current directory to debug problems"), + ) downloader = parser.add_argument_group("Downloader Options") downloader.add_argument( @@ -196,7 +202,7 @@ def build_parser(): downloader.add_argument( "--http-timeout", dest="timeout", metavar="SECONDS", type=float, action=ConfigAction, - help="Timeout for HTTP connections (defaut: 30.0)", + help="Timeout for HTTP connections (default: 30.0)", ) downloader.add_argument( "--sleep", diff --git a/gallery_dl/output.py b/gallery_dl/output.py index 9e2f8a6..2d3dc17 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -22,34 +22,93 @@ LOG_LEVEL = logging.INFO class Logger(logging.Logger): - """Custom logger that includes extractor and job info in log records""" - extractor = util.NONE - job = util.NONE + """Custom logger that includes extra info in log records""" def makeRecord(self, name, level, fn, lno, msg, args, exc_info, func=None, extra=None, sinfo=None, factory=logging._logRecordFactory): rv = factory(name, level, fn, lno, msg, args, exc_info, func, sinfo) - rv.extractor = self.extractor - rv.job = self.job + if extra: + rv.__dict__.update(extra) return rv +class LoggerAdapter(): + """Trimmed-down version of logging.LoggingAdapter""" + __slots__ = ("logger", "extra") + + def __init__(self, logger, extra): + self.logger = logger + self.extra = extra + + def debug(self, msg, *args, **kwargs): + if self.logger.isEnabledFor(logging.DEBUG): + kwargs["extra"] = self.extra + self.logger._log(logging.DEBUG, msg, args, **kwargs) + + def info(self, msg, *args, **kwargs): + if self.logger.isEnabledFor(logging.INFO): + kwargs["extra"] = self.extra + self.logger._log(logging.INFO, msg, args, **kwargs) + + def warning(self, msg, *args, **kwargs): + if self.logger.isEnabledFor(logging.WARNING): + kwargs["extra"] = self.extra + self.logger._log(logging.WARNING, msg, args, **kwargs) + + def error(self, msg, *args, **kwargs): + if self.logger.isEnabledFor(logging.ERROR): + kwargs["extra"] = self.extra + self.logger._log(logging.ERROR, msg, args, **kwargs) + + +class PathfmtProxy(): + __slots__ = ("job",) + + def __init__(self, job): + self.job = job + + def __getattribute__(self, name): + pathfmt = object.__getattribute__(self, "job").pathfmt + return pathfmt.__dict__.get(name) if pathfmt else None + + +class KwdictProxy(): + __slots__ = ("job",) + + def __init__(self, job): + self.job = job + + def __getattribute__(self, name): + pathfmt = object.__getattribute__(self, "job").pathfmt + return pathfmt.kwdict.get(name) if pathfmt else None + + class Formatter(logging.Formatter): """Custom formatter that supports different formats per loglevel""" def __init__(self, fmt, datefmt): - if not isinstance(fmt, dict): + if isinstance(fmt, dict): + for key in ("debug", "info", "warning", "error"): + value = fmt[key] if key in fmt else LOG_FORMAT + fmt[key] = (util.Formatter(value).format_map, + "{asctime" in value) + else: + if fmt == LOG_FORMAT: + fmt = (fmt.format_map, False) + else: + fmt = (util.Formatter(fmt).format_map, "{asctime" in fmt) fmt = {"debug": fmt, "info": fmt, "warning": fmt, "error": fmt} + self.formats = fmt self.datefmt = datefmt def format(self, record): record.message = record.getMessage() - fmt = self.formats[record.levelname] - if "{asctime" in fmt: + fmt, asctime = self.formats[record.levelname] + if asctime: record.asctime = self.formatTime(record, self.datefmt) - msg = fmt.format_map(record.__dict__) + msg = fmt(record.__dict__) if record.exc_info and not record.exc_text: record.exc_text = self.formatException(record.exc_info) if record.exc_text: @@ -244,7 +303,7 @@ class ColorOutput(TerminalOutput): print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="") -if os.name == "nt": +if util.WINDOWS: ANSI = os.environ.get("TERM") == "ANSI" OFFSET = 1 CHAR_SKIP = "# " diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index 7a3bf23..faa4d6c 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -9,7 +9,6 @@ """Post-processing modules""" import importlib -import logging modules = [ "classify", @@ -21,8 +20,6 @@ modules = [ "zip", ] -log = logging.getLogger("postprocessor") - def find(name): """Return a postprocessor class with the given name""" diff --git a/gallery_dl/postprocessor/classify.py b/gallery_dl/postprocessor/classify.py index 4a9bde9..0106903 100644 --- a/gallery_dl/postprocessor/classify.py +++ b/gallery_dl/postprocessor/classify.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,8 +22,8 @@ class ClassifyPP(PostProcessor): "Archives" : ("zip", "rar", "7z", "tar", "gz", "bz2"), } - def __init__(self, pathfmt, options): - PostProcessor.__init__(self) + def __init__(self, job, options): + PostProcessor.__init__(self, job) mapping = options.get("mapping", self.DEFAULT_MAPPING) self.mapping = { diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index 70b0dfb..64f978e 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,15 +8,13 @@ """Common classes and constants used by postprocessor modules.""" -import logging - class PostProcessor(): """Base class for postprocessors""" - def __init__(self): + def __init__(self, job): name = self.__class__.__name__[:-2].lower() - self.log = logging.getLogger("postprocessor." + name) + self.log = job.get_logger("postprocessor." + name) @staticmethod def prepare(pathfmt): diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py index ddbcef0..0d11844 100644 --- a/gallery_dl/postprocessor/compare.py +++ b/gallery_dl/postprocessor/compare.py @@ -14,8 +14,8 @@ import os class ComparePP(PostProcessor): - def __init__(self, pathfmt, options): - PostProcessor.__init__(self) + def __init__(self, job, options): + PostProcessor.__init__(self, job) if options.get("action") == "enumerate": self.run = self._run_enumerate if options.get("shallow"): diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index 0a56281..cbe51ae 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,10 +11,9 @@ from .common import PostProcessor from .. import util import subprocess -import os -if os.name == "nt": +if util.WINDOWS: def quote(s): return '"' + s.replace('"', '\\"') + '"' else: @@ -23,8 +22,8 @@ else: class ExecPP(PostProcessor): - def __init__(self, pathfmt, options): - PostProcessor.__init__(self) + def __init__(self, job, options): + PostProcessor.__init__(self, job) args = options["command"] final = options.get("final", False) diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index aa50dfd..a955ba3 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -15,8 +15,8 @@ import os class MetadataPP(PostProcessor): - def __init__(self, pathfmt, options): - PostProcessor.__init__(self) + def __init__(self, job, options): + PostProcessor.__init__(self, job) mode = options.get("mode", "json") if mode == "custom": diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py index 7065428..b8a4988 100644 --- a/gallery_dl/postprocessor/mtime.py +++ b/gallery_dl/postprocessor/mtime.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,8 +14,8 @@ from ..text import parse_int class MtimePP(PostProcessor): - def __init__(self, pathfmt, options): - PostProcessor.__init__(self) + def __init__(self, job, options): + PostProcessor.__init__(self, job) self.key = options.get("key", "date") def run(self, pathfmt): diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index 706e706..1afba86 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2018 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Convert pixiv ugoira to webm""" +"""Convert Pixiv Ugoira to WebM""" from .common import PostProcessor from .. import util @@ -19,8 +19,8 @@ import os class UgoiraPP(PostProcessor): - def __init__(self, pathfmt, options): - PostProcessor.__init__(self) + def __init__(self, job, options): + PostProcessor.__init__(self, job) self.extension = options.get("extension") or "webm" self.args = options.get("ffmpeg-args") or () self.twopass = options.get("ffmpeg-twopass", False) diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py index a43c43a..6970e95 100644 --- a/gallery_dl/postprocessor/zip.py +++ b/gallery_dl/postprocessor/zip.py @@ -22,8 +22,8 @@ class ZipPP(PostProcessor): "lzma" : zipfile.ZIP_LZMA, } - def __init__(self, pathfmt, options): - PostProcessor.__init__(self) + def __init__(self, job, options): + PostProcessor.__init__(self, job) self.delete = not options.get("keep-files", False) ext = "." + options.get("extension", "zip") algorithm = options.get("compression", "store") @@ -33,7 +33,7 @@ class ZipPP(PostProcessor): algorithm) algorithm = "store" - self.path = pathfmt.realdirectory + self.path = job.pathfmt.realdirectory args = (self.path[:-1] + ext, "a", self.COMPRESSION_ALGORITHMS[algorithm], True) diff --git a/gallery_dl/text.py b/gallery_dl/text.py index 3bb6390..4dc0963 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -60,6 +60,13 @@ def split_html(txt, sep=None): return [] +def ensure_http_scheme(url, scheme="https://"): + """Prepend 'scheme' to 'url' if it doesn't have one""" + if url and not url.startswith(("https://", "http://")): + return scheme + url.lstrip("/:") + return url + + def filename_from_url(url): """Extract the last part of an URL to use as a filename""" try: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index 83cf84b..85b871b 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -113,6 +113,57 @@ def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4): fp.write("\n") +def dump_response(response, fp=sys.stdout, + headers=True, content=True, hide_auth=True): + """Write the contents of 'response' into a file-like object""" + + if headers: + request = response.request + req_headers = request.headers.copy() + outfmt = """\ +{request.method} {request.url} +Status: {response.status_code} {response.reason} + +Request Headers +--------------- +{request_headers} + +Response Headers +---------------- +{response_headers} +""" + if hide_auth: + authorization = req_headers.get("Authorization") + if authorization: + atype, sep, _ = authorization.partition(" ") + req_headers["Authorization"] = atype + " ***" if sep else "***" + + cookies = req_headers.get("Cookie") + if cookies: + req_headers["Cookie"] = ";".join( + cookie.partition("=")[0] + "=***" + for cookie in cookies.split(";") + ) + + fp.write(outfmt.format( + request=request, + response=response, + request_headers="\n".join( + name + ": " + value + for name, value in req_headers.items() + ), + response_headers="\n".join( + name + ": " + value + for name, value in response.headers.items() + ), + ).encode()) + + if content: + if headers: + fp.write(b"\nContent\n-------\n") + fp.write(response.content) + + def expand_path(path): """Expand environment variables and tildes (~)""" if not path: @@ -270,6 +321,8 @@ class UniversalNone(): NONE = UniversalNone() +WINDOWS = (os.name == "nt") +SENTINEL = object() def build_predicate(predicates): @@ -672,22 +725,26 @@ class PathFormat(): self.basedirectory = basedir restrict = extractor.config("path-restrict", "auto") + replace = extractor.config("path-replace", "_") + if restrict == "auto": - restrict = "\\\\|/<>:\"?*" if os.name == "nt" else "/" + restrict = "\\\\|/<>:\"?*" if WINDOWS else "/" elif restrict == "unix": restrict = "/" elif restrict == "windows": restrict = "\\\\|/<>:\"?*" + self.clean_segment = self._build_cleanfunc(restrict, replace) remove = extractor.config("path-remove", "\x00-\x1f\x7f") - - self.clean_segment = self._build_cleanfunc(restrict, "_") self.clean_path = self._build_cleanfunc(remove, "") @staticmethod def _build_cleanfunc(chars, repl): if not chars: return lambda x: x + elif isinstance(chars, dict): + def func(x, table=str.maketrans(chars)): + return x.translate(table) elif len(chars) == 1: def func(x, c=chars, r=repl): return x.replace(c, r) @@ -726,7 +783,7 @@ class PathFormat(): def set_directory(self, kwdict): """Build directory path and create it if necessary""" - windows = os.name == "nt" + self.kwdict = kwdict # Build path segments by applying 'kwdict' to directory format strings segments = [] @@ -734,7 +791,7 @@ class PathFormat(): try: for formatter in self.directory_formatters: segment = formatter(kwdict).strip() - if windows: + if WINDOWS: # remove trailing dots and spaces (#647) segment = segment.rstrip(". ") if segment: @@ -751,7 +808,7 @@ class PathFormat(): directory += sep self.directory = directory - if windows: + if WINDOWS: # Enable longer-than-260-character paths on Windows directory = "\\\\?\\" + os.path.abspath(directory) @@ -772,6 +829,8 @@ class PathFormat(): if self.extension: self.build_path() + else: + self.filename = "" def set_extension(self, extension, real=True): """Set filename extension""" diff --git a/gallery_dl/version.py b/gallery_dl/version.py index 40b5c73..dd6f373 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.13.6" +__version__ = "1.14.0" |
