aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2020-06-01 23:11:37 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2020-06-01 23:11:37 -0400
commita70a3246927b72f1ded37acd55ee719515441b5b (patch)
tree57f0d3ab0b1387b665325f42a24b8aab63cbce07 /gallery_dl
parent90e50db2e3c38f523bb5195d295290b06e5cedb0 (diff)
New upstream version 1.14.0.upstream/1.14.0
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/cache.py23
-rw-r--r--gallery_dl/cloudflare.py27
-rw-r--r--gallery_dl/config.py8
-rw-r--r--gallery_dl/downloader/common.py12
-rw-r--r--gallery_dl/downloader/http.py10
-rw-r--r--gallery_dl/downloader/ytdl.py24
-rw-r--r--gallery_dl/extractor/__init__.py3
-rw-r--r--gallery_dl/extractor/common.py32
-rw-r--r--gallery_dl/extractor/danbooru.py6
-rw-r--r--gallery_dl/extractor/deviantart.py42
-rw-r--r--gallery_dl/extractor/gelbooru.py52
-rw-r--r--gallery_dl/extractor/hentainexus.py46
-rw-r--r--gallery_dl/extractor/hiperdex.py17
-rw-r--r--gallery_dl/extractor/imagechest.py48
-rw-r--r--gallery_dl/extractor/imgur.py19
-rw-r--r--gallery_dl/extractor/instagram.py70
-rw-r--r--gallery_dl/extractor/mangadex.py2
-rw-r--r--gallery_dl/extractor/mastodon.py6
-rw-r--r--gallery_dl/extractor/newgrounds.py7
-rw-r--r--gallery_dl/extractor/oauth.py138
-rw-r--r--gallery_dl/extractor/patreon.py3
-rw-r--r--gallery_dl/extractor/recursive.py3
-rw-r--r--gallery_dl/extractor/reddit.py37
-rw-r--r--gallery_dl/extractor/redgifs.py58
-rw-r--r--gallery_dl/extractor/sexcom.py6
-rw-r--r--gallery_dl/extractor/tumblr.py4
-rw-r--r--gallery_dl/extractor/twitter.py9
-rw-r--r--gallery_dl/extractor/webtoons.py148
-rw-r--r--gallery_dl/extractor/wikiart.py6
-rw-r--r--gallery_dl/job.py73
-rw-r--r--gallery_dl/oauth.py12
-rw-r--r--gallery_dl/option.py10
-rw-r--r--gallery_dl/output.py79
-rw-r--r--gallery_dl/postprocessor/__init__.py3
-rw-r--r--gallery_dl/postprocessor/classify.py6
-rw-r--r--gallery_dl/postprocessor/common.py8
-rw-r--r--gallery_dl/postprocessor/compare.py4
-rw-r--r--gallery_dl/postprocessor/exec.py9
-rw-r--r--gallery_dl/postprocessor/metadata.py6
-rw-r--r--gallery_dl/postprocessor/mtime.py6
-rw-r--r--gallery_dl/postprocessor/ugoira.py8
-rw-r--r--gallery_dl/postprocessor/zip.py6
-rw-r--r--gallery_dl/text.py7
-rw-r--r--gallery_dl/util.py71
-rw-r--r--gallery_dl/version.py2
45 files changed, 872 insertions, 304 deletions
diff --git a/gallery_dl/cache.py b/gallery_dl/cache.py
index 6cde65d..3886091 100644
--- a/gallery_dl/cache.py
+++ b/gallery_dl/cache.py
@@ -57,7 +57,7 @@ class MemoryCacheDecorator(CacheDecorator):
value, expires = self.cache[key]
except KeyError:
expires = 0
- if expires < timestamp:
+ if expires <= timestamp:
value = self.func(*args, **kwargs)
expires = timestamp + self.maxage
self.cache[key] = value, expires
@@ -189,25 +189,26 @@ def clear():
def _path():
- path = config.get(("cache",), "file", -1)
- if path != -1:
+ path = config.get(("cache",), "file", util.SENTINEL)
+ if path is not util.SENTINEL:
return util.expand_path(path)
- if os.name == "nt":
- import tempfile
- return os.path.join(tempfile.gettempdir(), ".gallery-dl.cache")
+ if util.WINDOWS:
+ cachedir = os.environ.get("APPDATA", "~")
+ else:
+ cachedir = os.environ.get("XDG_CACHE_HOME", "~/.cache")
- cachedir = util.expand_path(os.path.join(
- os.environ.get("XDG_CACHE_HOME", "~/.cache"), "gallery-dl"))
+ cachedir = util.expand_path(os.path.join(cachedir, "gallery-dl"))
os.makedirs(cachedir, exist_ok=True)
return os.path.join(cachedir, "cache.sqlite3")
try:
dbfile = _path()
- if os.name != "nt":
- # restrict access permissions for new db files
- os.close(os.open(dbfile, os.O_CREAT | os.O_RDONLY, 0o600))
+
+ # restrict access permissions for new db files
+ os.close(os.open(dbfile, os.O_CREAT | os.O_RDONLY, 0o600))
+
DatabaseCacheDecorator.db = sqlite3.connect(
dbfile, timeout=30, check_same_thread=False)
except (OSError, TypeError, sqlite3.OperationalError):
diff --git a/gallery_dl/cloudflare.py b/gallery_dl/cloudflare.py
index 43ccdeb..0cf5a57 100644
--- a/gallery_dl/cloudflare.py
+++ b/gallery_dl/cloudflare.py
@@ -8,11 +8,11 @@
"""Methods to access sites behind Cloudflare protection"""
-import re
import time
import operator
import collections
import urllib.parse
+from xml.etree import ElementTree
from . import text
from .cache import memcache
@@ -41,12 +41,16 @@ def solve_challenge(session, response, kwargs):
url = root + text.unescape(text.extract(page, 'action="', '"')[0])
headers["Referer"] = response.url
- for inpt in text.extract_iter(page, "<input ", ">"):
- name = text.extract(inpt, 'name="', '"')[0]
+ form = text.extract(page, 'id="challenge-form"', '</form>')[0]
+ for element in ElementTree.fromstring(
+ "<f>" + form + "</f>").findall("input"):
+ name = element.attrib.get("name")
+ if not name:
+ continue
if name == "jschl_answer":
value = solve_js_challenge(page, parsed.netloc)
else:
- value = text.unescape(text.extract(inpt, 'value="', '"')[0])
+ value = element.attrib.get("value")
params[name] = value
time.sleep(4)
@@ -84,6 +88,8 @@ def solve_js_challenge(page, netloc):
variable = "{}.{}".format(data["var"], data["key"])
vlength = len(variable)
+ k = text.extract(page, "k = '", "'")[0]
+
# evaluate the initial expression
solution = evaluate_expression(data["expr"], page, netloc)
@@ -97,7 +103,7 @@ def solve_js_challenge(page, netloc):
# select arithmetc function based on operator (+/-/*)
func = OPERATORS[expr[vlength]]
# evaluate the rest of the expression
- value = evaluate_expression(expr[vlength+2:], page, netloc)
+ value = evaluate_expression(expr[vlength+2:], page, netloc, k)
# combine expression value with our current solution
solution = func(solution, value)
@@ -110,17 +116,18 @@ def solve_js_challenge(page, netloc):
solution = "{:.10f}".format(solution)
return solution
+ elif expr.startswith("k+="):
+ k += str(evaluate_expression(expr[3:], page, netloc))
+
-def evaluate_expression(expr, page, netloc, *,
- split_re=re.compile(r"[(+]+([^)]*)\)")):
+def evaluate_expression(expr, page, netloc, k=""):
"""Evaluate a single Javascript expression for the challenge"""
if expr.startswith("function(p)"):
# get HTML element with ID k and evaluate the expression inside
# 'eval(eval("document.getElementById(k).innerHTML"))'
- k, pos = text.extract(page, "k = '", "'")
- e, pos = text.extract(page, 'id="'+k+'"', '<')
- return evaluate_expression(e.partition(">")[2], page, netloc)
+ expr = text.extract(page, 'id="'+k+'"', '<')[0]
+ return evaluate_expression(expr.partition(">")[2], page, netloc)
if "/" in expr:
# split the expression in numerator and denominator subexpressions,
diff --git a/gallery_dl/config.py b/gallery_dl/config.py
index c2787ad..5303616 100644
--- a/gallery_dl/config.py
+++ b/gallery_dl/config.py
@@ -22,8 +22,9 @@ log = logging.getLogger("config")
_config = {}
-if os.name == "nt":
+if util.WINDOWS:
_default_configs = [
+ r"%APPDATA%\gallery-dl\config.json",
r"%USERPROFILE%\gallery-dl\config.json",
r"%USERPROFILE%\gallery-dl.conf",
]
@@ -139,7 +140,6 @@ def unset(path, key, *, conf=_config):
class apply():
"""Context Manager: apply a collection of key-value pairs"""
- _sentinel = object()
def __init__(self, kvlist):
self.original = []
@@ -147,12 +147,12 @@ class apply():
def __enter__(self):
for path, key, value in self.kvlist:
- self.original.append((path, key, get(path, key, self._sentinel)))
+ self.original.append((path, key, get(path, key, util.SENTINEL)))
set(path, key, value)
def __exit__(self, etype, value, traceback):
for path, key, value in self.original:
- if value is self._sentinel:
+ if value is util.SENTINEL:
unset(path, key)
else:
set(path, key, value)
diff --git a/gallery_dl/downloader/common.py b/gallery_dl/downloader/common.py
index eca1284..d858075 100644
--- a/gallery_dl/downloader/common.py
+++ b/gallery_dl/downloader/common.py
@@ -9,7 +9,6 @@
"""Common classes and constants used by downloader modules."""
import os
-import logging
from .. import config, util
@@ -17,15 +16,12 @@ class DownloaderBase():
"""Base class for downloaders"""
scheme = ""
- def __init__(self, extractor, output):
- self.session = extractor.session
- self.out = output
+ def __init__(self, job):
+ self.out = job.out
+ self.session = job.extractor.session
self.part = self.config("part", True)
self.partdir = self.config("part-directory")
-
- self.log = logging.getLogger("downloader." + self.scheme)
- self.log.job = extractor.log.job
- self.log.extractor = extractor
+ self.log = job.get_logger("downloader." + self.scheme)
if self.partdir:
self.partdir = util.expand_path(self.partdir)
diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py
index 021dc16..6644827 100644
--- a/gallery_dl/downloader/http.py
+++ b/gallery_dl/downloader/http.py
@@ -24,16 +24,18 @@ except ImportError:
class HttpDownloader(DownloaderBase):
scheme = "http"
- def __init__(self, extractor, output):
- DownloaderBase.__init__(self, extractor, output)
+ def __init__(self, job):
+ DownloaderBase.__init__(self, job)
+ extractor = job.extractor
+ self.chunk_size = 16384
+ self.downloading = False
+
self.adjust_extension = self.config("adjust-extensions", True)
self.retries = self.config("retries", extractor._retries)
self.timeout = self.config("timeout", extractor._timeout)
self.verify = self.config("verify", extractor._verify)
self.mtime = self.config("mtime", True)
self.rate = self.config("rate")
- self.downloading = False
- self.chunk_size = 16384
if self.retries < 0:
self.retries = float("inf")
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
index fe6c4bc..c3dd863 100644
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -17,8 +17,9 @@ import os
class YoutubeDLDownloader(DownloaderBase):
scheme = "ytdl"
- def __init__(self, extractor, output):
- DownloaderBase.__init__(self, extractor, output)
+ def __init__(self, job):
+ DownloaderBase.__init__(self, job)
+ extractor = job.extractor
retries = self.config("retries", extractor._retries)
options = {
@@ -35,7 +36,7 @@ class YoutubeDLDownloader(DownloaderBase):
if self.config("logging", True):
options["logger"] = self.log
- self.forward_cookies = self.config("forward-cookies", True)
+ self.forward_cookies = self.config("forward-cookies", False)
outtmpl = self.config("outtmpl")
self.outtmpl = DEFAULT_OUTTMPL if outtmpl == "default" else outtmpl
@@ -70,6 +71,10 @@ class YoutubeDLDownloader(DownloaderBase):
if "url" in info_dict:
text.nameext_from_url(info_dict["url"], pathfmt.kwdict)
+ formats = info_dict.get("requested_formats")
+ if formats and not compatible_formats(formats):
+ info_dict["ext"] = "mkv"
+
if self.outtmpl:
self.ytdl.params["outtmpl"] = self.outtmpl
pathfmt.filename = filename = self.ytdl.prepare_filename(info_dict)
@@ -105,4 +110,15 @@ class YoutubeDLDownloader(DownloaderBase):
return True
+def compatible_formats(formats):
+ video_ext = formats[0].get("ext")
+ audio_ext = formats[1].get("ext")
+
+ if video_ext == "webm" and audio_ext == "webm":
+ return True
+
+ exts = ("mp3", "mp4", "m4a", "m4p", "m4b", "m4r", "m4v", "ismv", "isma")
+ return video_ext in exts and audio_ext in exts
+
+
__downloader__ = YoutubeDLDownloader
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 85fbddb..561b484 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -48,6 +48,7 @@ modules = [
"hypnohub",
"idolcomplex",
"imagebam",
+ "imagechest",
"imagefap",
"imgbb",
"imgbox",
@@ -94,6 +95,7 @@ modules = [
"readcomiconline",
"realbooru",
"reddit",
+ "redgifs",
"rule34",
"safebooru",
"sankaku",
@@ -113,6 +115,7 @@ modules = [
"vsco",
"wallhaven",
"warosu",
+ "webtoons",
"weibo",
"wikiart",
"xhamster",
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 3a282c2..dd685df 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -40,6 +40,7 @@ class Extractor():
self._cookiefile = None
self._cookiejar = self.session.cookies
self._parentdir = ""
+ self._write_pages = self.config("write-pages", False)
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
@@ -91,6 +92,8 @@ class Extractor():
raise exception.HttpError(exc)
else:
code = response.status_code
+ if self._write_pages:
+ self._dump_response(response)
if 200 <= code < 400 or fatal is None and \
(400 <= code < 500) or not fatal and \
(400 <= code < 429 or 431 <= code < 500):
@@ -325,6 +328,33 @@ class Extractor():
test = (test, None)
yield test
+ def _dump_response(self, response):
+ """Write the response content to a .dump file in the current directory.
+
+ The file name is derived from the response url,
+ replacing special characters with "_"
+ """
+ for resp in response.history:
+ self._dump_response(resp)
+
+ if hasattr(Extractor, "_dump_index"):
+ Extractor._dump_index += 1
+ else:
+ Extractor._dump_index = 1
+ Extractor._dump_sanitize = re.compile(r"[\\\\|/<>:\"?*&=#]+").sub
+
+ fname = "{:>02}_{}".format(
+ Extractor._dump_index,
+ Extractor._dump_sanitize('_', response.url)
+ )[:250]
+
+ try:
+ with open(fname + ".dump", 'wb') as fp:
+ util.dump_response(response, fp)
+ except Exception as e:
+ self.log.warning("Failed to dump HTTP request (%s: %s)",
+ e.__class__.__name__, e)
+
class GalleryExtractor(Extractor):
@@ -460,7 +490,7 @@ class SharedConfigMixin():
"""Enable sharing of config settings based on 'basecategory'"""
basecategory = ""
- def config(self, key, default=None, *, sentinel=object()):
+ def config(self, key, default=None, *, sentinel=util.SENTINEL):
value = Extractor.config(self, key, sentinel)
return value if value is not sentinel else config.interpolate(
("extractor", self.basecategory, self.subcategory), key, default)
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 3a0d0ef..e0edf89 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -32,7 +32,7 @@ class DanbooruExtractor(SharedConfigMixin, Extractor):
def __init__(self, match):
super().__init__(match)
self.root = "https://{}.donmai.us".format(match.group(1))
- self.ugoira = self.config("ugoira", True)
+ self.ugoira = self.config("ugoira", False)
self.params = {}
username, api_key = self._get_auth_info()
@@ -156,8 +156,8 @@ class DanbooruPostExtractor(DanbooruExtractor):
"content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
}),
("https://danbooru.donmai.us/posts/3613024", {
- "pattern": r"https?://.+\.webm$",
- "options": (("ugoira", False),)
+ "pattern": r"https?://.+\.zip$",
+ "options": (("ugoira", True),)
})
)
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 2631052..cda357a 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -126,8 +126,9 @@ class DeviantartExtractor(Extractor):
if self.extra:
for match in DeviantartStashExtractor.pattern.finditer(
deviation.get("description", "")):
+ url = text.ensure_http_scheme(match.group(0))
deviation["_extractor"] = DeviantartStashExtractor
- yield Message.Queue, match.group(0), deviation
+ yield Message.Queue, url, deviation
def deviations(self):
"""Return an iterable containing all relevant Deviation-objects"""
@@ -849,9 +850,12 @@ class DeviantartOAuthAPI():
self.client_secret = extractor.config(
"client-secret", self.CLIENT_SECRET)
- self.refresh_token = extractor.config("refresh-token")
- if self.refresh_token == "cache":
- self.refresh_token = "#" + str(self.client_id)
+ token = extractor.config("refresh-token")
+ if token is None or token == "cache":
+ token = "#" + str(self.client_id)
+ if not _refresh_token_cache(token):
+ token = None
+ self.refresh_token_key = token
self.log.debug(
"Using %s API credentials (client-id %s)",
@@ -904,7 +908,7 @@ class DeviantartOAuthAPI():
"""Get extended content of a single Deviation"""
endpoint = "deviation/content"
params = {"deviationid": deviation_id}
- return self._call(endpoint, params)
+ return self._call(endpoint, params, public=False)
def deviation_download(self, deviation_id):
"""Get the original file download (if allowed)"""
@@ -951,18 +955,19 @@ class DeviantartOAuthAPI():
endpoint = "user/profile/" + username
return self._call(endpoint, fatal=False)
- def authenticate(self, refresh_token):
+ def authenticate(self, refresh_token_key):
"""Authenticate the application by requesting an access token"""
- self.headers["Authorization"] = self._authenticate_impl(refresh_token)
+ self.headers["Authorization"] = \
+ self._authenticate_impl(refresh_token_key)
@cache(maxage=3600, keyarg=1)
- def _authenticate_impl(self, refresh_token):
+ def _authenticate_impl(self, refresh_token_key):
"""Actual authenticate implementation"""
url = "https://www.deviantart.com/oauth2/token"
- if refresh_token:
+ if refresh_token_key:
self.log.info("Refreshing private access token")
data = {"grant_type": "refresh_token",
- "refresh_token": _refresh_token_cache(refresh_token)}
+ "refresh_token": _refresh_token_cache(refresh_token_key)}
else:
self.log.info("Requesting public access token")
data = {"grant_type": "client_credentials"}
@@ -976,8 +981,9 @@ class DeviantartOAuthAPI():
self.log.debug("Server response: %s", data)
raise exception.AuthenticationError('"{}" ({})'.format(
data.get("error_description"), data.get("error")))
- if refresh_token:
- _refresh_token_cache.update(refresh_token, data["refresh_token"])
+ if refresh_token_key:
+ _refresh_token_cache.update(
+ refresh_token_key, data["refresh_token"])
return "Bearer " + data["access_token"]
def _call(self, endpoint, params=None, fatal=True, public=True):
@@ -987,7 +993,7 @@ class DeviantartOAuthAPI():
if self.delay >= 0:
time.sleep(2 ** self.delay)
- self.authenticate(None if public else self.refresh_token)
+ self.authenticate(None if public else self.refresh_token_key)
response = self.extractor.request(
url, headers=self.headers, params=params, fatal=None)
data = response.json()
@@ -1023,7 +1029,7 @@ class DeviantartOAuthAPI():
if extend:
if public and len(data["results"]) < params["limit"]:
- if self.refresh_token:
+ if self.refresh_token_key:
self.log.debug("Switching to private access token")
public = False
continue
@@ -1154,9 +1160,11 @@ class DeviantartEclipseAPI():
return text.rextract(page, '\\"id\\":', ',', pos)[0].strip('" ')
-@cache(maxage=10*365*24*3600, keyarg=0)
-def _refresh_token_cache(original_token, new_token=None):
- return new_token or original_token
+@cache(maxage=100*365*24*3600, keyarg=0)
+def _refresh_token_cache(token):
+ if token and token[0] == "#":
+ return None
+ return token
###############################################################################
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 0c05a97..612c742 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2019 Mike Fährmann
+# Copyright 2014-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,7 @@
from . import booru
from .common import Message
-from .. import text, util
+from .. import text
class GelbooruExtractor(booru.XmlParserMixin,
@@ -31,6 +31,7 @@ class GelbooruExtractor(booru.XmlParserMixin,
else:
self.items = self.items_noapi
self.session.cookies["fringeBenefits"] = "yup"
+ self.per_page = 42
def items_noapi(self):
yield Message.Version, 1
@@ -46,6 +47,19 @@ class GelbooruExtractor(booru.XmlParserMixin,
def get_posts(self):
"""Return an iterable containing all relevant post objects"""
+ url = "https://gelbooru.com/index.php?page=post&s=list"
+ params = {
+ "tags": self.params["tags"],
+ "pid" : self.page_start * self.per_page
+ }
+
+ while True:
+ page = self.request(url, params=params).text
+ ids = list(text.extract_iter(page, '<a id="p', '"'))
+ yield from ids
+ if len(ids) < self.per_page:
+ return
+ params["pid"] += self.per_page
def get_post_data(self, post_id):
"""Extract metadata of a single post"""
@@ -88,34 +102,20 @@ class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor):
}),
)
- def __init__(self, match):
- super().__init__(match)
- if not self.use_api:
- self.per_page = 42
-
- def get_posts(self):
- url = "https://gelbooru.com/index.php?page=post&s=list"
- params = {"tags": self.tags, "pid": self.page_start * self.per_page}
- while True:
- page = self.request(url, params=params).text
- ids = list(text.extract_iter(page, '<a id="p', '"'))
- yield from ids
- if len(ids) < self.per_page:
- return
- params["pid"] += self.per_page
-
-
-class GelbooruPoolExtractor(booru.GelbooruPoolMixin, GelbooruExtractor):
+class GelbooruPoolExtractor(booru.PoolMixin, GelbooruExtractor):
"""Extractor for image-pools from gelbooru.com"""
pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=pool&s=show&id=(?P<pool>\d+)")
- test = ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
- "count": 6,
- })
-
- def get_posts(self):
- return util.advance(self.posts, self.page_start)
+ test = (
+ ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
+ "count": 6,
+ }),
+ ("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
+ "options": (("api", False),),
+ "count": 6,
+ }),
+ )
class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor):
diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py
index ef64942..aa41836 100644
--- a/gallery_dl/extractor/hentainexus.py
+++ b/gallery_dl/extractor/hentainexus.py
@@ -51,20 +51,38 @@ class HentainexusGalleryExtractor(GalleryExtractor):
"description": rmve(extr('viewcolumn">Description</td>', '</td>')),
}
data["lang"] = util.language_to_code(data["language"])
- data["type"] = "Doujinshi" if 'doujin' in data["tags"] else "Manga"
- data["title_conventional"] = self.join_title(
- data["event"],
- data["circle"],
- data["artist"],
- data["title"],
- data["parody"],
- data["book"],
- data["magazine"],
- )
+ if 'doujin' in data['tags']:
+ data['type'] = 'Doujinshi'
+ elif 'illustration' in data['tags']:
+ data['type'] = 'Illustration'
+ else:
+ data['type'] = 'Manga'
+ data["title_conventional"] = self._join_title(data)
return data
+ def images(self, page):
+ url = "{}/read/{}".format(self.root, self.gallery_id)
+ extr = text.extract_from(self.request(url).text)
+ urls = extr("initReader(", "]") + "]"
+ return [(url, None) for url in json.loads(urls)]
+
@staticmethod
- def join_title(event, circle, artist, title, parody, book, magazine):
+ def _join_title(data):
+ event = data['event']
+ artist = data['artist']
+ circle = data['circle']
+ title = data['title']
+ parody = data['parody']
+ book = data['book']
+ magazine = data['magazine']
+
+ # a few galleries have a large number of artists or parodies,
+ # which get replaced with "Various" in the title string
+ if artist.count(',') >= 3:
+ artist = 'Various'
+ if parody.count(',') >= 3:
+ parody = 'Various'
+
jt = ''
if event:
jt += '({}) '.format(event)
@@ -81,12 +99,6 @@ class HentainexusGalleryExtractor(GalleryExtractor):
jt += ' ({})'.format(magazine)
return jt
- def images(self, page):
- url = "{}/read/{}".format(self.root, self.gallery_id)
- extr = text.extract_from(self.request(url).text)
- urls = extr("initReader(", "]") + "]"
- return [(url, None) for url in json.loads(urls)]
-
class HentainexusSearchExtractor(Extractor):
"""Extractor for search results on hentainexus.com"""
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
index 3883445..1c53723 100644
--- a/gallery_dl/extractor/hiperdex.py
+++ b/gallery_dl/extractor/hiperdex.py
@@ -14,6 +14,9 @@ from ..cache import memcache
import re
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?hiperdex\.(?:com|net|info)"
+
+
class HiperdexBase():
"""Base class for hiperdex extractors"""
category = "hiperdex"
@@ -61,11 +64,10 @@ class HiperdexBase():
class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
"""Extractor for manga chapters from hiperdex.com"""
- pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.(?:com|net)"
- r"(/manga/([^/?&#]+)/([^/?&#]+))")
+ pattern = BASE_PATTERN + r"(/manga/([^/?&#]+)/([^/?&#]+))"
test = (
("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", {
- "pattern": r"https://hiperdex.com/wp-content/uploads"
+ "pattern": r"https://hiperdex.(com|net|info)/wp-content/uploads"
r"/WP-manga/data/manga_\w+/[0-9a-f]{32}/\d+\.webp",
"count": 9,
"keyword": {
@@ -82,6 +84,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
},
}),
("https://hiperdex.net/manga/domestic-na-kanojo/154-5/"),
+ ("https://hiperdex.info/manga/domestic-na-kanojo/154-5/"),
)
def __init__(self, match):
@@ -102,8 +105,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
"""Extractor for manga from hiperdex.com"""
chapterclass = HiperdexChapterExtractor
- pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.(?:com|net)"
- r"(/manga/([^/?&#]+))/?$")
+ pattern = BASE_PATTERN + r"(/manga/([^/?&#]+))/?$"
test = (
("https://hiperdex.com/manga/youre-not-that-special/", {
"count": 51,
@@ -123,6 +125,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
},
}),
("https://hiperdex.net/manga/youre-not-that-special/"),
+ ("https://hiperdex.info/manga/youre-not-that-special/"),
)
def __init__(self, match):
@@ -154,11 +157,11 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
categorytransfer = False
chapterclass = HiperdexMangaExtractor
reverse = False
- pattern = (r"(?:https?://)?(?:www\.)?hiperdex\.(?:com|net)"
- r"(/manga-a(?:rtist|uthor)/([^/?&#]+))")
+ pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/([^/?&#]+))"
test = (
("https://hiperdex.com/manga-artist/beck-ho-an/"),
("https://hiperdex.net/manga-artist/beck-ho-an/"),
+ ("https://hiperdex.info/manga-artist/beck-ho-an/"),
("https://hiperdex.com/manga-author/viagra/", {
"pattern": HiperdexMangaExtractor.pattern,
"count": ">= 6",
diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py
new file mode 100644
index 0000000..a1ba0c3
--- /dev/null
+++ b/gallery_dl/extractor/imagechest.py
@@ -0,0 +1,48 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Leonid "Bepis" Pavel
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from galleries at https://imgchest.com/"""
+
+from .common import GalleryExtractor
+from .. import text, exception
+
+
+class ImagechestGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from imgchest.com"""
+ category = "imagechest"
+ root = "https://imgchest.com"
+ pattern = r"(?:https?://)?(?:www\.)?imgchest\.com/p/([A-Za-z0-9]{11})"
+ test = (
+ ("https://imgchest.com/p/3na7kr3by8d", {
+ "url": "f095b4f78c051e5a94e7c663814d1e8d4c93c1f7",
+ "content": "076959e65be30249a2c651fbe6090dc30ba85193",
+ "count": 3
+ }),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = self.root + "/p/" + self.gallery_id
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ if "Sorry, but the page you requested could not be found." in page:
+ raise exception.NotFoundError("gallery")
+
+ return {
+ "gallery_id": self.gallery_id,
+ "title": text.unescape(text.extract(
+ page, 'property="og:title" content="', '"')[0].strip())
+ }
+
+ def images(self, page):
+ return [
+ (url, None)
+ for url in text.extract_iter(
+ page, 'property="og:image" content="', '"')
+ ]
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 0813ea9..44fa5f2 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -34,7 +34,11 @@ class ImgurExtractor(Extractor):
except KeyError:
pass
- url = image["mp4"] if image["animated"] and self.mp4 else image["link"]
+ if image["animated"] and self.mp4 and "mp4" in image:
+ url = image["mp4"]
+ else:
+ url = image["link"]
+
image["date"] = text.parse_timestamp(image["datetime"])
text.nameext_from_url(url, image)
@@ -100,6 +104,9 @@ class ImgurImageExtractor(ImgurExtractor):
("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1'
"url": "ec2cf11a2bfb4939feff374781a6e6f3e9af8e8e",
}),
+ ("https://imgur.com/1Nily2P", { # animated png
+ "pattern": "https://i.imgur.com/1Nily2P.png",
+ }),
("https://imgur.com/zzzzzzz", { # not found
"exception": exception.HttpError,
}),
@@ -130,7 +137,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
directory_fmt = ("{category}", "{album[id]}{album[title]:? - //}")
filename_fmt = "{category}_{album[id]}_{num:>03}_{id}.{extension}"
archive_fmt = "{album[id]}_{id}"
- pattern = BASE_PATTERN + r"/(?:a|t/unmuted)/(\w{7}|\w{5})"
+ pattern = BASE_PATTERN + r"/a/(\w{7}|\w{5})"
test = (
("https://imgur.com/a/TcBmP", {
"url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563",
@@ -192,9 +199,6 @@ class ImgurAlbumExtractor(ImgurExtractor):
("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash
"url": "695ef0c950023362a0163ee5041796300db76674",
}),
- ("https://imgur.com/t/unmuted/YMqBcua", { # unmuted URL
- "url": "86b4747f8147cec7602f0214e267309af73a8655",
- }),
("https://imgur.com/a/TcBmQ", {
"exception": exception.HttpError,
}),
@@ -225,7 +229,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
class ImgurGalleryExtractor(ImgurExtractor):
"""Extractor for imgur galleries"""
subcategory = "gallery"
- pattern = BASE_PATTERN + r"/gallery/(\w{7}|\w{5})"
+ pattern = BASE_PATTERN + r"/(?:gallery|t/unmuted)/(\w{7}|\w{5})"
test = (
("https://imgur.com/gallery/zf2fIms", { # non-album gallery (#380)
"pattern": "https://imgur.com/zf2fIms",
@@ -233,6 +237,9 @@ class ImgurGalleryExtractor(ImgurExtractor):
("https://imgur.com/gallery/eD9CT", {
"pattern": "https://imgur.com/a/eD9CT",
}),
+ ("https://imgur.com/t/unmuted/26sEhNr", { # unmuted URL
+ "pattern": "https://imgur.com/26sEhNr",
+ }),
)
def items(self):
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index ea39cab..3781711 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -14,6 +14,8 @@ from .. import text, exception
from ..cache import cache
import itertools
import json
+import time
+import re
class InstagramExtractor(Extractor):
@@ -26,6 +28,10 @@ class InstagramExtractor(Extractor):
cookiedomain = ".instagram.com"
cookienames = ("sessionid",)
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self._find_tags = re.compile(r'#\w+').findall
+
def get_metadata(self):
return {}
@@ -78,9 +84,10 @@ class InstagramExtractor(Extractor):
url = self.root + "/accounts/login/ajax/"
data = {
"username" : username,
- "password" : password,
+ "enc_password" : "#PWD_INSTAGRAM_BROWSER:0:{}:{}".format(
+ int(time.time()), password),
"queryParams" : "{}",
- "optIntoOneTap": "true",
+ "optIntoOneTap": "false",
}
response = self.request(url, method="POST", headers=headers, data=data)
@@ -133,12 +140,24 @@ class InstagramExtractor(Extractor):
'fullname': media['owner']['full_name'],
'post_id': media['id'],
'post_shortcode': media['shortcode'],
+ 'post_url': url,
'description': text.parse_unicode_escapes('\n'.join(
edge['node']['text']
for edge in media['edge_media_to_caption']['edges']
)),
}
+ tags = self._find_tags(common['description'])
+ if tags:
+ common['tags'] = sorted(set(tags))
+
+ location = media['location']
+ if location:
+ common['location_id'] = location['id']
+ common['location_slug'] = location['slug']
+ common['location_url'] = "{}/explore/locations/{}/{}/".format(
+ self.root, location['id'], location['slug'])
+
medias = []
if media['__typename'] == 'GraphSidecar':
for num, edge in enumerate(
@@ -156,6 +175,7 @@ class InstagramExtractor(Extractor):
'sidecar_media_id': media['id'],
'sidecar_shortcode': media['shortcode'],
}
+ self._extract_tagged_users(children, media_data)
media_data.update(common)
medias.append(media_data)
@@ -169,6 +189,7 @@ class InstagramExtractor(Extractor):
'height': text.parse_int(media['dimensions']['height']),
'width': text.parse_int(media['dimensions']['width']),
}
+ self._extract_tagged_users(media, media_data)
media_data.update(common)
medias.append(media_data)
@@ -189,12 +210,12 @@ class InstagramExtractor(Extractor):
user_id = '"{}"'.format(
shared_data['entry_data']['StoriesPage'][0]['user']['id'])
highlight_id = ''
- query_hash = 'cda12de4f7fd3719c0569ce03589f4c4'
+ query_hash = '0a85e6ea60a4c99edc58ab2f3d17cfdf'
variables = (
'{{'
'"reel_ids":[{}],"tag_names":[],"location_ids":[],'
- '"highlight_reel_ids":[{}],"precomposed_overlay":true,'
+ '"highlight_reel_ids":[{}],"precomposed_overlay":false,'
'"show_story_viewer_list":true,'
'"story_viewer_fetch_count":50,"story_viewer_cursor":"",'
'"stories_video_dash_manifest":false'
@@ -250,7 +271,7 @@ class InstagramExtractor(Extractor):
data = self._request_graphql(
variables,
- 'aec5501414615eca36a9acf075655b1e',
+ 'ad99dd9d3646cc3c0dda65debcd266a7',
shared_data['config']['csrf_token'],
)
@@ -305,6 +326,18 @@ class InstagramExtractor(Extractor):
variables, psdf['query_hash'], csrf,
)
+ def _extract_tagged_users(self, src_media, dest_dict):
+ edges = src_media['edge_media_to_tagged_user']['edges']
+ if edges:
+ dest_dict['tagged_users'] = tagged_users = []
+ for edge in edges:
+ user = edge['node']['user']
+ tagged_users.append({
+ 'id' : user['id'],
+ 'username' : user['username'],
+ 'full_name': user['full_name'],
+ })
+
class InstagramImageExtractor(InstagramExtractor):
"""Extractor for PostPage"""
@@ -321,10 +354,15 @@ class InstagramImageExtractor(InstagramExtractor):
"description": str,
"height": int,
"likes": int,
+ "location_id": "214424288",
+ "location_slug": "hong-kong",
+ "location_url": "re:/explore/locations/214424288/hong-kong/",
"media_id": "1922949326347663701",
"shortcode": "BqvsDleB3lV",
"post_id": "1922949326347663701",
"post_shortcode": "BqvsDleB3lV",
+ "post_url": "https://www.instagram.com/p/BqvsDleB3lV/",
+ "tags": ["#WHPsquares"],
"typename": "GraphImage",
"username": "instagram",
"width": int,
@@ -339,6 +377,7 @@ class InstagramImageExtractor(InstagramExtractor):
"sidecar_shortcode": "BoHk1haB5tM",
"post_id": "1875629777499953996",
"post_shortcode": "BoHk1haB5tM",
+ "post_url": "https://www.instagram.com/p/BoHk1haB5tM/",
"num": int,
"likes": int,
"username": "instagram",
@@ -354,7 +393,9 @@ class InstagramImageExtractor(InstagramExtractor):
"height": int,
"likes": int,
"media_id": "1923502432034620000",
+ "post_url": "https://www.instagram.com/p/Bqxp0VSBgJg/",
"shortcode": "Bqxp0VSBgJg",
+ "tags": ["#ASMR"],
"typename": "GraphVideo",
"username": "instagram",
"width": int,
@@ -370,6 +411,7 @@ class InstagramImageExtractor(InstagramExtractor):
"height": int,
"likes": int,
"media_id": "1806097553666903266",
+ "post_url": "https://www.instagram.com/p/BkQjCfsBIzi/",
"shortcode": "BkQjCfsBIzi",
"typename": "GraphVideo",
"username": "instagram",
@@ -381,11 +423,23 @@ class InstagramImageExtractor(InstagramExtractor):
("https://www.instagram.com/p/BtOvDOfhvRr/", {
"count": 2,
"keyword": {
+ "post_url": "https://www.instagram.com/p/BtOvDOfhvRr/",
"sidecar_media_id": "1967717017113261163",
"sidecar_shortcode": "BtOvDOfhvRr",
"video_url": str,
}
- })
+ }),
+
+ # GraphImage with tagged user
+ ("https://www.instagram.com/p/B_2lf3qAd3y/", {
+ "keyword": {
+ "tagged_users": [{
+ "id": "1246468638",
+ "username": "kaaymbl",
+ "full_name": "Call Me Kay",
+ }]
+ }
+ }),
)
def __init__(self, match):
@@ -476,7 +530,7 @@ class InstagramUserExtractor(InstagramExtractor):
'node_id': 'id',
'variables_id': 'id',
'edge_to_medias': 'edge_owner_to_timeline_media',
- 'query_hash': 'f2405b236d85e8296cf30347c9f08c2a',
+ 'query_hash': '44efc15d3c13342d02df0b5a9fa3d33f',
})
if self.config('highlights'):
@@ -545,5 +599,5 @@ class InstagramTagExtractor(InstagramExtractor):
'node_id': 'name',
'variables_id': 'tag_name',
'edge_to_medias': 'edge_hashtag_to_media',
- 'query_hash': 'f12c9ec5e46a3173b2969c712ad84744',
+ 'query_hash': '7dabc71d3e758b1ec19ffb85639e427b',
})
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index 38c90df..72465f7 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -51,7 +51,7 @@ class MangadexChapterExtractor(MangadexExtractor):
test = (
("https://mangadex.org/chapter/122094", {
"keyword": "ef1084c2845825979e150512fed8fdc209baf05a",
- "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f",
+ "content": "50383a4c15124682057b197d40261641a98db514",
}),
# oneshot
("https://mangadex.cc/chapter/138086", {
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 4f0e38d..002c8f7 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -27,11 +27,9 @@ class MastodonExtractor(Extractor):
Extractor.__init__(self, match)
self.api = MastodonAPI(self)
- def config(self, key, default=None, *, sentinel=object()):
+ def config(self, key, default=None, *, sentinel=util.SENTINEL):
value = Extractor.config(self, key, sentinel)
- if value is not sentinel:
- return value
- return config.interpolate(
+ return value if value is not sentinel else config.interpolate(
("extractor", "mastodon", self.instance, self.subcategory),
key, default,
)
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 17fe935..84794ad 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -224,10 +224,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
self.post_url = "https://www.newgrounds.com/art/view/{}/{}".format(
self.user, match.group(3))
else:
- url = match.group(0)
- if not url.startswith("http"):
- url = "https://" + url
- self.post_url = url
+ self.post_url = text.ensure_http_scheme(match.group(0))
def posts(self):
return (self.post_url,)
@@ -414,6 +411,6 @@ class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
@staticmethod
def _extract_favorites(page):
return [
- "https://" + user.rpartition('"')[2].lstrip("/:")
+ text.ensure_http_scheme(user.rpartition('"')[2])
for user in text.extract_iter(page, 'class="item-user', '"><img')
]
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index c06721c..c07c4b7 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -10,9 +10,8 @@
from .common import Extractor, Message
from . import deviantart, flickr, reddit, smugmug, tumblr
-from .. import text, oauth, config, exception
+from .. import text, oauth, util, config, exception
from ..cache import cache
-import os
import urllib.parse
REDIRECT_URI_LOCALHOST = "http://localhost:6414/"
@@ -27,6 +26,7 @@ class OAuthBase(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.client = None
+ self.cache = config.get(("extractor", self.category), "cache", True)
def oauth_config(self, key, default=None):
return config.interpolate(
@@ -42,7 +42,7 @@ class OAuthBase(Extractor):
server.listen(1)
# workaround for ctrl+c not working during server.accept on Windows
- if os.name == "nt":
+ if util.WINDOWS:
server.settimeout(1.0)
while True:
try:
@@ -87,12 +87,20 @@ class OAuthBase(Extractor):
# exchange the request token for an access token
data = self.session.get(access_token_url, params=data).text
-
data = text.parse_query(data)
- self.send(OAUTH1_MSG_TEMPLATE.format(
- category=self.subcategory,
- token=data["oauth_token"],
- token_secret=data["oauth_token_secret"],
+ token = data["oauth_token"]
+ token_secret = data["oauth_token_secret"]
+
+ # write to cache
+ if self.cache:
+ key = (self.subcategory, self.session.auth.consumer_key)
+ oauth._token_cache.update(key, (token, token_secret))
+ self.log.info("Writing tokens to cache")
+
+ # display tokens
+ self.send(self._generate_message(
+ ("access-token", "access-token-secret"),
+ (token, token_secret),
))
def _oauth2_authorization_code_grant(
@@ -149,24 +157,66 @@ class OAuthBase(Extractor):
self.send(data["error"])
return
- # display token
- part = key.partition("_")[0]
- template = message_template or OAUTH2_MSG_TEMPLATE
- self.send(template.format(
- category=self.subcategory,
- key=part,
- Key=part.capitalize(),
- token=data[key],
- instance=getattr(self, "instance", ""),
- client_id=client_id,
- client_secret=client_secret,
- ))
-
# write to cache
- if cache and config.get(("extractor", self.category), "cache"):
+ if self.cache and cache:
cache.update("#" + str(client_id), data[key])
self.log.info("Writing 'refresh-token' to cache")
+ # display token
+ if message_template:
+ msg = message_template.format(
+ category=self.subcategory,
+ key=key.partition("_")[0],
+ token=data[key],
+ instance=getattr(self, "instance", ""),
+ client_id=client_id,
+ client_secret=client_secret,
+ )
+ else:
+ msg = self._generate_message(
+ ("refresh-token",),
+ (data[key],),
+ )
+ self.send(msg)
+
+ def _generate_message(self, names, values):
+ if len(names) == 1:
+ _vh = "This value has"
+ _is = "is"
+ _it = "it"
+ _va = "this value"
+ else:
+ _vh = "These values have"
+ _is = "are"
+ _it = "them"
+ _va = "these values"
+
+ msg = "\nYour {} {}\n\n{}\n\n".format(
+ " and ".join("'" + n + "'" for n in names),
+ _is,
+ "\n".join(values),
+ )
+
+ if self.cache:
+ opt = self.oauth_config(names[0])
+ if opt is None or opt == "cache":
+ msg += _vh + " been cached and will automatically be used."
+ else:
+ msg += (
+ "Set 'extractor.{}.{}' to \"cache\" to use {}.".format(
+ self.subcategory, names[0], _it,
+ )
+ )
+ else:
+ msg += "Put " + _va + " into your configuration file as \n"
+ msg += " and\n".join(
+ "'extractor." + self.subcategory + "." + n + "'"
+ for n in names
+ )
+ msg += "."
+
+ return msg
+
class OAuthDeviantart(OAuthBase):
subcategory = "deviantart"
@@ -224,6 +274,7 @@ class OAuthReddit(OAuthBase):
"https://www.reddit.com/api/v1/authorize",
"https://www.reddit.com/api/v1/access_token",
scope="read history",
+ cache=reddit._refresh_token_cache,
)
@@ -318,49 +369,8 @@ class OAuthMastodon(OAuthBase):
return data
-OAUTH1_MSG_TEMPLATE = """
-Your Access Token and Access Token Secret are
-
-{token}
-{token_secret}
-
-Put these values into your configuration file as
-'extractor.{category}.access-token' and
-'extractor.{category}.access-token-secret'.
-
-Example:
-{{
- "extractor": {{
- "{category}": {{
- "access-token": "{token}",
- "access-token-secret": "{token_secret}"
- }}
- }}
-}}
-"""
-
-
-OAUTH2_MSG_TEMPLATE = """
-Your {Key} Token is
-
-{token}
-
-Put this value into your configuration file as
-'extractor.{category}.{key}-token'.
-
-Example:
-{{
- "extractor": {{
- "{category}": {{
- "{key}-token": "{token}"
- }}
- }}
-}}
-"""
-
-
MASTODON_MSG_TEMPLATE = """
-Your {Key} Token is
+Your 'access-token' is
{token}
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 570bd72..a14ec9c 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -98,8 +98,7 @@ class PatreonExtractor(Extractor):
headers = {"Referer": self.root}
while url:
- if not url.startswith("http"):
- url = "https://" + url.lstrip("/:")
+ url = text.ensure_http_scheme(url)
posts = self.request(url, headers=headers).json()
if "included" in posts:
diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py
index 1a793a0..ead5c35 100644
--- a/gallery_dl/extractor/recursive.py
+++ b/gallery_dl/extractor/recursive.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -28,6 +28,7 @@ class RecursiveExtractor(Extractor):
self.session.mount("file://", FileAdapter())
page = self.request(self.url.partition(":")[2]).text
+ del self.session.adapters["file://"]
yield Message.Version, 1
with extractor.blacklist(blist):
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index d0232cc..2e3864a 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -222,20 +222,25 @@ class RedditAPI():
self.extractor = extractor
self.comments = text.parse_int(extractor.config("comments", 0))
self.morecomments = extractor.config("morecomments", False)
- self.refresh_token = extractor.config("refresh-token")
self.log = extractor.log
client_id = extractor.config("client-id", self.CLIENT_ID)
user_agent = extractor.config("user-agent", self.USER_AGENT)
if (client_id == self.CLIENT_ID) ^ (user_agent == self.USER_AGENT):
- self.client_id = None
- self.log.warning(
+ raise exception.StopExtraction(
"Conflicting values for 'client-id' and 'user-agent': "
"overwrite either both or none of them.")
+
+ self.client_id = client_id
+ self.headers = {"User-Agent": user_agent}
+
+ token = extractor.config("refresh-token")
+ if token is None or token == "cache":
+ key = "#" + self.client_id
+ self.refresh_token = _refresh_token_cache(key)
else:
- self.client_id = client_id
- extractor.session.headers["User-Agent"] = user_agent
+ self.refresh_token = token
def submission(self, submission_id):
"""Fetch the (submission, comments)=-tuple for a submission id"""
@@ -277,13 +282,15 @@ class RedditAPI():
def authenticate(self):
"""Authenticate the application by requesting an access token"""
- access_token = self._authenticate_impl(self.refresh_token)
- self.extractor.session.headers["Authorization"] = access_token
+ self.headers["Authorization"] = \
+ self._authenticate_impl(self.refresh_token)
@cache(maxage=3600, keyarg=1)
def _authenticate_impl(self, refresh_token=None):
"""Actual authenticate implementation"""
url = "https://www.reddit.com/api/v1/access_token"
+ self.headers["Authorization"] = None
+
if refresh_token:
self.log.info("Refreshing private access token")
data = {"grant_type": "refresh_token",
@@ -294,9 +301,9 @@ class RedditAPI():
"grants/installed_client"),
"device_id": "DO_NOT_TRACK_THIS_DEVICE"}
- auth = (self.client_id, "")
response = self.extractor.request(
- url, method="POST", data=data, auth=auth, fatal=False)
+ url, method="POST", headers=self.headers,
+ data=data, auth=(self.client_id, ""), fatal=False)
data = response.json()
if response.status_code != 200:
@@ -307,9 +314,10 @@ class RedditAPI():
def _call(self, endpoint, params):
url = "https://oauth.reddit.com" + endpoint
- params["raw_json"] = 1
+ params["raw_json"] = "1"
self.authenticate()
- response = self.extractor.request(url, params=params, fatal=None)
+ response = self.extractor.request(
+ url, params=params, headers=self.headers, fatal=None)
remaining = response.headers.get("x-ratelimit-remaining")
if remaining and float(remaining) < 2:
@@ -380,3 +388,10 @@ class RedditAPI():
@staticmethod
def _decode(sid):
return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz")
+
+
+@cache(maxage=100*365*24*3600, keyarg=0)
+def _refresh_token_cache(token):
+ if token and token[0] == "#":
+ return None
+ return token
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
new file mode 100644
index 0000000..7855eab
--- /dev/null
+++ b/gallery_dl/extractor/redgifs.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://redgifs.com/"""
+
+from .gfycat import GfycatImageExtractor
+from ..cache import cache
+
+
+class RedgifsImageExtractor(GfycatImageExtractor):
+ """Extractor for individual images from redgifs.com"""
+ category = "redgifs"
+ pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/watch/([A-Za-z]+)"
+ test = ("https://redgifs.com/watch/foolishforkedabyssiniancat", {
+ "pattern": r"https://\w+.redgifs.com/FoolishForkedAbyssiniancat.mp4",
+ "content": "f6e03f1df9a2ff2a74092f53ee7580d2fb943533",
+ })
+
+ def _get_info(self, gfycat_id):
+ api = RedgifsAPI(self)
+ return api.gfycat(gfycat_id)
+
+
+class RedgifsAPI():
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.headers = {}
+
+ def gfycat(self, gfycat_id):
+ endpoint = "v1/gfycats/" + gfycat_id
+ return self._call(endpoint)["gfyItem"]
+
+ @cache(maxage=3600)
+ def _authenticate_impl(self):
+ url = "https://weblogin.redgifs.com/oauth/webtoken"
+ headers = {
+ "Referer": "https://www.redgifs.com/",
+ "Origin" : "https://www.redgifs.com",
+ }
+ data = {
+ "access_key": "dBLwVuGn9eq4dtXLs8WSfpjcYFY7bPQe"
+ "AqGPSFgqeW5B9uzj2cMVhF63pTFF4Rg9",
+ }
+
+ response = self.extractor.request(
+ url, method="POST", headers=headers, json=data)
+ return "Bearer " + response.json()["access_token"]
+
+ def _call(self, endpoint):
+ self.headers["Authorization"] = self._authenticate_impl()
+ url = "https://napi.redgifs.com/" + endpoint
+ return self.extractor.request(url, headers=self.headers).json()
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index b21ad32..2cef430 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -121,9 +121,9 @@ class SexcomPinExtractor(SexcomExtractor):
},
}),
# gif
- ("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", {
- "pattern": "https://cdn.sex.com/images/.+/2014/01/26/4829951.gif",
- "content": "af6726d74d11d819e1c885fe5303f711862eae96",
+ ("https://www.sex.com/pin/55435122-ecchi/", {
+ "pattern": "https://cdn.sex.com/images/.+/2017/12/07/18760842.gif",
+ "content": "176cc63fa05182cb0438c648230c0f324a5965fe",
}),
# video
("https://www.sex.com/pin/55748341/", {
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 7e99823..3e3a5a0 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -108,11 +108,11 @@ class TumblrExtractor(Extractor):
del photo["alt_sizes"]
yield self._prepare_image(photo["url"], post)
- url = post.get("audio_url") # type: "audio"
+ url = post.get("audio_url") # type "audio"
if url and url.startswith("https://a.tumblr.com/"):
yield self._prepare(url, post)
- url = post.get("video_url") # type: "video"
+ url = post.get("video_url") # type "video"
if url:
yield self._prepare(_original_video(url), post)
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index c409f54..4c7b757 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -360,12 +360,13 @@ class TwitterTweetExtractor(TwitterExtractor):
"pattern": r"ytdl:https://video.twimg.com/ext_tw_video/.*.m3u8",
}),
# content with emoji, newlines, hashtags (#338)
- ("https://twitter.com/yumi_san0112/status/1151144618936823808", {
+ ("https://twitter.com/playpokemon/status/1263832915173048321", {
"options": (("content", True),),
"keyword": {"content": (
- "re:晴、お誕生日おめでとう🎉!\n実は下の名前が同じなので結構親近感ある"
- "アイドルです✨\n今年の晴ちゃんめちゃくちゃ可愛い路線攻めてるから、そろ"
- "そろまたかっこいい晴が見たいですねw\n#結城晴生誕祭2019\n#結城晴生誕祭"
+ r"re:Gear up for #PokemonSwordShieldEX with special Mystery "
+ "Gifts! \n\nYou’ll be able to receive four Galarian form "
+ "Pokémon with Hidden Abilities, plus some very useful items. "
+ "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ "
)},
}),
# Reply to another tweet (#403)
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
new file mode 100644
index 0000000..86ada49
--- /dev/null
+++ b/gallery_dl/extractor/webtoons.py
@@ -0,0 +1,148 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Leonardo Taccari
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://www.webtoons.com/"""
+
+from .common import Extractor, Message
+from .. import exception, text, util
+
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?webtoons\.com/((en|fr)"
+
+
+class WebtoonsExtractor(Extractor):
+ category = "webtoons"
+ root = "https://www.webtoons.com"
+ cookiedomain = "www.webtoons.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.session.cookies.set("ageGatePass", "true",
+ domain=self.cookiedomain)
+ self.path, self.lang, self.genre , self.comic, self.query = \
+ match.groups()
+
+
+class WebtoonsEpisodeExtractor(WebtoonsExtractor):
+ """Extractor for an episode on webtoons.com"""
+ subcategory = "episode"
+ directory_fmt = ("{category}", "{comic}")
+ filename_fmt = "{episode}-{num:>02}.{extension}"
+ archive_fmt = "{title_no}_{episode}_{num}"
+ pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+)/(?:[^/?&#]+))"
+ r"/viewer(?:\?([^#]+))")
+ test = (
+ (("https://www.webtoons.com/en/comedy/safely-endangered"
+ "/ep-572-earth/viewer?title_no=352&episode_no=572"), {
+ "url": "11041d71a3f92728305c11a228e77cf0f7aa02ef",
+ "content": "4f7701a750368e377d65900e6e8f64a5f9cb9c86",
+ "count": 5,
+ }),
+ )
+
+ def __init__(self, match):
+ WebtoonsExtractor.__init__(self, match)
+ query = text.parse_query(self.query)
+ self.title_no = query.get("title_no")
+ if not self.title_no:
+ raise exception.NotFoundError("title_no")
+ self.episode = query.get("episode_no")
+ if not self.episode:
+ raise exception.NotFoundError("episode_no")
+
+ def items(self):
+ url = "{}/{}/viewer?{}".format(self.root, self.path, self.query)
+ self.session.headers["Referer"] = url
+
+ page = self.request(url).text
+ data = self.get_job_metadata(page)
+ imgs = self.get_image_urls(page)
+ data["count"] = len(imgs)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for data["num"], url in enumerate(imgs, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def get_job_metadata(self, page):
+ """Collect metadata for extractor-job"""
+ title, pos = text.extract(
+ page, '<meta property="og:title" content="', '"')
+ descr, pos = text.extract(
+ page, '<meta property="og:description" content="', '"', pos)
+
+ return {
+ "genre": self.genre,
+ "comic": self.comic,
+ "title_no": self.title_no,
+ "episode": self.episode,
+ "title": text.unescape(title),
+ "description": text.unescape(descr),
+ "lang": self.lang,
+ "language": util.code_to_language(self.lang),
+ }
+
+ @staticmethod
+ def get_image_urls(page):
+ """Extract and return a list of all image urls"""
+ return list(text.extract_iter(page, 'class="_images" data-url="', '"'))
+
+
+class WebtoonsComicExtractor(WebtoonsExtractor):
+ """Extractor for an entire comic on webtoons.com"""
+ subcategory = "comic"
+ pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+))"
+ r"/list(?:\?([^#]+))")
+ test = (
+ # english
+ (("https://www.webtoons.com/en/comedy/live-with-yourself/"
+ "list?title_no=919"), {
+ "pattern": WebtoonsEpisodeExtractor.pattern,
+ "range": "1-15",
+ "count": ">= 15",
+ }),
+ # french
+ (("https://www.webtoons.com/fr/romance/subzero/"
+ "list?title_no=1845&page=3"), {
+ "count": ">= 15",
+ }),
+ )
+
+ def __init__(self, match):
+ WebtoonsExtractor.__init__(self, match)
+ query = text.parse_query(self.query)
+ self.title_no = query.get("title_no")
+ if not self.title_no:
+ raise exception.NotFoundError("title_no")
+ self.page_no = int(query.get("page", 1))
+
+ def items(self):
+ page = None
+ data = {"_extractor": WebtoonsEpisodeExtractor}
+
+ while True:
+ path = "/{}/list?title_no={}&page={}".format(
+ self.path, self.title_no, self.page_no)
+
+ if page and path not in page:
+ return
+
+ page = self.request(self.root + path).text
+ data["page"] = self.page_no
+
+ for url in self.get_episode_urls(page):
+ yield Message.Queue, url, data
+
+ self.page_no += 1
+
+ @staticmethod
+ def get_episode_urls(page):
+ """Extract and return all episode urls in 'page'"""
+ pos = page.find('id="_listUl"')
+ return text.extract_iter(
+ page, '<a href="', '" class="NPI=a:list', pos)
diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py
index b614cab..0ada118 100644
--- a/gallery_dl/extractor/wikiart.py
+++ b/gallery_dl/extractor/wikiart.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -69,8 +69,8 @@ class WikiartArtistExtractor(WikiartExtractor):
directory_fmt = ("{category}", "{artist[artistName]}")
pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)"
test = ("https://www.wikiart.org/en/thomas-cole", {
- "url": "9049e52e897b9ae6586df4c2c4f827d0a19dafa3",
- "keyword": "c3168b21a993707c41efb7674e8c90d53a79d483",
+ "url": "5ba2fbe6783fcce34e65014d16e5fbc581490c98",
+ "keyword": "6d92913c55675e05553f000cfee5daff0b4107cf",
})
def __init__(self, match):
diff --git a/gallery_dl/job.py b/gallery_dl/job.py
index 6ba2572..130df58 100644
--- a/gallery_dl/job.py
+++ b/gallery_dl/job.py
@@ -24,20 +24,32 @@ class Job():
extr = extractor.find(extr)
if not extr:
raise exception.NoExtractorError()
-
self.extractor = extr
- extr.log.extractor = extr
- extr.log.job = self
+ self.pathfmt = None
+
+ self._logger_extra = {
+ "job" : self,
+ "extractor": extr,
+ "path" : output.PathfmtProxy(self),
+ "keywords" : output.KwdictProxy(self),
+ }
+ extr.log = self._wrap_logger(extr.log)
extr.log.debug("Using %s for '%s'", extr.__class__.__name__, extr.url)
self.status = 0
self.pred_url = self._prepare_predicates("image", True)
self.pred_queue = self._prepare_predicates("chapter", False)
- if parent and parent.extractor.config(
- "category-transfer", parent.extractor.categorytransfer):
- self.extractor.category = parent.extractor.category
- self.extractor.subcategory = parent.extractor.subcategory
+ if parent:
+ pextr = parent.extractor
+
+ # transfer (sub)category
+ if pextr.config("category-transfer", pextr.categorytransfer):
+ extr.category = pextr.category
+ extr.subcategory = pextr.subcategory
+
+ # reuse connection adapters
+ extr.session.adapters = pextr.session.adapters
# user-supplied metadata
self.userkwds = self.extractor.config("keywords")
@@ -165,6 +177,12 @@ class Job():
return util.build_predicate(predicates)
+ def get_logger(self, name):
+ return self._wrap_logger(logging.getLogger(name))
+
+ def _wrap_logger(self, logger):
+ return output.LoggerAdapter(logger, self._logger_extra)
+
def _write_unsupported(self, url):
if self.ulog:
self.ulog.info(url)
@@ -175,8 +193,7 @@ class DownloadJob(Job):
def __init__(self, url, parent=None):
Job.__init__(self, url, parent)
- self.log = logging.getLogger("download")
- self.pathfmt = None
+ self.log = self.get_logger("download")
self.archive = None
self.sleep = None
self.downloaders = {}
@@ -325,7 +342,7 @@ class DownloadJob(Job):
cls = downloader.find(scheme)
if cls and config.get(("downloader", cls.scheme), "enabled", True):
- instance = cls(self.extractor, self.out)
+ instance = cls(self)
else:
instance = None
self.log.error("'%s:' URLs are not supported/enabled", scheme)
@@ -338,19 +355,20 @@ class DownloadJob(Job):
def initialize(self, kwdict=None):
"""Delayed initialization of PathFormat, etc."""
- self.pathfmt = util.PathFormat(self.extractor)
+ config = self.extractor.config
+ pathfmt = self.pathfmt = util.PathFormat(self.extractor)
if kwdict:
- self.pathfmt.set_directory(kwdict)
+ pathfmt.set_directory(kwdict)
- self.sleep = self.extractor.config("sleep")
- if not self.extractor.config("download", True):
- self.download = self.pathfmt.fix_extension
+ self.sleep = config("sleep")
+ if not config("download", True):
+ self.download = pathfmt.fix_extension
- skip = self.extractor.config("skip", True)
+ skip = config("skip", True)
if skip:
self._skipexc = None
if skip == "enumerate":
- self.pathfmt.check_file = self.pathfmt._enum_file
+ pathfmt.check_file = pathfmt._enum_file
elif isinstance(skip, str):
skip, _, smax = skip.partition(":")
if skip == "abort":
@@ -360,9 +378,9 @@ class DownloadJob(Job):
self._skipcnt = 0
self._skipmax = text.parse_int(smax)
else:
- self.pathfmt.exists = lambda x=None: False
+ pathfmt.exists = lambda x=None: False
- archive = self.extractor.config("archive")
+ archive = config("archive")
if archive:
path = util.expand_path(archive)
try:
@@ -374,27 +392,28 @@ class DownloadJob(Job):
else:
self.extractor.log.debug("Using download archive '%s'", path)
- postprocessors = self.extractor.config("postprocessors")
+ postprocessors = config("postprocessors")
if postprocessors:
+ pp_log = self.get_logger("postprocessor")
pp_list = []
+ category = self.extractor.category
for pp_dict in postprocessors:
whitelist = pp_dict.get("whitelist")
blacklist = pp_dict.get("blacklist")
- if (whitelist and self.extractor.category not in whitelist or
- blacklist and self.extractor.category in blacklist):
+ if (whitelist and category not in whitelist or
+ blacklist and category in blacklist):
continue
name = pp_dict.get("name")
pp_cls = postprocessor.find(name)
if not pp_cls:
- postprocessor.log.warning("module '%s' not found", name)
+ pp_log.warning("module '%s' not found", name)
continue
try:
- pp_obj = pp_cls(self.pathfmt, pp_dict)
+ pp_obj = pp_cls(self, pp_dict)
except Exception as exc:
- postprocessor.log.error(
- "'%s' initialization failed: %s: %s",
- name, exc.__class__.__name__, exc)
+ pp_log.error("'%s' initialization failed: %s: %s",
+ name, exc.__class__.__name__, exc)
else:
pp_list.append(pp_obj)
diff --git a/gallery_dl/oauth.py b/gallery_dl/oauth.py
index 9ceefbf..e9dfff0 100644
--- a/gallery_dl/oauth.py
+++ b/gallery_dl/oauth.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,6 +20,7 @@ import requests
import requests.auth
from . import text
+from .cache import cache
def nonce(size, alphabet=string.ascii_letters):
@@ -117,6 +118,10 @@ class OAuth1API():
token_secret = extractor.config("access-token-secret")
key_type = "default" if api_key == self.API_KEY else "custom"
+ if token is None or token == "cache":
+ key = (extractor.category, api_key)
+ token, token_secret = _token_cache(key)
+
if api_key and api_secret and token and token_secret:
self.log.debug("Using %s OAuth1.0 authentication", key_type)
self.session = OAuth1Session(
@@ -131,3 +136,8 @@ class OAuth1API():
kwargs["fatal"] = None
kwargs["session"] = self.session
return self.extractor.request(url, **kwargs)
+
+
+@cache(maxage=100*365*24*3600, keyarg=0)
+def _token_cache(key):
+ return None, None
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
index 34222a2..5b99bee 100644
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2019 Mike Fährmann
+# Copyright 2017-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -173,6 +173,12 @@ def build_parser():
help=("Write URLs, which get emitted by other extractors but cannot "
"be handled, to FILE"),
)
+ output.add_argument(
+ "--write-pages",
+ dest="write-pages", nargs=0, action=ConfigConstAction, const=True,
+ help=("Write downloaded intermediary pages to files "
+ "in the current directory to debug problems"),
+ )
downloader = parser.add_argument_group("Downloader Options")
downloader.add_argument(
@@ -196,7 +202,7 @@ def build_parser():
downloader.add_argument(
"--http-timeout",
dest="timeout", metavar="SECONDS", type=float, action=ConfigAction,
- help="Timeout for HTTP connections (defaut: 30.0)",
+ help="Timeout for HTTP connections (default: 30.0)",
)
downloader.add_argument(
"--sleep",
diff --git a/gallery_dl/output.py b/gallery_dl/output.py
index 9e2f8a6..2d3dc17 100644
--- a/gallery_dl/output.py
+++ b/gallery_dl/output.py
@@ -22,34 +22,93 @@ LOG_LEVEL = logging.INFO
class Logger(logging.Logger):
- """Custom logger that includes extractor and job info in log records"""
- extractor = util.NONE
- job = util.NONE
+ """Custom logger that includes extra info in log records"""
def makeRecord(self, name, level, fn, lno, msg, args, exc_info,
func=None, extra=None, sinfo=None,
factory=logging._logRecordFactory):
rv = factory(name, level, fn, lno, msg, args, exc_info, func, sinfo)
- rv.extractor = self.extractor
- rv.job = self.job
+ if extra:
+ rv.__dict__.update(extra)
return rv
+class LoggerAdapter():
+ """Trimmed-down version of logging.LoggingAdapter"""
+ __slots__ = ("logger", "extra")
+
+ def __init__(self, logger, extra):
+ self.logger = logger
+ self.extra = extra
+
+ def debug(self, msg, *args, **kwargs):
+ if self.logger.isEnabledFor(logging.DEBUG):
+ kwargs["extra"] = self.extra
+ self.logger._log(logging.DEBUG, msg, args, **kwargs)
+
+ def info(self, msg, *args, **kwargs):
+ if self.logger.isEnabledFor(logging.INFO):
+ kwargs["extra"] = self.extra
+ self.logger._log(logging.INFO, msg, args, **kwargs)
+
+ def warning(self, msg, *args, **kwargs):
+ if self.logger.isEnabledFor(logging.WARNING):
+ kwargs["extra"] = self.extra
+ self.logger._log(logging.WARNING, msg, args, **kwargs)
+
+ def error(self, msg, *args, **kwargs):
+ if self.logger.isEnabledFor(logging.ERROR):
+ kwargs["extra"] = self.extra
+ self.logger._log(logging.ERROR, msg, args, **kwargs)
+
+
+class PathfmtProxy():
+ __slots__ = ("job",)
+
+ def __init__(self, job):
+ self.job = job
+
+ def __getattribute__(self, name):
+ pathfmt = object.__getattribute__(self, "job").pathfmt
+ return pathfmt.__dict__.get(name) if pathfmt else None
+
+
+class KwdictProxy():
+ __slots__ = ("job",)
+
+ def __init__(self, job):
+ self.job = job
+
+ def __getattribute__(self, name):
+ pathfmt = object.__getattribute__(self, "job").pathfmt
+ return pathfmt.kwdict.get(name) if pathfmt else None
+
+
class Formatter(logging.Formatter):
"""Custom formatter that supports different formats per loglevel"""
def __init__(self, fmt, datefmt):
- if not isinstance(fmt, dict):
+ if isinstance(fmt, dict):
+ for key in ("debug", "info", "warning", "error"):
+ value = fmt[key] if key in fmt else LOG_FORMAT
+ fmt[key] = (util.Formatter(value).format_map,
+ "{asctime" in value)
+ else:
+ if fmt == LOG_FORMAT:
+ fmt = (fmt.format_map, False)
+ else:
+ fmt = (util.Formatter(fmt).format_map, "{asctime" in fmt)
fmt = {"debug": fmt, "info": fmt, "warning": fmt, "error": fmt}
+
self.formats = fmt
self.datefmt = datefmt
def format(self, record):
record.message = record.getMessage()
- fmt = self.formats[record.levelname]
- if "{asctime" in fmt:
+ fmt, asctime = self.formats[record.levelname]
+ if asctime:
record.asctime = self.formatTime(record, self.datefmt)
- msg = fmt.format_map(record.__dict__)
+ msg = fmt(record.__dict__)
if record.exc_info and not record.exc_text:
record.exc_text = self.formatException(record.exc_info)
if record.exc_text:
@@ -244,7 +303,7 @@ class ColorOutput(TerminalOutput):
print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="")
-if os.name == "nt":
+if util.WINDOWS:
ANSI = os.environ.get("TERM") == "ANSI"
OFFSET = 1
CHAR_SKIP = "# "
diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py
index 7a3bf23..faa4d6c 100644
--- a/gallery_dl/postprocessor/__init__.py
+++ b/gallery_dl/postprocessor/__init__.py
@@ -9,7 +9,6 @@
"""Post-processing modules"""
import importlib
-import logging
modules = [
"classify",
@@ -21,8 +20,6 @@ modules = [
"zip",
]
-log = logging.getLogger("postprocessor")
-
def find(name):
"""Return a postprocessor class with the given name"""
diff --git a/gallery_dl/postprocessor/classify.py b/gallery_dl/postprocessor/classify.py
index 4a9bde9..0106903 100644
--- a/gallery_dl/postprocessor/classify.py
+++ b/gallery_dl/postprocessor/classify.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -22,8 +22,8 @@ class ClassifyPP(PostProcessor):
"Archives" : ("zip", "rar", "7z", "tar", "gz", "bz2"),
}
- def __init__(self, pathfmt, options):
- PostProcessor.__init__(self)
+ def __init__(self, job, options):
+ PostProcessor.__init__(self, job)
mapping = options.get("mapping", self.DEFAULT_MAPPING)
self.mapping = {
diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py
index 70b0dfb..64f978e 100644
--- a/gallery_dl/postprocessor/common.py
+++ b/gallery_dl/postprocessor/common.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,15 +8,13 @@
"""Common classes and constants used by postprocessor modules."""
-import logging
-
class PostProcessor():
"""Base class for postprocessors"""
- def __init__(self):
+ def __init__(self, job):
name = self.__class__.__name__[:-2].lower()
- self.log = logging.getLogger("postprocessor." + name)
+ self.log = job.get_logger("postprocessor." + name)
@staticmethod
def prepare(pathfmt):
diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py
index ddbcef0..0d11844 100644
--- a/gallery_dl/postprocessor/compare.py
+++ b/gallery_dl/postprocessor/compare.py
@@ -14,8 +14,8 @@ import os
class ComparePP(PostProcessor):
- def __init__(self, pathfmt, options):
- PostProcessor.__init__(self)
+ def __init__(self, job, options):
+ PostProcessor.__init__(self, job)
if options.get("action") == "enumerate":
self.run = self._run_enumerate
if options.get("shallow"):
diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py
index 0a56281..cbe51ae 100644
--- a/gallery_dl/postprocessor/exec.py
+++ b/gallery_dl/postprocessor/exec.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2019 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,10 +11,9 @@
from .common import PostProcessor
from .. import util
import subprocess
-import os
-if os.name == "nt":
+if util.WINDOWS:
def quote(s):
return '"' + s.replace('"', '\\"') + '"'
else:
@@ -23,8 +22,8 @@ else:
class ExecPP(PostProcessor):
- def __init__(self, pathfmt, options):
- PostProcessor.__init__(self)
+ def __init__(self, job, options):
+ PostProcessor.__init__(self, job)
args = options["command"]
final = options.get("final", False)
diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py
index aa50dfd..a955ba3 100644
--- a/gallery_dl/postprocessor/metadata.py
+++ b/gallery_dl/postprocessor/metadata.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -15,8 +15,8 @@ import os
class MetadataPP(PostProcessor):
- def __init__(self, pathfmt, options):
- PostProcessor.__init__(self)
+ def __init__(self, job, options):
+ PostProcessor.__init__(self, job)
mode = options.get("mode", "json")
if mode == "custom":
diff --git a/gallery_dl/postprocessor/mtime.py b/gallery_dl/postprocessor/mtime.py
index 7065428..b8a4988 100644
--- a/gallery_dl/postprocessor/mtime.py
+++ b/gallery_dl/postprocessor/mtime.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -14,8 +14,8 @@ from ..text import parse_int
class MtimePP(PostProcessor):
- def __init__(self, pathfmt, options):
- PostProcessor.__init__(self)
+ def __init__(self, job, options):
+ PostProcessor.__init__(self, job)
self.key = options.get("key", "date")
def run(self, pathfmt):
diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py
index 706e706..1afba86 100644
--- a/gallery_dl/postprocessor/ugoira.py
+++ b/gallery_dl/postprocessor/ugoira.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2018 Mike Fährmann
+# Copyright 2018-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Convert pixiv ugoira to webm"""
+"""Convert Pixiv Ugoira to WebM"""
from .common import PostProcessor
from .. import util
@@ -19,8 +19,8 @@ import os
class UgoiraPP(PostProcessor):
- def __init__(self, pathfmt, options):
- PostProcessor.__init__(self)
+ def __init__(self, job, options):
+ PostProcessor.__init__(self, job)
self.extension = options.get("extension") or "webm"
self.args = options.get("ffmpeg-args") or ()
self.twopass = options.get("ffmpeg-twopass", False)
diff --git a/gallery_dl/postprocessor/zip.py b/gallery_dl/postprocessor/zip.py
index a43c43a..6970e95 100644
--- a/gallery_dl/postprocessor/zip.py
+++ b/gallery_dl/postprocessor/zip.py
@@ -22,8 +22,8 @@ class ZipPP(PostProcessor):
"lzma" : zipfile.ZIP_LZMA,
}
- def __init__(self, pathfmt, options):
- PostProcessor.__init__(self)
+ def __init__(self, job, options):
+ PostProcessor.__init__(self, job)
self.delete = not options.get("keep-files", False)
ext = "." + options.get("extension", "zip")
algorithm = options.get("compression", "store")
@@ -33,7 +33,7 @@ class ZipPP(PostProcessor):
algorithm)
algorithm = "store"
- self.path = pathfmt.realdirectory
+ self.path = job.pathfmt.realdirectory
args = (self.path[:-1] + ext, "a",
self.COMPRESSION_ALGORITHMS[algorithm], True)
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
index 3bb6390..4dc0963 100644
--- a/gallery_dl/text.py
+++ b/gallery_dl/text.py
@@ -60,6 +60,13 @@ def split_html(txt, sep=None):
return []
+def ensure_http_scheme(url, scheme="https://"):
+ """Prepend 'scheme' to 'url' if it doesn't have one"""
+ if url and not url.startswith(("https://", "http://")):
+ return scheme + url.lstrip("/:")
+ return url
+
+
def filename_from_url(url):
"""Extract the last part of an URL to use as a filename"""
try:
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index 83cf84b..85b871b 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -113,6 +113,57 @@ def dump_json(obj, fp=sys.stdout, ensure_ascii=True, indent=4):
fp.write("\n")
+def dump_response(response, fp=sys.stdout,
+ headers=True, content=True, hide_auth=True):
+ """Write the contents of 'response' into a file-like object"""
+
+ if headers:
+ request = response.request
+ req_headers = request.headers.copy()
+ outfmt = """\
+{request.method} {request.url}
+Status: {response.status_code} {response.reason}
+
+Request Headers
+---------------
+{request_headers}
+
+Response Headers
+----------------
+{response_headers}
+"""
+ if hide_auth:
+ authorization = req_headers.get("Authorization")
+ if authorization:
+ atype, sep, _ = authorization.partition(" ")
+ req_headers["Authorization"] = atype + " ***" if sep else "***"
+
+ cookies = req_headers.get("Cookie")
+ if cookies:
+ req_headers["Cookie"] = ";".join(
+ cookie.partition("=")[0] + "=***"
+ for cookie in cookies.split(";")
+ )
+
+ fp.write(outfmt.format(
+ request=request,
+ response=response,
+ request_headers="\n".join(
+ name + ": " + value
+ for name, value in req_headers.items()
+ ),
+ response_headers="\n".join(
+ name + ": " + value
+ for name, value in response.headers.items()
+ ),
+ ).encode())
+
+ if content:
+ if headers:
+ fp.write(b"\nContent\n-------\n")
+ fp.write(response.content)
+
+
def expand_path(path):
"""Expand environment variables and tildes (~)"""
if not path:
@@ -270,6 +321,8 @@ class UniversalNone():
NONE = UniversalNone()
+WINDOWS = (os.name == "nt")
+SENTINEL = object()
def build_predicate(predicates):
@@ -672,22 +725,26 @@ class PathFormat():
self.basedirectory = basedir
restrict = extractor.config("path-restrict", "auto")
+ replace = extractor.config("path-replace", "_")
+
if restrict == "auto":
- restrict = "\\\\|/<>:\"?*" if os.name == "nt" else "/"
+ restrict = "\\\\|/<>:\"?*" if WINDOWS else "/"
elif restrict == "unix":
restrict = "/"
elif restrict == "windows":
restrict = "\\\\|/<>:\"?*"
+ self.clean_segment = self._build_cleanfunc(restrict, replace)
remove = extractor.config("path-remove", "\x00-\x1f\x7f")
-
- self.clean_segment = self._build_cleanfunc(restrict, "_")
self.clean_path = self._build_cleanfunc(remove, "")
@staticmethod
def _build_cleanfunc(chars, repl):
if not chars:
return lambda x: x
+ elif isinstance(chars, dict):
+ def func(x, table=str.maketrans(chars)):
+ return x.translate(table)
elif len(chars) == 1:
def func(x, c=chars, r=repl):
return x.replace(c, r)
@@ -726,7 +783,7 @@ class PathFormat():
def set_directory(self, kwdict):
"""Build directory path and create it if necessary"""
- windows = os.name == "nt"
+ self.kwdict = kwdict
# Build path segments by applying 'kwdict' to directory format strings
segments = []
@@ -734,7 +791,7 @@ class PathFormat():
try:
for formatter in self.directory_formatters:
segment = formatter(kwdict).strip()
- if windows:
+ if WINDOWS:
# remove trailing dots and spaces (#647)
segment = segment.rstrip(". ")
if segment:
@@ -751,7 +808,7 @@ class PathFormat():
directory += sep
self.directory = directory
- if windows:
+ if WINDOWS:
# Enable longer-than-260-character paths on Windows
directory = "\\\\?\\" + os.path.abspath(directory)
@@ -772,6 +829,8 @@ class PathFormat():
if self.extension:
self.build_path()
+ else:
+ self.filename = ""
def set_extension(self, extension, real=True):
"""Set filename extension"""
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 40b5c73..dd6f373 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.13.6"
+__version__ = "1.14.0"