diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 14 | ||||
| -rw-r--r-- | gallery_dl/extractor/danbooru.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/deviantart.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/exhentai.py | 73 | ||||
| -rw-r--r-- | gallery_dl/extractor/nijie.py | 34 | ||||
| -rw-r--r-- | gallery_dl/extractor/oauth.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/patreon.py | 7 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixeldrain.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/reddit.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 37 | ||||
| -rw-r--r-- | gallery_dl/extractor/urlgalleries.py | 55 |
12 files changed, 179 insertions, 55 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 72239d5..d074de2 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -155,6 +155,7 @@ modules = [ "tumblrgallery", "twibooru", "twitter", + "urlgalleries", "unsplash", "uploadir", "urlshortener", diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index f378427..9b010c5 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -136,6 +136,18 @@ class Extractor(): kwargs["timeout"] = self._timeout if "verify" not in kwargs: kwargs["verify"] = self._verify + + if "json" in kwargs: + json = kwargs["json"] + if json is not None: + kwargs["data"] = util.json_dumps(json).encode() + del kwargs["json"] + headers = kwargs.get("headers") + if headers: + headers["Content-Type"] = "application/json" + else: + kwargs["headers"] = {"Content-Type": "application/json"} + response = None tries = 1 @@ -233,7 +245,7 @@ class Extractor(): password = None if username: - password = self.config("password") + password = self.config("password") or util.LazyPrompt() elif self.config("netrc", False): try: info = netrc.netrc().authenticators(self.category) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index 56d81e5..9e6516e 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -36,7 +36,7 @@ class DanbooruExtractor(BaseExtractor): username, api_key = self._get_auth_info() if username: self.log.debug("Using HTTP Basic Auth for user '%s'", username) - self.session.auth = (username, api_key) + self.session.auth = util.HTTPBasicAuth(username, api_key) def skip(self, num): pages = num // self.per_page diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 2c37ef1..1852dc1 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -1239,7 +1239,7 @@ class DeviantartOAuthAPI(): self.log.info("Requesting public access token") data = {"grant_type": "client_credentials"} - auth = (self.client_id, self.client_secret) + auth = util.HTTPBasicAuth(self.client_id, self.client_secret) response = self.extractor.request( url, method="POST", data=data, auth=auth, fatal=False) data = response.json() diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 5dc498f..a479d00 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -47,14 +47,6 @@ class ExhentaiExtractor(Extractor): if self.version != "ex": self.cookies.set("nw", "1", domain=self.cookies_domain) - self.original = self.config("original", True) - - limits = self.config("limits", False) - if limits and limits.__class__ is int: - self.limits = limits - self._remaining = 0 - else: - self.limits = False def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) @@ -85,6 +77,7 @@ class ExhentaiExtractor(Extractor): @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) + url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01" headers = { "Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1", @@ -98,10 +91,19 @@ class ExhentaiExtractor(Extractor): "ipb_login_submit": "Login!", } + self.cookies.clear() + response = self.request(url, method="POST", headers=headers, data=data) if b"You are now logged in as:" not in response.content: raise exception.AuthenticationError() - return {c: response.cookies[c] for c in self.cookies_names} + + # collect more cookies + url = self.root + "/favorites.php" + response = self.request(url) + if response.history: + self.request(url) + + return self.cookies class ExhentaiGalleryExtractor(ExhentaiExtractor): @@ -128,6 +130,19 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if source == "hitomi": self.items = self._items_hitomi + limits = self.config("limits", False) + if limits and limits.__class__ is int: + self.limits = limits + self._remaining = 0 + else: + self.limits = False + + self.fallback_retries = self.config("fallback-retries", 2) + if self.fallback_retries < 0: + self.fallback_retries = float("inf") + + self.original = self.config("original", True) + def favorite(self, slot="0"): url = self.root + "/gallerypopups.php" params = { @@ -301,12 +316,11 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): if self.original and orig: url = self.root + "/fullimg" + text.unescape(orig) data = self._parse_original_info(extr('ownload original', '<')) - data["_fallback"] = ("{}?nl={}".format(url, nl),) + data["_fallback"] = self._fallback_original(nl, url) else: url = iurl data = self._parse_image_info(url) - data["_fallback"] = self._fallback( - None, self.image_num, nl) + data["_fallback"] = self._fallback_1280(nl, self.image_num) except IndexError: self.log.debug("Page content:\n%s", page) raise exception.StopExtraction( @@ -315,6 +329,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = self.image_num data["image_token"] = self.key_start = extr('var startkey="', '";') data["_url_1280"] = iurl + data["_nl"] = nl self.key_show = extr('var showkey="', '";') self._check_509(iurl, data) @@ -351,12 +366,12 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): url = text.unescape(origurl) data = self._parse_original_info(text.extract( i6, "ownload original", "<", pos)[0]) - data["_fallback"] = ("{}?nl={}".format(url, nl),) + data["_fallback"] = self._fallback_original(nl, url) else: url = imgurl data = self._parse_image_info(url) - data["_fallback"] = self._fallback( - imgkey, request["page"], nl) + data["_fallback"] = self._fallback_1280( + nl, request["page"], imgkey) except IndexError: self.log.debug("Page content:\n%s", page) raise exception.StopExtraction( @@ -365,6 +380,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = request["page"] data["image_token"] = imgkey data["_url_1280"] = imgurl + data["_nl"] = nl self._check_509(imgurl, data) yield url, text.nameext_from_url(url, data) @@ -431,13 +447,26 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.NotFoundError("image page") return page - def _fallback(self, imgkey, num, nl): - url = "{}/s/{}/{}-{}?nl={}".format( - self.root, imgkey or self.key_start, self.gallery_id, num, nl) - page = self.request(url, fatal=False).text - if page.startswith(("Invalid page", "Keep trying")): - return - yield self.image_from_page(page)[0] + def _fallback_original(self, nl, fullimg): + url = "{}?nl={}".format(fullimg, nl) + for _ in range(self.fallback_retries): + yield url + + def _fallback_1280(self, nl, num, token=None): + if not token: + token = self.key_start + + for _ in range(self.fallback_retries): + url = "{}/s/{}/{}-{}?nl={}".format( + self.root, token, self.gallery_id, num, nl) + + page = self.request(url, fatal=False).text + if page.startswith(("Invalid page", "Keep trying")): + return + url, data = self.image_from_page(page) + yield url + + nl = data["_nl"] @staticmethod def _parse_image_info(url): diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 76c5404..54f2942 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -57,7 +57,11 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): data["user_name"] = data["artist_name"] yield Message.Directory, data - for image in self._extract_images(page): + for num, url in enumerate(self._extract_images(image_id, page)): + image = text.nameext_from_url(url, { + "num": num, + "url": "https:" + url, + }) image.update(data) if not image["extension"]: image["extension"] = "jpg" @@ -72,7 +76,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): extr = text.extract_from(page) keywords = text.unescape(extr( 'name="keywords" content="', '" />')).split(",") - data = { + return { "title" : keywords[0].strip(), "description": text.unescape(extr( '"description": "', '"').replace("&", "&")), @@ -82,7 +86,6 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): "artist_name": keywords[1], "tags" : keywords[2:-1], } - return data @staticmethod def _extract_data_horne(page): @@ -90,7 +93,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): extr = text.extract_from(page) keywords = text.unescape(extr( 'name="keywords" content="', '" />')).split(",") - data = { + return { "title" : keywords[0].strip(), "description": text.unescape(extr( 'property="og:description" content="', '"')), @@ -101,21 +104,16 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): "itemprop='datePublished' content=", "<").rpartition(">")[2], "%Y-%m-%d %H:%M:%S", 9), } - return data - @staticmethod - def _extract_images(page): - """Extract image URLs from 'page'""" - images = text.extract_iter(page, "/view_popup.php", "</a>") - for num, image in enumerate(images): - src = text.extr(image, 'src="', '"') - if not src: - continue - url = ("https:" + src).replace("/__rs_l120x120/", "/") - yield text.nameext_from_url(url, { - "num": num, - "url": url, - }) + def _extract_images(self, image_id, page): + if '&#diff_1" ' in page: + # multiple images + url = "{}/view_popup.php?id={}".format(self.root, image_id) + page = self.request(url).text + yield from text.extract_iter( + page, 'href="javascript:void(0);"><img src="', '"') + else: + yield text.extr(page, 'itemprop="image" src="', '"') @staticmethod def _extract_user_name(page): diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index d1f135d..65db94d 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -183,7 +183,7 @@ class OAuthBase(Extractor): } if auth: - auth = (client_id, client_secret) + auth = util.HTTPBasicAuth(client_id, client_secret) else: auth = None data["client_id"] = client_id diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 6aef9cb..fb560e9 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -254,6 +254,13 @@ class PatreonExtractor(Extractor): if bootstrap: return util.json_loads(bootstrap + "}") + bootstrap = text.extr( + page, + 'window.patreon = wrapInProxy({"bootstrap":', + '},"apiServer"') + if bootstrap: + return util.json_loads(bootstrap + "}") + bootstrap = text.extr(page, "window.patreon.bootstrap,", "});") if bootstrap: return util.json_loads(bootstrap + "}") diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py index 34b4ebf..5cfdc43 100644 --- a/gallery_dl/extractor/pixeldrain.py +++ b/gallery_dl/extractor/pixeldrain.py @@ -9,7 +9,7 @@ """Extractors for https://pixeldrain.com/""" from .common import Extractor, Message -from .. import text +from .. import text, util BASE_PATTERN = r"(?:https?://)?pixeldrain\.com" @@ -23,7 +23,7 @@ class PixeldrainExtractor(Extractor): def _init(self): api_key = self.config("api-key") if api_key: - self.session.auth = ("", api_key) + self.session.auth = util.HTTPBasicAuth("", api_key) def parse_datetime(self, date_string): return text.parse_datetime( diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index c0bf5b3..feb6d1f 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -423,9 +423,10 @@ class RedditAPI(): "grants/installed_client"), "device_id": "DO_NOT_TRACK_THIS_DEVICE"} + auth = util.HTTPBasicAuth(self.client_id, "") response = self.extractor.request( url, method="POST", headers=self.headers, - data=data, auth=(self.client_id, ""), fatal=False) + data=data, auth=auth, fatal=False) data = response.json() if response.status_code != 200: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ca1e906..f874f12 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -1276,8 +1276,21 @@ class TwitterAPI(): self.headers["x-csrf-token"] = csrf_token if response.status_code < 400: - # success - return response.json() + data = response.json() + if not data.get("errors") or not any( + (e.get("message") or "").lower().startswith("timeout") + for e in data["errors"]): + return data # success or non-timeout errors + + msg = data["errors"][0].get("message") or "Unspecified" + self.extractor.log.debug("Internal Twitter error: '%s'", msg) + + if self.headers["x-twitter-auth-type"]: + self.extractor.log.debug("Retrying API request") + continue # retry + + # fall through to "Login Required" + response.status_code = 404 if response.status_code == 429: # rate limit exceeded @@ -1289,11 +1302,9 @@ class TwitterAPI(): self.extractor.wait(until=until, seconds=seconds) continue - if response.status_code == 403 and \ - not self.headers["x-twitter-auth-type"] and \ - endpoint == "/2/search/adaptive.json": - raise exception.AuthorizationError( - "Login required to access search results") + if response.status_code in (403, 404) and \ + not self.headers["x-twitter-auth-type"]: + raise exception.AuthorizationError("Login required") # error try: @@ -1431,7 +1442,12 @@ class TwitterAPI(): for instr in instructions: instr_type = instr.get("type") if instr_type == "TimelineAddEntries": - entries = instr["entries"] + if entries: + entries.extend(instr["entries"]) + else: + entries = instr["entries"] + elif instr_type == "TimelineAddToModule": + entries = instr["moduleItems"] elif instr_type == "TimelineReplaceEntry": entry = instr["entry"] if entry["entryId"].startswith("cursor-bottom-"): @@ -1479,6 +1495,11 @@ class TwitterAPI(): if esw("tweet-"): tweets.append(entry) + elif esw("profile-grid-"): + if "content" in entry: + tweets.extend(entry["content"]["items"]) + else: + tweets.append(entry) elif esw(("homeConversation-", "profile-conversation-", "conversationthread-")): diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py new file mode 100644 index 0000000..b21709a --- /dev/null +++ b/gallery_dl/extractor/urlgalleries.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://urlgalleries.net/""" + +from .common import GalleryExtractor, Message +from .. import text + + +class UrlgalleriesGalleryExtractor(GalleryExtractor): + """Base class for Urlgalleries extractors""" + category = "urlgalleries" + root = "urlgalleries.net" + request_interval = (0.5, 1.0) + pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)" + example = "https://blog.urlgalleries.net/gallery-12345/TITLE" + + def __init__(self, match): + self.blog, self.gallery_id = match.groups() + url = "https://{}.urlgalleries.net/porn-gallery-{}/?a=10000".format( + self.blog, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def items(self): + page = self.request(self.gallery_url).text + imgs = self.images(page) + data = self.metadata(page) + data["count"] = len(imgs) + del page + + root = "https://{}.urlgalleries.net".format(self.blog) + yield Message.Directory, data + for data["num"], img in enumerate(imgs, 1): + response = self.request( + root + img, method="HEAD", allow_redirects=False) + yield Message.Queue, response.headers["Location"], data + + def metadata(self, page): + extr = text.extract_from(page) + return { + "gallery_id": self.gallery_id, + "_site": extr(' title="', '"'), # site name + "blog" : text.unescape(extr(' title="', '"')), + "_rprt": extr(' title="', '"'), # report button + "title": text.unescape(extr(' title="', '"').strip()), + "date" : text.parse_datetime( + extr(" images in gallery | ", "<"), "%B %d, %Y %H:%M"), + } + + def images(self, page): + imgs = text.extr(page, 'id="wtf"', "</div>") + return list(text.extract_iter(imgs, " href='", "'")) |
