diff options
| author | 2020-09-28 18:27:46 -0400 | |
|---|---|---|
| committer | 2020-09-28 18:27:46 -0400 | |
| commit | 9074eee175f76b824fbb6695d56426105191c51c (patch) | |
| tree | 2294be463d325d7092e600d88f160027c437086d /gallery_dl/extractor | |
| parent | 261c8c2bc74969e2242a153297895684742b6995 (diff) | |
New upstream version 1.15.0.upstream/1.15.0
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/500px.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 19 | ||||
| -rw-r--r-- | gallery_dl/extractor/aryion.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 23 | ||||
| -rw-r--r-- | gallery_dl/extractor/danbooru.py | 7 | ||||
| -rw-r--r-- | gallery_dl/extractor/deviantart.py | 78 | ||||
| -rw-r--r-- | gallery_dl/extractor/exhentai.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/foolfuuka.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/foolslide.py | 66 | ||||
| -rw-r--r-- | gallery_dl/extractor/furaffinity.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/hbrowse.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/hitomi.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/imgur.py | 242 | ||||
| -rw-r--r-- | gallery_dl/extractor/myhentaigallery.py | 65 | ||||
| -rw-r--r-- | gallery_dl/extractor/plurk.py | 13 | ||||
| -rw-r--r-- | gallery_dl/extractor/recursive.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/redgifs.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/tumblr.py | 24 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 29 | ||||
| -rw-r--r-- | gallery_dl/extractor/wikiart.py | 2 |
20 files changed, 340 insertions, 277 deletions
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 96cb021..4dc4f0d 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -153,7 +153,7 @@ class _500pxGalleryExtractor(_500pxExtractor): def metadata(self): user = self._request_graphql( "ProfileRendererQuery", {"username": self.user_name}, - "db1dba2cb7b7e94916d1005db16fea1a39d6211437b691c4de2f1a606c21c5fb", + "4d02ff5c13927a3ac73b3eef306490508bc765956940c31051468cf30402a503", )["profile"] self.user_id = str(user["legacyId"]) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 6f8867c..53bc726 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -74,6 +74,7 @@ modules = [ "mangareader", "mangastream", "mangoxo", + "myhentaigallery", "myportfolio", "naver", "newgrounds", @@ -140,7 +141,7 @@ def find(url): """Find a suitable extractor for the given URL""" for cls in _list_classes(): match = cls.pattern.match(url) - if match and cls not in _blacklist: + if match: return cls(match) return None @@ -169,26 +170,10 @@ def extractors(): ) -class blacklist(): - """Context Manager to blacklist extractor modules""" - def __init__(self, categories, extractors=None): - self.extractors = extractors or [] - for cls in _list_classes(): - if cls.category in categories: - self.extractors.append(cls) - - def __enter__(self): - _blacklist.update(self.extractors) - - def __exit__(self, etype, value, traceback): - _blacklist.clear() - - # -------------------------------------------------------------------- # internals _cache = [] -_blacklist = set() _module_iter = iter(modules) diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 2e4c4d4..374a9fc 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -30,6 +30,7 @@ class AryionExtractor(Extractor): Extractor.__init__(self, match) self.user = match.group(1) self.recursive = True + self._needle = "class='gallery-item' id='" def login(self): username, password = self._get_auth_info() @@ -73,7 +74,7 @@ class AryionExtractor(Extractor): while True: page = self.request(url).text yield from text.extract_iter( - page, "class='thumb' href='/g4/view/", "'") + page, self._needle, "'") pos = page.find("Next >>") if pos < 0: @@ -180,6 +181,7 @@ class AryionGalleryExtractor(AryionExtractor): url = "{}/g4/gallery/{}".format(self.root, self.user) return self._pagination(url) else: + self._needle = "class='thumb' href='/g4/view/" url = "{}/g4/latest.php?name={}".format(self.root, self.user) return util.advance(self._pagination(url), self.offset) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index e6c0968..357deac 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -31,6 +31,8 @@ class Extractor(): cookiedomain = "" root = "" test = None + _request_last = 0 + _request_interval = 0 def __init__(self, match): self.session = requests.Session() @@ -40,10 +42,14 @@ class Extractor(): self._cookiefile = None self._cookiejar = self.session.cookies self._parentdir = "" + + self._cfgpath = ("extractor", self.category, self.subcategory) self._write_pages = self.config("write-pages", False) self._retries = self.config("retries", 4) self._timeout = self.config("timeout", 30) self._verify = self.config("verify", True) + self._request_interval = self.config( + "sleep-request", self._request_interval) if self._retries < 0: self._retries = float("inf") @@ -69,8 +75,10 @@ class Extractor(): return 0 def config(self, key, default=None): - return config.interpolate( - ("extractor", self.category, self.subcategory), key, default) + return config.interpolate(self._cfgpath, key, default) + + def config_accumulate(self, key): + return config.accumulate(self._cfgpath, key) def request(self, url, *, method="GET", session=None, retries=None, encoding=None, fatal=True, notfound=None, **kwargs): @@ -81,6 +89,13 @@ class Extractor(): kwargs.setdefault("verify", self._verify) response = None + if self._request_interval: + seconds = (self._request_interval - + (time.time() - Extractor._request_last)) + if seconds > 0: + self.log.debug("Sleeping for %.5s seconds", seconds) + time.sleep(seconds) + while True: try: response = session.request(method, url, **kwargs) @@ -119,11 +134,13 @@ class Extractor(): msg = "'{} {}' for '{}'".format(code, reason, url) if code < 500 and code != 429 and code != 430: break + finally: + Extractor._request_last = time.time() self.log.debug("%s (%s/%s)", msg, tries, retries+1) if tries > retries: break - time.sleep(min(2 ** (tries-1), 1800)) + time.sleep(tries) tries += 1 raise exception.HttpError(msg, response) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py index e0edf89..1ebaf5b 100644 --- a/gallery_dl/extractor/danbooru.py +++ b/gallery_dl/extractor/danbooru.py @@ -93,7 +93,12 @@ class DanbooruExtractor(SharedConfigMixin, Extractor): if pagenum: params["page"] += 1 else: - params["page"] = "b{}".format(posts[-1]["id"]) + for post in reversed(posts): + if "id" in post: + params["page"] = "b{}".format(post["id"]) + break + else: + return class DanbooruTagExtractor(DanbooruExtractor): diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 73ef20d..a0f4d1c 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -14,7 +14,6 @@ from ..cache import cache, memcache import collections import itertools import mimetypes -import math import time import re @@ -55,6 +54,7 @@ class DeviantartExtractor(Extractor): self._update_content = self._update_content_image self.original = True + self._premium_cache = {} self.commit_journal = { "html": self._commit_journal_html, "text": self._commit_journal_text, @@ -66,6 +66,8 @@ class DeviantartExtractor(Extractor): def items(self): self.api = DeviantartOAuthAPI(self) + if not self.api.refresh_token_key: + self._fetch_premium = self._fetch_premium_notoken if self.user: profile = self.api.user_profile(self.user) @@ -83,6 +85,10 @@ class DeviantartExtractor(Extractor): yield Message.Queue, url, data continue + if "premium_folder_data" in deviation: + if not self._fetch_premium(deviation): + continue + self.prepare(deviation) yield Message.Directory, deviation @@ -261,7 +267,9 @@ class DeviantartExtractor(Extractor): return [(url + folder["name"], folder) for folder in folders] def _update_content_default(self, deviation, content): - content.update(self.api.deviation_download(deviation["deviationid"])) + public = "premium_folder_data" not in deviation + data = self.api.deviation_download(deviation["deviationid"], public) + content.update(data) def _update_content_image(self, deviation, content): data = self.api.deviation_download(deviation["deviationid"]) @@ -290,6 +298,41 @@ class DeviantartExtractor(Extractor): return response self.wait(seconds=180) + def _fetch_premium(self, deviation): + cache = self._premium_cache + + if deviation["deviationid"] not in cache: + + # check accessibility + dev = self.api.deviation(deviation["deviationid"], False) + has_access = dev["premium_folder_data"]["has_access"] + + if has_access: + self.log.info("Fetching premium folder data") + else: + self.log.warning("Unable to access premium content (type: %s)", + dev["premium_folder_data"]["type"]) + # fill cache + for dev in self.api.gallery( + deviation["author"]["username"], + deviation["premium_folder_data"]["gallery_id"], + public=False, + ): + cache[dev["deviationid"]] = dev if has_access else None + + data = cache[deviation["deviationid"]] + if data: + deviation.update(data) + return True + return False + + def _fetch_premium_notoken(self, deviation): + if not self._premium_cache: + self.log.warning( + "Unable to access premium content (no refresh-token)") + self._premium_cache = True + return False + class DeviantartUserExtractor(DeviantartExtractor): """Extractor for an artist's user profile""" @@ -837,8 +880,7 @@ class DeviantartOAuthAPI(): self.log = extractor.log self.headers = {} - delay = extractor.config("wait-min", 0) - self.delay = math.ceil(math.log2(delay)) if delay >= 1 else -1 + self.delay = extractor.config("wait-min", 0) self.delay_min = max(2, self.delay) self.mature = extractor.config("mature", "true") @@ -897,27 +939,27 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_folders(endpoint, params) - def deviation(self, deviation_id): + def deviation(self, deviation_id, public=True): """Query and return info about a single Deviation""" endpoint = "deviation/" + deviation_id - deviation = self._call(endpoint) + deviation = self._call(endpoint, public=public) if self.metadata: self._metadata((deviation,)) if self.folders: self._folders((deviation,)) return deviation - def deviation_content(self, deviation_id): + def deviation_content(self, deviation_id, public=False): """Get extended content of a single Deviation""" endpoint = "deviation/content" params = {"deviationid": deviation_id} - return self._call(endpoint, params, public=False) + return self._call(endpoint, params, public=public) - def deviation_download(self, deviation_id): + def deviation_download(self, deviation_id, public=True): """Get the original file download (if allowed)""" endpoint = "deviation/download/" + deviation_id params = {"mature_content": self.mature} - return self._call(endpoint, params) + return self._call(endpoint, params, public=public) def deviation_metadata(self, deviations): """ Fetch deviation metadata for a set of deviations""" @@ -930,12 +972,12 @@ class DeviantartOAuthAPI(): params = {"mature_content": self.mature} return self._call(endpoint, params)["metadata"] - def gallery(self, username, folder_id="", offset=0, extend=True): + def gallery(self, username, folder_id, offset=0, extend=True, public=True): """Yield all Deviation-objects contained in a gallery folder""" endpoint = "gallery/" + folder_id params = {"username": username, "offset": offset, "limit": 24, "mature_content": self.mature, "mode": "newest"} - return self._pagination(endpoint, params, extend) + return self._pagination(endpoint, params, extend, public) def gallery_all(self, username, offset=0): """Yield all Deviation-objects of a specific user""" @@ -993,8 +1035,8 @@ class DeviantartOAuthAPI(): """Call an API endpoint""" url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint while True: - if self.delay >= 0: - time.sleep(2 ** self.delay) + if self.delay: + time.sleep(self.delay) self.authenticate(None if public else self.refresh_token_key) response = self.extractor.request( @@ -1015,15 +1057,15 @@ class DeviantartOAuthAPI(): msg = "API responded with {} {}".format( status, response.reason) if status == 429: - if self.delay < 9: + if self.delay < 30: self.delay += 1 - self.log.warning("%s. Using %ds delay.", msg, 2 ** self.delay) + self.log.warning("%s. Using %ds delay.", msg, self.delay) else: self.log.error(msg) return data - def _pagination(self, endpoint, params, extend=True): - public = warn = True + def _pagination(self, endpoint, params, extend=True, public=True): + warn = True while True: data = self._call(endpoint, params, public=public) if "results" not in data: diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 80c7187..cb4df11 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -34,6 +34,9 @@ class ExhentaiExtractor(Extractor): LIMIT = False def __init__(self, match): + # allow calling 'self.config()' before 'Extractor.__init__()' + self._cfgpath = ("extractor", self.category, self.subcategory) + version = match.group(1) domain = self.config("domain", "auto") if domain == "auto": @@ -193,7 +196,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self._check_limits(data) if "/fullimg.php" in url: data["extension"] = "" - self.wait(1.5) + self.wait(self.wait_max / 4) yield Message.Url, url, data def get_metadata(self, page): diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 4af9d4a..f2019ca 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -109,7 +109,7 @@ EXTRACTORS = { "root": "https://arch.b4k.co", "extra": {"external": "direct"}, "test-thread": ("https://arch.b4k.co/meta/thread/196/", { - "url": "9b0ae01292133268fe9178b71332da1ee25b7704", + "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a", }), }, "desuarchive": { diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index e624a65..0ab42db 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -17,9 +17,7 @@ from .common import ( generate_extractors, ) from .. import text, util -import base64 import json -import re class FoolslideBase(SharedConfigMixin): @@ -83,25 +81,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): }) def images(self, page): - data = None - - if self.decode == "base64": - pos = page.find("'fromCharCode'") - if pos >= 0: - blob = text.extract(page, "'", "'", pos+15)[0] - base64_data = re.sub(r"[a-zA-Z]", _decode_jaiminisbox, blob) - else: - base64_data = text.extract(page, 'atob("', '"')[0] - if base64_data: - data = base64.b64decode(base64_data.encode()).decode() - elif self.decode == "double": - pos = page.find("[{") - if pos >= 0: - data = text.extract(page, " = ", ";", pos)[0] - - if not data: - data = text.extract(page, "var pages = ", ";")[0] - return json.loads(data) + return json.loads(text.extract(page, "var pages = ", ";")[0]) class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): @@ -126,16 +106,6 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): }))) -def _decode_jaiminisbox(match): - c = match.group(0) - - # ord("Z") == 90, ord("z") == 122 - N = 90 if c <= "Z" else 122 - C = ord(c) + 13 - - return chr(C if N >= C else (C - 26)) - - EXTRACTORS = { "dokireader": { "root": "https://kobato.hologfx.com/reader", @@ -151,19 +121,6 @@ EXTRACTORS = { "keyword": "614d89a6045b85c822cbd3e67578ea7577dfc995", }), }, - "jaiminisbox": { - "root": "https://jaiminisbox.com/reader", - "pattern": r"(?:www\.)?jaiminisbox\.com/reader", - "extra": {"decode": "base64"}, - "test-chapter": - ("https://jaiminisbox.com/reader/read/oshi-no-ko/en/0/1/", { - "keyword": "d6435cfc1522293a42517a4aadda95a8631da0b3", - }), - "test-manga": - ("https://jaiminisbox.com/reader/series/oshi-no-ko/", { - "count": ">= 10", - }), - }, "kireicake": { "root": "https://reader.kireicake.com", "test-chapter": @@ -220,27 +177,6 @@ EXTRACTORS = { "keyword": "562fb5a7362a4cb43d59d5c8a6ea8080fc65cf99", }), }, - "worldthree": { - "root": "http://www.slide.world-three.org", - "pattern": r"(?:www\.)?slide\.world-three\.org", - "test-chapter": ( - (("http://www.slide.world-three.org" - "/read/black_bullet/en/2/7/page/1"), { - "url": "be2f04f6e2d311b35188094cfd3e768583271584", - "keyword": "967d536a65de4d52478d5b666a1760b181eddb6e", - }), - (("http://www.slide.world-three.org" - "/read/idolmster_cg_shuffle/en/0/4/2/"), { - "url": "6028ea5ca282744f925dfad92eeb98509f9cc78c", - "keyword": "f3cfe2ad3388991f1d045c85d0fa94795a7694dc", - }), - ), - "test-manga": - ("http://www.slide.world-three.org/series/black_bullet/", { - "url": "5743b93512d26e6b540d90a7a5d69208b6d4a738", - "keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120", - }), - }, "_ckey": "chapterclass", } diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 6dfd75d..950a174 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -55,7 +55,7 @@ class FuraffinityExtractor(Extractor): title, _, artist = text.unescape(extr( 'property="og:title" content="', '"')).rpartition(" by ") artist_url = artist.replace("_", "").lower() - path = extr('href="//d.facdn.net/', '"') + path = extr('href="//d', '"') if not path: self.log.warning( @@ -76,7 +76,7 @@ class FuraffinityExtractor(Extractor): "artist" : artist, "artist_url": artist_url, "user" : self.user or artist_url, - "url" : "https://d.facdn.net/" + path + "url" : "https://d" + path }) tags = extr('class="tags-row">', '</section>') @@ -179,7 +179,7 @@ class FuraffinityGalleryExtractor(FuraffinityExtractor): subcategory = "gallery" pattern = BASE_PATTERN + r"/gallery/([^/?&#]+)" test = ("https://www.furaffinity.net/gallery/mirlinthloth/", { - "pattern": r"https://d.facdn.net/art/mirlinthloth/\d+/\d+.\w+\.\w+", + "pattern": r"https://d\d?.facdn.net/art/mirlinthloth/\d+/\d+.\w+\.\w+", "range": "45-50", "count": 6, }) @@ -191,7 +191,7 @@ class FuraffinityScrapsExtractor(FuraffinityExtractor): directory_fmt = ("{category}", "{user!l}", "Scraps") pattern = BASE_PATTERN + r"/scraps/([^/?&#]+)" test = ("https://www.furaffinity.net/scraps/mirlinthloth/", { - "pattern": r"https://d.facdn.net/art/[^/]+(/stories)?/\d+/\d+.\w+.\w+", + "pattern": r"https://d\d?.facdn.net/art/[^/]+(/stories)?/\d+/\d+.\w+.", "count": ">= 3", }) @@ -202,7 +202,7 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): directory_fmt = ("{category}", "{user!l}", "Favorites") pattern = BASE_PATTERN + r"/favorites/([^/?&#]+)" test = ("https://www.furaffinity.net/favorites/mirlinthloth/", { - "pattern": r"https://d.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+", + "pattern": r"https://d\d?.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+", "range": "45-50", "count": 6, }) @@ -217,7 +217,7 @@ class FuraffinitySearchExtractor(FuraffinityExtractor): directory_fmt = ("{category}", "Search", "{search}") pattern = BASE_PATTERN + r"/search/?\?([^#]+)" test = ("https://www.furaffinity.net/search/?q=cute", { - "pattern": r"https://d.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+", + "pattern": r"https://d\d?.facdn.net/art/[^/]+/\d+/\d+.\w+\.\w+", "range": "45-50", "count": 6, }) @@ -236,7 +236,7 @@ class FuraffinityPostExtractor(FuraffinityExtractor): pattern = BASE_PATTERN + r"/(?:view|full)/(\d+)" test = ( ("https://www.furaffinity.net/view/21835115/", { - "url": "eae4ef93d99365c69b31a37561bd800c03d336ad", + "url": "d80254eb4fba654597b4df8320d55916e11ba375", "keyword": { "artist" : "mirlinthloth", "artist_url" : "mirlinthloth", @@ -247,7 +247,7 @@ class FuraffinityPostExtractor(FuraffinityExtractor): "id" : 21835115, "tags" : list, "title" : "Bude's 4 Ever", - "url" : "re:https://d.facdn.net/art/mirlinthloth/music", + "url" : r"re:https://d\d?.facdn.net/art/mirlinthloth/m", "user" : "mirlinthloth", "views" : int, "favorites" : int, diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py index 181db9a..43479c6 100644 --- a/gallery_dl/extractor/hbrowse.py +++ b/gallery_dl/extractor/hbrowse.py @@ -50,7 +50,7 @@ class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor): pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))" test = ("https://www.hbrowse.com/10363/c00000", { "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6", - "keyword": "6c1136522a25de013a6579ffa34dadc1eb0d4d1b", + "keyword": "274996f6c809e5250b6ff3abbc5147e29f89d9a5", "content": "44578ebbe176c2c27434966aef22945787e2781e", }) @@ -78,7 +78,7 @@ class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$" test = ("https://www.hbrowse.com/10363", { "url": "b89682bfb86c11d2af0dc47463804ec3ac4aadd6", - "keyword": "08f5935a4411d2c19ac1786bd4ca552c3785fcae", + "keyword": "4b15fda1858a69de1fbf5afddfe47dd893397312", }) def chapters(self, page): diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index 209a4f2..f341c47 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -25,7 +25,7 @@ class HitomiGalleryExtractor(GalleryExtractor): test = ( ("https://hitomi.la/galleries/867789.html", { "pattern": r"https://[a-c]a.hitomi.la/images/./../[0-9a-f]+.jpg", - "keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2", + "keyword": "4873ef9a523621fc857b114e0b2820ba4066e9ae", "count": 16, }), # download test diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 190a4ff..4391e64 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -27,19 +27,17 @@ class ImgurExtractor(Extractor): self.mp4 = self.config("mp4", True) def _prepare(self, image): - try: - del image["ad_url"] - del image["ad_type"] - del image["ad_config"] - except KeyError: - pass + image.update(image["metadata"]) + del image["metadata"] - if image["animated"] and self.mp4 and "mp4" in image: - url = image["mp4"] - else: - url = image["link"] + if image["ext"] == "jpeg": + image["ext"] = "jpg" + elif image["is_animated"] and self.mp4 and image["ext"] == "gif": + image["ext"] = "mp4" - image["date"] = text.parse_timestamp(image["datetime"]) + image["url"] = url = "https://i.imgur.com/{}.{}".format( + image["id"], image["ext"]) + image["date"] = text.parse_datetime(image["created_at"]) text.nameext_from_url(url, image) return url @@ -65,33 +63,38 @@ class ImgurImageExtractor(ImgurExtractor): "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { - "account_id" : None, - "account_url" : None, - "animated" : False, - "bandwidth" : int, - "date" : "dt:2016-11-10 14:24:35", - "datetime" : 1478787875, - "description" : None, - "edited" : "0", - "extension" : "png", - "favorite" : False, - "filename" : "21yMxCS", - "has_sound" : False, - "height" : 32, - "id" : "21yMxCS", - "in_gallery" : False, - "in_most_viral": False, - "is_ad" : False, - "link" : "https://i.imgur.com/21yMxCS.png", - "nsfw" : False, - "section" : None, - "size" : 182, - "tags" : [], - "title" : "Test", - "type" : "image/png", - "views" : int, - "vote" : None, - "width" : 64, + "account_id" : 0, + "comment_count" : int, + "cover_id" : "21yMxCS", + "date" : "dt:2016-11-10 14:24:35", + "description" : "", + "downvote_count": int, + "duration" : 0, + "ext" : "png", + "favorite" : False, + "favorite_count": 0, + "has_sound" : False, + "height" : 32, + "id" : "21yMxCS", + "image_count" : 1, + "in_most_viral" : False, + "is_ad" : False, + "is_album" : False, + "is_animated" : False, + "is_looping" : False, + "is_mature" : False, + "is_pending" : False, + "mime_type" : "image/png", + "name" : "test-テスト", + "point_count" : int, + "privacy" : "", + "score" : int, + "size" : 182, + "title" : "Test", + "upvote_count" : int, + "url" : "https://i.imgur.com/21yMxCS.png", + "view_count" : int, + "width" : 64, }, }), ("http://imgur.com/0gybAXR", { # gifv/mp4 video @@ -101,30 +104,32 @@ class ImgurImageExtractor(ImgurExtractor): ("https://imgur.com/XFfsmuC", { # missing title in API response (#467) "keyword": {"title": "Tears are a natural response to irritants"}, }), - ("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1' - "url": "ec2cf11a2bfb4939feff374781a6e6f3e9af8e8e", - }), ("https://imgur.com/1Nily2P", { # animated png "pattern": "https://i.imgur.com/1Nily2P.png", }), ("https://imgur.com/zzzzzzz", { # not found "exception": exception.HttpError, }), - ("https://www.imgur.com/21yMxCS"), # www - ("https://m.imgur.com/21yMxCS"), # mobile - ("https://imgur.com/zxaY6"), # 5 character key - ("https://i.imgur.com/21yMxCS.png"), # direct link + ("https://www.imgur.com/21yMxCS"), # www + ("https://m.imgur.com/21yMxCS"), # mobile + ("https://imgur.com/zxaY6"), # 5 character key + ("https://i.imgur.com/21yMxCS.png"), # direct link ("https://i.imgur.com/21yMxCSh.png"), # direct link thumbnail - ("https://i.imgur.com/zxaY6.gif"), # direct link (short) - ("https://i.imgur.com/zxaY6s.gif"), # direct link (short; thumb) + ("https://i.imgur.com/zxaY6.gif"), # direct link (short) + ("https://i.imgur.com/zxaY6s.gif"), # direct link (short; thumb) ) def items(self): image = self.api.image(self.key) - if not image["title"]: - page = self.request(self.root + "/" + self.key, fatal=False).text - title = text.extract(page, "<title>", "<")[0] or "" - image["title"] = text.unescape(title.rpartition(" - ")[0].strip()) + + try: + del image["ad_url"] + del image["ad_type"] + except KeyError: + pass + + image.update(image["media"][0]) + del image["media"] url = self._prepare(image) yield Message.Version, 1 yield Message.Directory, image @@ -143,53 +148,49 @@ class ImgurAlbumExtractor(ImgurExtractor): "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", "keyword": { "album": { - "account_id" : None, - "account_url" : None, - "cover" : "693j2Kr", - "cover_edited": None, - "cover_height": 1400, - "cover_width" : 951, - "date" : "dt:2015-10-09 10:37:50", - "datetime" : 1444387070, - "description" : None, - "favorite" : False, - "id" : "TcBmP", - "images_count": 19, - "in_gallery" : False, - "is_ad" : False, - "is_album" : True, - "layout" : "blog", - "link" : "https://imgur.com/a/TcBmP", - "nsfw" : bool, - "privacy" : "hidden", - "section" : None, - "title" : "138", - "views" : int, + "account_id" : 0, + "comment_count" : int, + "cover_id" : "693j2Kr", + "date" : "dt:2015-10-09 10:37:50", + "description" : "", + "downvote_count": 0, + "favorite" : False, + "favorite_count": 0, + "id" : "TcBmP", + "image_count" : 19, + "in_most_viral" : False, + "is_ad" : False, + "is_album" : True, + "is_mature" : False, + "is_pending" : False, + "privacy" : "private", + "score" : int, + "title" : "138", + "topic" : "", + "topic_id" : 0, + "upvote_count" : int, + "url" : "https://imgur.com/a/TcBmP", + "view_count" : int, + "virality" : int, }, - "account_id" : None, - "account_url": None, - "animated" : bool, - "bandwidth" : int, + "account_id" : 0, + "count" : 19, "date" : "type:datetime", - "datetime" : int, - "description": None, - "edited" : "0", - "favorite" : False, + "description": "", + "ext" : "jpg", "has_sound" : False, "height" : int, "id" : str, - "in_gallery" : False, - "is_ad" : False, - "link" : r"re:https://i\.imgur\.com/\w+\.jpg", - "nsfw" : None, + "is_animated": False, + "is_looping" : False, + "mime_type" : "image/jpeg", + "name" : str, "num" : int, - "section" : None, "size" : int, - "tags" : list, - "title" : None, - "type" : "image/jpeg", - "views" : int, - "vote" : None, + "title" : str, + "type" : "image", + "updated_at" : None, + "url" : str, "width" : int, }, }), @@ -208,13 +209,15 @@ class ImgurAlbumExtractor(ImgurExtractor): def items(self): album = self.api.album(self.key) - album["date"] = text.parse_timestamp(album["datetime"]) - images = album["images"] + album["date"] = text.parse_datetime(album["created_at"]) + + images = album["media"] + del album["media"] count = len(images) try: - del album["images"] - del album["ad_config"] + del album["ad_url"] + del album["ad_type"] except KeyError: pass @@ -239,22 +242,17 @@ class ImgurGalleryExtractor(ImgurExtractor): ("https://imgur.com/gallery/eD9CT", { "pattern": "https://imgur.com/a/eD9CT", }), - ("https://imgur.com/t/unmuted/26sEhNr", { # unmuted URL - "pattern": "https://imgur.com/26sEhNr", - }), + ("https://imgur.com/t/unmuted/26sEhNr"), ("https://imgur.com/t/cat/qSB8NbN"), ) def items(self): - url = self.root + "/a/" + self.key - with self.request(url, method="HEAD", fatal=False) as response: - if response.status_code < 400: - extr = ImgurAlbumExtractor - else: - extr = ImgurImageExtractor - url = self.root + "/" + self.key - - yield Message.Version, 1 + if self.api.gallery(self.key)["is_album"]: + url = "{}/a/{}".format(self.root, self.key) + extr = ImgurAlbumExtractor + else: + url = "{}/{}".format(self.root, self.key) + extr = ImgurImageExtractor yield Message.Queue, url, {"_extractor": extr} @@ -346,38 +344,46 @@ class ImgurAPI(): } def account_favorites(self, account): - endpoint = "account/{}/gallery_favorites".format(account) + endpoint = "/3/account/{}/gallery_favorites".format(account) return self._pagination(endpoint) def gallery_search(self, query): - endpoint = "gallery/search" + endpoint = "/3/gallery/search" params = {"q": query} return self._pagination(endpoint, params) def account_submissions(self, account): - endpoint = "account/{}/submissions".format(account) + endpoint = "/3/account/{}/submissions".format(account) return self._pagination(endpoint) def gallery_subreddit(self, subreddit): - endpoint = "gallery/r/{}".format(subreddit) + endpoint = "/3/gallery/r/{}".format(subreddit) return self._pagination(endpoint) def gallery_tag(self, tag): - endpoint = "gallery/t/{}".format(tag) + endpoint = "/3/gallery/t/{}".format(tag) return self._pagination(endpoint, key="items") + def image(self, image_hash): + endpoint = "/post/v1/media/" + image_hash + params = {"include": "media,tags,account"} + return self._call(endpoint, params) + def album(self, album_hash): - return self._call("album/" + album_hash) + endpoint = "/post/v1/albums/" + album_hash + params = {"include": "media,tags,account"} + return self._call(endpoint, params) - def image(self, image_hash): - return self._call("image/" + image_hash) + def gallery(self, gallery_hash): + endpoint = "/post/v1/posts/" + gallery_hash + return self._call(endpoint) def _call(self, endpoint, params=None): try: return self.extractor.request( - "https://api.imgur.com/3/" + endpoint, + "https://api.imgur.com" + endpoint, params=params, headers=self.headers, - ).json()["data"] + ).json() except exception.HttpError as exc: if exc.status != 403 or b"capacity" not in exc.response.content: raise @@ -388,7 +394,7 @@ class ImgurAPI(): num = 0 while True: - data = self._call("{}/{}".format(endpoint, num), params) + data = self._call("{}/{}".format(endpoint, num), params)["data"] if key: data = data[key] if not data: diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py new file mode 100644 index 0000000..4a43d57 --- /dev/null +++ b/gallery_dl/extractor/myhentaigallery.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract hentai-gallery from https://myhentaigallery.com/""" + +from .common import GalleryExtractor +from .. import text, exception + + +class MyhentaigalleryGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from myhentaigallery.com""" + category = "myhentaigallery" + directory_fmt = ("{category}", "{gallery_id} {artist:?[/] /J, }{title}") + pattern = (r"(?:https?://)?myhentaigallery\.com" + r"/gallery/(?:thumbnails|show)/(\d+)") + test = ( + ("https://myhentaigallery.com/gallery/thumbnails/16247", { + "pattern": r"https://images.myhentaigrid.com/imagesgallery/images" + r"/[^/]+/original/\d+\.jpg", + "keyword": { + "artist" : list, + "count" : 11, + "gallery_id": 16247, + "group" : list, + "parodies" : list, + "tags" : ["Giantess"], + "title" : "Attack Of The 50ft Woman 1", + }, + }), + ("https://myhentaigallery.com/gallery/show/16247/1"), + ) + root = "https://myhentaigallery.com" + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/gallery/thumbnails/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + self.session.headers["Referer"] = url + + def metadata(self, page): + extr = text.extract_from(page) + split = text.split_html + + title = extr('<div class="comic-description">\n<h1>', '</h1>') + if not title: + raise exception.NotFoundError("gallery") + + return { + "title" : text.unescape(title), + "gallery_id": text.parse_int(self.gallery_id), + "tags" : split(extr('<div>\nCategories:', '</div>')), + "artist" : split(extr('<div>\nArtists:' , '</div>')), + "group" : split(extr('<div>\nGroups:' , '</div>')), + "parodies" : split(extr('<div>\nParodies:' , '</div>')), + } + + def images(self, page): + return [ + (text.unescape(text.extract(url, 'src="', '"')[0]).replace( + "/thumbnail/", "/original/"), None) + for url in text.extract_iter(page, 'class="comic-thumb"', '</div>') + ] diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py index 6862559..60ca1fb 100644 --- a/gallery_dl/extractor/plurk.py +++ b/gallery_dl/extractor/plurk.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,7 +9,7 @@ """Extractors for https://www.plurk.com/""" from .common import Extractor, Message -from .. import text, extractor, exception +from .. import text, exception import datetime import time import json @@ -23,12 +23,9 @@ class PlurkExtractor(Extractor): def items(self): urls = self._urls_ex if self.config("comments", False) else self._urls - - yield Message.Version, 1 - with extractor.blacklist(("plurk",)): - for plurk in self.plurks(): - for url in urls(plurk): - yield Message.Queue, url, plurk + for plurk in self.plurks(): + for url in urls(plurk): + yield Message.Queue, url, plurk def plurks(self): """Return an iterable with all relevant 'plurk' objects""" diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py index ead5c35..4dd9d5c 100644 --- a/gallery_dl/extractor/recursive.py +++ b/gallery_dl/extractor/recursive.py @@ -9,7 +9,6 @@ """Recursive extractor""" from .common import Extractor, Message -from .. import extractor, util import requests import re @@ -23,17 +22,12 @@ class RecursiveExtractor(Extractor): }) def items(self): - blist = self.config( - "blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS) - self.session.mount("file://", FileAdapter()) page = self.request(self.url.partition(":")[2]).text del self.session.adapters["file://"] - yield Message.Version, 1 - with extractor.blacklist(blist): - for match in re.finditer(r"https?://[^\s\"']+", page): - yield Message.Queue, match.group(0), {} + for match in re.finditer(r"https?://[^\s\"']+", page): + yield Message.Queue, match.group(0), {} class FileAdapter(requests.adapters.BaseAdapter): diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 0f02e8b..96be3d8 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -24,7 +24,7 @@ class RedgifsUserExtractor(RedgifsExtractor): directory_fmt = ("{category}", "{userName}") pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?&#]+)" test = ("https://www.redgifs.com/users/Natalifiction", { - "pattern": r"https://thcf\d+\.redgifs\.com/[A-Za-z]+\.mp4", + "pattern": r"https://\w+\.(redgifs|gfycat)\.com/[A-Za-z]+\.mp4", "count": ">= 100", }) @@ -38,7 +38,7 @@ class RedgifsSearchExtractor(RedgifsExtractor): directory_fmt = ("{category}", "Search", "{search}") pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/gifs/browse/([^/?&#]+)" test = ("https://www.redgifs.com/gifs/browse/jav", { - "pattern": r"https://thcf\d+\.redgifs\.com/[A-Za-z]+\.mp4", + "pattern": r"https://\w+\.(redgifs|gfycat)\.com/[A-Za-z]+\.mp4", "range": "100-300", "count": "> 200", }) diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 4d51851..185f33a 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -9,7 +9,7 @@ """Extract images from https://www.tumblr.com/""" from .common import Extractor, Message -from .. import text, oauth, extractor, exception +from .. import text, oauth, exception from datetime import datetime, timedelta import re @@ -41,7 +41,7 @@ BASE_PATTERN = ( class TumblrExtractor(Extractor): """Base class for tumblr extractors""" category = "tumblr" - directory_fmt = ("{category}", "{name}") + directory_fmt = ("{category}", "{blog_name}") filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" cookiedomain = None @@ -69,7 +69,6 @@ class TumblrExtractor(Extractor): def items(self): blog = None - yield Message.Version, 1 for post in self.posts(): if self.date_min > post["timestamp"]: @@ -79,10 +78,10 @@ class TumblrExtractor(Extractor): if not blog: blog = self.api.info(self.blog) blog["uuid"] = self.blog - yield Message.Directory, blog.copy() if self.avatar: url = self.api.avatar(self.blog) + yield Message.Directory, {"blog": blog} yield self._prepare_avatar(url, post.copy(), blog) reblog = "reblogged_from_id" in post @@ -90,13 +89,13 @@ class TumblrExtractor(Extractor): continue post["reblogged"] = reblog + if "trail" in post: + del post["trail"] post["blog"] = blog post["date"] = text.parse_timestamp(post["timestamp"]) + yield Message.Directory, post post["num"] = 0 - if "trail" in post: - del post["trail"] - if "photos" in post: # type "photo" or "link" photos = post["photos"] del post["photos"] @@ -129,12 +128,9 @@ class TumblrExtractor(Extractor): if self.external: # external links post["extension"] = None - with extractor.blacklist(("tumblr",)): - for key in ("permalink_url", "url"): - url = post.get(key) - if url: - yield Message.Queue, url, post - break + url = post.get("permalink_url") or post.get("url") + if url: + yield Message.Queue, url, post def posts(self): """Return an iterable containing all relevant posts""" @@ -316,7 +312,7 @@ class TumblrTagExtractor(TumblrExtractor): class TumblrLikesExtractor(TumblrExtractor): """Extractor for images from a tumblr-user's liked posts""" subcategory = "likes" - directory_fmt = ("{category}", "{name}", "likes") + directory_fmt = ("{category}", "{blog_name}", "likes") archive_fmt = "f_{blog[name]}_{id}_{num}" pattern = BASE_PATTERN + r"/likes" test = ("http://mikf123.tumblr.com/likes", { diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 71f14dc..236a001 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -245,15 +245,24 @@ class TwitterExtractor(Extractor): class TwitterTimelineExtractor(TwitterExtractor): """Extractor for all images from a user's timeline""" subcategory = "timeline" - pattern = BASE_PATTERN + r"/(?!search)([^/?&#]+)/?(?:$|[?#])" + pattern = BASE_PATTERN + \ + r"/(?!search)(?:([^/?&#]+)/?(?:$|[?#])|intent/user\?user_id=(\d+))" test = ( ("https://twitter.com/supernaturepics", { "range": "1-40", "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", }), ("https://mobile.twitter.com/supernaturepics?p=i"), + ("https://www.twitter.com/id:2976459548"), + ("https://twitter.com/intent/user?user_id=2976459548"), ) + def __init__(self, match): + TwitterExtractor.__init__(self, match) + uid = match.group(2) + if uid: + self.user = "id:" + uid + def tweets(self): return TwitterAPI(self).timeline_profile(self.user) @@ -268,6 +277,7 @@ class TwitterMediaExtractor(TwitterExtractor): "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", }), ("https://mobile.twitter.com/supernaturepics/media#t"), + ("https://www.twitter.com/id:2976459548/media"), ) def tweets(self): @@ -450,18 +460,18 @@ class TwitterAPI(): return tweets def timeline_profile(self, screen_name): - user = self.user_by_screen_name(screen_name) - endpoint = "2/timeline/profile/{}.json".format(user["rest_id"]) + user_id = self._user_id_by_screen_name(screen_name) + endpoint = "2/timeline/profile/{}.json".format(user_id) return self._pagination(endpoint) def timeline_media(self, screen_name): - user = self.user_by_screen_name(screen_name) - endpoint = "2/timeline/media/{}.json".format(user["rest_id"]) + user_id = self._user_id_by_screen_name(screen_name) + endpoint = "2/timeline/media/{}.json".format(user_id) return self._pagination(endpoint) def timeline_favorites(self, screen_name): - user = self.user_by_screen_name(screen_name) - endpoint = "2/timeline/favorites/{}.json".format(user["rest_id"]) + user_id = self._user_id_by_screen_name(screen_name) + endpoint = "2/timeline/favorites/{}.json".format(user_id) return self._pagination(endpoint) def timeline_bookmark(self): @@ -490,6 +500,11 @@ class TwitterAPI(): except KeyError: raise exception.NotFoundError("user") + def _user_id_by_screen_name(self, screen_name): + if screen_name.startswith("id:"): + return screen_name[3:] + return self.user_by_screen_name(screen_name)["rest_id"] + @cache(maxage=3600) def _guest_token(self): endpoint = "1.1/guest/activate.json" diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index 0ada118..4efc92c 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -93,7 +93,7 @@ class WikiartArtworksExtractor(WikiartExtractor): directory_fmt = ("{category}", "Artworks by {group!c}", "{type}") pattern = BASE_PATTERN + r"/paintings-by-([\w-]+)/([\w-]+)" test = ("https://www.wikiart.org/en/paintings-by-media/grisaille", { - "url": "228426a9d32b5bba9d659944c6b0ba73883af33f", + "url": "36e054fcb3363b7f085c81f4778e6db3994e56a3", }) def __init__(self, match): |
