diff options
Diffstat (limited to 'gallery_dl/extractor')
27 files changed, 695 insertions, 253 deletions
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index 5675081..e686c70 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,6 +20,7 @@ class ArtstationExtractor(Extractor): filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}" directory_fmt = ("{category}", "{userinfo[username]}") archive_fmt = "{asset[id]}" + browser = "firefox" root = "https://www.artstation.com" def __init__(self, match): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index abb352c..cac8c2d 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -306,23 +306,29 @@ class Extractor(): cookiefile = util.expand_path(cookies) try: with open(cookiefile) as fp: - cookies = util.load_cookiestxt(fp) + util.cookiestxt_load(fp, self._cookiejar) except Exception as exc: self.log.warning("cookies: %s", exc) else: - self._update_cookies(cookies) self._cookiefile = cookiefile + elif isinstance(cookies, (list, tuple)): + from ..cookies import load_cookies + try: + load_cookies(self._cookiejar, cookies) + except Exception as exc: + self.log.warning("cookies: %s", exc) else: self.log.warning( - "expected 'dict' or 'str' value for 'cookies' option, " - "got '%s' (%s)", cookies.__class__.__name__, cookies) + "Expected 'dict', 'list', or 'str' value for 'cookies' " + "option, got '%s' (%s)", + cookies.__class__.__name__, cookies) def _store_cookies(self): """Store the session's cookiejar in a cookies.txt file""" if self._cookiefile and self.config("cookies-update", True): try: with open(self._cookiefile, "w") as fp: - util.save_cookiestxt(fp, self._cookiejar) + util.cookiestxt_store(fp, self._cookiejar) except OSError as exc: self.log.warning("cookies: %s", exc) diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 04e5926..093113d 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for 4chan archives based on FoolFuuka""" +"""Extractors for FoolFuuka 4chan archives""" from .common import BaseExtractor, Message from .. import text @@ -16,6 +16,7 @@ import itertools class FoolfuukaExtractor(BaseExtractor): """Base extractor for FoolFuuka based boards/archives""" basecategory = "foolfuuka" + filename_fmt = "{timestamp_ms} {filename_media}.{extension}" archive_fmt = "{board[shortname]}_{num}_{timestamp}" external = "default" @@ -40,6 +41,9 @@ class FoolfuukaExtractor(BaseExtractor): post["filename"], _, post["extension"] = \ media["media"].rpartition(".") + post["filename_media"] = media["media_filename"].rpartition(".")[0] + post["timestamp_ms"] = text.parse_int( + media["media_orig"].rpartition(".")[0]) yield Message.Url, url, post def metadata(self): @@ -66,6 +70,7 @@ BASE_PATTERN = FoolfuukaExtractor.update({ }, "archivedmoe": { "root": "https://archived.moe", + "pattern": r"archived\.moe", }, "archiveofsins": { "root": "https://archiveofsins.com", @@ -73,12 +78,15 @@ BASE_PATTERN = FoolfuukaExtractor.update({ }, "b4k": { "root": "https://arch.b4k.co", + "pattern": r"arch\.b4k\.co", }, "desuarchive": { "root": "https://desuarchive.org", + "pattern": r"desuarchive\.org", }, "fireden": { "root": "https://boards.fireden.net", + "pattern": r"boards\.fireden\.net", }, "nyafuu": { "root": "https://archive.nyafuu.org", @@ -90,9 +98,11 @@ BASE_PATTERN = FoolfuukaExtractor.update({ }, "thebarchive": { "root": "https://thebarchive.com", + "pattern": r"thebarchive\.com", }, "wakarimasen": { "root": "https://archive.wakarimasen.moe", + "pattern": r"archive\.wakarimasen\.moe", }, }) @@ -101,7 +111,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): """Base extractor for threads on FoolFuuka based boards/archives""" subcategory = "thread" directory_fmt = ("{category}", "{board[shortname]}", - "{thread_num}{title:? - //}") + "{thread_num} {title|comment[:50]}") pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)" test = ( ("https://archive.4plebs.org/tg/thread/54059290", { diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index c09eb96..382cc25 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2021 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -41,6 +41,7 @@ class FoolslideExtractor(BaseExtractor): BASE_PATTERN = FoolslideExtractor.update({ "kireicake": { "root": "https://reader.kireicake.com", + "pattern": r"reader\.kireicake\.com", }, "powermanga": { "root": "https://read.powermanga.org", diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 541f454..9c19664 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Gelbooru v0.1 sites""" +"""Extractors for Gelbooru Beta 0.1.11 sites""" from . import booru from .. import text @@ -42,14 +42,43 @@ class GelbooruV01Extractor(booru.BooruExtractor): return post + def _pagination(self, url, begin, end): + pid = self.page_start + + while True: + page = self.request(url + str(pid)).text + + cnt = 0 + for post_id in text.extract_iter(page, begin, end): + yield self._parse_post(post_id) + cnt += 1 + + if cnt < self.per_page: + return + pid += self.per_page + BASE_PATTERN = GelbooruV01Extractor.update({ - "thecollection" : {"root": "https://the-collection.booru.org"}, - "illusioncardsbooru": {"root": "https://illusioncards.booru.org"}, - "allgirlbooru" : {"root": "https://allgirl.booru.org"}, - "drawfriends" : {"root": "https://drawfriends.booru.org"}, - "vidyart" : {"root": "https://vidyart.booru.org"}, - "theloudbooru" : {"root": "https://tlb.booru.org"}, + "thecollection": { + "root": "https://the-collection.booru.org", + "pattern": r"the-collection\.booru\.org", + }, + "illusioncardsbooru": { + "root": "https://illusioncards.booru.org", + "pattern": r"illusioncards\.booru\.org", + }, + "allgirlbooru": { + "root": "https://allgirl.booru.org", + "pattern": r"allgirl\.booru\.org", + }, + "drawfriends": { + "root": "https://drawfriends.booru.org", + "pattern": r"drawfriends\.booru\.org", + }, + "vidyart": { + "root": "https://vidyart.booru.org", + "pattern": r"vidyart\.booru\.org", + }, }) @@ -75,7 +104,6 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): }), ("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"), ("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"), - ("https://tlb.booru.org/index.php?page=post&s=list&tags=all"), ) def __init__(self, match): @@ -88,20 +116,42 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): def posts(self): url = "{}/index.php?page=post&s=list&tags={}&pid=".format( self.root, self.tags) - pid = self.page_start + return self._pagination(url, 'class="thumb"><a id="p', '"') - while True: - page = self.request(url + str(pid)).text - cnt = 0 - for post_id in text.extract_iter( - page, 'class="thumb"><a id="p', '"'): - yield self._parse_post(post_id) - cnt += 1 +class GelbooruV01FavoriteExtractor(GelbooruV01Extractor): + subcategory = "favorite" + directory_fmt = ("{category}", "favorites", "{favorite_id}") + archive_fmt = "f_{favorite_id}_{id}" + per_page = 50 + pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)" + test = ( + (("https://the-collection.booru.org" + "/index.php?page=favorites&s=view&id=1166"), { + "count": 2, + }), + (("https://illusioncards.booru.org" + "/index.php?page=favorites&s=view&id=84887"), { + "count": 2, + }), + ("https://allgirl.booru.org/index.php?page=favorites&s=view&id=380", { + "count": 4, + }), + ("https://drawfriends.booru.org/index.php?page=favorites&s=view&id=1"), + ("https://vidyart.booru.org/index.php?page=favorites&s=view&id=1"), + ) - if cnt < self.per_page: - return - pid += self.per_page + def __init__(self, match): + GelbooruV01Extractor.__init__(self, match) + self.favorite_id = match.group(match.lastindex) + + def metadata(self): + return {"favorite_id": text.parse_int(self.favorite_id)} + + def posts(self): + url = "{}/index.php?page=favorites&s=view&id={}&pid=".format( + self.root, self.favorite_id) + return self._pagination(url, "posts[", "]") class GelbooruV01PostExtractor(GelbooruV01Extractor): @@ -141,7 +191,6 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor): }), ("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"), ("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"), - ("https://tlb.booru.org/index.php?page=post&s=view&id=127223"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 7e16a51..2dd0c0c 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for Gelbooru v0.2 sites""" +"""Extractors for Gelbooru Beta 0.2 sites""" from . import booru from .. import text, util, exception @@ -26,6 +26,9 @@ class GelbooruV02Extractor(booru.BooruExtractor): except KeyError: self.api_root = self.root + if self.category == "realbooru": + self._file_url = self._file_url_realbooru + def _api_request(self, params): url = self.api_root + "/index.php?page=dapi&s=post&q=index" return ElementTree.fromstring(self.request(url, params=params).text) @@ -61,6 +64,14 @@ class GelbooruV02Extractor(booru.BooruExtractor): post["date"] = text.parse_datetime( post["created_at"], "%a %b %d %H:%M:%S %z %Y") + def _file_url_realbooru(self, post): + url = post["file_url"] + if url.count("/") == 5: + md5 = post["md5"] + url = "{}/images/{}/{}/{}.{}".format( + self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2]) + return url + def _extended_tags(self, post, page=None): if not page: url = "{}/index.php?page=post&s=view&id={}".format( @@ -105,11 +116,23 @@ class GelbooruV02Extractor(booru.BooruExtractor): INSTANCES = { - "realbooru": {"root": "https://realbooru.com"}, - "rule34" : {"root": "https://rule34.xxx", - "api_root": " https://api.rule34.xxx"}, - "safebooru": {"root": "https://safebooru.org"}, - "tbib" : {"root": "https://tbib.org"}, + "realbooru": { + "root": "https://realbooru.com", + "pattern": r"realbooru\.com", + }, + "rule34": { + "root": "https://rule34.xxx", + "pattern": r"rule34\.xxx", + "api_root": "https://api.rule34.xxx", + }, + "safebooru": { + "root": "https://safebooru.org", + "pattern": r"safebooru\.org", + }, + "tbib": { + "root": "https://tbib.org", + "pattern": r"tbib\.org", + }, } BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES) @@ -147,7 +170,7 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor): return {"search_tags": self.tags} def posts(self): - return self._pagination({"tags" : self.tags}) + return self._pagination({"tags": self.tags}) class GelbooruV02PoolExtractor(GelbooruV02Extractor): @@ -213,7 +236,7 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor): "count": 2, }), ("https://realbooru.com/index.php?page=favorites&s=view&id=274", { - "count": 4, + "count": 2, }), ("https://tbib.org/index.php?page=favorites&s=view&id=7881", { "count": 3, @@ -279,7 +302,8 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor): }, }), ("https://realbooru.com/index.php?page=post&s=view&id=668483", { - "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9", + "pattern": r"https://realbooru\.com/images/dc/b5" + r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg", "content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da", }), ("https://tbib.org/index.php?page=post&s=view&id=9233957", { diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 2035655..fd78ce2 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2021 Mike Fährmann +# Copyright 2015-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -201,17 +201,24 @@ class ImgurAlbumExtractor(ImgurExtractor): ("https://imgur.com/a/TcBmQ", { "exception": exception.HttpError, }), + ("https://imgur.com/a/pjOnJA0", { # empty, no 'media' (#2557) + "count": 0, + }), ("https://www.imgur.com/a/TcBmP"), # www ("https://m.imgur.com/a/TcBmP"), # mobile ) def items(self): album = self.api.album(self.key) - album["date"] = text.parse_datetime(album["created_at"]) - images = album["media"] + try: + images = album["media"] + except KeyError: + return + del album["media"] count = len(images) + album["date"] = text.parse_datetime(album["created_at"]) try: del album["ad_url"] diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index e07b64e..82c9858 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -80,12 +80,22 @@ class InstagramExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and "/accounts/login/" in response.request.url: - if self._cursor: - self.log.info("Use '-o cursor=%s' to continue downloading " - "from the current position", self._cursor) - raise exception.StopExtraction( - "HTTP redirect to login page (%s)", response.request.url) + if response.history: + + url = response.request.url + if "/accounts/login/" in url: + page = "login" + elif "/challenge/" in url: + page = "challenge" + else: + page = None + + if page: + if self._cursor: + self.log.info("Use '-o cursor=%s' to continue downloading " + "from the current position", self._cursor) + raise exception.StopExtraction("HTTP redirect to %s page (%s)", + page, url.partition("?")[0]) www_claim = response.headers.get("x-ig-set-www-claim") if www_claim is not None: @@ -298,7 +308,7 @@ class InstagramExtractor(Extractor): video = None media = image - files.append({ + media = { "num" : num, "date" : text.parse_timestamp(item.get("taken_at") or media.get("taken_at")), @@ -309,7 +319,9 @@ class InstagramExtractor(Extractor): "video_url" : video["url"] if video else None, "width" : media["width"], "height" : media["height"], - }) + } + self._extract_tagged_users(item, media) + files.append(media) return data @@ -321,22 +333,45 @@ class InstagramExtractor(Extractor): "abcdefghijklmnopqrstuvwxyz" "0123456789-_") - def _extract_tagged_users(self, src, dest): - if "edge_media_to_tagged_user" not in src: - return - edges = src["edge_media_to_tagged_user"]["edges"] + @staticmethod + def _extract_tagged_users(src, dest): + dest["tagged_users"] = tagged_users = [] + + edges = src.get("edge_media_to_tagged_user") if edges: - dest["tagged_users"] = tagged_users = [] - for edge in edges: + for edge in edges["edges"]: user = edge["node"]["user"] - tagged_users.append({ - "id" : user["id"], - "username" : user["username"], - "full_name": user["full_name"], - }) - - def _extract_shared_data(self, url): - page = self.request(url).text + tagged_users.append({"id" : user["id"], + "username" : user["username"], + "full_name": user["full_name"]}) + + usertags = src.get("usertags") + if usertags: + for tag in usertags["in"]: + user = tag["user"] + tagged_users.append({"id" : user["pk"], + "username" : user["username"], + "full_name": user["full_name"]}) + + mentions = src.get("reel_mentions") + if mentions: + for mention in mentions: + user = mention["user"] + tagged_users.append({"id" : user.get("pk"), + "username" : user["username"], + "full_name": user["full_name"]}) + + stickers = src.get("story_bloks_stickers") + if stickers: + for sticker in stickers: + sticker = sticker["bloks_sticker"] + if sticker["bloks_sticker_type"] == "mention": + user = sticker["sticker_data"]["ig_mention"] + tagged_users.append({"id" : user["account_id"], + "username" : user["username"], + "full_name": user["full_name"]}) + + def _extract_shared_data(self, page): shared_data, pos = text.extract( page, "window._sharedData =", ";</script>") additional_data, pos = text.extract( @@ -349,13 +384,15 @@ class InstagramExtractor(Extractor): return data def _extract_profile_page(self, url): - data = self._extract_shared_data(url)["entry_data"] + page = self.request(url).text + data = self._extract_shared_data(page)["entry_data"] if "HttpErrorPage" in data: raise exception.NotFoundError("user") return data["ProfilePage"][0]["graphql"]["user"] def _extract_post_page(self, url): - data = self._extract_shared_data(url)["entry_data"] + page = self.request(url).text + data = self._extract_shared_data(page)["entry_data"] if "HttpErrorPage" in data: raise exception.NotFoundError("post") return data["PostPage"][0] @@ -524,7 +561,8 @@ class InstagramTagExtractor(InstagramExtractor): def posts(self): url = "{}/explore/tags/{}/".format(self.root, self.item) - page = self._extract_shared_data(url)["entry_data"]["TagPage"][0] + page = self._extract_shared_data( + self.request(url).text)["entry_data"]["TagPage"][0] if "data" in page: return self._pagination_sections(page["data"]["recent"]) @@ -718,8 +756,12 @@ class InstagramStoriesExtractor(InstagramExtractor): reel_id = "highlight:" + self.highlight_id else: url = "{}/stories/{}/".format(self.root, self.user) + with self.request(url, allow_redirects=False) as response: + if 300 <= response.status_code < 400: + return () + page = response.text try: - data = self._extract_shared_data(url)["entry_data"] + data = self._extract_shared_data(page)["entry_data"] user = data["StoriesPage"][0]["user"] except KeyError: return () diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py index 67a1a95..e7827b1 100644 --- a/gallery_dl/extractor/khinsider.py +++ b/gallery_dl/extractor/khinsider.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2020 Mike Fährmann +# Copyright 2016-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -26,7 +26,18 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): "pattern": r"https?://vgm(site|downloads).com" r"/soundtracks/horizon-riders-wii/[^/]+" r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3", - "keyword": "12ca70e0709ea15250e577ea388cf2b5b0c65630", + "keyword": { + "album": { + "count": 1, + "date": "Sep 18th, 2016", + "name": "Horizon Riders (Wii)", + "size": 26214400, + "type": "Gamerip", + }, + "extension": "mp3", + "filename": "Horizon Riders Wii - Full Soundtrack", + }, + "count": 1, }) def __init__(self, match): @@ -48,10 +59,10 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): def metadata(self, page): extr = text.extract_from(page) return {"album": { - "name" : text.unescape(extr("Album name: <b>", "<")), + "name" : text.unescape(extr("<h2>", "<")), "count": text.parse_int(extr("Number of Files: <b>", "<")), "size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]), - "date" : extr("Date added: <b>", "<"), + "date" : extr("Date Added: <b>", "<"), "type" : extr("Album type: <b>", "<"), }} diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index ad7cd1d..b6a508d 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -63,6 +63,12 @@ class LolisafeAlbumExtractor(LolisafeExtractor): LolisafeExtractor.__init__(self, match) self.album_id = match.group(match.lastindex) + domain = self.config("domain") + if domain is None or domain == "auto": + self.root = text.root_from_url(match.group(0)) + else: + self.root = text.ensure_http_scheme(domain) + def items(self): files, data = self.fetch_album(self.album_id) diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index f6514ca..4808105 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2021 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -34,6 +34,7 @@ class MangafoxChapterExtractor(ChapterExtractor): base, self.cstr, self.volume, self.chapter, self.minor = match.groups() self.urlbase = self.root + base ChapterExtractor.__init__(self, match, self.urlbase + "/1.html") + self.session.headers["Referer"] = self.root + "/" def metadata(self, page): manga, pos = text.extract(page, "<title>", "</title>") diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index f655f94..461c92d 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -17,8 +17,8 @@ class MangahereBase(): """Base class for mangahere extractors""" category = "mangahere" root = "https://www.mangahere.cc" - mobile_root = "https://m.mangahere.cc" - url_fmt = mobile_root + "/manga/{}/{}.html" + root_mobile = "https://m.mangahere.cc" + url_fmt = root_mobile + "/manga/{}/{}.html" class MangahereChapterExtractor(MangahereBase, ChapterExtractor): @@ -42,6 +42,7 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor): self.part, self.volume, self.chapter = match.groups() url = self.url_fmt.format(self.part, 1) ChapterExtractor.__init__(self, match, url) + self.session.headers["Referer"] = self.root_mobile + "/" def metadata(self, page): pos = page.index("</select>") diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py index 0b0da65..2bd11ef 100644 --- a/gallery_dl/extractor/mangasee.py +++ b/gallery_dl/extractor/mangasee.py @@ -9,7 +9,7 @@ """Extractors for https://mangasee123.com/""" from .common import ChapterExtractor, MangaExtractor -from .. import text +from .. import text, util import json @@ -57,6 +57,15 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): }, }) + def __init__(self, match): + ChapterExtractor.__init__(self, match) + self.session.headers["Referer"] = self.gallery_url + + domain = "mangasee123.com" + cookies = self.session.cookies + if not cookies.get("PHPSESSID", domain=domain): + cookies.set("PHPSESSID", util.generate_token(13), domain=domain) + def metadata(self, page): extr = text.extract_from(page) self.chapter = data = json.loads(extr("vm.CurChapter =", ";\r\n")) diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index cd7cabb..6e780e8 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -60,12 +60,14 @@ class MastodonExtractor(BaseExtractor): INSTANCES = { "mastodon.social": { "root" : "https://mastodon.social", + "pattern" : r"mastodon\.social", "access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48", "client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo", "client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI", }, "pawoo": { "root" : "https://pawoo.net", + "pattern" : r"pawoo\.net", "access-token" : "c12c9d275050bce0dc92169a28db09d7" "0d62d0a75a8525953098c167eacd3668", "client-id" : "978a25f843ec01e53d09be2c290cd75c" @@ -75,6 +77,7 @@ INSTANCES = { }, "baraag": { "root" : "https://baraag.net", + "pattern" : r"baraag\.net", "access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0", "client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o", "client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY", diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 604d65c..65b9a83 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2021 Mike Fährmann +# Copyright 2020-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -54,6 +54,7 @@ class MoebooruExtractor(BooruExtractor): BASE_PATTERN = MoebooruExtractor.update({ "yandere": { "root": "https://yande.re", + "pattern": r"yande\.re", }, "konachan": { "root": "https://konachan.com", @@ -61,6 +62,7 @@ BASE_PATTERN = MoebooruExtractor.update({ }, "hypnohub": { "root": "https://hypnohub.net", + "pattern": r"hypnohub\.net", }, "sakugabooru": { "root": "https://www.sakugabooru.com", @@ -68,6 +70,7 @@ BASE_PATTERN = MoebooruExtractor.update({ }, "lolibooru": { "root": "https://lolibooru.moe", + "pattern": r"lolibooru\.moe", }, }) diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py index 348f6a1..eadd460 100644 --- a/gallery_dl/extractor/naverwebtoon.py +++ b/gallery_dl/extractor/naverwebtoon.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2021 Seonghyeon Cho +# Copyright 2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,8 +11,10 @@ from .common import GalleryExtractor, Extractor, Message from .. import text +import re -BASE_PATTERN = r"(?:https?://)?comic\.naver\.com/webtoon" +BASE_PATTERN = (r"(?:https?://)?comic\.naver\.com" + r"/(webtoon|challenge|bestChallenge)") class NaverwebtoonBase(): @@ -25,19 +28,33 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): directory_fmt = ("{category}", "{comic}") filename_fmt = "{episode:>03}-{num:>02}.{extension}" archive_fmt = "{title_id}_{episode}_{num}" - pattern = BASE_PATTERN + r"/detail\.nhn\?([^#]+)" + pattern = BASE_PATTERN + r"/detail(?:\.nhn)?\?([^#]+)" test = ( - (("https://comic.naver.com/webtoon/detail.nhn?" - "titleId=26458&no=1&weekday=tue"), { + (("https://comic.naver.com/webtoon/detail" + "?titleId=26458&no=1&weekday=tue"), { "url": "47a956ba8c7a837213d5985f50c569fcff986f75", "content": "3806b6e8befbb1920048de9888dfce6220f69a60", "count": 14 }), + (("https://comic.naver.com/challenge/detail" + "?titleId=765124&no=1"), { + "pattern": r"https://image-comic\.pstatic\.net/nas" + r"/user_contents_data/challenge_comic/2021/01/19" + r"/342586/upload_7149856273586337846\.jpeg", + "count": 1, + }), + (("https://comic.naver.com/bestChallenge/detail.nhn" + "?titleId=771467&no=3"), { + "pattern": r"https://image-comic\.pstatic\.net/nas" + r"/user_contents_data/challenge_comic/2021/04/28" + r"/345534/upload_3617293622396203109\.jpeg", + "count": 1, + }), ) def __init__(self, match): - query = match.group(1) - url = "{}/webtoon/detail.nhn?{}".format(self.root, query) + path, query = match.groups() + url = "{}/{}/detail?{}".format(self.root, path, query) GalleryExtractor.__init__(self, match, url) query = text.parse_query(query) @@ -70,22 +87,31 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor): class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor): subcategory = "comic" categorytransfer = True - pattern = (BASE_PATTERN + r"/list\.nhn\?([^#]+)") + pattern = (BASE_PATTERN + r"/list(?:\.nhn)?\?([^#]+)") test = ( - ("https://comic.naver.com/webtoon/list.nhn?titleId=22073", { + ("https://comic.naver.com/webtoon/list?titleId=22073", { "pattern": NaverwebtoonEpisodeExtractor.pattern, "count": 32, }), + ("https://comic.naver.com/challenge/list?titleId=765124", { + "pattern": NaverwebtoonEpisodeExtractor.pattern, + "count": 25, + }), + ("https://comic.naver.com/bestChallenge/list.nhn?titleId=789786", { + "pattern": NaverwebtoonEpisodeExtractor.pattern, + "count": ">= 12", + }), ) def __init__(self, match): Extractor.__init__(self, match) - query = text.parse_query(match.group(1)) + self.path, query = match.groups() + query = text.parse_query(query) self.title_id = query.get("titleId") self.page_no = text.parse_int(query.get("page"), 1) def items(self): - url = self.root + "/webtoon/list.nhn" + url = "{}/{}/list".format(self.root, self.path) params = {"titleId": self.title_id, "page": self.page_no} data = {"_extractor": NaverwebtoonEpisodeExtractor} @@ -103,7 +129,8 @@ class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor): def get_episode_urls(self, page): """Extract and return all episode urls in page""" return [ - self.root + "/webtoon/detail.nhn?" + query - for query in text.extract_iter( - page, '<a href="/webtoon/detail?', '"') + self.root + path + for path in re.findall( + r'<a href="(/(?:webtoon|challenge|bestChallenge)' + r'/detail\?[^"]+)', page) ][::2] diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 90ca01d..832831f 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -91,6 +91,10 @@ class NijieExtractor(AsynchronousMixin, Extractor): "url": url, }) + @staticmethod + def _extract_user_name(page): + return text.unescape(text.extract(page, "<br />", "<")[0] or "") + def login(self): """Login and obtain session cookies""" if not self._check_cookies(self.cookienames): @@ -119,9 +123,8 @@ class NijieExtractor(AsynchronousMixin, Extractor): while True: page = self.request(url, params=params, notfound="artist").text - if not self.user_name: - self.user_name = text.unescape(text.extract( - page, '<br />', '<')[0] or "") + if self.user_name is None: + self.user_name = self._extract_user_name(page) yield from text.extract_iter(page, 'illust_id="', '"') if '<a rel="next"' not in page: @@ -137,11 +140,12 @@ class NijieUserExtractor(NijieExtractor): test = ("https://nijie.info/members.php?id=44",) def items(self): - base = "{}/{{}}.php?id={}".format(self.root, self.user_id) + fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format return self._dispatch_extractors(( - (NijieIllustrationExtractor, base.format("members_illust")), - (NijieDoujinExtractor , base.format("members_dojin")), - (NijieFavoriteExtractor , base.format("user_like_illust_view")), + (NijieIllustrationExtractor, fmt("members_illust")), + (NijieDoujinExtractor , fmt("members_dojin")), + (NijieFavoriteExtractor , fmt("user_like_illust_view")), + (NijieNuitaExtractor , fmt("history_nuita")), ), ("illustration", "doujin")) @@ -217,6 +221,36 @@ class NijieFavoriteExtractor(NijieExtractor): return data +class NijieNuitaExtractor(NijieExtractor): + """Extractor for a nijie user's 抜いた list""" + subcategory = "nuita" + directory_fmt = ("{category}", "nuita", "{user_id}") + archive_fmt = "n_{user_id}_{image_id}_{num}" + pattern = BASE_PATTERN + r"/history_nuita\.php\?id=(\d+)" + test = ("https://nijie.info/history_nuita.php?id=728995", { + "range": "1-10", + "count": 10, + "keyword": { + "user_id" : 728995, + "user_name": "莚", + }, + }) + + def image_ids(self): + return self._pagination("history_nuita") + + def _extract_data(self, page): + data = NijieExtractor._extract_data(page) + data["user_id"] = self.user_id + data["user_name"] = self.user_name + return data + + @staticmethod + def _extract_user_name(page): + return text.unescape(text.extract( + page, "<title>", "さんの抜いた")[0] or "") + + class NijieImageExtractor(NijieExtractor): """Extractor for a work/image from nijie.info""" subcategory = "image" diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 428f772..653822f 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -11,6 +11,7 @@ from .common import Extractor, Message from . import deviantart, flickr, mastodon, pixiv, reddit, smugmug, tumblr from .. import text, oauth, util, config, exception +from ..output import stdout_write from ..cache import cache import urllib.parse import hashlib @@ -37,7 +38,7 @@ class OAuthBase(Extractor): def recv(self): """Open local HTTP server and recv callback parameters""" import socket - print("Waiting for response. (Cancel with Ctrl+c)") + stdout_write("Waiting for response. (Cancel with Ctrl+c)\n") server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server.bind(("localhost", self.config("port", 6414))) @@ -60,7 +61,7 @@ class OAuthBase(Extractor): def send(self, msg): """Send 'msg' to the socket opened in 'recv()'""" - print(msg) + stdout_write(msg) self.client.send(b"HTTP/1.1 200 OK\r\n\r\n" + msg.encode()) self.client.close() @@ -69,12 +70,13 @@ class OAuthBase(Extractor): import webbrowser url += "?" + urllib.parse.urlencode(params) if not self.config("browser", True) or not webbrowser.open(url): - print("Please open this URL in your browser:") - print(url, end="\n\n", flush=True) + stdout_write( + "Please open this URL in your browser:\n\n" + url + "\n\n") return (recv or self.recv)() def error(self, msg): - return self.send("Remote server reported an error:\n\n" + str(msg)) + return self.send( + "Remote server reported an error:\n\n{}\n".format(msg)) def _oauth1_authorization_flow( self, request_token_url, authorize_url, access_token_url): @@ -133,7 +135,7 @@ class OAuthBase(Extractor): # check authorization response if state != params.get("state"): - self.send("'state' mismatch: expected {}, got {}.".format( + self.send("'state' mismatch: expected {}, got {}.\n".format( state, params.get("state") )) return @@ -188,7 +190,7 @@ class OAuthBase(Extractor): opt = self.oauth_config(names[0]) if self.cache and (opt is None or opt == "cache"): - msg += _vh + " been cached and will automatically be used." + msg += _vh + " been cached and will automatically be used.\n" else: msg += "Put " + _va + " into your configuration file as \n" msg += " and\n".join( @@ -200,7 +202,7 @@ class OAuthBase(Extractor): "\nor set\n'extractor.{}.{}' to \"cache\"" .format(self.subcategory, names[0]) ) - msg += "\nto use {}.".format(_it) + msg += "\nto use {}.\n".format(_it) return msg @@ -398,9 +400,9 @@ class OAuthPixiv(OAuthBase): data = self.session.post(url, headers=headers, data=data).json() if "error" in data: - print(data) + stdout_write("\n{}\n".format(data)) if data["error"] in ("invalid_request", "invalid_grant"): - print("'code' expired, try again") + stdout_write("'code' expired, try again\n\n") return token = data["refresh_token"] @@ -409,10 +411,10 @@ class OAuthPixiv(OAuthBase): pixiv._refresh_token_cache.update(username, token) self.log.info("Writing 'refresh-token' to cache") - print(self._generate_message(("refresh-token",), (token,))) + stdout_write(self._generate_message(("refresh-token",), (token,))) def _input(self): - print(""" + stdout_write("""\ 1) Open your browser's Developer Tools (F12) and switch to the Network tab 2) Login 3) Select the last network monitor entry ('callback?state=...') @@ -421,6 +423,7 @@ class OAuthPixiv(OAuthBase): - This 'code' will expire 30 seconds after logging in. - Copy-pasting more than just the 'code' value will work as well, like the entire URL or several query parameters. + """) code = input("code: ") return code.rpartition("=")[2].strip() diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 92b8113..951b34d 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -58,12 +58,21 @@ class PhilomenaExtractor(BooruExtractor): INSTANCES = { - "derpibooru": {"root": "https://derpibooru.org", - "filter_id": "56027"}, - "ponybooru" : {"root": "https://ponybooru.org", - "filter_id": "2"}, - "furbooru" : {"root": "https://furbooru.org", - "filter_id": "2"}, + "derpibooru": { + "root": "https://derpibooru.org", + "pattern": r"derpibooru\.org", + "filter_id": "56027", + }, + "ponybooru": { + "root": "https://ponybooru.org", + "pattern": r"ponybooru\.org", + "filter_id": "2", + }, + "furbooru": { + "root": "https://furbooru.org", + "pattern": r"furbooru\.org", + "filter_id": "2", + }, } BASE_PATTERN = PhilomenaExtractor.update(INSTANCES) @@ -239,5 +248,5 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor): def posts(self): gallery_id = "gallery_id:" + self.gallery_id url = self.root + "/api/v1/json/search/images" - params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id} + params = {"sd": "desc", "sf": gallery_id, "q": gallery_id} return self._pagination(url, params) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a33df42..9b35e42 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2021 Mike Fährmann +# Copyright 2014-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,16 +10,16 @@ from .common import Extractor, Message from .. import text, util, exception -from ..cache import cache +from ..cache import cache, memcache from datetime import datetime, timedelta import itertools import hashlib -import time class PixivExtractor(Extractor): """Base class for pixiv extractors""" category = "pixiv" + root = "https://www.pixiv.net" directory_fmt = ("{category}", "{user[id]} {user[account]}") filename_fmt = "{id}_p{num}.{extension}" archive_fmt = "{id}{suffix}.{extension}" @@ -73,7 +73,14 @@ class PixivExtractor(Extractor): if work["type"] == "ugoira": if not self.load_ugoira: continue - ugoira = self.api.ugoira_metadata(work["id"]) + + try: + ugoira = self.api.ugoira_metadata(work["id"]) + except exception.StopExtraction as exc: + self.log.warning( + "Unable to retrieve Ugoira metatdata (%s - %s)", + work.get("id"), exc.message) + continue url = ugoira["zip_urls"]["medium"].replace( "_ugoira600x600", "_ugoira1920x1080") @@ -91,22 +98,70 @@ class PixivExtractor(Extractor): work["suffix"] = "_p{:02}".format(work["num"]) yield Message.Url, url, text.nameext_from_url(url, work) + @staticmethod + def _make_work(kind, url, user): + p = url.split("/") + return { + "create_date" : "{}-{}-{}T{}:{}:{}+09:00".format( + p[5], p[6], p[7], p[8], p[9], p[10]) if len(p) > 9 else None, + "height" : 0, + "id" : kind, + "image_urls" : None, + "meta_pages" : (), + "meta_single_page": {"original_image_url": url}, + "page_count" : 1, + "sanity_level" : 0, + "tags" : (), + "title" : kind, + "type" : kind, + "user" : user, + "width" : 0, + "x_restrict" : 0, + } + def works(self): - """Return an iterable containing all relevant 'work'-objects""" + """Return an iterable containing all relevant 'work' objects""" def metadata(self): - """Collect metadata for extractor-job""" + """Collect metadata for extractor job""" return {} class PixivUserExtractor(PixivExtractor): - """Extractor for works of a pixiv user""" + """Extractor for a pixiv user profile""" subcategory = "user" pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" - r"(?:en/)?users/(\d+)(?:/(?:artworks|illustrations|manga)" - r"(?:/([^/?#]+))?)?/?(?:$|[?#])" - r"|member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?" - r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))") + r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id=" + r")(\d+)(?:$|[?#])") + test = ( + ("https://www.pixiv.net/en/users/173530"), + ("https://www.pixiv.net/u/173530"), + ("https://www.pixiv.net/member.php?id=173530"), + ("https://www.pixiv.net/mypage.php#id=173530"), + ("https://www.pixiv.net/#id=173530"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id = match.group(1) + + def items(self): + base = "{}/users/{}/".format(self.root, self.user_id) + return self._dispatch_extractors(( + (PixivAvatarExtractor , base + "avatar"), + (PixivBackgroundExtractor, base + "background"), + (PixivArtworksExtractor , base + "artworks"), + (PixivFavoriteExtractor , base + "bookmarks/artworks"), + ), ("artworks",)) + + +class PixivArtworksExtractor(PixivExtractor): + """Extractor for artworks of a pixiv user""" + subcategory = "artworks" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" + r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" + r"(?:/([^/?#]+))?/?(?:$|[?#])" + r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") test = ( ("https://www.pixiv.net/en/users/173530/artworks", { "url": "852c31ad83b6840bacbce824d85f2a997889efb7", @@ -120,47 +175,30 @@ class PixivUserExtractor(PixivExtractor): "&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), { "url": "25b1cd81153a8ff82eec440dd9f20a4a22079658", }), - # avatar (#595, #623, #1124) - ("https://www.pixiv.net/en/users/173530", { - "options": (("avatar", True),), - "content": "4e57544480cc2036ea9608103e8f024fa737fe66", - "range": "1", - }), - # background (#623, #1124, #2495) - ("https://www.pixiv.net/en/users/194921", { - "options": (("background", True),), - "content": "aeda3536003ea3002f70657cb93c5053f26f5843", - "range": "1", - }), # deleted account ("http://www.pixiv.net/member_illust.php?id=173531", { "options": (("metadata", True),), "exception": exception.NotFoundError, }), - ("https://www.pixiv.net/en/users/173530"), ("https://www.pixiv.net/en/users/173530/manga"), ("https://www.pixiv.net/en/users/173530/illustrations"), ("https://www.pixiv.net/member_illust.php?id=173530"), - ("https://www.pixiv.net/u/173530"), - ("https://www.pixiv.net/user/173530"), - ("https://www.pixiv.net/mypage.php#id=173530"), - ("https://www.pixiv.net/#id=173530"), ("https://touch.pixiv.net/member_illust.php?id=173530"), ) def __init__(self, match): PixivExtractor.__init__(self, match) - u1, t1, u2, t2, u3 = match.groups() + u1, t1, u2, t2 = match.groups() if t1: t1 = text.unquote(t1) elif t2: t2 = text.parse_query(t2).get("tag") - self.user_id = u1 or u2 or u3 + self.user_id = u1 or u2 self.tag = t1 or t2 def metadata(self): if self.config("metadata"): - return {"user": self.api.user_detail(self.user_id)["user"]} + return self.api.user_detail(self.user_id) return {} def works(self): @@ -173,54 +211,60 @@ class PixivUserExtractor(PixivExtractor): if tag in [t["name"].lower() for t in work["tags"]] ) - avatar = self.config("avatar") - background = self.config("background") - if avatar or background: - work_list = [] - detail = self.api.user_detail(self.user_id) - user = detail["user"] - - if avatar: - url = user["profile_image_urls"]["medium"] - work_list.append((self._make_work( - "avatar", url.replace("_170.", "."), user),)) - - if background: - url = detail["profile"]["background_image_url"] - if url: - if "/c/" in url: - parts = url.split("/") - del parts[3:5] - url = "/".join(parts) - url = url.replace("_master1200.", ".") - work = self._make_work("background", url, user) - if url.endswith(".jpg"): - work["_fallback"] = (url[:-4] + ".png",) - work_list.append((work,)) - - work_list.append(works) - works = itertools.chain.from_iterable(work_list) - return works - @staticmethod - def _make_work(kind, url, user): - return { - "create_date" : None, - "height" : 0, - "id" : kind, - "image_urls" : None, - "meta_pages" : (), - "meta_single_page": {"original_image_url": url}, - "page_count" : 1, - "sanity_level" : 0, - "tags" : (), - "title" : kind, - "type" : kind, - "user" : user, - "width" : 0, - "x_restrict" : 0, - } + +class PixivAvatarExtractor(PixivExtractor): + """Extractor for pixiv avatars""" + subcategory = "avatar" + filename_fmt = "avatar{date:?_//%Y-%m-%d}.{extension}" + archive_fmt = "avatar_{user[id]}_{date}" + pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" + r"/(?:en/)?users/(\d+)/avatar") + test = ("https://www.pixiv.net/en/users/173530/avatar", { + "content": "4e57544480cc2036ea9608103e8f024fa737fe66", + }) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id = match.group(1) + + def works(self): + user = self.api.user_detail(self.user_id)["user"] + url = user["profile_image_urls"]["medium"].replace("_170.", ".") + return (self._make_work("avatar", url, user),) + + +class PixivBackgroundExtractor(PixivExtractor): + """Extractor for pixiv background banners""" + subcategory = "background" + filename_fmt = "background{date?_//:%Y-%m-%d}.{extension}" + archive_fmt = "background_{user[id]}_{date}" + pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" + r"/(?:en/)?users/(\d+)/background") + test = ("https://www.pixiv.net/en/users/194921/background", { + "pattern": r"https://i\.pximg\.net/background/img/2021/01/30/16/12/02" + r"/194921_af1f71e557a42f499213d4b9eaccc0f8\.jpg", + }) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id = match.group(1) + + def works(self): + detail = self.api.user_detail(self.user_id) + url = detail["profile"]["background_image_url"] + if not url: + return () + if "/c/" in url: + parts = url.split("/") + del parts[3:5] + url = "/".join(parts) + url = url.replace("_master1200.", ".") + work = self._make_work("background", url, detail["user"]) + if url.endswith(".jpg"): + work["_fallback"] = (url[:-4] + ".png",) + return (work,) class PixivMeExtractor(PixivExtractor): @@ -312,10 +356,10 @@ class PixivFavoriteExtractor(PixivExtractor): r"|bookmark\.php)(?:\?([^#]*))?") test = ( ("https://www.pixiv.net/en/users/173530/bookmarks/artworks", { - "url": "e717eb511500f2fa3497aaee796a468ecf685cc4", + "url": "85a3104eaaaf003c7b3947117ca2f1f0b1cfc949", }), ("https://www.pixiv.net/bookmark.php?id=173530", { - "url": "e717eb511500f2fa3497aaee796a468ecf685cc4", + "url": "85a3104eaaaf003c7b3947117ca2f1f0b1cfc949", }), # bookmarks with specific tag (("https://www.pixiv.net/en/users/3137110" @@ -735,66 +779,70 @@ class PixivAppAPI(): def illust_detail(self, illust_id): params = {"illust_id": illust_id} - return self._call("v1/illust/detail", params)["illust"] + return self._call("/v1/illust/detail", params)["illust"] def illust_follow(self, restrict="all"): params = {"restrict": restrict} - return self._pagination("v2/illust/follow", params) + return self._pagination("/v2/illust/follow", params) def illust_ranking(self, mode="day", date=None): params = {"mode": mode, "date": date} - return self._pagination("v1/illust/ranking", params) + return self._pagination("/v1/illust/ranking", params) def illust_related(self, illust_id): params = {"illust_id": illust_id} - return self._pagination("v2/illust/related", params) + return self._pagination("/v2/illust/related", params) def search_illust(self, word, sort=None, target=None, duration=None, date_start=None, date_end=None): params = {"word": word, "search_target": target, "sort": sort, "duration": duration, "start_date": date_start, "end_date": date_end} - return self._pagination("v1/search/illust", params) + return self._pagination("/v1/search/illust", params) def user_bookmarks_illust(self, user_id, tag=None, restrict="public"): params = {"user_id": user_id, "tag": tag, "restrict": restrict} - return self._pagination("v1/user/bookmarks/illust", params) + return self._pagination("/v1/user/bookmarks/illust", params) + @memcache(keyarg=1) def user_detail(self, user_id): params = {"user_id": user_id} - return self._call("v1/user/detail", params) + return self._call("/v1/user/detail", params) def user_following(self, user_id, restrict="public"): params = {"user_id": user_id, "restrict": restrict} - return self._pagination("v1/user/following", params, "user_previews") + return self._pagination("/v1/user/following", params, "user_previews") def user_illusts(self, user_id): params = {"user_id": user_id} - return self._pagination("v1/user/illusts", params) + return self._pagination("/v1/user/illusts", params) def ugoira_metadata(self, illust_id): params = {"illust_id": illust_id} - return self._call("v1/ugoira/metadata", params)["ugoira_metadata"] + return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"] def _call(self, endpoint, params=None): - url = "https://app-api.pixiv.net/" + endpoint + url = "https://app-api.pixiv.net" + endpoint + + while True: + self.login() + response = self.extractor.request(url, params=params, fatal=False) + data = response.json() - self.login() - response = self.extractor.request(url, params=params, fatal=False) - data = response.json() + if "error" not in data: + return data + + self.log.debug(data) - if "error" in data: if response.status_code == 404: raise exception.NotFoundError() error = data["error"] if "rate limit" in (error.get("message") or "").lower(): - self.log.info("Waiting two minutes for API rate limit reset.") - time.sleep(120) - return self._call(endpoint, params) - raise exception.StopExtraction("API request failed: %s", error) + self.extractor.wait(seconds=300) + continue - return data + raise exception.StopExtraction("API request failed: %s", error) def _pagination(self, endpoint, params, key="illusts"): while True: diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index b3a620a..db8d700 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -160,6 +160,7 @@ BASE_PATTERN = ReactorExtractor.update({ }, "thatpervert": { "root": "http://thatpervert.com", + "pattern": r"thatpervert\.com", }, }) diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index c8b8c9a..16b9191 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -24,6 +24,7 @@ class ReadcomiconlineBase(): archive_fmt = "{issue_id}_{page}" root = "https://readcomiconline.li" browser = "firefox" + request_interval = (1, 9) def request(self, url, **kwargs): """Detect and handle redirects to CAPTCHA pages""" @@ -85,7 +86,7 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): return [ (beau(url), None) for url in text.extract_iter( - page, 'lstImages.push("', '"' + page, "lstImages.push('", "'", ) ] @@ -129,10 +130,13 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): def beau(url): - """https://readcomiconline.li/Scripts/rguard.min.js?v=1.1""" + """https://readcomiconline.li/Scripts/rguard.min.js""" if url.startswith("https"): return url + url = url.replace("_x236", "d") + url = url.replace("_x945", "g") + containsS0 = "=s0" in url url = url[:-3 if containsS0 else -6] url = url[4:22] + url[25:] diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index f276e84..f2bf3cb 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -43,19 +43,45 @@ class ShopifyExtractor(BaseExtractor): BASE_PATTERN = ShopifyExtractor.update({ + "chelseacrew": { + "root": "https://chelseacrew.com", + "pattern": r"(?:www\.)?chelseacrew\.com", + }, "fashionnova": { "root": "https://www.fashionnova.com", "pattern": r"(?:www\.)?fashionnova\.com", }, + "loungeunderwear": { + "root": "https://loungeunderwear.com", + "pattern": r"(?:[a-z]+\.)?loungeunderwear\.com", + }, + "michaelscameras": { + "root": "https://michaels.com.au", + "pattern": r"michaels\.com\.au", + }, + "modcloth": { + "root": "https://modcloth.com", + "pattern": r"modcloth\.com", + }, "omgmiamiswimwear": { "root": "https://www.omgmiamiswimwear.com", + "pattern": r"(?:www\.)?omgmiamiswimwear\.com", + }, + "pinupgirlclothing": { + "root": "https://pinupgirlclothing.com", + "pattern": r"pinupgirlclothing\.com", + }, + "raidlondon": { + "root": "https://www.raidlondon.com", + "pattern": r"(?:www\.)?raidlondon\.com", + }, + "unique-vintage": { + "root": "https://www.unique-vintage.com", + "pattern": r"(?:www\.)?unique\-vintage\.com", }, "windsorstore": { "root": "https://www.windsorstore.com", - }, - "loungeunderwear": { - "root": "https://loungeunderwear.com", - "pattern": r"(?:[a-z]+\.)?loungeunderwear\.com", + "pattern": r"(?:www\.)?windsorstore\.com", }, }) @@ -66,15 +92,21 @@ class ShopifyCollectionExtractor(ShopifyExtractor): directory_fmt = ("{category}", "{collection[title]}") pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$|[?#])" test = ( + ("https://chelseacrew.com/collections/flats"), ("https://www.fashionnova.com/collections/mini-dresses", { "range": "1-20", "count": 20, }), ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), ("https://www.fashionnova.com/collections/mini-dresses#1"), + ("https://loungeunderwear.com/collections/apparel"), + ("https://michaels.com.au/collections/microphones"), + ("https://modcloth.com/collections/shoes"), ("https://www.omgmiamiswimwear.com/collections/fajas"), + ("https://pinupgirlclothing.com/collections/evening"), + ("https://www.raidlondon.com/collections/flats"), + ("https://www.unique-vintage.com/collections/flapper-1920s"), ("https://www.windsorstore.com/collections/dresses-ball-gowns"), - ("https://loungeunderwear.com/collections/apparel"), ) def metadata(self): @@ -99,18 +131,28 @@ class ShopifyProductExtractor(ShopifyExtractor): directory_fmt = ("{category}", "Products") pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)" test = ( + ("https://chelseacrew.com/collections/flats/products/dora"), ("https://www.fashionnova.com/products/essential-slide-red", { "pattern": r"https?://cdn\d*\.shopify.com/", "count": 3, }), + ("https://www.fashionnova.com/collections/flats/products/name"), + ("https://de.loungeunderwear.com/products/ribbed-crop-top-black"), + ("https://michaels.com.au/collections/audio/products" + "/boya-by-wm4-pro-k5-2-4ghz-mic-android-1-1-101281"), + ("https://modcloth.com/collections/shoes/products/heidii-brn"), ("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", { "pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/", "count": 5, }), - ("https://www.fashionnova.com/collections/flats/products/name"), + ("https://pinupgirlclothing.com/collections/evening/products" + "/clarice-coat-dress-in-olive-green-poly-crepe-laura-byrnes-design"), + ("https://www.raidlondon.com/collections/flats/products" + "/raid-addyson-chunky-flat-shoe-in-white"), + ("https://www.unique-vintage.com/collections/flapper-1920s/products" + "/unique-vintage-plus-size-black-silver-beaded-troyes-flapper-dress"), ("https://www.windsorstore.com/collections/accessories-belts/products" "/rhine-buckle-dbl-o-ring-pu-strap-belt-073010158001"), - ("https://de.loungeunderwear.com/products/ribbed-crop-top-black"), ) def products(self): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4c947e7..2737d34 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -15,7 +15,7 @@ import json BASE_PATTERN = ( r"(?:https?://)?(?:www\.|mobile\.)?" - r"(?:(?:fx)?twitter\.com|nitter\.net)" + r"(?:(?:[fv]x)?twitter\.com|nitter\.net)" ) @@ -39,7 +39,7 @@ class TwitterExtractor(Extractor): self.pinned = self.config("pinned", False) self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) - self.cards = self.config("cards", True) + self.cards = self.config("cards", False) self._user_cache = {} self._init_sizes() @@ -104,6 +104,7 @@ class TwitterExtractor(Extractor): def _extract_media(self, tweet, entities, files): for media in entities: + descr = media.get("ext_alt_text") width = media["original_info"].get("width", 0) height = media["original_info"].get("height", 0) @@ -112,9 +113,10 @@ class TwitterExtractor(Extractor): files.append({ "url": "ytdl:{}/i/web/status/{}".format( self.root, tweet["id_str"]), - "width" : width, - "height" : height, - "extension": None, + "width" : width, + "height" : height, + "extension" : None, + "description": descr, }) elif self.videos: video_info = media["video_info"] @@ -123,22 +125,24 @@ class TwitterExtractor(Extractor): key=lambda v: v.get("bitrate", 0), ) files.append({ - "url" : variant["url"], - "width" : width, - "height" : height, - "bitrate" : variant.get("bitrate", 0), - "duration": video_info.get( + "url" : variant["url"], + "width" : width, + "height" : height, + "bitrate" : variant.get("bitrate", 0), + "duration" : video_info.get( "duration_millis", 0) / 1000, + "description": descr, }) elif "media_url_https" in media: url = media["media_url_https"] base, _, fmt = url.rpartition(".") base += "?format=" + fmt + "&name=" files.append(text.nameext_from_url(url, { - "url" : base + self._size_image, - "width" : width, - "height" : height, - "_fallback": self._image_fallback(base), + "url" : base + self._size_image, + "width" : width, + "height" : height, + "_fallback" : self._image_fallback(base), + "description": descr, })) else: files.append({"url": media["media_url"]}) @@ -323,6 +327,9 @@ class TwitterExtractor(Extractor): elif userfmt == "media": cls = TwitterMediaExtractor fmt = (self.root + "/id:{rest_id}/media").format_map + elif userfmt == "tweets": + cls = TwitterTweetsExtractor + fmt = (self.root + "/id:{rest_id}/tweets").format_map else: cls = None fmt = userfmt.format_map @@ -383,7 +390,7 @@ class TwitterExtractor(Extractor): class TwitterTimelineExtractor(TwitterExtractor): - """Extractor for Tweets from a user's timeline""" + """Extractor for a Twitter user timeline""" subcategory = "timeline" pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") @@ -400,6 +407,8 @@ class TwitterTimelineExtractor(TwitterExtractor): ("https://www.twitter.com/id:2976459548"), ("https://twitter.com/i/user/2976459548"), ("https://twitter.com/intent/user?user_id=2976459548"), + ("https://fxtwitter.com/supernaturepics"), + ("https://vxtwitter.com/supernaturepics"), ) def __init__(self, match): @@ -409,6 +418,52 @@ class TwitterTimelineExtractor(TwitterExtractor): self.user = "id:" + user_id def tweets(self): + tweets = (self.api.user_tweets(self.user) if self.retweets else + self.api.user_media(self.user)) + + # yield initial batch of (media) tweets + tweet = None + for tweet in tweets: + yield tweet + + if tweet is None: + return + + # get username + if not self.user.startswith("id:"): + username = self.user + elif "core" in tweet: + username = (tweet["core"]["user_results"]["result"] + ["legacy"]["screen_name"]) + else: + username = tweet["user"]["screen_name"] + + # get tweet data + if "legacy" in tweet: + tweet = tweet["legacy"] + + # yield search results starting from last tweet id + yield from self.api.search_adaptive( + "from:{} include:retweets include:nativeretweets max_id:{} " + "filter:images OR card_name:animated_gif OR filter:native_video" + .format(username, tweet["id_str"]) + ) + + +class TwitterTweetsExtractor(TwitterExtractor): + """Extractor for Tweets from a user's Tweets timeline""" + subcategory = "tweets" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)" + test = ( + ("https://twitter.com/supernaturepics/tweets", { + "range": "1-40", + "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", + }), + ("https://mobile.twitter.com/supernaturepics/tweets#t"), + ("https://www.twitter.com/id:2976459548/tweets"), + ) + + def tweets(self): return self.api.user_tweets(self.user) @@ -662,6 +717,10 @@ class TwitterTweetExtractor(TwitterExtractor): "options": (("syndication", True),), "count": 1, }), + # media alt texts / descriptions (#2617) + ("https://twitter.com/my0nruri/status/1528379296041299968", { + "keyword": {"description": "oc"} + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py index 8fb9bbf..23f6ea2 100644 --- a/gallery_dl/extractor/vk.py +++ b/gallery_dl/extractor/vk.py @@ -9,7 +9,7 @@ """Extractors for https://vk.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com" @@ -39,9 +39,15 @@ class VkExtractor(Extractor): self.log.warning("no photo URL found (%s)", photo.get("id")) continue - photo.update(data) - photo["url"], photo["width"], photo["height"] = photo[size] + try: + photo["url"], photo["width"], photo["height"] = photo[size] + except ValueError: + # photo without width/height entries (#2535) + photo["url"] = photo[size + "src"] + photo["width"] = photo["height"] = 0 + photo["id"] = photo["id"].rpartition("_")[2] + photo.update(data) text.nameext_from_url(photo["url"], photo) yield Message.Url, photo["url"], photo @@ -66,6 +72,10 @@ class VkExtractor(Extractor): url, method="POST", headers=headers, data=data, ).json()["payload"][1] + if len(payload) < 4: + self.log.debug(payload) + raise exception.AuthorizationError(payload[0]) + total = payload[1] photos = payload[3] @@ -105,7 +115,7 @@ class VkPhotosExtractor(VkExtractor): }, }), ("https://vk.com/cosplayinrussia", { - "range": "25-35", + "range": "15-25", "keywords": { "id": r"re:\d+", "user": { @@ -117,6 +127,12 @@ class VkPhotosExtractor(VkExtractor): }, }, }), + # photos without width/height (#2535) + ("https://vk.com/id76957806", { + "pattern": r"https://sun\d+-\d+\.userapi\.com/", + "range": "1-9", + "count": 9, + }), ("https://m.vk.com/albums398982326"), ("https://www.vk.com/id398982326?profile=1"), ("https://vk.com/albums-165740836"), @@ -150,7 +166,8 @@ class VkPhotosExtractor(VkExtractor): '<h1 class="page_name">', "<")).replace(" ", " "), "info": text.unescape(text.remove_html(extr( '<span class="current_text">', '</span'))), - "id" : extr('<a href="/albums', '"'), + "id" : (extr('<a href="/albums', '"') or + extr('data-from-id="', '"')), }} @@ -166,6 +183,10 @@ class VkAlbumExtractor(VkExtractor): ("https://vk.com/album-165740836_281339889", { "count": 12, }), + # "Access denied" (#2556) + ("https://vk.com/album-53775183_00", { + "exception": exception.AuthorizationError, + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py index 75b78c5..599a175 100644 --- a/gallery_dl/extractor/weasyl.py +++ b/gallery_dl/extractor/weasyl.py @@ -47,6 +47,7 @@ class WeasylExtractor(Extractor): return data def submissions(self, owner_login, folderid=None): + metadata = self.config("metadata") url = "{}/api/users/{}/gallery".format(self.root, owner_login) params = { "nextid" : None, @@ -56,6 +57,9 @@ class WeasylExtractor(Extractor): while True: data = self.request(url, params=params).json() for submission in data["submissions"]: + if metadata: + submission = self.request_submission( + submission["submitid"]) if self.populate_submission(submission): submission["folderid"] = folderid # Do any submissions have more than one url? If so diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index cf5b192..59f46f0 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2020 Leonardo Taccari +# Copyright 2021-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -41,8 +42,8 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): """Extractor for an episode on webtoons.com""" subcategory = "episode" directory_fmt = ("{category}", "{comic}") - filename_fmt = "{episode}-{num:>02}.{extension}" - archive_fmt = "{title_no}_{episode}_{num}" + filename_fmt = "{episode_no}-{num:>02}.{extension}" + archive_fmt = "{title_no}_{episode_no}_{num}" pattern = (BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)/(?:[^/?#]+))" r"/viewer(?:\?([^#'\"]+))") test = ( @@ -54,6 +55,18 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): "49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9"), "count": 5, }), + (("https://www.webtoons.com/en/challenge/punderworld" + "/happy-earth-day-/viewer?title_no=312584&episode_no=40"), { + "keyword": { + "comic": "punderworld", + "description": str, + "episode": "36", + "episode_no": "40", + "genre": "challenge", + "title": r"re:^Punderworld - .+", + "title_no": "312584", + }, + }), ) def __init__(self, match): @@ -65,11 +78,13 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): query = text.parse_query(query) self.title_no = query.get("title_no") - self.episode = query.get("episode_no") + self.episode_no = query.get("episode_no") def metadata(self, page): + keywords, pos = text.extract( + page, '<meta name="keywords" content="', '"') title, pos = text.extract( - page, '<meta property="og:title" content="', '"') + page, '<meta property="og:title" content="', '"', pos) descr, pos = text.extract( page, '<meta property="og:description" content="', '"', pos) @@ -77,8 +92,9 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): "genre" : self.genre, "comic" : self.comic, "title_no" : self.title_no, - "episode" : self.episode, + "episode_no" : self.episode_no, "title" : text.unescape(title), + "episode" : keywords.split(", ")[1], "description": text.unescape(descr), "lang" : self.lang, "language" : util.code_to_language(self.lang), |
