diff options
| author | 2021-12-01 14:44:00 -0500 | |
|---|---|---|
| committer | 2021-12-01 14:44:00 -0500 | |
| commit | a5aecc343fd2886e7ae09bb3e2afeec38f175755 (patch) | |
| tree | 06a284b3d73700bd38116423e2480afa516255c2 /gallery_dl/extractor | |
| parent | fc8c5e642017e2b4e5299e2093e72b316479690d (diff) | |
New upstream version 1.19.3.upstream/1.19.3
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/dynastyscans.py | 25 | ||||
| -rw-r--r-- | gallery_dl/extractor/exhentai.py | 8 | ||||
| -rw-r--r-- | gallery_dl/extractor/foolfuuka.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/gelbooru_v02.py | 15 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 20 | ||||
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 125 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangadex.py | 42 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangoxo.py | 12 | ||||
| -rw-r--r-- | gallery_dl/extractor/philomena.py | 12 | ||||
| -rw-r--r-- | gallery_dl/extractor/reactor.py | 228 | ||||
| -rw-r--r-- | gallery_dl/extractor/seisoparty.py | 201 | ||||
| -rw-r--r-- | gallery_dl/extractor/shopify.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/skeb.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/subscribestar.py | 14 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 37 | ||||
| -rw-r--r-- | gallery_dl/extractor/webtoons.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/xvideos.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/ytdl.py | 79 |
19 files changed, 369 insertions, 474 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 79fe971..dd9da01 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -111,7 +111,6 @@ modules = [ "sankaku", "sankakucomplex", "seiga", - "seisoparty", "senmanga", "sexcom", "simplyhentai", diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index 4541d25..ab1044f 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -8,7 +8,7 @@ """Extractors for https://dynasty-scans.com/""" -from .common import ChapterExtractor, Extractor, Message +from .common import ChapterExtractor, MangaExtractor, Extractor, Message from .. import text import json import re @@ -48,12 +48,12 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): (("http://dynasty-scans.com/chapters/" "hitoribocchi_no_oo_seikatsu_ch33"), { "url": "dce64e8c504118f1ab4135c00245ea12413896cb", - "keyword": "1564965671ac69bb7fbc340538397f6bd0aa269b", + "keyword": "b67599703c27316a2fe4f11c3232130a1904e032", }), (("http://dynasty-scans.com/chapters/" "new_game_the_spinoff_special_13"), { "url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538", - "keyword": "22b35029bc65d6d95db2e2c147b0a37f2d290f29", + "keyword": "6b674eb3a274999153f6be044973b195008ced2f", }), ) @@ -76,7 +76,8 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): "author" : text.remove_html(author), "group" : (text.remove_html(group) or text.extract(group, ' alt="', '"')[0] or ""), - "date" : extr('"icon-calendar"></i> ', '<'), + "date" : text.parse_datetime(extr( + '"icon-calendar"></i> ', '<'), "%b %d, %Y"), "lang" : "en", "language": "English", } @@ -89,6 +90,22 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): ] +class DynastyscansMangaExtractor(DynastyscansBase, MangaExtractor): + chapterclass = DynastyscansChapterExtractor + reverse = False + pattern = BASE_PATTERN + r"(/series/[^/?#]+)" + test = ("https://dynasty-scans.com/series/hitoribocchi_no_oo_seikatsu", { + "pattern": DynastyscansChapterExtractor.pattern, + "count": ">= 100", + }) + + def chapters(self, page): + return [ + (self.root + path, {}) + for path in text.extract_iter(page, '<dd>\n<a href="', '"') + ] + + class DynastyscansSearchExtractor(DynastyscansBase, Extractor): """Extrator for image search results on dynasty-scans.com""" subcategory = "search" diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index aabfe6b..7ffb214 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -122,7 +122,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "date": "dt:2018-03-18 20:15:00", "eh_category": "Non-H", "expunged": False, - "favorites": "18", + "favorites": "19", "filecount": "4", "filesize": 1488978, "gid": 1200119, @@ -239,7 +239,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "title_jpn" : text.unescape(extr('<h1 id="gj">', '</h1>')), "_" : extr('<div id="gdc"><div class="cs ct', '"'), "eh_category" : extr('>', '<'), - "uploader" : text.unquote(extr('/uploader/', '"')), + "uploader" : extr('<div id="gdn">', '</div>'), "date" : text.parse_datetime(extr( '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"), "parent" : extr( @@ -255,6 +255,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): "torrentcount" : extr('>Torrent Download (', ')'), } + if data["uploader"].startswith("<"): + data["uploader"] = text.unescape(text.extract( + data["uploader"], ">", "<")[0]) + f = data["favorites"][0] if f == "N": data["favorites"] = "0" diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index d2c5e8f..6ddd689 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -122,7 +122,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): "url": "d309713d2f838797096b3e9cb44fe514a9c9d07a", }), ("https://desuarchive.org/a/thread/159542679/", { - "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406", + "url": "2bddbe03b01b4630337f6916f6df36d1d443b7b8", }), ("https://boards.fireden.net/sci/thread/11264294/", { "url": "61cab625c95584a12a30049d054931d64f8d20aa", @@ -131,10 +131,10 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", }), ("https://rbt.asia/g/thread/61487650/", { - "url": "61896d9d9a2edb556b619000a308a984307b6d30", + "url": "b4692707cddb4ad1c9ba1cde77c4703025cb86e5", }), ("https://archive.rebeccablacktech.com/g/thread/61487650/", { - "url": "61896d9d9a2edb556b619000a308a984307b6d30", + "url": "b4692707cddb4ad1c9ba1cde77c4703025cb86e5", }), ("https://thebarchive.com/b/thread/739772332/", { "url": "e8b18001307d130d67db31740ce57c8561b5d80c", diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index e09e190..a42a202 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -27,8 +27,21 @@ class GelbooruV02Extractor(booru.BooruExtractor): params["pid"] = self.page_start params["limit"] = self.per_page + post = None while True: - root = self._api_request(params) + try: + root = self._api_request(params) + except ElementTree.ParseError: + if "tags" not in params or post is None: + raise + taglist = [tag for tag in params["tags"].split() + if not tag.startswith("id:<")] + taglist.append("id:<" + str(post.attrib["id"])) + params["tags"] = " ".join(taglist) + params["pid"] = 0 + continue + + post = None for post in root: yield post.attrib diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index bf479ab..a1dd465 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -439,15 +439,27 @@ class InstagramTaggedExtractor(InstagramExtractor): test = ("https://www.instagram.com/instagram/tagged/", { "range": "1-16", "count": ">= 16", + "keyword": { + "tagged_owner_id" : "25025320", + "tagged_username" : "instagram", + "tagged_full_name": "Instagram", + }, }) - def posts(self): + def metadata(self): url = "{}/{}/".format(self.root, self.item) - user = self._extract_profile_page(url) + self.user = user = self._extract_profile_page(url) + + return { + "tagged_owner_id" : user["id"], + "tagged_username" : user["username"], + "tagged_full_name": user["full_name"], + } + def posts(self): query_hash = "be13233562af2d229b008d2976b998b5" - variables = {"id": user["id"], "first": 50} - edge = self._get_edge_data(user, None) + variables = {"id": self.user["id"], "first": 50} + edge = self._get_edge_data(self.user, None) return self._pagination_graphql(query_hash, variables, edge) diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 2e1d0b2..6483278 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -14,7 +14,7 @@ from ..cache import cache import itertools import re -BASE_PATTERN = r"(?:https?://)?kemono\.party" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?kemono\.party" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" @@ -30,19 +30,20 @@ class KemonopartyExtractor(Extractor): def items(self): self._prepare_ddosguard_cookies() - find_inline = re.compile( + self._find_inline = re.compile( r'src="(?:https?://kemono\.party)?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall - skip_service = \ - "patreon" if self.config("patreon-skip-file", True) else None + find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match + generators = self._build_file_generators(self.config("files")) comments = self.config("comments") + username = dms = None if self.config("metadata"): username = text.unescape(text.extract( self.request(self.user_url).text, '<meta name="artist_name" content="', '"')[0]) - else: - username = None + if self.config("dms"): + dms = True posts = self.posts() max_posts = self.config("max-posts") @@ -51,31 +52,38 @@ class KemonopartyExtractor(Extractor): for post in posts: - files = [] - append = files.append - file = post["file"] - - if file: - file["type"] = "file" - if post["service"] != skip_service or not post["attachments"]: - append(file) - for attachment in post["attachments"]: - attachment["type"] = "attachment" - append(attachment) - for path in find_inline(post["content"] or ""): - append({"path": path, "name": path, "type": "inline"}) - post["date"] = text.parse_datetime( - post["published"], "%a, %d %b %Y %H:%M:%S %Z") + post["published"] or post["added"], + "%a, %d %b %Y %H:%M:%S %Z") if username: post["username"] = username if comments: post["comments"] = self._extract_comments(post) + if dms is not None: + if dms is True: + dms = self._extract_dms(post) + post["dms"] = dms yield Message.Directory, post - for post["num"], file in enumerate(files, 1): - post["type"] = file["type"] + hashes = set() + post["num"] = 0 + for file in itertools.chain.from_iterable( + g(post) for g in generators): url = file["path"] + + match = find_hash(url) + if match: + post["hash"] = hash = match.group(1) + if hash in hashes: + self.log.debug("Skipping %s (duplicate)", url) + continue + hashes.add(hash) + else: + post["hash"] = "" + + post["type"] = file["type"] + post["num"] += 1 + if url[0] == "/": url = self.root + "/data" + url elif url.startswith("https://kemono.party"): @@ -103,6 +111,34 @@ class KemonopartyExtractor(Extractor): return {c.name: c.value for c in response.history[0].cookies} + def _file(self, post): + file = post["file"] + if not file: + return () + file["type"] = "file" + return (file,) + + def _attachments(self, post): + for attachment in post["attachments"]: + attachment["type"] = "attachment" + return post["attachments"] + + def _inline(self, post): + for path in self._find_inline(post["content"] or ""): + yield {"path": path, "name": path, "type": "inline"} + + def _build_file_generators(self, filetypes): + if filetypes is None: + return (self._file, self._attachments, self._inline) + genmap = { + "file" : self._file, + "attachments": self._attachments, + "inline" : self._inline, + } + if isinstance(filetypes, str): + filetypes = filetypes.split(",") + return [genmap[ft] for ft in filetypes] + def _extract_comments(self, post): url = "{}/{}/user/{}/post/{}".format( self.root, post["service"], post["user"], post["id"]) @@ -121,6 +157,21 @@ class KemonopartyExtractor(Extractor): }) return comments + def _extract_dms(self, post): + url = "{}/{}/user/{}/dms".format( + self.root, post["service"], post["user"]) + page = self.request(url).text + + dms = [] + for dm in text.extract_iter(page, "<article", "</article>"): + dms.append({ + "body": text.unescape(text.extract( + dm, '<div class="dm-card__content">', '</div>', + )[0].strip()), + "date": text.extract(dm, 'datetime="', '"')[0], + }) + return dms + class KemonopartyUserExtractor(KemonopartyExtractor): """Extractor for all posts from a kemono.party user listing""" @@ -175,6 +226,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "embed": dict, "extension": "jpeg", "filename": "P058kDFYus7DbqAkGlfWTlOr", + "hash": "210f35388e28bbcf756db18dd516e2d8" + "2ce758e0d32881eeee76d43e1716d382", "id": "506575", "num": 1, "published": "Sun, 11 Aug 2019 02:09:04 GMT", @@ -188,25 +241,39 @@ class KemonopartyPostExtractor(KemonopartyExtractor): }), # inline image (#1286) ("https://kemono.party/fanbox/user/7356311/post/802343", { - "pattern": r"https://kemono\.party/data/inline/fanbox" - r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg", + "pattern": r"https://kemono\.party/data/47/b5/47b5c014ecdcfabdf2c8" + r"5eec53f1133a76336997ae8596f332e97d956a460ad2\.jpg", + "keyword": {"hash": "47b5c014ecdcfabdf2c85eec53f1133a" + "76336997ae8596f332e97d956a460ad2"}, }), # kemono.party -> data.kemono.party ("https://kemono.party/gumroad/user/trylsc/post/IURjT", { - "pattern": r"https://kemono\.party/data/(file|attachment)s" - r"/gumroad/trylsc/IURjT/", + "pattern": r"https://kemono\.party/data/(" + r"files/gumroad/trylsc/IURjT/reward8\.jpg|" + r"c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)", }), # username (#1548, #1652) ("https://kemono.party/gumroad/user/3252870377455/post/aJnAH", { "options": (("metadata", True),), "keyword": {"username": "Kudalyn's Creations"}, }), - # skip patreon main file (#1667, #1689) + # skip patreon duplicates ("https://kemono.party/patreon/user/4158582/post/32099982", { "count": 2, - "keyword": {"type": "attachment"}, + }), + # DMs (#2008) + ("https://kemono.party/patreon/user/34134344/post/38129255", { + "options": (("dms", True),), + "keyword": {"dms": [{ + "body": r"re:Hi! Thank you very much for supporting the work I" + r" did in May. Here's your reward pack! I hope you fin" + r"d something you enjoy in it. :\)\n\nhttps://www.medi" + r"afire.com/file/\w+/Set13_tier_2.zip/file", + "date": "2021-07-31 02:47:51.327865", + }]}, }), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), + ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index ff1d7c3..393f4e2 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -46,10 +46,10 @@ class MangadexExtractor(Extractor): def _transform(self, chapter): relationships = defaultdict(list) for item in chapter["relationships"]: - relationships[item["type"]].append(item["id"]) - manga = self.api.manga(relationships["manga"][0]) + relationships[item["type"]].append(item) + manga = self.api.manga(relationships["manga"][0]["id"]) for item in manga["relationships"]: - relationships[item["type"]].append(item["id"]) + relationships[item["type"]].append(item) cattributes = chapter["attributes"] mattributes = manga["attributes"] @@ -75,16 +75,12 @@ class MangadexExtractor(Extractor): "count" : len(cattributes["data"]), } - if self.config("metadata"): - data["artist"] = [ - self.api.author(uuid)["attributes"]["name"] - for uuid in relationships["artist"]] - data["author"] = [ - self.api.author(uuid)["attributes"]["name"] - for uuid in relationships["author"]] - data["group"] = [ - self.api.group(uuid)["attributes"]["name"] - for uuid in relationships["scanlation_group"]] + data["artist"] = [artist["attributes"]["name"] + for artist in relationships["artist"]] + data["author"] = [author["attributes"]["name"] + for author in relationships["author"]] + data["group"] = [group["attributes"]["name"] + for group in relationships["scanlation_group"]] return data @@ -95,12 +91,11 @@ class MangadexChapterExtractor(MangadexExtractor): pattern = BASE_PATTERN + r"/chapter/([0-9a-f-]+)" test = ( ("https://mangadex.org/chapter/f946ac53-0b71-4b5d-aeb2-7931b13c4aaa", { - "keyword": "f6c2b908df06eb834d56193dfe1fa1f7c2c4dccd", + "keyword": "86fb262cf767dac6d965cd904ad499adba466404", # "content": "50383a4c15124682057b197d40261641a98db514", }), # oneshot ("https://mangadex.org/chapter/61a88817-9c29-4281-bdf1-77b3c1be9831", { - "options": (("metadata", True),), "count": 64, "keyword": "6abcbe1e24eeb1049dc931958853cd767ee483fb", }), @@ -147,6 +142,8 @@ class MangadexMangaExtractor(MangadexExtractor): "date" : "type:datetime", "lang" : str, "language": str, + "artist" : ["Arakawa Hiromu"], + "author" : ["Arakawa Hiromu"], }, }), ("https://mangadex.cc/manga/d0c88e3b-ea64-4e07-9841-c1d2ac982f4a/", { @@ -193,20 +190,14 @@ class MangadexAPI(): def athome_server(self, uuid): return self._call("/at-home/server/" + uuid) - @memcache(keyarg=1) - def author(self, uuid): - return self._call("/author/" + uuid)["data"] - def chapter(self, uuid): - return self._call("/chapter/" + uuid)["data"] - - @memcache(keyarg=1) - def group(self, uuid): - return self._call("/group/" + uuid)["data"] + params = {"includes[]": ("scanlation_group",)} + return self._call("/chapter/" + uuid, params)["data"] @memcache(keyarg=1) def manga(self, uuid): - return self._call("/manga/" + uuid)["data"] + params = {"includes[]": ("artist", "author")} + return self._call("/manga/" + uuid, params)["data"] def manga_feed(self, uuid): order = "desc" if self.extractor.config("chapter-reverse") else "asc" @@ -275,6 +266,7 @@ class MangadexAPI(): ratings = ("safe", "suggestive", "erotica", "pornographic") params["contentRating[]"] = ratings + params["includes[]"] = ("scanlation_group",) params["translatedLanguage[]"] = config("lang") params["offset"] = 0 diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index d45fbc9..1486057 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -122,18 +122,18 @@ class MangoxoAlbumExtractor(MangoxoExtractor): def metadata(self, page): """Return general metadata""" extr = text.extract_from(page) - title = extr('<title>', '</title>') - count = extr('id="pic-count">', '<') - cid = extr('<img alt="', '"') + title = extr('<img id="cover-img" alt="', '"') + cid = extr('href="https://www.mangoxo.com/user/', '"') + cname = extr('<img alt="', '"') cover = extr(' src="', '"') - cname = extr('target="_blank">', '<') - date = extr('</i>', '<') + count = extr('id="pic-count">', '<') + date = extr('class="fa fa-calendar"></i>', '<') descr = extr('<pre>', '</pre>') return { "channel": { "id": cid, - "name": text.unescape(cname.strip()), + "name": text.unescape(cname), "cover": cover, }, "album": { diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index d3b3bb1..51a0d38 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -62,6 +62,8 @@ INSTANCES = { "filter_id": "56027"}, "ponybooru" : {"root": "https://ponybooru.org", "filter_id": "2"}, + "furbooru" : {"root": "https://furbooru.org", + "filter_id": "2"}, } BASE_PATTERN = PhilomenaExtractor.update(INSTANCES) @@ -124,6 +126,9 @@ class PhilomenaPostExtractor(PhilomenaExtractor): ("https://ponybooru.org/images/1", { "content": "bca26f58fafd791fe07adcd2a28efd7751824605", }), + ("https://furbooru.org/images/1", { + "content": "9eaa1e1b32fa0f16520912257dbefaff238d5fd2", + }), ) def __init__(self, match): @@ -157,6 +162,10 @@ class PhilomenaSearchExtractor(PhilomenaExtractor): "range": "40-60", "count": 21, }), + ("https://furbooru.org/search?q=cute", { + "range": "40-60", + "count": 21, + }), ) def __init__(self, match): @@ -210,6 +219,9 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor): ("https://ponybooru.org/galleries/27", { "count": ">= 24", }), + ("https://furbooru.org/galleries/27", { + "count": ">= 13", + }), ) def __init__(self, match): diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 04fe581..b3a620a 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -8,29 +8,29 @@ """Generic extractors for *reactor sites""" -from .common import Extractor, Message +from .common import BaseExtractor, Message from .. import text import urllib.parse import json -BASE_PATTERN = r"(?:https?://)?((?:[^/.]+\.)?reactor\.cc)" - -class ReactorExtractor(Extractor): +class ReactorExtractor(BaseExtractor): """Base class for *reactor.cc extractors""" basecategory = "reactor" filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" archive_fmt = "{post_id}_{num}" - instances = () request_interval = 5.0 def __init__(self, match): - Extractor.__init__(self, match) - self.root = "http://" + match.group(1) + BaseExtractor.__init__(self, match) + url = text.ensure_http_scheme(match.group(0), "http://") + pos = url.index("/", 10) + + self.root, self.path = url[:pos], url[pos:] self.session.headers["Referer"] = self.root self.gif = self.config("gif", False) - if not self.category: + if self.category == "reactor": # set category based on domain name netloc = urllib.parse.urlsplit(self.root).netloc self.category = netloc.rpartition(".")[0] @@ -50,7 +50,7 @@ class ReactorExtractor(Extractor): def posts(self): """Return all relevant post-objects""" - return self._pagination(self.url) + return self._pagination(self.root + self.path) def _pagination(self, url): while True: @@ -145,91 +145,63 @@ class ReactorExtractor(Extractor): } +BASE_PATTERN = ReactorExtractor.update({ + "reactor" : { + "root": "http://reactor.cc", + "pattern": r"(?:[^/.]+\.)?reactor\.cc", + }, + "joyreactor" : { + "root": "http://joyreactor.cc", + "pattern": r"(?:www\.)?joyreactor\.c(?:c|om)", + }, + "pornreactor": { + "root": "http://pornreactor.cc", + "pattern": r"(?:www\.)?(?:pornreactor\.cc|fapreactor.com)", + }, + "thatpervert": { + "root": "http://thatpervert.com", + }, +}) + + class ReactorTagExtractor(ReactorExtractor): """Extractor for tag searches on *reactor.cc sites""" subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "{search_tags}_{post_id}_{num}" pattern = BASE_PATTERN + r"/tag/([^/?#]+)" - test = ("http://anime.reactor.cc/tag/Anime+Art",) + test = ( + ("http://reactor.cc/tag/gif"), + ("http://anime.reactor.cc/tag/Anime+Art"), + ("http://joyreactor.cc/tag/Advent+Cirno", { + "count": ">= 15", + }), + ("http://joyreactor.com/tag/Cirno", { + "url": "aa59090590b26f4654881301fe8fe748a51625a8", + }), + ("http://pornreactor.cc/tag/RiceGnat", { + "range": "1-25", + "count": ">= 25", + }), + ("http://fapreactor.com/tag/RiceGnat"), + ) def __init__(self, match): ReactorExtractor.__init__(self, match) - self.tag = match.group(2) + self.tag = match.group(match.lastindex) def metadata(self): return {"search_tags": text.unescape(self.tag).replace("+", " ")} -class ReactorSearchExtractor(ReactorTagExtractor): +class ReactorSearchExtractor(ReactorExtractor): """Extractor for search results on *reactor.cc sites""" subcategory = "search" directory_fmt = ("{category}", "search", "{search_tags}") archive_fmt = "s_{search_tags}_{post_id}_{num}" pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" - test = ("http://anime.reactor.cc/search?q=Art",) - - -class ReactorUserExtractor(ReactorExtractor): - """Extractor for all posts of a user on *reactor.cc sites""" - subcategory = "user" - directory_fmt = ("{category}", "user", "{user}") - pattern = BASE_PATTERN + r"/user/([^/?#]+)" - test = ("http://anime.reactor.cc/user/Shuster",) - - def __init__(self, match): - ReactorExtractor.__init__(self, match) - self.user = match.group(2) - - def metadata(self): - return {"user": text.unescape(self.user).replace("+", " ")} - - -class ReactorPostExtractor(ReactorExtractor): - """Extractor for single posts on *reactor.cc sites""" - subcategory = "post" - pattern = BASE_PATTERN + r"/post/(\d+)" - test = ("http://anime.reactor.cc/post/3576250",) - - def __init__(self, match): - ReactorExtractor.__init__(self, match) - self.post_id = match.group(2) - - def items(self): - post = self.request(self.url).text - pos = post.find('class="uhead">') - for image in self._parse_post(post[pos:]): - if image["num"] == 1: - yield Message.Directory, image - url = image["url"] - yield Message.Url, url, text.nameext_from_url(url, image) - - -# -------------------------------------------------------------------- -# JoyReactor - -JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))" - - -class JoyreactorTagExtractor(ReactorTagExtractor): - """Extractor for tag searches on joyreactor.cc""" - category = "joyreactor" - pattern = JR_BASE_PATTERN + r"/tag/([^/?#]+)" - test = ( - ("http://joyreactor.cc/tag/Advent+Cirno", { - "count": ">= 15", - }), - ("http://joyreactor.com/tag/Cirno", { - "url": "aa59090590b26f4654881301fe8fe748a51625a8", - }), - ) - - -class JoyreactorSearchExtractor(ReactorSearchExtractor): - """Extractor for search results on joyreactor.cc""" - category = "joyreactor" - pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" test = ( + ("http://reactor.cc/search?q=Art"), ("http://joyreactor.cc/search/Nature", { "range": "1-25", "count": ">= 20", @@ -238,26 +210,54 @@ class JoyreactorSearchExtractor(ReactorSearchExtractor): "range": "1-25", "count": ">= 20", }), + ("http://pornreactor.cc/search?q=ecchi+hentai"), + ("http://fapreactor.com/search/ecchi+hentai"), ) + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.tag = match.group(match.lastindex) + + def metadata(self): + return {"search_tags": text.unescape(self.tag).replace("+", " ")} + -class JoyreactorUserExtractor(ReactorUserExtractor): - """Extractor for all posts of a user on joyreactor.cc""" - category = "joyreactor" - pattern = JR_BASE_PATTERN + r"/user/([^/?#]+)" +class ReactorUserExtractor(ReactorExtractor): + """Extractor for all posts of a user on *reactor.cc sites""" + subcategory = "user" + directory_fmt = ("{category}", "user", "{user}") + pattern = BASE_PATTERN + r"/user/([^/?#]+)" test = ( + ("http://reactor.cc/user/Dioklet"), + ("http://anime.reactor.cc/user/Shuster"), ("http://joyreactor.cc/user/hemantic"), ("http://joyreactor.com/user/Tacoman123", { "url": "60ce9a3e3db791a0899f7fb7643b5b87d09ae3b5", }), + ("http://pornreactor.cc/user/Disillusion", { + "range": "1-25", + "count": ">= 20", + }), + ("http://fapreactor.com/user/Disillusion"), ) + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.user = match.group(match.lastindex) + + def metadata(self): + return {"user": text.unescape(self.user).replace("+", " ")} + -class JoyreactorPostExtractor(ReactorPostExtractor): - """Extractor for single posts on joyreactor.cc""" - category = "joyreactor" - pattern = JR_BASE_PATTERN + r"/post/(\d+)" +class ReactorPostExtractor(ReactorExtractor): + """Extractor for single posts on *reactor.cc sites""" + subcategory = "post" + pattern = BASE_PATTERN + r"/post/(\d+)" test = ( + ("http://reactor.cc/post/4999736", { + "url": "dfc74d150d7267384d8c229c4b82aa210755daa0", + }), + ("http://anime.reactor.cc/post/3576250"), ("http://joyreactor.com/post/3721876", { # single image "pattern": r"http://img\d\.joyreactor\.com/pics/post/full" r"/cartoon-painting-monster-lake-4841316.jpeg", @@ -281,57 +281,6 @@ class JoyreactorPostExtractor(ReactorPostExtractor): ("http://joyreactor.cc/post/1299", { # "malformed" JSON "url": "ab02c6eb7b4035ad961b29ee0770ee41be2fcc39", }), - ) - - -# -------------------------------------------------------------------- -# PornReactor - -PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)" - - -class PornreactorTagExtractor(ReactorTagExtractor): - """Extractor for tag searches on pornreactor.cc""" - category = "pornreactor" - pattern = PR_BASE_PATTERN + r"/tag/([^/?#]+)" - test = ( - ("http://pornreactor.cc/tag/RiceGnat", { - "range": "1-25", - "count": ">= 25", - }), - ("http://fapreactor.com/tag/RiceGnat"), - ) - - -class PornreactorSearchExtractor(ReactorSearchExtractor): - """Extractor for search results on pornreactor.cc""" - category = "pornreactor" - pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)" - test = ( - ("http://pornreactor.cc/search?q=ecchi+hentai"), - ("http://fapreactor.com/search/ecchi+hentai"), - ) - - -class PornreactorUserExtractor(ReactorUserExtractor): - """Extractor for all posts of a user on pornreactor.cc""" - category = "pornreactor" - pattern = PR_BASE_PATTERN + r"/user/([^/?#]+)" - test = ( - ("http://pornreactor.cc/user/Disillusion", { - "range": "1-25", - "count": ">= 20", - }), - ("http://fapreactor.com/user/Disillusion"), - ) - - -class PornreactorPostExtractor(ReactorPostExtractor): - """Extractor for single posts on pornreactor.cc""" - category = "pornreactor" - subcategory = "post" - pattern = PR_BASE_PATTERN + r"/post/(\d+)" - test = ( ("http://pornreactor.cc/post/863166", { "url": "a09fb0577489e1f9564c25d0ad576f81b19c2ef3", "content": "ec6b0568bfb1803648744077da082d14de844340", @@ -340,3 +289,16 @@ class PornreactorPostExtractor(ReactorPostExtractor): "url": "2a956ce0c90e8bc47b4392db4fa25ad1342f3e54", }), ) + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def items(self): + post = self.request(self.root + self.path).text + pos = post.find('class="uhead">') + for image in self._parse_post(post[pos:]): + if image["num"] == 1: + yield Message.Directory, image + url = image["url"] + yield Message.Url, url, text.nameext_from_url(url, image) diff --git a/gallery_dl/extractor/seisoparty.py b/gallery_dl/extractor/seisoparty.py deleted file mode 100644 index a2a24e0..0000000 --- a/gallery_dl/extractor/seisoparty.py +++ /dev/null @@ -1,201 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://seiso.party/""" - -from .common import Extractor, Message -from .. import text, exception -from ..cache import cache -import re - - -class SeisopartyExtractor(Extractor): - """Base class for seisoparty extractors""" - category = "seisoparty" - root = "https://seiso.party" - directory_fmt = ("{category}", "{service}", "{username}") - filename_fmt = "{id}_{title}_{num:>02}_{filename}.{extension}" - archive_fmt = "{service}_{user}_{id}_{num}" - cookiedomain = ".seiso.party" - - def __init__(self, match): - Extractor.__init__(self, match) - self.user_name = None - self._find_files = re.compile( - r'href="(https://cdn(?:-\d)?\.seiso\.party/files/[^"]+)').findall - - def items(self): - self._prepare_ddosguard_cookies() - - for post in self.posts(): - files = post.pop("files") - yield Message.Directory, post - for post["num"], url in enumerate(files, 1): - yield Message.Url, url, text.nameext_from_url(url, post) - - def _parse_post(self, page, post_id): - extr = text.extract_from(page) - return { - "service" : self.service, - "user" : self.user_id, - "username": self.user_name, - "id" : post_id, - "date" : text.parse_datetime(extr( - '<div class="margin-bottom-15 minor-text">', '<'), - "%Y-%m-%d %H:%M:%S %Z"), - "title" : text.unescape(extr('class="post-title">', '<')), - "content" : text.unescape(extr("\n<p>\n", "\n</p>\n").strip()), - "files" : self._find_files(page), - } - - def login(self): - username, password = self._get_auth_info() - if username: - self._update_cookies(self._login_impl(username, password)) - - @cache(maxage=28*24*3600, keyarg=1) - def _login_impl(self, username, password): - self.log.info("Logging in as %s", username) - - url = self.root + "/account/login" - data = {"username": username, "password": password} - - response = self.request(url, method="POST", data=data) - if response.url.endswith("/account/login") and \ - "Username or password is incorrect" in response.text: - raise exception.AuthenticationError() - - return {c.name: c.value for c in response.history[0].cookies} - - -class SeisopartyUserExtractor(SeisopartyExtractor): - """Extractor for all posts from a seiso.party user listing""" - subcategory = "user" - pattern = r"(?:https?://)?seiso\.party/artists/([^/?#]+)/([^/?#]+)" - test = ( - ("https://seiso.party/artists/fanbox/21", { - "pattern": r"https://cdn\.seiso\.party/files/fanbox/\d+/", - "count": ">=15", - "keyword": { - "content": str, - "date": "type:datetime", - "id": r"re:\d+", - "num": int, - "service": "fanbox", - "title": str, - "user": "21", - "username": "雨", - }, - }), - ) - - def __init__(self, match): - SeisopartyExtractor.__init__(self, match) - self.service, self.user_id = match.groups() - - def posts(self): - url = "{}/artists/{}/{}".format(self.root, self.service, self.user_id) - page = self.request(url).text - self.user_name, pos = text.extract(page, '<span class="title">', '<') - - url = self.root + text.extract( - page, 'href="', '"', page.index('id="content"', pos))[0] - response = self.request(url) - headers = {"Referer": url} - - while True: - yield self._parse_post(response.text, url.rpartition("/")[2]) - response = self.request(url + "/next", headers=headers) - if url == response.url: - return - url = headers["Referer"] = response.url - - -class SeisopartyPostExtractor(SeisopartyExtractor): - """Extractor for a single seiso.party post""" - subcategory = "post" - pattern = r"(?:https?://)?seiso\.party/post/([^/?#]+)/([^/?#]+)/([^/?#]+)" - test = ( - ("https://seiso.party/post/fanbox/21/371", { - "url": "75f13b92de0ce399b6163c3de18f1f36011c2366", - "count": 2, - "keyword": { - "content": "この前描いためぐるちゃんのPSDファイルです。<br/>" - "どうぞよろしくお願いします。", - "date": "dt:2021-05-06 12:38:31", - "extension": "re:psd|jpg", - "filename": "re:backcourt|ffb2ccb7a3586d05f9a4620329dd131e", - "id": "371", - "num": int, - "service": "fanbox", - "title": "MEGURU.PSD", - "user": "21", - "username": "雨", - }, - }), - ("https://seiso.party/post/patreon/429/95949", { - "pattern": r"https://cdn-2\.seiso\.party/files/patreon/95949/", - "count": 2, - }), - ) - - def __init__(self, match): - SeisopartyExtractor.__init__(self, match) - self.service, self.user_id, self.post_id = match.groups() - - def posts(self): - url = "{}/artists/{}/{}".format(self.root, self.service, self.user_id) - page = self.request(url).text - self.user_name, pos = text.extract(page, '<span class="title">', '<') - - url = "{}/post/{}/{}/{}".format( - self.root, self.service, self.user_id, self.post_id) - return (self._parse_post(self.request(url).text, self.post_id),) - - -class SeisopartyFavoriteExtractor(SeisopartyExtractor): - """Extractor for seiso.party favorites""" - subcategory = "favorite" - pattern = r"(?:https?://)?seiso\.party/favorites/artists/?(?:\?([^#]+))?" - test = ( - ("https://seiso.party/favorites/artists", { - "pattern": SeisopartyUserExtractor.pattern, - "url": "0c862434bc3bbbe84cbf41c3a6152473a8cde683", - "count": 3, - }), - ("https://seiso.party/favorites/artists?sort=id&sort_direction=asc", { - "url": "629a8b9c6d3a8a64f521908bdb3d7426ac03f8d3", - }), - ) - - def __init__(self, match): - SeisopartyExtractor.__init__(self, match) - self.query = match.group(1) - - def items(self): - self._prepare_ddosguard_cookies() - self.login() - - url = self.root + "/favorites/artists" - data = {"_extractor": SeisopartyUserExtractor} - params = text.parse_query(self.query) - params["page"] = text.parse_int(params.get("page"), 1) - - while True: - page = self.request(url, params=params).text - - cnt = 0 - for card in text.extract_iter( - page, '<div class="artist-card', '</a>'): - path = text.extract(card, '<a href="', '"')[0] - yield Message.Queue, self.root + path, data - cnt += 1 - - if cnt < 25: - return - params["page"] += 1 diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py index 6d924de..f276e84 100644 --- a/gallery_dl/extractor/shopify.py +++ b/gallery_dl/extractor/shopify.py @@ -53,6 +53,10 @@ BASE_PATTERN = ShopifyExtractor.update({ "windsorstore": { "root": "https://www.windsorstore.com", }, + "loungeunderwear": { + "root": "https://loungeunderwear.com", + "pattern": r"(?:[a-z]+\.)?loungeunderwear\.com", + }, }) @@ -70,6 +74,7 @@ class ShopifyCollectionExtractor(ShopifyExtractor): ("https://www.fashionnova.com/collections/mini-dresses#1"), ("https://www.omgmiamiswimwear.com/collections/fajas"), ("https://www.windsorstore.com/collections/dresses-ball-gowns"), + ("https://loungeunderwear.com/collections/apparel"), ) def metadata(self): @@ -105,6 +110,7 @@ class ShopifyProductExtractor(ShopifyExtractor): ("https://www.fashionnova.com/collections/flats/products/name"), ("https://www.windsorstore.com/collections/accessories-belts/products" "/rhine-buckle-dbl-o-ring-pu-strap-belt-073010158001"), + ("https://de.loungeunderwear.com/products/ribbed-crop-top-black"), ) def products(self): diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index c1a8878..2c806ad 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -21,6 +21,7 @@ class SkebExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user_name = match.group(1) + self.thumbnails = self.config("thumbnails", False) def items(self): for post_num in self.posts(): @@ -94,7 +95,7 @@ class SkebExtractor(Extractor): return resp, post def _get_urls_from_post(self, resp, post): - if "og_image_url" in resp: + if self.thumbnails and "og_image_url" in resp: post["content_category"] = "thumb" post["file_id"] = "thumb" post["file_url"] = resp["og_image_url"] diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index ae8b58d..69e3854 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -38,12 +38,11 @@ class SubscribestarExtractor(Extractor): self.login() for post_html in self.posts(): media = self._media_from_post(post_html) - if not media: - continue data = self._data_from_post(post_html) yield Message.Directory, data - for item in media: + for num, item in enumerate(media, 1): item.update(data) + item["num"] = num text.nameext_from_url(item.get("name") or item["url"], item) yield Message.Url, item["url"], item @@ -140,8 +139,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor): test = ( ("https://www.subscribestar.com/subscribestar", { "count": ">= 20", - "pattern": r"https://(star-uploads|ss-uploads-prod)\.s\d+-us-west-" - r"\d+\.amazonaws\.com/uploads(_v2)?/users/11/", + "pattern": r"https://\w+\.cloudfront\.net/uploads(_v2)?/users/11/", "keyword": { "author_id": 11, "author_name": "subscribestar", @@ -149,6 +147,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor): "content": str, "date" : "type:datetime", "id" : int, + "num" : int, "post_id": int, "type" : "re:image|video|attachment", "url" : str, @@ -190,7 +189,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor): pattern = BASE_PATTERN + r"/posts/(\d+)" test = ( ("https://www.subscribestar.com/posts/102468", { - "url": "612da5a98af056dd78dc846fbcfa705e721f6675", + "count": 1, "keyword": { "author_id": 11, "author_name": "subscribestar", @@ -202,6 +201,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor): "group": "imgs_and_videos", "height": 291, "id": 203885, + "num": 1, "pinned": False, "post_id": 102468, "type": "image", @@ -209,7 +209,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor): }, }), ("https://subscribestar.adult/posts/22950", { - "url": "440d745a368e6b3e218415f593a5045f384afa0d", + "count": 1, "keyword": {"date": "dt:2019-04-28 07:32:00"}, }), ) diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 00f3b04..f1c392d 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -41,7 +41,9 @@ class TwitterExtractor(Extractor): self.videos = self.config("videos", True) self.cards = self.config("cards", False) self._user_cache = {} + self._init_sizes() + def _init_sizes(self): size = self.config("size") if size is None: self._size_image = "orig" @@ -580,13 +582,17 @@ class TwitterImageExtractor(Extractor): subcategory = "image" pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)" test = ( - ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg%name=orig"), + ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg&name=orig", { + "options": (("size", "4096x4096,orig"),), + "url": "cb3042a6f6826923da98f0d2b66c427e9385114c", + }), ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"), ) def __init__(self, match): Extractor.__init__(self, match) self.id, self.fmt = match.groups() + TwitterExtractor._init_sizes(self) def items(self): base = "https://pbs.twimg.com/media/{}?format={}&name=".format( @@ -595,11 +601,11 @@ class TwitterImageExtractor(Extractor): data = { "filename": self.id, "extension": self.fmt, - "_fallback": TwitterExtractor._image_fallback(base), + "_fallback": TwitterExtractor._image_fallback(self, base), } yield Message.Directory, data - yield Message.Url, base + "orig", data + yield Message.Url, base + self._size_image, data class TwitterAPI(): @@ -793,16 +799,21 @@ class TwitterAPI(): data = response.json() if "errors" in data: try: - msg = ", ".join( - '"' + error["message"] + '"' - for error in data["errors"] - ) + errors, warnings = [], [] + for error in data["errors"]: + if error.get("kind") == "NonFatal": + warnings.append(error["message"]) + else: + errors.append(error["message"]) + errors = ", ".join(errors) except Exception: - msg = data["errors"] - if msg and response.status_code < 400: - raise exception.StopExtraction(msg) + errors = data["errors"] + if warnings: + self.extractor.log.warning(", ".join(warnings)) + if errors and response.status_code < 400: + raise exception.StopExtraction(errors) else: - msg = "" + errors = "" if response.status_code < 400: # success @@ -816,7 +827,7 @@ class TwitterAPI(): continue if response.status_code == 401 and \ - "have been blocked from viewing" in msg: + "have been blocked from viewing" in errors: # account blocked extr = self.extractor if self.headers["x-twitter-auth-type"] and \ @@ -833,7 +844,7 @@ class TwitterAPI(): # error raise exception.StopExtraction( - "%s %s (%s)", response.status_code, response.reason, msg) + "%s %s (%s)", response.status_code, response.reason, errors) def _pagination(self, endpoint, params=None): if params is None: diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index e2474c9..cf5b192 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -48,7 +48,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): test = ( (("https://www.webtoons.com/en/comedy/safely-endangered" "/ep-572-earth/viewer?title_no=352&episode_no=572"), { - "url": "11041d71a3f92728305c11a228e77cf0f7aa02ef", + "url": "55bec5d7c42aba19e3d0d56db25fdf0b0b13be38", "content": ("1748c7e82b6db910fa179f6dc7c4281b0f680fa7", "42055e44659f6ffc410b3fb6557346dfbb993df3", "49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9"), @@ -62,7 +62,6 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): url = "{}/{}/viewer?{}".format(self.root, self.path, query) GalleryExtractor.__init__(self, match, url) self.setup_agegate_cookies() - self.session.headers["Referer"] = url query = text.parse_query(query) self.title_no = query.get("title_no") @@ -88,7 +87,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor): @staticmethod def images(page): return [ - (url, None) + (url.replace("://webtoon-phinf.", "://swebtoon-phinf."), None) for url in text.extract_iter( page, 'class="_images" data-url="', '"') ] diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 0922c7c..0a55532 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -32,8 +32,8 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): test = ( ("https://www.xvideos.com/profiles/pervertedcouple/photos/751031", { "count": 8, - "pattern": r"https://profile-pics-l3\.xvideos-cdn\.com" - r"/[0-9a-f]{40}-\d+/videos/profiles/galleries/84/ca/37" + "pattern": r"https://profile-pics-cdn\d+\.xvideos-cdn\.com" + r"/[^/]+\,\d+/videos/profiles/galleries/84/ca/37" r"/pervertedcouple/gal751031/pic_\d+_big\.jpg", "keyword": { "gallery": { diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py index d380dab..8eb0c83 100644 --- a/gallery_dl/extractor/ytdl.py +++ b/gallery_dl/extractor/ytdl.py @@ -9,7 +9,7 @@ """Extractors for sites supported by youtube-dl""" from .common import Extractor, Message -from .. import text, config, exception +from .. import ytdl, config, exception class YoutubeDLExtractor(Extractor): @@ -54,52 +54,45 @@ class YoutubeDLExtractor(Extractor): self.log.debug("Using %s", ytdl_module) # construct YoutubeDL object - options = { - "format" : self.config("format"), + extr_opts = { + "extract_flat" : "in_playlist", + "force_generic_extractor": self.force_generic_extractor, + } + user_opts = { "retries" : self._retries, "socket_timeout" : self._timeout, "nocheckcertificate" : not self._verify, - "proxy" : self.session.proxies.get("http"), - "force_generic_extractor": self.force_generic_extractor, - "nopart" : not self.config("part", True), - "updatetime" : self.config("mtime", True), - "ratelimit" : text.parse_bytes( - self.config("rate"), None), - "min_filesize" : text.parse_bytes( - self.config("filesize-min"), None), - "max_filesize" : text.parse_bytes( - self.config("filesize-max"), None), } - raw_options = self.config("raw-options") - if raw_options: - options.update(raw_options) - if self.config("logging", True): - options["logger"] = self.log - options["extract_flat"] = "in_playlist" - username, password = self._get_auth_info() if username: - options["username"], options["password"] = username, password + user_opts["username"], user_opts["password"] = username, password del username, password - ytdl = ytdl_module.YoutubeDL(options) + ytdl_instance = ytdl.construct_YoutubeDL( + ytdl_module, self, user_opts, extr_opts) # transfer cookies to ytdl cookies = self.session.cookies if cookies: - set_cookie = self.ytdl.cookiejar.set_cookie - for cookie in self.session.cookies: + set_cookie = ytdl_instance.cookiejar.set_cookie + for cookie in cookies: set_cookie(cookie) # extract youtube_dl info_dict - info_dict = ytdl._YoutubeDL__extract_info( - self.ytdl_url, - ytdl.get_info_extractor(self.ytdl_ie_key), - False, {}, True) - - if "entries" in info_dict: - results = self._process_entries(ytdl, info_dict["entries"]) + try: + info_dict = ytdl_instance._YoutubeDL__extract_info( + self.ytdl_url, + ytdl_instance.get_info_extractor(self.ytdl_ie_key), + False, {}, True) + except ytdl_module.utils.YoutubeDLError: + raise exception.StopExtraction("Failed to extract video data") + + if not info_dict: + return + elif "entries" in info_dict: + results = self._process_entries( + ytdl_module, ytdl_instance, info_dict["entries"]) else: results = (info_dict,) @@ -107,7 +100,7 @@ class YoutubeDLExtractor(Extractor): for info_dict in results: info_dict["extension"] = None info_dict["_ytdl_info_dict"] = info_dict - info_dict["_ytdl_instance"] = ytdl + info_dict["_ytdl_instance"] = ytdl_instance url = "ytdl:" + (info_dict.get("url") or info_dict.get("webpage_url") or @@ -116,15 +109,23 @@ class YoutubeDLExtractor(Extractor): yield Message.Directory, info_dict yield Message.Url, url, info_dict - def _process_entries(self, ytdl, entries): + def _process_entries(self, ytdl_module, ytdl_instance, entries): for entry in entries: - if entry.get("_type") in ("url", "url_transparent"): - info_dict = ytdl.extract_info( - entry["url"], False, - ie_key=entry.get("ie_key")) - if "entries" in info_dict: + if not entry: + continue + elif entry.get("_type") in ("url", "url_transparent"): + try: + info_dict = ytdl_instance.extract_info( + entry["url"], False, + ie_key=entry.get("ie_key")) + except ytdl_module.utils.YoutubeDLError: + continue + + if not info_dict: + continue + elif "entries" in info_dict: yield from self._process_entries( - ytdl, info_dict["entries"]) + ytdl_module, ytdl_instance, info_dict["entries"]) else: yield info_dict else: |
