diff options
Diffstat (limited to 'gallery_dl/extractor/kemonoparty.py')
| -rw-r--r-- | gallery_dl/extractor/kemonoparty.py | 125 |
1 files changed, 96 insertions, 29 deletions
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 2e1d0b2..6483278 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -14,7 +14,7 @@ from ..cache import cache import itertools import re -BASE_PATTERN = r"(?:https?://)?kemono\.party" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?kemono\.party" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" @@ -30,19 +30,20 @@ class KemonopartyExtractor(Extractor): def items(self): self._prepare_ddosguard_cookies() - find_inline = re.compile( + self._find_inline = re.compile( r'src="(?:https?://kemono\.party)?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall - skip_service = \ - "patreon" if self.config("patreon-skip-file", True) else None + find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match + generators = self._build_file_generators(self.config("files")) comments = self.config("comments") + username = dms = None if self.config("metadata"): username = text.unescape(text.extract( self.request(self.user_url).text, '<meta name="artist_name" content="', '"')[0]) - else: - username = None + if self.config("dms"): + dms = True posts = self.posts() max_posts = self.config("max-posts") @@ -51,31 +52,38 @@ class KemonopartyExtractor(Extractor): for post in posts: - files = [] - append = files.append - file = post["file"] - - if file: - file["type"] = "file" - if post["service"] != skip_service or not post["attachments"]: - append(file) - for attachment in post["attachments"]: - attachment["type"] = "attachment" - append(attachment) - for path in find_inline(post["content"] or ""): - append({"path": path, "name": path, "type": "inline"}) - post["date"] = text.parse_datetime( - post["published"], "%a, %d %b %Y %H:%M:%S %Z") + post["published"] or post["added"], + "%a, %d %b %Y %H:%M:%S %Z") if username: post["username"] = username if comments: post["comments"] = self._extract_comments(post) + if dms is not None: + if dms is True: + dms = self._extract_dms(post) + post["dms"] = dms yield Message.Directory, post - for post["num"], file in enumerate(files, 1): - post["type"] = file["type"] + hashes = set() + post["num"] = 0 + for file in itertools.chain.from_iterable( + g(post) for g in generators): url = file["path"] + + match = find_hash(url) + if match: + post["hash"] = hash = match.group(1) + if hash in hashes: + self.log.debug("Skipping %s (duplicate)", url) + continue + hashes.add(hash) + else: + post["hash"] = "" + + post["type"] = file["type"] + post["num"] += 1 + if url[0] == "/": url = self.root + "/data" + url elif url.startswith("https://kemono.party"): @@ -103,6 +111,34 @@ class KemonopartyExtractor(Extractor): return {c.name: c.value for c in response.history[0].cookies} + def _file(self, post): + file = post["file"] + if not file: + return () + file["type"] = "file" + return (file,) + + def _attachments(self, post): + for attachment in post["attachments"]: + attachment["type"] = "attachment" + return post["attachments"] + + def _inline(self, post): + for path in self._find_inline(post["content"] or ""): + yield {"path": path, "name": path, "type": "inline"} + + def _build_file_generators(self, filetypes): + if filetypes is None: + return (self._file, self._attachments, self._inline) + genmap = { + "file" : self._file, + "attachments": self._attachments, + "inline" : self._inline, + } + if isinstance(filetypes, str): + filetypes = filetypes.split(",") + return [genmap[ft] for ft in filetypes] + def _extract_comments(self, post): url = "{}/{}/user/{}/post/{}".format( self.root, post["service"], post["user"], post["id"]) @@ -121,6 +157,21 @@ class KemonopartyExtractor(Extractor): }) return comments + def _extract_dms(self, post): + url = "{}/{}/user/{}/dms".format( + self.root, post["service"], post["user"]) + page = self.request(url).text + + dms = [] + for dm in text.extract_iter(page, "<article", "</article>"): + dms.append({ + "body": text.unescape(text.extract( + dm, '<div class="dm-card__content">', '</div>', + )[0].strip()), + "date": text.extract(dm, 'datetime="', '"')[0], + }) + return dms + class KemonopartyUserExtractor(KemonopartyExtractor): """Extractor for all posts from a kemono.party user listing""" @@ -175,6 +226,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "embed": dict, "extension": "jpeg", "filename": "P058kDFYus7DbqAkGlfWTlOr", + "hash": "210f35388e28bbcf756db18dd516e2d8" + "2ce758e0d32881eeee76d43e1716d382", "id": "506575", "num": 1, "published": "Sun, 11 Aug 2019 02:09:04 GMT", @@ -188,25 +241,39 @@ class KemonopartyPostExtractor(KemonopartyExtractor): }), # inline image (#1286) ("https://kemono.party/fanbox/user/7356311/post/802343", { - "pattern": r"https://kemono\.party/data/inline/fanbox" - r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg", + "pattern": r"https://kemono\.party/data/47/b5/47b5c014ecdcfabdf2c8" + r"5eec53f1133a76336997ae8596f332e97d956a460ad2\.jpg", + "keyword": {"hash": "47b5c014ecdcfabdf2c85eec53f1133a" + "76336997ae8596f332e97d956a460ad2"}, }), # kemono.party -> data.kemono.party ("https://kemono.party/gumroad/user/trylsc/post/IURjT", { - "pattern": r"https://kemono\.party/data/(file|attachment)s" - r"/gumroad/trylsc/IURjT/", + "pattern": r"https://kemono\.party/data/(" + r"files/gumroad/trylsc/IURjT/reward8\.jpg|" + r"c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)", }), # username (#1548, #1652) ("https://kemono.party/gumroad/user/3252870377455/post/aJnAH", { "options": (("metadata", True),), "keyword": {"username": "Kudalyn's Creations"}, }), - # skip patreon main file (#1667, #1689) + # skip patreon duplicates ("https://kemono.party/patreon/user/4158582/post/32099982", { "count": 2, - "keyword": {"type": "attachment"}, + }), + # DMs (#2008) + ("https://kemono.party/patreon/user/34134344/post/38129255", { + "options": (("dms", True),), + "keyword": {"dms": [{ + "body": r"re:Hi! Thank you very much for supporting the work I" + r" did in May. Here's your reward pack! I hope you fin" + r"d something you enjoy in it. :\)\n\nhttps://www.medi" + r"afire.com/file/\w+/Set13_tier_2.zip/file", + "date": "2021-07-31 02:47:51.327865", + }]}, }), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), + ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"), ) def __init__(self, match): |
