New upstream version 1.19.3.upstream/1.19.3

author: Unit 193 <unit193@unit193.net> 2021-12-01 14:44:00 -0500
committer: Unit 193 <unit193@unit193.net> 2021-12-01 14:44:00 -0500
commit: a5aecc343fd2886e7ae09bb3e2afeec38f175755 (patch)
tree: 06a284b3d73700bd38116423e2480afa516255c2 /gallery_dl/extractor/kemonoparty.py
parent: fc8c5e642017e2b4e5299e2093e72b316479690d (diff)
1 files changed, 96 insertions, 29 deletions
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 2e1d0b2..6483278 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -14,7 +14,7 @@ from ..cache import cache
 import itertools
 import re
 
-BASE_PATTERN = r"(?:https?://)?kemono\.party"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?kemono\.party"
 USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
 
 
@@ -30,19 +30,20 @@ class KemonopartyExtractor(Extractor):
     def items(self):
         self._prepare_ddosguard_cookies()
 
-        find_inline = re.compile(
+        self._find_inline = re.compile(
             r'src="(?:https?://kemono\.party)?(/inline/[^"]+'
             r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
-        skip_service = \
-            "patreon" if self.config("patreon-skip-file", True) else None
+        find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match
+        generators = self._build_file_generators(self.config("files"))
         comments = self.config("comments")
+        username = dms = None
 
         if self.config("metadata"):
             username = text.unescape(text.extract(
                 self.request(self.user_url).text,
                 '<meta name="artist_name" content="', '"')[0])
-        else:
-            username = None
+        if self.config("dms"):
+            dms = True
 
         posts = self.posts()
         max_posts = self.config("max-posts")
@@ -51,31 +52,38 @@ class KemonopartyExtractor(Extractor):
 
         for post in posts:
 
-            files = []
-            append = files.append
-            file = post["file"]
-
-            if file:
-                file["type"] = "file"
-                if post["service"] != skip_service or not post["attachments"]:
-                    append(file)
-            for attachment in post["attachments"]:
-                attachment["type"] = "attachment"
-                append(attachment)
-            for path in find_inline(post["content"] or ""):
-                append({"path": path, "name": path, "type": "inline"})
-
             post["date"] = text.parse_datetime(
-                post["published"], "%a, %d %b %Y %H:%M:%S %Z")
+                post["published"] or post["added"],
+                "%a, %d %b %Y %H:%M:%S %Z")
             if username:
                 post["username"] = username
             if comments:
                 post["comments"] = self._extract_comments(post)
+            if dms is not None:
+                if dms is True:
+                    dms = self._extract_dms(post)
+                post["dms"] = dms
             yield Message.Directory, post
 
-            for post["num"], file in enumerate(files, 1):
-                post["type"] = file["type"]
+            hashes = set()
+            post["num"] = 0
+            for file in itertools.chain.from_iterable(
+                    g(post) for g in generators):
                 url = file["path"]
+
+                match = find_hash(url)
+                if match:
+                    post["hash"] = hash = match.group(1)
+                    if hash in hashes:
+                        self.log.debug("Skipping %s (duplicate)", url)
+                        continue
+                    hashes.add(hash)
+                else:
+                    post["hash"] = ""
+
+                post["type"] = file["type"]
+                post["num"] += 1
+
                 if url[0] == "/":
                     url = self.root + "/data" + url
                 elif url.startswith("https://kemono.party"):
@@ -103,6 +111,34 @@ class KemonopartyExtractor(Extractor):
 
         return {c.name: c.value for c in response.history[0].cookies}
 
+    def _file(self, post):
+        file = post["file"]
+        if not file:
+            return ()
+        file["type"] = "file"
+        return (file,)
+
+    def _attachments(self, post):
+        for attachment in post["attachments"]:
+            attachment["type"] = "attachment"
+        return post["attachments"]
+
+    def _inline(self, post):
+        for path in self._find_inline(post["content"] or ""):
+            yield {"path": path, "name": path, "type": "inline"}
+
+    def _build_file_generators(self, filetypes):
+        if filetypes is None:
+            return (self._file, self._attachments, self._inline)
+        genmap = {
+            "file"       : self._file,
+            "attachments": self._attachments,
+            "inline"     : self._inline,
+        }
+        if isinstance(filetypes, str):
+            filetypes = filetypes.split(",")
+        return [genmap[ft] for ft in filetypes]
+
     def _extract_comments(self, post):
         url = "{}/{}/user/{}/post/{}".format(
             self.root, post["service"], post["user"], post["id"])
@@ -121,6 +157,21 @@ class KemonopartyExtractor(Extractor):
             })
         return comments
 
+    def _extract_dms(self, post):
+        url = "{}/{}/user/{}/dms".format(
+            self.root, post["service"], post["user"])
+        page = self.request(url).text
+
+        dms = []
+        for dm in text.extract_iter(page, "<article", "</article>"):
+            dms.append({
+                "body": text.unescape(text.extract(
+                    dm, '<div class="dm-card__content">', '</div>',
+                )[0].strip()),
+                "date": text.extract(dm, 'datetime="', '"')[0],
+            })
+        return dms
+
 
 class KemonopartyUserExtractor(KemonopartyExtractor):
     """Extractor for all posts from a kemono.party user listing"""
@@ -175,6 +226,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
                 "embed": dict,
                 "extension": "jpeg",
                 "filename": "P058kDFYus7DbqAkGlfWTlOr",
+                "hash": "210f35388e28bbcf756db18dd516e2d8"
+                        "2ce758e0d32881eeee76d43e1716d382",
                 "id": "506575",
                 "num": 1,
                 "published": "Sun, 11 Aug 2019 02:09:04 GMT",
@@ -188,25 +241,39 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
         }),
         # inline image (#1286)
         ("https://kemono.party/fanbox/user/7356311/post/802343", {
-            "pattern": r"https://kemono\.party/data/inline/fanbox"
-                       r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg",
+            "pattern": r"https://kemono\.party/data/47/b5/47b5c014ecdcfabdf2c8"
+                       r"5eec53f1133a76336997ae8596f332e97d956a460ad2\.jpg",
+            "keyword": {"hash": "47b5c014ecdcfabdf2c85eec53f1133a"
+                                "76336997ae8596f332e97d956a460ad2"},
         }),
         # kemono.party -> data.kemono.party
         ("https://kemono.party/gumroad/user/trylsc/post/IURjT", {
-            "pattern": r"https://kemono\.party/data/(file|attachment)s"
-                       r"/gumroad/trylsc/IURjT/",
+            "pattern": r"https://kemono\.party/data/("
+                       r"files/gumroad/trylsc/IURjT/reward8\.jpg|"
+                       r"c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)",
         }),
         # username (#1548, #1652)
         ("https://kemono.party/gumroad/user/3252870377455/post/aJnAH", {
             "options": (("metadata", True),),
             "keyword": {"username": "Kudalyn's Creations"},
         }),
-        # skip patreon main file (#1667, #1689)
+        # skip patreon duplicates
         ("https://kemono.party/patreon/user/4158582/post/32099982", {
             "count": 2,
-            "keyword": {"type": "attachment"},
+        }),
+        # DMs (#2008)
+        ("https://kemono.party/patreon/user/34134344/post/38129255", {
+            "options": (("dms", True),),
+            "keyword": {"dms": [{
+                "body": r"re:Hi! Thank you very much for supporting the work I"
+                        r" did in May. Here's your reward pack! I hope you fin"
+                        r"d something you enjoy in it. :\)\n\nhttps://www.medi"
+                        r"afire.com/file/\w+/Set13_tier_2.zip/file",
+                "date": "2021-07-31 02:47:51.327865",
+            }]},
         }),
         ("https://kemono.party/subscribestar/user/alcorart/post/184330"),
+        ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"),
     )
 
     def __init__(self, match):
author	Unit 193 <unit193@unit193.net>	2021-12-01 14:44:00 -0500
committer	Unit 193 <unit193@unit193.net>	2021-12-01 14:44:00 -0500
commit	a5aecc343fd2886e7ae09bb3e2afeec38f175755 (patch)
tree	06a284b3d73700bd38116423e2480afa516255c2 /gallery_dl/extractor/kemonoparty.py
parent	fc8c5e642017e2b4e5299e2093e72b316479690d (diff)