New upstream version 1.20.0.upstream/1.20.0

author: Unit 193 <unit193@unit193.net> 2021-12-30 01:56:41 -0500
committer: Unit 193 <unit193@unit193.net> 2021-12-30 01:56:41 -0500
commit: 7bc30b43b70556630b4a93c03fefc0d888e3d19f (patch)
tree: fb0e96762ab8137d23f248ef303538d8d6ff4368 /gallery_dl/extractor/rule34us.py
parent: a5aecc343fd2886e7ae09bb3e2afeec38f175755 (diff)
1 files changed, 130 insertions, 0 deletions
diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py
new file mode 100644
index 0000000..00b6972
--- /dev/null
+++ b/gallery_dl/extractor/rule34us.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://rule34.us/"""
+
+from .booru import BooruExtractor
+from .. import text
+import re
+import collections
+
+
+class Rule34usExtractor(BooruExtractor):
+    category = "rule34us"
+    root = "https://rule34.us"
+    per_page = 42
+
+    def __init__(self, match):
+        BooruExtractor.__init__(self, match)
+        self._find_tags = re.compile(
+            r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall
+
+    def _parse_post(self, post_id):
+        url = "{}/index.php?r=posts/view&id={}".format(self.root, post_id)
+        page = self.request(url).text
+        extr = text.extract_from(page)
+
+        post = {
+            "id"      : post_id,
+            "tags"    : text.unescape(extr(
+                'name="keywords" content="', '"').rstrip(", ")),
+            "uploader": text.extract(extr('Added by: ', '</li>'), ">", "<")[0],
+            "score"   : text.extract(extr('Score: ', '> - <'), ">", "<")[0],
+            "width"   : extr('Size: ', 'w'),
+            "height"  : extr(' x ', 'h'),
+            "file_url": extr(' src="', '"'),
+        }
+        post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
+
+        tags = collections.defaultdict(list)
+        for tag_type, tag_name in self._find_tags(page):
+            tags[tag_type].append(text.unquote(tag_name))
+        for key, value in tags.items():
+            post["tags_" + key] = " ".join(value)
+
+        return post
+
+
+class Rule34usTagExtractor(Rule34usExtractor):
+    subcategory = "tag"
+    directory_fmt = ("{category}", "{search_tags}")
+    archive_fmt = "t_{search_tags}_{id}"
+    pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]+)"
+    test = ("https://rule34.us/index.php?r=posts/index&q=[terios]_elysion", {
+        "pattern": r"https://img\d*\.rule34\.us"
+                   r"/images/../../[0-9a-f]{32}\.\w+",
+        "count": 10,
+    })
+
+    def __init__(self, match):
+        Rule34usExtractor.__init__(self, match)
+        self.tags = text.unquote(match.group(1).replace("+", " "))
+
+    def metadata(self):
+        return {"search_tags": self.tags}
+
+    def posts(self):
+        url = self.root + "/index.php"
+        params = {
+            "r"   : "posts/index",
+            "q"   : self.tags,
+            "page": self.page_start,
+        }
+
+        while True:
+            page = self.request(url, params=params).text
+
+            cnt = 0
+            for post_id in text.extract_iter(page, '><a id="', '"'):
+                yield self._parse_post(post_id)
+                cnt += 1
+
+            if cnt < self.per_page:
+                return
+
+            if "page" in params:
+                del params["page"]
+            params["q"] = self.tags + " id:<" + post_id
+
+
+class Rule34usPostExtractor(Rule34usExtractor):
+    subcategory = "post"
+    archive_fmt = "{id}"
+    pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/view&id=(\d+)"
+    test = (
+        ("https://rule34.us/index.php?r=posts/view&id=3709005", {
+            "pattern": r"https://img\d*\.rule34\.us/images/14/7b"
+                       r"/147bee6fc2e13f73f5f9bac9d4930b13\.png",
+            "content": "d714342ea84050f82dda5f0c194d677337abafc5",
+        }),
+        ("https://rule34.us/index.php?r=posts/view&id=4576310", {
+            "pattern": r"https://video\.rule34\.us/images/a2/94"
+                       r"/a294ff8e1f8e0efa041e5dc9d1480011\.mp4",
+            "keyword": {
+                "extension": "mp4",
+                "file_url": str,
+                "filename": "a294ff8e1f8e0efa041e5dc9d1480011",
+                "height": "3982",
+                "id": "4576310",
+                "md5": "a294ff8e1f8e0efa041e5dc9d1480011",
+                "score": r"re:\d+",
+                "tags": "tagme, video",
+                "tags_general": "video",
+                "tags_metadata": "tagme",
+                "uploader": "Anonymous",
+                "width": "3184",
+            },
+        }),
+    )
+
+    def __init__(self, match):
+        Rule34usExtractor.__init__(self, match)
+        self.post_id = match.group(1)
+
+    def posts(self):
+        return (self._parse_post(self.post_id),)
author	Unit 193 <unit193@unit193.net>	2021-12-30 01:56:41 -0500
committer	Unit 193 <unit193@unit193.net>	2021-12-30 01:56:41 -0500
commit	7bc30b43b70556630b4a93c03fefc0d888e3d19f (patch)
tree	fb0e96762ab8137d23f248ef303538d8d6ff4368 /gallery_dl/extractor/rule34us.py
parent	a5aecc343fd2886e7ae09bb3e2afeec38f175755 (diff)