summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/postmill.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/postmill.py')
-rw-r--r--gallery_dl/extractor/postmill.py203
1 files changed, 203 insertions, 0 deletions
diff --git a/gallery_dl/extractor/postmill.py b/gallery_dl/extractor/postmill.py
new file mode 100644
index 0000000..29b351b
--- /dev/null
+++ b/gallery_dl/extractor/postmill.py
@@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Postmill instances"""
+
+import re
+from .common import BaseExtractor, Message
+from .. import text, exception
+
+
+class PostmillExtractor(BaseExtractor):
+ """Base class for Postmill extractors"""
+ basecategory = "postmill"
+ directory_fmt = ("{category}", "{instance}", "{forum}")
+ filename_fmt = "{id}_{title[:220]}.{extension}"
+ archive_fmt = "{filename}"
+
+ def _init(self):
+ self.instance = self.root.partition("://")[2]
+ self.save_link_post_body = self.config("save-link-post-body", False)
+ self._search_canonical_url = re.compile(r"/f/([\w\d_]+)/(\d+)/").search
+ self._search_image_tag = re.compile(
+ r'<a href="[^"]+"\n +class="submission__image-link"').search
+
+ def items(self):
+ for post_url in self.post_urls():
+ page = self.request(post_url).text
+ extr = text.extract_from(page)
+
+ title = text.unescape(extr(
+ '<meta property="og:title" content="', '">'))
+ date = text.parse_datetime(extr(
+ '<meta property="og:article:published_time" content="', '">'))
+ username = extr(
+ '<meta property="og:article:author" content="', '">')
+ post_canonical_url = text.unescape(extr(
+ '<link rel="canonical" href="', '">'))
+
+ url = text.unescape(extr(
+ '<h1 class="submission__title unheaderize inline"><a href="',
+ '"'))
+ body = extr(
+ '<div class="submission__body break-text text-flow">',
+ '</div>')
+
+ match = self._search_canonical_url(post_canonical_url)
+ forum = match.group(1)
+ id = int(match.group(2))
+
+ is_text_post = url.startswith("/")
+ is_image_post = self._search_image_tag(page) is not None
+ data = {
+ "title": title,
+ "date": date,
+ "username": username,
+ "forum": forum,
+ "id": id,
+ "flair": [text.unescape(i) for i in text.extract_iter(
+ page, '<span class="flair__label">', '</span>')],
+ "instance": self.instance,
+ }
+
+ urls = []
+ if is_text_post or self.save_link_post_body:
+ urls.append((Message.Url, "text:" + body))
+
+ if is_image_post:
+ urls.append((Message.Url, url))
+ elif not is_text_post:
+ urls.append((Message.Queue, url))
+
+ data["count"] = len(urls)
+ yield Message.Directory, data
+ for data["num"], (msg, url) in enumerate(urls, 1):
+ if url.startswith("text:"):
+ data["filename"], data["extension"] = "", "htm"
+ else:
+ data = text.nameext_from_url(url, data)
+
+ yield msg, url, data
+
+
+class PostmillSubmissionsExtractor(PostmillExtractor):
+ """Base class for Postmill submissions extractors"""
+ whitelisted_parameters = ()
+
+ def __init__(self, match):
+ PostmillExtractor.__init__(self, match)
+ groups = match.groups()
+ self.base = groups[-3]
+ self.sorting_path = groups[-2] or ""
+ self.query = {key: value for key, value in text.parse_query(
+ groups[-1]).items() if self.acceptable_query(key)}
+
+ def items(self):
+ url = self.root + self.base + self.sorting_path
+
+ while url:
+ response = self.request(url, params=self.query)
+ if response.history:
+ redirect_url = response.url
+ if redirect_url == self.root + "/login":
+ raise exception.StopExtraction(
+ "HTTP redirect to login page (%s)", redirect_url)
+ page = response.text
+
+ for nav in text.extract_iter(page,
+ '<nav class="submission__nav">',
+ '</nav>'):
+ post_url = text.unescape(text.extr(nav, '<a href="', '"'))
+ yield Message.Queue, text.urljoin(url, post_url), \
+ {"_extractor": PostmillPostExtractor}
+
+ url = text.unescape(text.extr(page,
+ '<link rel="next" href="', '">'))
+
+ def acceptable_query(self, key):
+ return key in self.whitelisted_parameters or key == "t" or \
+ (key.startswith("next[") and key.endswith("]"))
+
+
+BASE_PATTERN = PostmillExtractor.update({
+ "raddle": {
+ "root" : None,
+ "pattern": (r"(?:raddle\.me|"
+ r"c32zjeghcp5tj3kb72pltz56piei66drc63vkhn5yixiyk4cmerrjtid"
+ r"\.onion)"),
+ }
+})
+QUERY_RE = r"(?:\?([^#]+))?$"
+SORTING_RE = r"(/(?:hot|new|active|top|controversial|most_commented))?" + \
+ QUERY_RE
+
+
+class PostmillPostExtractor(PostmillExtractor):
+ """Extractor for a single submission URL"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/f/(\w+)/(\d+)"
+ example = "https://raddle.me/f/FORUM/123/TITLE"
+
+ def __init__(self, match):
+ PostmillExtractor.__init__(self, match)
+ self.forum = match.group(3)
+ self.post_id = match.group(4)
+
+ def post_urls(self):
+ return (self.root + "/f/" + self.forum + "/" + self.post_id,)
+
+
+class PostmillShortURLExtractor(PostmillExtractor):
+ """Extractor for short submission URLs"""
+ subcategory = "shorturl"
+ pattern = BASE_PATTERN + r"/(\d+)$"
+ example = "https://raddle.me/123"
+
+ def __init__(self, match):
+ PostmillExtractor.__init__(self, match)
+ self.post_id = match.group(3)
+
+ def items(self):
+ url = self.root + "/" + self.post_id
+ response = self.request(url, method="HEAD", allow_redirects=False)
+ full_url = text.urljoin(url, response.headers["Location"])
+ yield Message.Queue, full_url, {"_extractor": PostmillPostExtractor}
+
+
+class PostmillHomeExtractor(PostmillSubmissionsExtractor):
+ """Extractor for the home page"""
+ subcategory = "home"
+ pattern = BASE_PATTERN + r"(/(?:featured|subscribed|all)?)" + SORTING_RE
+ example = "https://raddle.me/"
+
+
+class PostmillForumExtractor(PostmillSubmissionsExtractor):
+ """Extractor for submissions on a forum"""
+ subcategory = "forum"
+ pattern = BASE_PATTERN + r"(/f/\w+)" + SORTING_RE
+ example = "https://raddle.me/f/FORUM"
+
+
+class PostmillUserSubmissionsExtractor(PostmillSubmissionsExtractor):
+ """Extractor for submissions made by a user"""
+ subcategory = "usersubmissions"
+ pattern = BASE_PATTERN + r"(/user/\w+/submissions)()" + QUERY_RE
+ example = "https://raddle.me/user/USER/submissions"
+
+
+class PostmillTagExtractor(PostmillSubmissionsExtractor):
+ """Extractor for submissions on a forum with a specific tag"""
+ subcategory = "tag"
+ pattern = BASE_PATTERN + r"(/tag/\w+)" + SORTING_RE
+ example = "https://raddle.me/tag/TAG"
+
+
+class PostmillSearchExtractor(PostmillSubmissionsExtractor):
+ """Extractor for search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"(/search)()\?(q=[^#]+)$"
+ example = "https://raddle.me/search?q=QUERY"
+ whitelisted_parameters = ("q",)