diff options
Diffstat (limited to 'gallery_dl/extractor/8chan.py')
| -rw-r--r-- | gallery_dl/extractor/8chan.py | 172 |
1 files changed, 172 insertions, 0 deletions
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py new file mode 100644 index 0000000..1e020c2 --- /dev/null +++ b/gallery_dl/extractor/8chan.py @@ -0,0 +1,172 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://8chan.moe/""" + +from .common import Extractor, Message +from .. import text +from ..cache import memcache +from datetime import datetime, timedelta +import itertools + +BASE_PATTERN = r"(?:https?://)?8chan\.(moe|se|cc)" + + +class _8chanExtractor(Extractor): + """Base class for 8chan extractors""" + category = "8chan" + root = "https://8chan.moe" + + def __init__(self, match): + self.root = "https://8chan." + match.group(1) + Extractor.__init__(self, match) + + @memcache() + def _prepare_cookies(self): + # fetch captcha cookies + # (necessary to download without getting interrupted) + now = datetime.utcnow() + url = self.root + "/captcha.js" + params = {"d": now.strftime("%a %b %d %Y %H:%M:%S GMT+0000 (UTC)")} + self.request(url, params=params).content + + # adjust cookies + # - remove 'expires' timestamp + # - move 'captchaexpiration' value forward by 1 month) + domain = self.root.rpartition("/")[2] + for cookie in self.session.cookies: + if cookie.domain.endswith(domain): + cookie.expires = None + if cookie.name == "captchaexpiration": + cookie.value = (now + timedelta(30, 300)).strftime( + "%a, %d %b %Y %H:%M:%S GMT") + + return self.session.cookies + + +class _8chanThreadExtractor(_8chanExtractor): + """Extractor for 8chan threads""" + subcategory = "thread" + directory_fmt = ("{category}", "{boardUri}", + "{threadId} {subject[:50]}") + filename_fmt = "{postId}{num:?-//} {filename[:200]}.{extension}" + archive_fmt = "{boardUri}_{postId}_{num}" + pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)" + test = ( + ("https://8chan.moe/vhs/res/4.html", { + "pattern": r"https://8chan\.moe/\.media/[0-9a-f]{64}\.\w+$", + "count": 14, + "keyword": { + "archived": False, + "autoSage": False, + "boardDescription": "Film and Cinema", + "boardMarkdown": None, + "boardName": "Movies", + "boardUri": "vhs", + "creation": r"re:\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.\d{3}Z", + "cyclic": False, + "email": None, + "id": "re:^[0-9a-f]{6}$", + "locked": False, + "markdown": str, + "maxFileCount": 5, + "maxFileSize": "32.00 MB", + "maxMessageLength": 8001, + "message": str, + "mime": str, + "name": "Anonymous", + "num": int, + "originalName": str, + "path": r"re:/.media/[0-9a-f]{64}\.\w+$", + "pinned": False, + "postId": int, + "signedRole": None, + "size": int, + "threadId": 4, + "thumb": r"re:/.media/t_[0-9a-f]{64}$", + "uniquePosters": 9, + "usesCustomCss": True, + "usesCustomJs": False, + "wsPort": 8880, + "wssPort": 2087, + }, + }), + ("https://8chan.se/vhs/res/4.html"), + ("https://8chan.cc/vhs/res/4.html"), + ) + + def __init__(self, match): + _8chanExtractor.__init__(self, match) + _, self.board, self.thread = match.groups() + + def items(self): + # fetch thread data + url = "{}/{}/res/{}.".format(self.root, self.board, self.thread) + self.session.headers["Referer"] = url + "html" + thread = self.request(url + "json").json() + thread["postId"] = thread["threadId"] + thread["_http_headers"] = {"Referer": url + "html"} + + try: + self.session.cookies = self._prepare_cookies() + except Exception as exc: + self.log.debug("Failed to fetch captcha cookies: %s: %s", + exc.__class__.__name__, exc, exc_info=True) + + # download files + posts = thread.pop("posts", ()) + yield Message.Directory, thread + for post in itertools.chain((thread,), posts): + files = post.pop("files", ()) + if not files: + continue + thread.update(post) + for num, file in enumerate(files): + file.update(thread) + file["num"] = num + text.nameext_from_url(file["originalName"], file) + yield Message.Url, self.root + file["path"], file + + +class _8chanBoardExtractor(_8chanExtractor): + """Extractor for 8chan boards""" + subcategory = "board" + pattern = BASE_PATTERN + r"/([^/?#]+)/(?:(\d+)\.html)?$" + test = ( + ("https://8chan.moe/vhs/"), + ("https://8chan.moe/vhs/2.html", { + "pattern": _8chanThreadExtractor.pattern, + "count": 23, + }), + ("https://8chan.se/vhs/"), + ("https://8chan.cc/vhs/"), + ) + + def __init__(self, match): + _8chanExtractor.__init__(self, match) + _, self.board, self.page = match.groups() + self.session.headers["Referer"] = self.root + "/" + + def items(self): + page = text.parse_int(self.page, 1) + url = "{}/{}/{}.json".format(self.root, self.board, page) + board = self.request(url).json() + threads = board["threads"] + + while True: + for thread in threads: + thread["_extractor"] = _8chanThreadExtractor + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["threadId"]) + yield Message.Queue, url, thread + + page += 1 + if page > board["pageCount"]: + return + url = "{}/{}/{}.json".format(self.root, self.board, page) + threads = self.request(url).json()["threads"] |
