diff options
| author | 2024-01-23 23:35:00 -0500 | |
|---|---|---|
| committer | 2024-01-23 23:35:00 -0500 | |
| commit | 12e23f1195164dcb740d6d4a4287e762c9e5e534 (patch) | |
| tree | e6b13483475c510ea2f685c21363271f23745c56 /gallery_dl/extractor/2ch.py | |
| parent | e949aaf6f6ac93896947d5b736e48e7911926efb (diff) | |
New upstream version 1.26.7.upstream/1.26.7
Diffstat (limited to 'gallery_dl/extractor/2ch.py')
| -rw-r--r-- | gallery_dl/extractor/2ch.py | 91 |
1 files changed, 91 insertions, 0 deletions
diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py new file mode 100644 index 0000000..dbbf21b --- /dev/null +++ b/gallery_dl/extractor/2ch.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://2ch.hk/""" + +from .common import Extractor, Message +from .. import text, util + + +class _2chThreadExtractor(Extractor): + """Extractor for 2ch threads""" + category = "2ch" + subcategory = "thread" + root = "https://2ch.hk" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{tim}{filename:? //}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)" + example = "https://2ch.hk/a/res/12345.html" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread) + posts = self.request(url).json()["threads"][0]["posts"] + + op = posts[0] + title = op.get("subject") or text.remove_html(op["comment"]) + + thread = { + "board" : self.board, + "thread": self.thread, + "title" : text.unescape(title)[:50], + } + + yield Message.Directory, thread + for post in posts: + files = post.get("files") + if files: + post["post_name"] = post["name"] + post["date"] = text.parse_timestamp(post["timestamp"]) + del post["files"] + del post["name"] + + for file in files: + file.update(thread) + file.update(post) + + file["filename"] = file["fullname"].rpartition(".")[0] + file["tim"], _, file["extension"] = \ + file["name"].rpartition(".") + + yield Message.Url, self.root + file["path"], file + + +class _2chBoardExtractor(Extractor): + """Extractor for 2ch boards""" + category = "2ch" + subcategory = "board" + root = "https://2ch.hk" + pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$" + example = "https://2ch.hk/a/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board = match.group(1) + + def items(self): + # index page + url = "{}/{}/index.json".format(self.root, self.board) + index = self.request(url).json() + index["_extractor"] = _2chThreadExtractor + for thread in index["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, index + + # pages 1..n + for n in util.advance(index["pages"], 1): + url = "{}/{}/{}.json".format(self.root, self.board, n) + page = self.request(url).json() + page["_extractor"] = _2chThreadExtractor + for thread in page["threads"]: + url = "{}/{}/res/{}.html".format( + self.root, self.board, thread["thread_num"]) + yield Message.Queue, url, page |
