From 78e2d1672e4301497f786cd03637de9ddbc717ac Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Mon, 17 Oct 2022 03:44:04 -0400 Subject: New upstream version 1.23.3. --- gallery_dl/extractor/2chen.py | 99 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 gallery_dl/extractor/2chen.py (limited to 'gallery_dl/extractor/2chen.py') diff --git a/gallery_dl/extractor/2chen.py b/gallery_dl/extractor/2chen.py new file mode 100644 index 0000000..8fffeb0 --- /dev/null +++ b/gallery_dl/extractor/2chen.py @@ -0,0 +1,99 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://2chen.moe/""" + +from .common import Extractor, Message +from .. import text + + +class _2chenThreadExtractor(Extractor): + """Extractor for 2chen threads""" + category = "2chen" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} {title}") + filename_fmt = "{time} {filename}.{extension}" + archive_fmt = "{board}_{thread}_{hash}" + root = "https://2chen.moe" + pattern = r"(?:https?://)?2chen\.moe/([^/?#]+)/(\d+)" + test = ( + ("https://2chen.moe/jp/303786", { + "count": ">= 10", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/{}".format(self.root, self.board, self.thread) + page = self.request(url, encoding="utf-8").text + data = self.metadata(page) + yield Message.Directory, data + for post in self.posts(page): + if not post["url"]: + continue + post.update(data) + post["url"] = self.root + post["url"] + post["time"] = text.parse_int(post["date"].timestamp()) + yield Message.Url, post["url"], text.nameext_from_url( + post["filename"], post) + + def metadata(self, page): + board, pos = text.extract(page, 'class="board">/', '/<') + title = text.extract(page, "

", "

", pos)[0] + return { + "board" : board, + "thread": self.thread, + "title" : text.unescape(title), + } + + def posts(self, page): + """Return iterable with relevant posts""" + return map(self.parse, text.extract_iter( + page, 'class="glass media', '')) + + def parse(self, post): + extr = text.extract_from(post) + return { + "name" : text.unescape(extr("", "")), + "date" : text.parse_datetime( + extr("")[2], + "%d %b %Y (%a) %H:%M:%S" + ), + "no" : extr('href="#p', '"'), + "url" : extr('