diff options
| author | 2022-12-24 17:14:46 -0500 | |
|---|---|---|
| committer | 2022-12-24 17:14:46 -0500 | |
| commit | ebdfcd3cd3f76534a590ba08933ff7ea54813316 (patch) | |
| tree | 35db6003766dff695cf8a5aa24f47629b602b7c0 /gallery_dl/extractor/warosu.py | |
| parent | 3338dfce719c999467ffe08fd45663be8190057a (diff) | |
New upstream version 1.24.2.upstream/1.24.2
Diffstat (limited to 'gallery_dl/extractor/warosu.py')
| -rw-r--r-- | gallery_dl/extractor/warosu.py | 62 |
1 files changed, 30 insertions, 32 deletions
diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 677680f..bdedfcb 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -1,21 +1,22 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2019 Mike Fährmann +# Copyright 2017-2022 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://warosu.org/""" +"""Extractors for https://warosu.org/""" from .common import Extractor, Message from .. import text class WarosuThreadExtractor(Extractor): - """Extractor for images from threads on warosu.org""" + """Extractor for threads on warosu.org""" category = "warosu" subcategory = "thread" + root = "https://warosu.org" directory_fmt = ("{category}", "{board}", "{thread} - {title}") filename_fmt = "{tim}-{filename}.{extension}" archive_fmt = "{board}_{thread}_{tim}" @@ -31,7 +32,6 @@ class WarosuThreadExtractor(Extractor): "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", }), ) - root = "https://warosu.org" def __init__(self, match): Extractor.__init__(self, match) @@ -40,12 +40,12 @@ class WarosuThreadExtractor(Extractor): def items(self): url = "{}/{}/thread/{}".format(self.root, self.board, self.thread) page = self.request(url).text - data = self.get_metadata(page) + data = self.metadata(page) posts = self.posts(page) if not data["title"]: - title = text.remove_html(posts[0]["com"]) - data["title"] = text.unescape(title)[:50] + data["title"] = text.unescape(text.remove_html( + posts[0]["com"]))[:50] yield Message.Directory, data for post in posts: @@ -55,25 +55,24 @@ class WarosuThreadExtractor(Extractor): post.update(data) yield Message.Url, post["image"], post - def get_metadata(self, page): - """Collect metadata for extractor-job""" + def metadata(self, page): boardname = text.extr(page, "<title>", "</title>") title = text.extr(page, 'filetitle" itemprop="name">', '<') return { - "board": self.board, + "board" : self.board, "board_name": boardname.rpartition(" - ")[2], - "thread": self.thread, - "title": title, + "thread" : self.thread, + "title" : title, } def posts(self, page): - """Build a list of all post-objects""" + """Build a list of all post objects""" page = text.extr(page, '<div class="content">', '<table>') needle = '<table itemscope itemtype="http://schema.org/Comment">' return [self.parse(post) for post in page.split(needle)] def parse(self, post): - """Build post-object by extracting data from an HTML post""" + """Build post object by extracting data from an HTML post""" data = self._extract_post(post) if "<span>File:" in post: self._extract_image(post, data) @@ -84,24 +83,23 @@ class WarosuThreadExtractor(Extractor): @staticmethod def _extract_post(post): - data = text.extract_all(post, ( - ("no" , 'id="p', '"'), - ("name", '<span itemprop="name">', '</span>'), - ("time", '<span class="posttime" title="', '000">'), - ("now" , '', '<'), - ("com" , '<blockquote><p itemprop="text">', '</p></blockquote>'), - ))[0] - data["com"] = text.unescape(text.remove_html(data["com"].strip())) - return data + extr = text.extract_from(post) + return { + "no" : extr('id="p', '"'), + "name": extr('<span itemprop="name">', "</span>"), + "time": extr('<span class="posttime" title="', '000">'), + "now" : extr("", "<"), + "com" : text.unescape(text.remove_html(extr( + '<blockquote><p itemprop="text">', '</p></blockquote>' + ).strip())), + } @staticmethod def _extract_image(post, data): - text.extract_all(post, ( - ("fsize" , '<span>File: ', ', '), - ("w" , '', 'x'), - ("h" , '', ', '), - ("filename", '', '<'), - ("image" , '<br />\n<a href="', '"'), - ), 0, data) - data["filename"] = text.unquote(data["filename"].rpartition(".")[0]) - data["image"] = "https:" + data["image"] + extr = text.extract_from(post) + data["fsize"] = extr("<span>File: ", ", ") + data["w"] = extr("", "x") + data["h"] = extr("", ", ") + data["filename"] = text.unquote(extr("", "<").rpartition(".")[0]) + extr("<br />", "") + data["image"] = "https:" + extr('<a href="', '"') |
