From e052f3b9e1d9703a5a466daeaf37bacf476c2daf Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sun, 22 Oct 2023 01:00:14 -0400 Subject: New upstream version 1.26.1. --- gallery_dl/extractor/4chanarchives.py | 1 + gallery_dl/extractor/__init__.py | 2 +- gallery_dl/extractor/bunkr.py | 44 +++++++++-- gallery_dl/extractor/chevereto.py | 113 ++++++++++++++++++++++++++++ gallery_dl/extractor/deviantart.py | 64 ++++++++++++---- gallery_dl/extractor/fantia.py | 9 ++- gallery_dl/extractor/hentaifoundry.py | 22 ++++-- gallery_dl/extractor/imgbb.py | 39 +++++++--- gallery_dl/extractor/jpgfish.py | 105 -------------------------- gallery_dl/extractor/kemonoparty.py | 134 +++++++++++++++++++++++----------- gallery_dl/extractor/moebooru.py | 5 ++ gallery_dl/extractor/newgrounds.py | 56 ++++++++++++-- gallery_dl/extractor/patreon.py | 2 +- gallery_dl/extractor/reddit.py | 3 +- gallery_dl/extractor/redgifs.py | 15 +++- gallery_dl/extractor/sankaku.py | 2 +- gallery_dl/extractor/twitter.py | 2 + gallery_dl/extractor/warosu.py | 34 ++++----- 18 files changed, 434 insertions(+), 218 deletions(-) create mode 100644 gallery_dl/extractor/chevereto.py delete mode 100644 gallery_dl/extractor/jpgfish.py (limited to 'gallery_dl/extractor') diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py index f018d3e..27ac7c5 100644 --- a/gallery_dl/extractor/4chanarchives.py +++ b/gallery_dl/extractor/4chanarchives.py @@ -20,6 +20,7 @@ class _4chanarchivesThreadExtractor(Extractor): directory_fmt = ("{category}", "{board}", "{thread} - {title}") filename_fmt = "{no}-{filename}.{extension}" archive_fmt = "{board}_{thread}_{no}" + referer = False pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)" example = "https://4chanarchives.com/board/a/thread/12345/" diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 3abe74b..1c1473a 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -28,6 +28,7 @@ modules = [ "blogger", "bunkr", "catbox", + "chevereto", "comicvine", "cyberdrop", "danbooru", @@ -73,7 +74,6 @@ modules = [ "issuu", "itaku", "itchio", - "jpgfish", "jschan", "kabeuchi", "keenspot", diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 5509f5a..26123b8 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -12,6 +12,8 @@ from .lolisafe import LolisafeAlbumExtractor from .. import text from urllib.parse import urlsplit, urlunsplit +BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)" + MEDIA_DOMAIN_OVERRIDES = { "cdn9.bunkr.ru" : "c9.bunkr.ru", "cdn12.bunkr.ru": "media-files12.bunkr.la", @@ -28,7 +30,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): """Extractor for bunkrr.su albums""" category = "bunkr" root = "https://bunkrr.su" - pattern = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)/a/([^/?#]+)" + pattern = BASE_PATTERN + r"/a/([^/?#]+)" example = "https://bunkrr.su/a/ID" def fetch_album(self, album_id): @@ -53,11 +55,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): for url in urls: if url.startswith("/"): try: - page = self.request(self.root + text.unescape(url)).text - if url[1] == "v": - url = text.extr(page, '<') + + +BASE_PATTERN = CheveretoExtractor.update({ + "jpgfish": { + "root": "https://jpg2.su", + "pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)", + }, + "pixl": { + "root": "https://pixl.li", + "pattern": r"pixl\.(?:li|is)", + }, + "imgkiwi": { + "root": "https://img.kiwi", + "pattern": r"img\.kiwi", + }, + "deltaporno": { + "root": "https://gallery.deltaporno.com", + "pattern": r"gallery\.deltaporno\.com", + }, +}) + + +class CheveretoImageExtractor(CheveretoExtractor): + """Extractor for chevereto Images""" + subcategory = "image" + pattern = BASE_PATTERN + r"(/im(?:g|age)/[^/?#]+)" + example = "https://jpg2.su/img/TITLE.ID" + + def items(self): + url = self.root + self.path + extr = text.extract_from(self.request(url).text) + + image = { + "id" : self.path.rpartition(".")[2], + "url" : extr('"), ">", "<"), + "user" : extr('username: "', '"'), + } + + text.nameext_from_url(image["url"], image) + yield Message.Directory, image + yield Message.Url, image["url"], image + + +class CheveretoAlbumExtractor(CheveretoExtractor): + """Extractor for chevereto Albums""" + subcategory = "album" + pattern = BASE_PATTERN + r"(/a(?:lbum)?/[^/?#]+(?:/sub)?)" + example = "https://jpg2.su/album/TITLE.ID" + + def items(self): + url = self.root + self.path + data = {"_extractor": CheveretoImageExtractor} + + if self.path.endswith("/sub"): + albums = self._pagination(url) + else: + albums = (url,) + + for album in albums: + for image in self._pagination(album): + yield Message.Queue, image, data + + +class CheveretoUserExtractor(CheveretoExtractor): + """Extractor for chevereto Users""" + subcategory = "user" + pattern = BASE_PATTERN + r"(/(?!img|image|a(?:lbum)?)[^/?#]+(?:/albums)?)" + example = "https://jpg2.su/USER" + + def items(self): + url = self.root + self.path + + if self.path.endswith("/albums"): + data = {"_extractor": CheveretoAlbumExtractor} + else: + data = {"_extractor": CheveretoImageExtractor} + + for url in self._pagination(url): + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 9421096..2c37ef1 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -42,7 +42,7 @@ class DeviantartExtractor(Extractor): self.offset = 0 def _init(self): - self.jwt = self.config("jwt", True) + self.jwt = self.config("jwt", False) self.flat = self.config("flat", True) self.extra = self.config("extra", False) self.quality = self.config("quality", "100") @@ -91,14 +91,20 @@ class DeviantartExtractor(Extractor): return True def items(self): - if self.user and self.config("group", True): - profile = self.api.user_profile(self.user) - self.group = not profile - if self.group: - self.subcategory = "group-" + self.subcategory - self.user = self.user.lower() - else: - self.user = profile["user"]["username"] + if self.user: + group = self.config("group", True) + if group: + profile = self.api.user_profile(self.user) + if profile: + self.user = profile["user"]["username"] + self.group = False + elif group == "skip": + self.log.info("Skipping group '%s'", self.user) + raise exception.StopExtraction() + else: + self.subcategory = "group-" + self.subcategory + self.user = self.user.lower() + self.group = True for deviation in self.deviations(): if isinstance(deviation, tuple): @@ -228,7 +234,7 @@ class DeviantartExtractor(Extractor): if self.comments: deviation["comments"] = ( - self.api.comments(deviation["deviationid"], target="deviation") + self._extract_comments(deviation["deviationid"], "deviation") if deviation["stats"]["comments"] else () ) @@ -395,6 +401,28 @@ class DeviantartExtractor(Extractor): binascii.b2a_base64(payload).rstrip(b"=\n").decode()) ) + def _extract_comments(self, target_id, target_type="deviation"): + results = None + comment_ids = [None] + + while comment_ids: + comments = self.api.comments( + target_id, target_type, comment_ids.pop()) + + if results: + results.extend(comments) + else: + results = comments + + # parent comments, i.e. nodes with at least one child + parents = {c["parentid"] for c in comments} + # comments with more than one reply + replies = {c["commentid"] for c in comments if c["replies"]} + # add comment UUIDs with replies that are not parent to any node + comment_ids.extend(replies - parents) + + return results + def _limited_request(self, url, **kwargs): """Limits HTTP requests to one every 2 seconds""" kwargs["fatal"] = None @@ -698,7 +726,7 @@ class DeviantartStatusExtractor(DeviantartExtractor): deviation["stats"] = {"comments": comments_count} if self.comments: deviation["comments"] = ( - self.api.comments(deviation["statusid"], target="status") + self._extract_comments(deviation["statusid"], "status") if comments_count else () ) @@ -1072,11 +1100,17 @@ class DeviantartOAuthAPI(): "mature_content": self.mature} return self._pagination_list(endpoint, params) - def comments(self, id, target, offset=0): + def comments(self, target_id, target_type="deviation", + comment_id=None, offset=0): """Fetch comments posted on a target""" - endpoint = "/comments/{}/{}".format(target, id) - params = {"maxdepth": "5", "offset": offset, "limit": 50, - "mature_content": self.mature} + endpoint = "/comments/{}/{}".format(target_type, target_id) + params = { + "commentid" : comment_id, + "maxdepth" : "5", + "offset" : offset, + "limit" : 50, + "mature_content": self.mature, + } return self._pagination_list(endpoint, params=params, key="thread") def deviation(self, deviation_id, public=None): diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index f1d51e2..4a67695 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -42,7 +42,11 @@ class FantiaExtractor(Extractor): post = self._get_post_data(post_id) post["num"] = 0 - for content in self._get_post_contents(post): + contents = self._get_post_contents(post) + post["content_count"] = len(contents) + post["content_num"] = 0 + + for content in contents: files = self._process_content(post, content) yield Message.Directory, post @@ -59,6 +63,8 @@ class FantiaExtractor(Extractor): post["content_filename"] or file["file_url"], post) yield Message.Url, file["file_url"], post + post["content_num"] += 1 + def posts(self): """Return post IDs""" @@ -131,6 +137,7 @@ class FantiaExtractor(Extractor): post["content_filename"] = content.get("filename") or "" post["content_id"] = content["id"] post["content_comment"] = content.get("comment") or "" + post["content_num"] += 1 post["plan"] = content["plan"] or self._empty_plan files = [] diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 4c02000..8ba23c2 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -72,13 +72,11 @@ class HentaifoundryExtractor(Extractor): extr = text.extract_from(page, page.index('id="picBox"')) data = { + "index" : text.parse_int(path.rsplit("/", 2)[1]), "title" : text.unescape(extr('class="imageTitle">', '<')), "artist" : text.unescape(extr('/profile">', '<')), - "width" : text.parse_int(extr('width="', '"')), - "height" : text.parse_int(extr('height="', '"')), - "index" : text.parse_int(path.rsplit("/", 2)[1]), - "src" : text.urljoin(self.root, text.unescape(extr( - 'src="', '"'))), + "_body" : extr( + '
Description
', '') .replace("\r\n", "\n"), "", "")), @@ -92,6 +90,20 @@ class HentaifoundryExtractor(Extractor): ">Tags ", "")), } + body = data["_body"] + if "', '<', pos) + album = text.extr(page, '"og:title" content="', '"') + user = self._extract_user(page) return { - "album_id" : self.album_id, - "album_name": text.unescape(album), - "user" : user.lower() if user else "", + "album_id" : self.album_id, + "album_name" : text.unescape(album), + "user" : user.get("username") or "", + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", } def images(self, page): @@ -158,7 +167,12 @@ class ImgbbUserExtractor(ImgbbExtractor): self.page_url = "https://{}.imgbb.com/".format(self.user) def metadata(self, page): - return {"user": self.user} + user = self._extract_user(page) + return { + "user" : user.get("username") or self.user, + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", + } def images(self, page): user = text.extr(page, '.obj.resource={"id":"', '"') @@ -181,15 +195,20 @@ class ImgbbImageExtractor(ImgbbExtractor): def items(self): url = "https://ibb.co/" + self.image_id - extr = text.extract_from(self.request(url).text) + page = self.request(url).text + extr = text.extract_from(page) + user = self._extract_user(page) image = { "id" : self.image_id, - "title" : text.unescape(extr('"og:title" content="', '"')), + "title" : text.unescape(extr( + '"og:title" content="', ' hosted at ImgBB"')), "url" : extr('"og:image" content="', '"'), "width" : text.parse_int(extr('"og:image:width" content="', '"')), "height": text.parse_int(extr('"og:image:height" content="', '"')), - "user" : extr('rel="author">', '<').lower(), + "user" : user.get("username") or "", + "user_id" : user.get("id") or "", + "displayname": user.get("name") or "", } image["extension"] = text.ext_from_url(image["url"]) diff --git a/gallery_dl/extractor/jpgfish.py b/gallery_dl/extractor/jpgfish.py deleted file mode 100644 index 8862a7b..0000000 --- a/gallery_dl/extractor/jpgfish.py +++ /dev/null @@ -1,105 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://jpg1.su/""" - -from .common import Extractor, Message -from .. import text - -BASE_PATTERN = r"(?:https?://)?jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)" - - -class JpgfishExtractor(Extractor): - """Base class for jpgfish extractors""" - category = "jpgfish" - root = "https://jpg1.su" - directory_fmt = ("{category}", "{user}", "{album}",) - archive_fmt = "{id}" - - def _pagination(self, url): - while url: - page = self.request(url).text - - for item in text.extract_iter( - page, '
<')[0] - - -class JpgfishImageExtractor(JpgfishExtractor): - """Extractor for jpgfish Images""" - subcategory = "image" - pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" - example = "https://jpg1.su/img/TITLE.ID" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.path, self.image_id = match.groups() - - def items(self): - url = "{}/img/{}".format(self.root, self.path) - extr = text.extract_from(self.request(url).text) - - image = { - "id" : self.image_id, - "url" : extr('"), ">", "<")[0] or "", - "user" : extr('username: "', '"'), - } - - text.nameext_from_url(image["url"], image) - yield Message.Directory, image - yield Message.Url, image["url"], image - - -class JpgfishAlbumExtractor(JpgfishExtractor): - """Extractor for jpgfish Albums""" - subcategory = "album" - pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" - example = "https://jpg1.su/album/TITLE.ID" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.album, self.sub_albums = match.groups() - - def items(self): - url = "{}/a/{}".format(self.root, self.album) - data = {"_extractor": JpgfishImageExtractor} - - if self.sub_albums: - albums = self._pagination(url + "/sub") - else: - albums = (url,) - - for album in albums: - for image in self._pagination(album): - yield Message.Queue, image, data - - -class JpgfishUserExtractor(JpgfishExtractor): - """Extractor for jpgfish Users""" - subcategory = "user" - pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" - example = "https://jpg1.su/USER" - - def __init__(self, match): - JpgfishExtractor.__init__(self, match) - self.user, self.albums = match.groups() - - def items(self): - url = "{}/{}".format(self.root, self.user) - - if self.albums: - url += "/albums" - data = {"_extractor": JpgfishAlbumExtractor} - else: - data = {"_extractor": JpgfishImageExtractor} - - for url in self._pagination(url): - yield Message.Queue, url, data diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 894c671..1596cfb 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import cache +from ..cache import cache, memcache import itertools import re @@ -70,8 +70,7 @@ class KemonopartyExtractor(Extractor): self.root, post["service"], post["user"], post["id"]) post["_http_headers"] = headers post["date"] = text.parse_datetime( - post["published"] or post["added"], - "%a, %d %b %Y %H:%M:%S %Z") + post["published"] or post["added"], "%Y-%m-%dT%H:%M:%S") if username: post["username"] = username if comments: @@ -197,14 +196,25 @@ class KemonopartyExtractor(Extractor): dms = [] for dm in text.extract_iter(page, ""): + footer = text.extr(dm, "") dms.append({ - "body": text.unescape(text.extract( + "body": text.unescape(text.extr( dm, "
", "
1 else posts + if not self.revision: + post = self.request(self.api_url).json() + if self.config("revisions"): + post["revision_id"] = 0 + try: + revs = self._post_revisions(self.api_url) + except exception.HttpError: + pass + else: + return itertools.chain((post,), revs) + return (post,) + + revs = self._post_revisions(self.api_url) + if not self.revision_id: + return revs + + for rev in revs: + if str(rev["revision_id"]) == self.revision_id: + return (rev,) + + raise exception.NotFoundError("revision") class KemonopartyDiscordExtractor(KemonopartyExtractor): @@ -270,11 +314,29 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - _, _, self.server, self.channel, self.channel_name = match.groups() + _, _, self.server, self.channel_id, self.channel = match.groups() + self.channel_name = "" def items(self): self._prepare_ddosguard_cookies() + if self.channel_id: + self.channel_name = self.channel + else: + if self.channel.isdecimal() and len(self.channel) >= 16: + key = "id" + else: + key = "name" + + for channel in self._discord_channels(self.server): + if channel[key] == self.channel: + break + else: + raise exception.NotFoundError("channel") + + self.channel_id = channel["id"] + self.channel_name = channel["name"] + find_inline = re.compile( r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)" r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall @@ -299,7 +361,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): post["channel_name"] = self.channel_name post["date"] = text.parse_datetime( - post["published"], "%a, %d %b %Y %H:%M:%S %Z") + post["published"], "%Y-%m-%dT%H:%M:%S.%f") post["count"] = len(files) yield Message.Directory, post @@ -319,27 +381,17 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): yield Message.Url, url, post def posts(self): - if self.channel is None: - url = "{}/api/discord/channels/lookup?q={}".format( - self.root, self.server) - for channel in self.request(url).json(): - if channel["name"] == self.channel_name: - self.channel = channel["id"] - break - else: - raise exception.NotFoundError("channel") - - url = "{}/api/discord/channel/{}".format(self.root, self.channel) - params = {"skip": 0} + url = "{}/api/v1/discord/channel/{}".format( + self.root, self.channel_id) + params = {"o": 0} while True: posts = self.request(url, params=params).json() yield from posts - cnt = len(posts) - if cnt < 25: + if len(posts) < 150: break - params["skip"] += cnt + params["o"] += 150 class KemonopartyDiscordServerExtractor(KemonopartyExtractor): @@ -352,11 +404,7 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): self.server = match.group(3) def items(self): - url = "{}/api/discord/channels/lookup?q={}".format( - self.root, self.server) - channels = self.request(url).json() - - for channel in channels: + for channel in self._discord_channels(self.server): url = "{}/discord/server/{}/channel/{}#{}".format( self.root, self.server, channel["id"], channel["name"]) channel["_extractor"] = KemonopartyDiscordExtractor diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 145dd51..e97d273 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -124,6 +124,11 @@ class MoebooruPoolExtractor(MoebooruExtractor): self.pool_id = match.group(match.lastindex) def metadata(self): + if self.config("metadata"): + url = "{}/pool/show/{}.json".format(self.root, self.pool_id) + pool = self.request(url).json() + pool.pop("posts", None) + return {"pool": pool} return {"pool": text.parse_int(self.pool_id)} def posts(self): diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 1bcc915..a6971e8 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -54,14 +54,31 @@ class NewgroundsExtractor(Extractor): if metadata: post.update(metadata) yield Message.Directory, post + post["num"] = 0 yield Message.Url, url, text.nameext_from_url(url, post) - for num, url in enumerate(text.extract_iter( - post["_comment"], 'data-smartload-src="', '"'), 1): - post["num"] = num - post["_index"] = "{}_{:>02}".format(post["index"], num) + if "_multi" in post: + for data in post["_multi"]: + post["num"] += 1 + post["_index"] = "{}_{:>02}".format( + post["index"], post["num"]) + post.update(data) + url = data["image"] + + text.nameext_from_url(url, post) + yield Message.Url, url, post + + if "_fallback" in post: + del post["_fallback"] + + for url in text.extract_iter( + post["_comment"], 'data-smartload-src="', '"'): + post["num"] += 1 + post["_index"] = "{}_{:>02}".format( + post["index"], post["num"]) url = text.ensure_http_scheme(url) - yield Message.Url, url, text.nameext_from_url(url, post) + text.nameext_from_url(url, post) + yield Message.Url, url, post else: self.log.warning( "Unable to get download URL for '%s'", post_url) @@ -153,8 +170,7 @@ class NewgroundsExtractor(Extractor): data["post_url"] = post_url return data - @staticmethod - def _extract_image_data(extr, url): + def _extract_image_data(self, extr, url): full = text.extract_from(util.json_loads(extr( '"full_image_text":', '});'))) data = { @@ -172,8 +188,34 @@ class NewgroundsExtractor(Extractor): index = data["url"].rpartition("/")[2].partition("_")[0] data["index"] = text.parse_int(index) data["_index"] = index + + image_data = extr("let imageData =", "\n];") + if image_data: + data["_multi"] = self._extract_images_multi(image_data) + else: + art_images = extr('
", "<"), + "sensitive" : tget("possibly_sensitive"), "favorite_count": tget("favorite_count"), "quote_count" : tget("quote_count"), "reply_count" : tget("reply_count"), @@ -451,6 +452,7 @@ class TwitterExtractor(Extractor): "id_str": id_str, "lang": None, "user": user, + "source": "><", "entities": {}, "extended_entities": { "media": [ diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 6f152ed..8e6b842 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -47,7 +47,7 @@ class WarosuThreadExtractor(Extractor): def metadata(self, page): boardname = text.extr(page, "", "") - title = text.extr(page, 'filetitle" itemprop="name">', '<') + title = text.unescape(text.extr(page, "class=filetitle>", "<")) return { "board" : self.board, "board_name": boardname.rpartition(" - ")[2], @@ -57,39 +57,37 @@ class WarosuThreadExtractor(Extractor): def posts(self, page): """Build a list of all post objects""" - page = text.extr(page, '
', '') - needle = '
' + page = text.extr(page, "
") + needle = "
" return [self.parse(post) for post in page.split(needle)] def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if "File:" in post: + if " File:" in post: self._extract_image(post, data) part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] return data - @staticmethod - def _extract_post(post): + def _extract_post(self, post): extr = text.extract_from(post) return { - "no" : extr('id="p', '"'), - "name": extr('', ""), - "time": extr(''), - "now" : extr("", "<"), + "no" : extr("id=p", ">"), + "name": extr("class=postername>", "<").strip(), + "time": extr("class=posttime title=", "000>"), + "now" : extr("", "<").strip(), "com" : text.unescape(text.remove_html(extr( - '

', '

' - ).strip())), + "
", "
").strip())), } - @staticmethod - def _extract_image(post, data): + def _extract_image(self, post, data): extr = text.extract_from(post) - data["fsize"] = extr("File: ", ", ") + data["fsize"] = extr(" File: ", ", ") data["w"] = extr("", "x") data["h"] = extr("", ", ") - data["filename"] = text.unquote(extr("", "<").rpartition(".")[0]) - extr("
", "") - data["image"] = "https:" + extr('", "") + data["image"] = self.root + extr("") -- cgit v1.2.3