# -*- coding: utf-8 -*- # Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://www.zerochan.net/""" from .booru import BooruExtractor from ..cache import cache from .. import text, util, exception import collections import re BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" class ZerochanExtractor(BooruExtractor): """Base class for zerochan extractors""" category = "zerochan" root = "https://www.zerochan.net" filename_fmt = "{id}.{extension}" archive_fmt = "{id}" page_start = 1 per_page = 250 cookies_domain = ".zerochan.net" cookies_names = ("z_id", "z_hash") request_interval = (0.5, 1.5) def login(self): self._logged_in = True if self.cookies_check(self.cookies_names): return username, password = self._get_auth_info() if username: return self.cookies_update(self._login_impl(username, password)) self._logged_in = False @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) url = self.root + "/login" headers = { "Origin" : self.root, "Referer" : url, } data = { "ref" : "/", "name" : username, "password": password, "login" : "Login", } response = self.request(url, method="POST", headers=headers, data=data) if not response.history: raise exception.AuthenticationError() return response.cookies def _parse_entry_html(self, entry_id): url = "{}/{}".format(self.root, entry_id) page = self.request(url).text try: jsonld = self._extract_jsonld(page) except Exception: return {"id": entry_id} extr = text.extract_from(page) data = { "id" : text.parse_int(entry_id), "file_url": jsonld["contentUrl"], "date" : text.parse_datetime(jsonld["datePublished"]), "width" : text.parse_int(jsonld["width"][:-3]), "height" : text.parse_int(jsonld["height"][:-3]), "size" : text.parse_bytes(jsonld["contentSize"][:-1]), "path" : text.split_html(extr( 'class="breadcrumbs', ''))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('

') if not post: break if metadata: entry_id = extr('href="/', '"') post = self._parse_entry_html(entry_id) post.update(self._parse_entry_api(entry_id)) yield post else: yield { "id" : extr('href="/', '"'), "name" : extr('alt="', '"'), "width" : extr('title="', '✕'), "height": extr('', ' '), "size" : extr('', 'b'), "file_url": "https://static." + extr( '= 300: url = text.urljoin(self.root, response.headers["location"]) self.log.warning("HTTP redirect to %s", url) if self.config("redirects"): continue raise exception.StopExtraction() data = response.json() try: posts = data["items"] except Exception: self.log.debug("Server response: %s", data) return if metadata: for post in posts: post_id = post["id"] post.update(self._parse_entry_html(post_id)) post.update(self._parse_entry_api(post_id)) yield post else: for post in posts: urls = self._urls(post) post["file_url"] = next(urls) post["_fallback"] = urls yield post if not data.get("next"): return params["p"] += 1 def _urls(self, post, static="https://static.zerochan.net/.full."): base = static + str(post["id"]) + "." for ext in self.exts: yield base + ext class ZerochanImageExtractor(ZerochanExtractor): subcategory = "image" pattern = BASE_PATTERN + r"/(\d+)" example = "https://www.zerochan.net/12345" def __init__(self, match): ZerochanExtractor.__init__(self, match) self.image_id = match.group(1) def posts(self): post = self._parse_entry_html(self.image_id) if self.config("metadata"): post.update(self._parse_entry_api(self.image_id)) return (post,)