# -*- coding: utf-8 -*- # Copyright 2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://xfolio.jp/""" from .common import Extractor, Message from .. import text, exception BASE_PATTERN = r"(?:https?://)?xfolio\.jp(?:/[^/?#]+)?" class XfolioExtractor(Extractor): """Base class for xfolio extractors""" category = "xfolio" root = "https://xfolio.jp" cookies_domain = ".xfolio.jp" directory_fmt = ("{category}", "{creator_slug}", "{work_id}") filename_fmt = "{work_id}_{image_id}.{extension}" archive_fmt = "{work_id}_{image_id}" request_interval = (0.5, 1.5) def _init(self): XfolioExtractor._init = Extractor._init if not self.cookies_check(("xfolio_session",)): self.log.error("'xfolio_session' cookie required") def items(self): data = {"_extractor": XfolioWorkExtractor} for work in self.works(): yield Message.Queue, work, data def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) if "/system/recaptcha" in response.url: raise exception.AbortExtraction("Bot check / CAPTCHA page") return response class XfolioWorkExtractor(XfolioExtractor): subcategory = "work" pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/works/(\d+)" example = "https://xfolio.jp/portfolio/USER/works/12345" def items(self): creator, work_id = self.groups url = f"{self.root}/portfolio/{creator}/works/{work_id}" html = self.request(url).text work = self._extract_data(html) files = self._extract_files(html, work) work["count"] = len(files) yield Message.Directory, "", work for work["num"], file in enumerate(files, 1): file.update(work) yield Message.Url, file["url"], file def _extract_data(self, html): creator, work_id = self.groups extr = text.extract_from(html) return { "title" : text.unescape(extr( 'property="og:title" content="', '"').rpartition(" - ")[0]), "description" : text.unescape(extr( 'property="og:description" content="', '"')), "creator_id" : extr(' data-creator-id="', '"'), "creator_userid" : extr(' data-creator-user-id="', '"'), "creator_name" : extr(' data-creator-name="', '"'), "creator_profile": text.unescape(extr( ' data-creator-profile="', '"')), "series_id" : extr("/series/", '"'), "creator_slug" : creator, "work_id" : work_id, } def _extract_files(self, html, work): files = [] work_id = work["work_id"] for img in text.extract_iter( html, 'class="article__wrap_img', ""): image_id = text.extr(img, "/fullscale_image?image_id=", "&") if not image_id: self.log.warning( "%s: 'fullscale_image' not available", work_id) continue files.append({ "image_id" : image_id, "extension": "jpg", "url": (f"{self.root}/user_asset.php?id={image_id}&work_id=" f"{work_id}&work_image_id={image_id}&type=work_image"), "_http_headers": {"Referer": ( f"{self.root}/fullscale_image" f"?image_id={image_id}&work_id={work_id}")}, }) return files class XfolioUserExtractor(XfolioExtractor): subcategory = "user" pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)(?:/works)?/?(?:$|\?|#)" example = "https://xfolio.jp/portfolio/USER" def works(self): url = f"{self.root}/portfolio/{self.groups[0]}/works" while True: html = self.request(url).text for item in text.extract_iter( html, '
"): yield text.extr(item, ' href="', '"') pager = text.extr(html, ' class="pager__list_next', "") url = text.extr(pager, ' href="', '"') if not url: return url = text.unescape(url) class XfolioSeriesExtractor(XfolioExtractor): subcategory = "series" pattern = BASE_PATTERN + r"/portfolio/([^/?#]+)/series/(\d+)" example = "https://xfolio.jp/portfolio/USER/series/12345" def works(self): creator, series_id = self.groups url = f"{self.root}/portfolio/{creator}/series/{series_id}" html = self.request(url).text return [ text.extr(item, ' href="', '"') for item in text.extract_iter( html, 'class="listWrap--title">', "") ]