# -*- coding: utf-8 -*- # Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. """Extractors for https://archiveofourown.org/""" from .common import Extractor, Message, Dispatch from .. import text, util, exception from ..cache import cache BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" r"a(?:rchiveofourown|o3)\.(?:org|com|net)") class Ao3Extractor(Extractor): """Base class for ao3 extractors""" category = "ao3" root = "https://archiveofourown.org" categorytransfer = True cookies_domain = ".archiveofourown.org" cookies_names = ("remember_user_token",) request_interval = (0.5, 1.5) def items(self): self.login() base = self.root + "/works/" data = {"_extractor": Ao3WorkExtractor, "type": "work"} for work_id in self.works(): yield Message.Queue, base + work_id, data def items_list(self, type, needle, part=True): self.login() base = self.root + "/" data_work = {"_extractor": Ao3WorkExtractor, "type": "work"} data_series = {"_extractor": Ao3SeriesExtractor, "type": "series"} data_user = {"_extractor": Ao3UserExtractor, "type": "user"} for item in self._pagination(self.groups[0], needle): path = item.rpartition("/")[0] if part else item url = base + path if item.startswith("works/"): yield Message.Queue, url, data_work elif item.startswith("series/"): yield Message.Queue, url, data_series elif item.startswith("users/"): yield Message.Queue, url, data_user else: self.log.warning("Unsupported %s type '%s'", type, path) def works(self): return self._pagination(self.groups[0]) def login(self): if self.cookies_check(self.cookies_names): return username, password = self._get_auth_info() if username: return self.cookies_update(self._login_impl(username, password)) @cache(maxage=90*86400, keyarg=1) def _login_impl(self, username, password): self.log.info("Logging in as %s", username) url = self.root + "/users/login" page = self.request(url).text pos = page.find('id="loginform"') token = text.extract( page, ' name="authenticity_token" value="', '"', pos)[0] if not token: self.log.error("Unable to extract 'authenticity_token'") data = { "authenticity_token": text.unescape(token), "user[login]" : username, "user[password]" : password, "user[remember_me]" : "1", "commit" : "Log In", } response = self.request(url, method="POST", data=data) if not response.history: raise exception.AuthenticationError() remember = response.history[0].cookies.get("remember_user_token") if not remember: raise exception.AuthenticationError() return { "remember_user_token": remember, "user_credentials" : "1", } def _pagination(self, path, needle='