diff options
| author | 2024-09-28 20:01:25 -0400 | |
|---|---|---|
| committer | 2024-09-28 20:01:25 -0400 | |
| commit | 1a457ed68769880ab7760e0746f0cbbd9ca00487 (patch) | |
| tree | a5e2f36fa6537e24a7a8851dab900ea03efdbd00 /gallery_dl/extractor/ao3.py | |
| parent | 1f3ffe32342852fd9ea9e7704022488f3a1222bd (diff) | |
New upstream version 1.27.5.upstream/1.27.5
Diffstat (limited to 'gallery_dl/extractor/ao3.py')
| -rw-r--r-- | gallery_dl/extractor/ao3.py | 302 |
1 files changed, 302 insertions, 0 deletions
diff --git a/gallery_dl/extractor/ao3.py b/gallery_dl/extractor/ao3.py new file mode 100644 index 0000000..1f570e8 --- /dev/null +++ b/gallery_dl/extractor/ao3.py @@ -0,0 +1,302 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://archiveofourown.org/""" + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache + +BASE_PATTERN = (r"(?:https?://)?(?:www\.)?" + r"a(?:rchiveofourown|o3)\.(?:org|com|net)") + + +class Ao3Extractor(Extractor): + """Base class for ao3 extractors""" + category = "ao3" + root = "https://archiveofourown.org" + categorytransfer = True + cookies_domain = ".archiveofourown.org" + cookies_names = ("remember_user_token",) + request_interval = (0.5, 1.5) + + def items(self): + self.login() + + base = self.root + "/works/" + data = {"_extractor": Ao3WorkExtractor} + + for work_id in self.works(): + yield Message.Queue, base + work_id, data + + def works(self): + return self._pagination(self.groups[0]) + + def login(self): + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + return self.cookies_update(self._login_impl(username, password)) + + @cache(maxage=90*86400, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/users/login" + page = self.request(url).text + + pos = page.find('id="loginform"') + token = text.extract( + page, ' name="authenticity_token" value="', '"', pos)[0] + if not token: + self.log.error("Unable to extract 'authenticity_token'") + + data = { + "authenticity_token": text.unescape(token), + "user[login]" : username, + "user[password]" : password, + "user[remember_me]" : "1", + "commit" : "Log In", + } + + response = self.request(url, method="POST", data=data) + if not response.history: + raise exception.AuthenticationError() + + remember = response.history[0].cookies.get("remember_user_token") + if not remember: + raise exception.AuthenticationError() + + return { + "remember_user_token": remember, + "user_credentials" : "1", + } + + def _pagination(self, path, needle='<li id="work_'): + while True: + page = self.request(self.root + path).text + yield from text.extract_iter(page, needle, '"') + path = text.extr(page, '<a rel="next" href="', '"') + if not path: + return + path = text.unescape(path) + + +class Ao3WorkExtractor(Ao3Extractor): + """Extractor for an AO3 work""" + subcategory = "work" + directory_fmt = ("{category}", "{author}") + filename_fmt = "{id} {title}.{extension}" + archive_fmt = "{id}.{extension}" + pattern = BASE_PATTERN + r"/works/(\d+)" + example = "https://archiveofourown.org/works/12345" + + def _init(self): + formats = self.config("formats") + if formats is None: + self.formats = ("pdf",) + elif not formats: + self.formats = () + elif isinstance(formats, str): + self.formats = formats.lower().replace(" ", "").split(",") + else: + self.formats = formats + + self.cookies.set("view_adult", "true", domain="archiveofourown.org") + + def items(self): + self.login() + + work_id = self.groups[0] + url = "{}/works/{}".format(self.root, work_id) + response = self.request(url, notfound="work") + + if response.url.endswith("/users/login?restricted=true"): + raise exception.AuthorizationError( + "Login required to access member-only works") + page = response.text + if len(page) < 20000 and \ + '<h2 class="landmark heading">Adult Content Warning</' in page: + raise exception.StopExtraction("Adult Content") + + extr = text.extract_from(page) + + chapters = {} + cindex = extr(' id="chapter_index"', "</ul>") + for ch in text.extract_iter(cindex, ' value="', "</option>"): + cid, _, cname = ch.partition('">') + chapters[cid] = text.unescape(cname) + + fmts = {} + path = "" + download = extr(' class="download"', "</ul>") + for dl in text.extract_iter(download, ' href="', "</"): + path, _, type = dl.rpartition('">') + fmts[type.lower()] = path + + data = { + "id" : text.parse_int(work_id), + "rating" : text.split_html( + extr('<dd class="rating tags">', "</dd>")), + "warnings" : text.split_html( + extr('<dd class="warning tags">', "</dd>")), + "categories" : text.split_html( + extr('<dd class="category tags">', "</dd>")), + "fandom" : text.split_html( + extr('<dd class="fandom tags">', "</dd>")), + "relationships": text.split_html( + extr('<dd class="relationship tags">', "</dd>")), + "characters" : text.split_html( + extr('<dd class="character tags">', "</dd>")), + "tags" : text.split_html( + extr('<dd class="freeform tags">', "</dd>")), + "lang" : extr('<dd class="language" lang="', '"'), + "series" : extr('<dd class="series">', "</dd>"), + "date" : text.parse_datetime( + extr('<dd class="published">', "<"), "%Y-%m-%d"), + "date_completed": text.parse_datetime( + extr('>Completed:</dt><dd class="status">', "<"), "%Y-%m-%d"), + "date_updated" : text.parse_timestamp( + path.rpartition("updated_at=")[2]), + "words" : text.parse_int( + extr('<dd class="words">', "<").replace(",", "")), + "chapters" : chapters, + "comments" : text.parse_int( + extr('<dd class="comments">', "<").replace(",", "")), + "likes" : text.parse_int( + extr('<dd class="kudos">', "<").replace(",", "")), + "bookmarks" : text.parse_int(text.remove_html( + extr('<dd class="bookmarks">', "</dd>")).replace(",", "")), + "views" : text.parse_int( + extr('<dd class="hits">', "<").replace(",", "")), + "title" : text.unescape(text.remove_html( + extr(' class="title heading">', "</h2>")).strip()), + "author" : text.unescape(text.remove_html( + extr(' class="byline heading">', "</h3>"))), + "summary" : text.split_html( + extr(' class="heading">Summary:</h3>', "</div>")), + } + data["language"] = util.code_to_language(data["lang"]) + + series = data["series"] + if series: + extr = text.extract_from(series) + data["series"] = { + "prev" : extr(' class="previous" href="/works/', '"'), + "index": extr(' class="position">Part ', " "), + "id" : extr(' href="/series/', '"'), + "name" : text.unescape(extr(">", "<")), + "next" : extr(' class="next" href="/works/', '"'), + } + else: + data["series"] = None + + yield Message.Directory, data + for fmt in self.formats: + try: + url = text.urljoin(self.root, fmts[fmt]) + except KeyError: + self.log.warning("%s: Format '%s' not available", work_id, fmt) + else: + yield Message.Url, url, text.nameext_from_url(url, data) + + +class Ao3SeriesExtractor(Ao3Extractor): + """Extractor for AO3 works of a series""" + subcategory = "series" + pattern = BASE_PATTERN + r"(/series/(\d+))" + example = "https://archiveofourown.org/series/12345" + + +class Ao3TagExtractor(Ao3Extractor): + """Extractor for AO3 works by tag""" + subcategory = "tag" + pattern = BASE_PATTERN + r"(/tags/([^/?#]+)/works(?:/?\?.+)?)" + example = "https://archiveofourown.org/tags/TAG/works" + + +class Ao3SearchExtractor(Ao3Extractor): + """Extractor for AO3 search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"(/works/search/?\?.+)" + example = "https://archiveofourown.org/works/search?work_search[query]=air" + + +class Ao3UserExtractor(Ao3Extractor): + """Extractor for an AO3 user profile""" + subcategory = "user" + pattern = (BASE_PATTERN + r"/users/([^/?#]+(?:/pseuds/[^/?#]+)?)" + r"(?:/profile)?/?(?:$|\?|#)") + example = "https://archiveofourown.org/users/USER" + + def initialize(self): + pass + + def items(self): + base = "{}/users/{}/".format(self.root, self.groups[0]) + return self._dispatch_extractors(( + (Ao3UserWorksExtractor , base + "works"), + (Ao3UserSeriesExtractor , base + "series"), + (Ao3UserBookmarkExtractor, base + "bookmarks"), + ), ("user-works", "user-series")) + + +class Ao3UserWorksExtractor(Ao3Extractor): + """Extractor for works of an AO3 user""" + subcategory = "user-works" + pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + r"works(?:/?\?.+)?)") + example = "https://archiveofourown.org/users/USER/works" + + +class Ao3UserSeriesExtractor(Ao3Extractor): + """Extractor for series of an AO3 user""" + subcategory = "user-series" + pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + r"series(?:/?\?.+)?)") + example = "https://archiveofourown.org/users/USER/series" + + def items(self): + self.login() + + base = self.root + "/series/" + data = {"_extractor": Ao3SeriesExtractor} + + for series_id in self.series(): + yield Message.Queue, base + series_id, data + + def series(self): + return self._pagination(self.groups[0], '<li id="series_') + + +class Ao3UserBookmarkExtractor(Ao3Extractor): + """Extractor for bookmarked works of an AO3 user""" + subcategory = "user-bookmark" + pattern = (BASE_PATTERN + r"(/users/([^/?#]+)/(?:pseuds/([^/?#]+)/)?" + r"bookmarks(?:/?\?.+)?)") + example = "https://archiveofourown.org/users/USER/bookmarks" + + def items(self): + self.login() + + base = self.root + "/" + data_work = {"_extractor": Ao3WorkExtractor} + data_series = {"_extractor": Ao3SeriesExtractor} + + for item in self._pagination( + self.groups[0], '<span class="count"><a href="/'): + path = item.rpartition("/")[0] + url = base + path + if item.startswith("works/"): + yield Message.Queue, url, data_work + elif item.startswith("series/"): + yield Message.Queue, url, data_series + else: + self.log.warning("Unsupported bookmark type '%s'", path) |
