diff options
Diffstat (limited to 'gallery_dl/extractor/bilibili.py')
| -rw-r--r-- | gallery_dl/extractor/bilibili.py | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py new file mode 100644 index 0000000..d5c419e --- /dev/null +++ b/gallery_dl/extractor/bilibili.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.bilibili.com/""" + +from .common import Extractor, Message +from .. import text, util, exception + + +class BilibiliExtractor(Extractor): + """Base class for bilibili extractors""" + category = "bilibili" + root = "https://www.bilibili.com" + request_interval = (3.0, 6.0) + + def _init(self): + self.api = BilibiliAPI(self) + + +class BilibiliUserArticlesExtractor(BilibiliExtractor): + """Extractor for a bilibili user's articles""" + subcategory = "user-articles" + pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article" + example = "https://space.bilibili.com/12345/article" + + def items(self): + for article in self.api.user_articles(self.groups[0]): + article["_extractor"] = BilibiliArticleExtractor + url = "{}/opus/{}".format(self.root, article["opus_id"]) + yield Message.Queue, url, article + + +class BilibiliArticleExtractor(BilibiliExtractor): + """Extractor for a bilibili article""" + subcategory = "article" + pattern = (r"(?:https?://)?" + r"(?:t\.bilibili\.com|(?:www\.)?bilibili.com/opus)/(\d+)") + example = "https://www.bilibili.com/opus/12345" + directory_fmt = ("{category}", "{username}") + filename_fmt = "{id}_{num}.{extension}" + archive_fmt = "{id}_{num}" + + def items(self): + article = self.api.article(self.groups[0]) + + # Flatten modules list + modules = {} + for module in article["detail"]["modules"]: + del module['module_type'] + modules.update(module) + article["detail"]["modules"] = modules + + article["username"] = modules["module_author"]["name"] + + pics = [] + for paragraph in modules['module_content']['paragraphs']: + if "pic" not in paragraph: + continue + + try: + pics.extend(paragraph["pic"]["pics"]) + except Exception: + pass + + article["count"] = len(pics) + yield Message.Directory, article + for article["num"], pic in enumerate(pics, 1): + url = pic["url"] + article.update(pic) + yield Message.Url, url, text.nameext_from_url(url, article) + + +class BilibiliAPI(): + def __init__(self, extractor): + self.extractor = extractor + + def _call(self, endpoint, params): + url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint + data = self.extractor.request(url, params=params).json() + + if data["code"] != 0: + self.extractor.log.debug("Server response: %s", data) + raise exception.StopExtraction("API request failed") + + return data + + def user_articles(self, user_id): + endpoint = "/opus/feed/space" + params = {"host_mid": user_id} + + while True: + data = self._call(endpoint, params) + + for item in data["data"]["items"]: + params["offset"] = item["opus_id"] + yield item + + if not data["data"]["has_more"]: + break + + def article(self, article_id): + url = "https://www.bilibili.com/opus/" + article_id + + while True: + page = self.extractor.request(url).text + try: + return util.json_loads(text.extr( + page, "window.__INITIAL_STATE__=", "};") + "}") + except Exception: + if "window._riskdata_" not in page: + raise exception.StopExtraction( + "%s: Unable to extract INITIAL_STATE data", article_id) + self.extractor.wait(seconds=300) |
