diff options
Diffstat (limited to 'gallery_dl/extractor/weibo.py')
| -rw-r--r-- | gallery_dl/extractor/weibo.py | 137 |
1 files changed, 137 insertions, 0 deletions
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py new file mode 100644 index 0000000..7a4ee8f --- /dev/null +++ b/gallery_dl/extractor/weibo.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.weibo.com/""" + +from .common import Extractor, Message +from .. import text +import json + + +class WeiboExtractor(Extractor): + category = "weibo" + directory_fmt = ("{category}", "{user[screen_name]}") + filename_fmt = "{status[id]}_{num:>02}.{extension}" + archive_fmt = "{status[id]}_{num}" + root = "https://m.weibo.cn" + + def __init__(self, match): + Extractor.__init__(self, match) + self.retweets = self.config("retweets", True) + + def items(self): + first = True + + for status in self.statuses(): + + obj = status + num = 1 + + if first: + yield Message.Version, 1 + yield Message.Directory, status + first = False + + while True: + + if "pics" in obj: + for image in obj["pics"]: + pid = image["pid"] + if "large" in image: + image = image["large"] + data = text.nameext_from_url(image["url"], { + "num": num, + "pid": pid, + "width": text.parse_int(image["geo"]["width"]), + "height": text.parse_int(image["geo"]["height"]), + "status": status, + }) + yield Message.Url, image["url"], data + num += 1 + + if "page_info" in obj and "media_info" in obj["page_info"]: + info = obj["page_info"]["media_info"] + url = info.get("stream_url_hd") or info["stream_url"] + data = text.nameext_from_url(url, { + "num": num, + "url": url, + "width": 0, + "height": 0, + "status": status, + }) + yield Message.Url, url, data + + if self.retweets and "retweeted_status" in obj: + obj = obj["retweeted_status"] + else: + break + + def statuses(self): + """Returns an iterable containing all relevant 'status' objects""" + + +class WeiboUserExtractor(WeiboExtractor): + """Extractor for all images of a user on weibo.cn""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)" + r"/(?:u|p(?:rofile)?)/(\d+)") + test = ( + ("https://m.weibo.cn/u/2314621010", { + "range": "1-30", + }), + ("https://m.weibo.cn/profile/2314621010"), + ("https://m.weibo.cn/p/2304132314621010_-_WEIBO_SECOND_PROFILE_WEIBO"), + ("https://www.weibo.com/p/1003062314621010/home"), + ) + + def __init__(self, match): + WeiboExtractor.__init__(self, match) + self.user_id = match.group(1) + + def statuses(self): + url = self.root + "/api/container/getIndex" + params = {"page": 1, "containerid": "107603" + self.user_id[-10:]} + + while True: + data = self.request(url, params=params).json() + + for card in data["data"]["cards"]: + if "mblog" in card: + yield card["mblog"] + + if len(data["data"]["cards"]) < 5: + return + params["page"] += 1 + + +class WeiboStatusExtractor(WeiboExtractor): + """Extractor for images from a status on weibo.cn""" + subcategory = "status" + pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)" + r"/(?:detail|status|\d+)/(\d+)") + test = ( + ("https://m.weibo.cn/detail/4323047042991618", { + "pattern": r"https?://wx\d+.sinaimg.cn/large/\w+.jpg", + }), + ("https://m.weibo.cn/detail/4339748116375525", { + "pattern": r"https?://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_hd", + }), + ("https://m.weibo.cn/status/4339748116375525"), + ("https://m.weibo.cn/5746766133/4339748116375525"), + ) + + def __init__(self, match): + WeiboExtractor.__init__(self, match) + self.status_id = match.group(1) + + def statuses(self): + url = "{}/detail/{}".format(self.root, self.status_id) + page = self.request(url).text + data = json.loads(text.extract( + page, " var $render_data = [", "][0] || {};")[0]) + return (data["status"],) |
