diff options
Diffstat (limited to 'gallery_dl/extractor/nitter.py')
| -rw-r--r-- | gallery_dl/extractor/nitter.py | 256 |
1 files changed, 256 insertions, 0 deletions
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py new file mode 100644 index 0000000..1ba8253 --- /dev/null +++ b/gallery_dl/extractor/nitter.py @@ -0,0 +1,256 @@ +# -*- coding: utf-8 -*- + +# Copyright 2022 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Nitter instances""" + +from .common import BaseExtractor, Message +from .. import text + + +class NitterExtractor(BaseExtractor): + """Base class for nitter extractors""" + basecategory = "nitter" + directory_fmt = ("{category}", "{user[name]}") + filename_fmt = "{tweet_id}_{num}.{extension}" + archive_fmt = "{tweet_id}_{num}" + + def __init__(self, match): + BaseExtractor.__init__(self, match) + self.user = match.group(match.lastindex) + + def items(self): + for tweet_html in self.tweets(): + tweet = self._tweet_from_html(tweet_html) + + attachments_html = tweet.pop("_attach", "") + if attachments_html: + attachments = list(text.extract_iter( + attachments_html, 'href="', '"')) + attachments.extend(text.extract_iter( + attachments_html, 'data-url="', '"')) + else: + attachments = () + tweet["count"] = len(attachments) + + yield Message.Directory, tweet + for tweet["num"], url in enumerate(attachments, 1): + if url[0] == "/": + url = self.root + url + if "/video/" in url: + url = "ytdl:" + url + tweet["filename"] = url.rpartition( + "%2F")[2].partition(".")[0] + tweet["extension"] = "mp4" + else: + text.nameext_from_url(url, tweet) + yield Message.Url, url, tweet + + def _tweet_from_html(self, html): + extr = text.extract_from(html) + user = { + "name": extr('class="fullname" href="/', '"'), + "nick": extr('title="', '"'), + } + extr('<span class="tweet-date', '') + link = extr('href="', '"') + return { + "user": user, + "date": text.parse_datetime( + extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"), + "tweet_id": link.rpartition("/")[2].partition("#")[0], + "content": extr('class="tweet-content', "</div").partition(">")[2], + "_attach": extr('class="attachments', 'class="tweet-stats'), + "comments": text.parse_int(extr( + 'class="icon-comment', '</div>').rpartition(">")[2]), + "retweets": text.parse_int(extr( + 'class="icon-retweet', '</div>').rpartition(">")[2]), + "quotes" : text.parse_int(extr( + 'class="icon-quote', '</div>').rpartition(">")[2]), + "likes" : text.parse_int(extr( + 'class="icon-heart', '</div>').rpartition(">")[2]), + } + + def _pagination(self, path): + base_url = url = self.root + path + + while True: + page = self.request(url).text + + yield from page.split('<div class="timeline-item')[1:] + + more = text.extr(page, '<div class="show-more"><a href="?', '"') + if not more: + return + url = base_url + "?" + text.unescape(more) + + +BASE_PATTERN = NitterExtractor.update({ + "nitter.net": { + "root": "https://nitter.net", + "pattern": r"nitter\.net", + }, + "nitter.lacontrevoie.fr": { + "root": "https://nitter.lacontrevoie.fr", + "pattern": r"nitter\.lacontrevoie\.fr", + }, + "nitter.pussthecat.org": { + "root": "https://nitter.pussthecat.org", + "pattern": r"nitter\.pussthecat\.org", + }, + "nitter.1d4.us": { + "root": "https://nitter.1d4.us", + "pattern": r"nitter\.1d4\.us", + }, + "nitter.kavin.rocks": { + "root": "https://nitter.kavin.rocks", + "pattern": r"nitter\.kavin\.rocks", + }, + "nitter.unixfox.eu": { + "root": "https://nitter.unixfox.eu", + "pattern": r"nitter\.unixfox\.eu", + }, +}) + + +class NitterTweetsExtractor(NitterExtractor): + subcategory = "tweets" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/tweets)?(?:$|\?|#)" + test = ( + ("https://nitter.net/supernaturepics", { + "pattern": r"https://nitter\.net/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + "count": 20, + "keyword": { + "comments": int, + "content": str, + "count": 1, + "date": "type:datetime", + "likes": int, + "quotes": int, + "retweets": int, + "tweet_id": r"re:\d+", + "user": { + "name": "supernaturepics", + "nick": "Nature Pictures" + }, + }, + }), + ("https://nitter.lacontrevoie.fr/supernaturepics"), + ("https://nitter.pussthecat.org/supernaturepics"), + ("https://nitter.1d4.us/supernaturepics"), + ("https://nitter.kavin.rocks/supernaturepics"), + ("https://nitter.unixfox.eu/supernaturepics"), + ) + + def tweets(self): + return self._pagination("/" + self.user) + + +class NitterRepliesExtractor(NitterExtractor): + subcategory = "replies" + pattern = BASE_PATTERN + r"/([^/?#]+)/with_replies" + test = ( + ("https://nitter.net/supernaturepics/with_replies", { + "pattern": r"https://nitter\.net/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + }), + ("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"), + ("https://nitter.pussthecat.org/supernaturepics/with_replies"), + ("https://nitter.1d4.us/supernaturepics/with_replies"), + ("https://nitter.kavin.rocks/supernaturepics/with_replies"), + ("https://nitter.unixfox.eu/supernaturepics/with_replies"), + ) + + def tweets(self): + return self._pagination("/" + self.user + "/with_replies") + + +class NitterMediaExtractor(NitterExtractor): + subcategory = "media" + pattern = BASE_PATTERN + r"/([^/?#]+)/media" + test = ( + ("https://nitter.net/supernaturepics/media", { + "pattern": r"https://nitter\.net/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + }), + ("https://nitter.lacontrevoie.fr/supernaturepics/media"), + ("https://nitter.pussthecat.org/supernaturepics/media"), + ("https://nitter.1d4.us/supernaturepics/media"), + ("https://nitter.kavin.rocks/supernaturepics/media"), + ("https://nitter.unixfox.eu/supernaturepics/media"), + ) + + def tweets(self): + return self._pagination("/" + self.user + "/media") + + +class NitterSearchExtractor(NitterExtractor): + subcategory = "search" + pattern = BASE_PATTERN + r"/([^/?#]+)/search" + test = ( + ("https://nitter.net/supernaturepics/search", { + "pattern": r"https://nitter\.net/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + }), + ("https://nitter.lacontrevoie.fr/supernaturepics/search"), + ("https://nitter.pussthecat.org/supernaturepics/search"), + ("https://nitter.1d4.us/supernaturepics/search"), + ("https://nitter.kavin.rocks/supernaturepics/search"), + ("https://nitter.unixfox.eu/supernaturepics/search"), + ) + + def tweets(self): + return self._pagination("/" + self.user + "/search") + + +class NitterTweetExtractor(NitterExtractor): + """Extractor for nitter tweets""" + subcategory = "tweet" + directory_fmt = ("{category}", "{user[name]}") + filename_fmt = "{tweet_id}_{num}.{extension}" + archive_fmt = "{tweet_id}_{num}" + pattern = BASE_PATTERN + r"/[^/?#]+/status/(\d+)" + test = ( + ("https://nitter.net/supernaturepics/status/604341487988576256", { + "url": "3f2b64e175bf284aa672c3bb53ed275e470b919a", + "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", + }), + # 4 images + ("https://nitter.lacontrevoie.fr/i/status/894001459754180609", { + "url": "9c51b3a4a1114535eb9b168bba97ad95db0d59ff", + }), + # video + ("https://nitter.pussthecat.org/i/status/1065692031626829824", { + "pattern": r"ytdl:https://nitter.pussthecat.org/video" + r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F" + r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F" + r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5", + }), + # content with emoji, newlines, hashtags (#338) + ("https://nitter.1d4.us/playpokemon/status/1263832915173048321", { + "keyword": {"content": ( + r"re:Gear up for #PokemonSwordShieldEX with special Mystery " + "Gifts! \n\nYou’ll be able to receive four Galarian form " + "Pokémon with Hidden Abilities, plus some very useful items. " + "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ " + )}, + }), + # Nitter tweet (#890) + ("https://nitter.kavin.rocks/ed1conf/status/1163841619336007680", { + "url": "e115bd1c86c660064e392b05269bbcafcd8c8b7a", + "content": "f29501e44d88437fe460f5c927b7543fda0f6e34", + }), + ) + + def tweets(self): + url = "{}/i/status/{}".format(self.root, self.user) + return (self.request(url).text,) |
