diff options
| author | 2022-12-04 23:25:06 -0500 | |
|---|---|---|
| committer | 2022-12-04 23:25:06 -0500 | |
| commit | 3338dfce719c999467ffe08fd45663be8190057a (patch) | |
| tree | fd3235a1379c19508bbb47b8e8b95d5d9164b0d3 /gallery_dl/extractor/nitter.py | |
| parent | 7af5cc29d1c02d20a6890b7b7ba78ab41532a763 (diff) | |
New upstream version 1.24.1.upstream/1.24.1
Diffstat (limited to 'gallery_dl/extractor/nitter.py')
| -rw-r--r-- | gallery_dl/extractor/nitter.py | 314 |
1 files changed, 264 insertions, 50 deletions
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 1ba8253..dfe78ae 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -10,6 +10,7 @@ from .common import BaseExtractor, Message from .. import text +import binascii class NitterExtractor(BaseExtractor): @@ -20,51 +21,102 @@ class NitterExtractor(BaseExtractor): archive_fmt = "{tweet_id}_{num}" def __init__(self, match): + self.cookiedomain = self.root.partition("://")[2] BaseExtractor.__init__(self, match) - self.user = match.group(match.lastindex) + + lastindex = match.lastindex + self.user = match.group(lastindex) + self.user_id = match.group(lastindex + 1) + self.user_obj = None def items(self): - for tweet_html in self.tweets(): - tweet = self._tweet_from_html(tweet_html) - - attachments_html = tweet.pop("_attach", "") - if attachments_html: - attachments = list(text.extract_iter( - attachments_html, 'href="', '"')) - attachments.extend(text.extract_iter( - attachments_html, 'data-url="', '"')) + retweets = self.config("retweets", False) + videos = self.config("videos", True) + if videos: + ytdl = (videos == "ytdl") + videos = True + self._cookiejar.set("hlsPlayback", "on", domain=self.cookiedomain) + + for tweet in self.tweets(): + + if not retweets and tweet["retweet"]: + self.log.debug("Skipping %s (retweet)", tweet["tweet_id"]) + continue + + attachments = tweet.pop("_attach", "") + if attachments: + files = [] + append = files.append + + for url in text.extract_iter( + attachments, 'href="', '"'): + + if "/enc/" in url: + name = binascii.a2b_base64(url.rpartition( + "/")[2]).decode().rpartition("/")[2] + else: + name = url.rpartition("%2F")[2] + + if url[0] == "/": + url = self.root + url + file = { + "url": url, + "_http_retry_codes": (404,), + } + file["filename"], _, file["extension"] = \ + name.rpartition(".") + append(file) + + if videos and not files: + if ytdl: + append({ + "url": "ytdl:{}/i/status/{}".format( + self.root, tweet["tweet_id"]), + "extension": None, + }) + else: + for url in text.extract_iter( + attachments, 'data-url="', '"'): + + if "/enc/" in url: + name = binascii.a2b_base64(url.rpartition( + "/")[2]).decode().rpartition("/")[2] + else: + name = url.rpartition("%2F")[2] + + if url[0] == "/": + url = self.root + url + append({ + "url" : "ytdl:" + url, + "filename" : name.rpartition(".")[0], + "extension": "mp4", + }) else: - attachments = () - tweet["count"] = len(attachments) + files = () + tweet["count"] = len(files) yield Message.Directory, tweet - for tweet["num"], url in enumerate(attachments, 1): - if url[0] == "/": - url = self.root + url - if "/video/" in url: - url = "ytdl:" + url - tweet["filename"] = url.rpartition( - "%2F")[2].partition(".")[0] - tweet["extension"] = "mp4" - else: - text.nameext_from_url(url, tweet) - yield Message.Url, url, tweet + for tweet["num"], file in enumerate(files, 1): + url = file["url"] + file.update(tweet) + yield Message.Url, url, file def _tweet_from_html(self, html): extr = text.extract_from(html) - user = { + author = { "name": extr('class="fullname" href="/', '"'), "nick": extr('title="', '"'), } extr('<span class="tweet-date', '') link = extr('href="', '"') return { - "user": user, - "date": text.parse_datetime( + "author" : author, + "user" : self.user_obj or author, + "date" : text.parse_datetime( extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"), "tweet_id": link.rpartition("/")[2].partition("#")[0], "content": extr('class="tweet-content', "</div").partition(">")[2], - "_attach": extr('class="attachments', 'class="tweet-stats'), + "_attach" : extr('class="attachments', 'class="tweet-stats'), "comments": text.parse_int(extr( 'class="icon-comment', '</div>').rpartition(">")[2]), "retweets": text.parse_int(extr( @@ -73,17 +125,87 @@ class NitterExtractor(BaseExtractor): 'class="icon-quote', '</div>').rpartition(">")[2]), "likes" : text.parse_int(extr( 'class="icon-heart', '</div>').rpartition(">")[2]), + "retweet" : 'class="retweet-header' in html, + "quoted": False, + } + + def _tweet_from_quote(self, html): + extr = text.extract_from(html) + author = { + "name": extr('class="fullname" href="/', '"'), + "nick": extr('title="', '"'), } + extr('<span class="tweet-date', '') + link = extr('href="', '"') + return { + "author" : author, + "user" : self.user_obj or author, + "date" : text.parse_datetime( + extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"), + "tweet_id": link.rpartition("/")[2].partition("#")[0], + "content": extr('class="quote-text', "</div").partition(">")[2], + "_attach" : extr('class="attachments', ''' + </div>'''), + "retweet" : False, + "quoted": True, + } + + def _user_from_html(self, html): + extr = text.extract_from(html, html.index('class="profile-tabs')) + banner = extr('class="profile-banner"><a href="', '"') + return { + "id" : banner.split("%2F")[4] if banner else None, + "profile_banner" : self.root + banner if banner else "", + "profile_image" : self.root + extr( + 'class="profile-card-avatar" href="', '"'), + "nick" : extr('title="', '"'), + "name" : extr('title="@', '"'), + "description" : extr('<p dir="auto">', '<'), + "date" : text.parse_datetime( + extr('class="profile-joindate"><span title="', '"'), + "%I:%M %p - %d %b %Y"), + "statuses_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "friends_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "followers_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "favourites_count": text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "verified" : 'title="Verified account"' in html, + } + + def _extract_quote(self, html): + html, _, quote = html.partition('class="quote') + if quote: + quote, _, tail = quote.partition('class="tweet-published') + return (html + tail, quote) + return (html, None) def _pagination(self, path): - base_url = url = self.root + path + quoted = self.config("quoted", False) + + if self.user_id: + self.user = self.request( + "{}/i/user/{}".format(self.root, self.user_id), + allow_redirects=False, + ).headers["location"].rpartition("/")[2] + base_url = url = "{}/{}{}".format(self.root, self.user, path) while True: - page = self.request(url).text + tweets_html = self.request(url).text.split( + '<div class="timeline-item') - yield from page.split('<div class="timeline-item')[1:] + if self.user_obj is None: + self.user_obj = self._user_from_html(tweets_html[0]) - more = text.extr(page, '<div class="show-more"><a href="?', '"') + for html, quote in map(self._extract_quote, tweets_html[1:]): + yield self._tweet_from_html(html) + if quoted and quote: + yield self._tweet_from_quote(quote) + + more = text.extr( + tweets_html[-1], '<div class="show-more"><a href="?', '"') if not more: return url = base_url + "?" + text.unescape(more) @@ -116,10 +238,12 @@ BASE_PATTERN = NitterExtractor.update({ }, }) +USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)" + class NitterTweetsExtractor(NitterExtractor): subcategory = "tweets" - pattern = BASE_PATTERN + r"/([^/?#]+)(?:/tweets)?(?:$|\?|#)" + pattern = USER_PATTERN + r"(?:/tweets)?(?:$|\?|#)" test = ( ("https://nitter.net/supernaturepics", { "pattern": r"https://nitter\.net/pic/orig" @@ -127,6 +251,10 @@ class NitterTweetsExtractor(NitterExtractor): "range": "1-20", "count": 20, "keyword": { + "author": { + "name": "supernaturepics", + "nick": "Nature Pictures" + }, "comments": int, "content": str, "count": 1, @@ -136,25 +264,44 @@ class NitterTweetsExtractor(NitterExtractor): "retweets": int, "tweet_id": r"re:\d+", "user": { + "date": "dt:2015-01-12 10:25:00", + "description": "The very best nature pictures.", + "favourites_count": int, + "followers_count": int, + "friends_count": int, + "id": "2976459548", "name": "supernaturepics", - "nick": "Nature Pictures" + "nick": "Nature Pictures", + "profile_banner": "https://nitter.net/pic/https%3A%2F%2Fpb" + "s.twimg.com%2Fprofile_banners%2F2976459" + "548%2F1421058583%2F1500x500", + "profile_image": "https://nitter.net/pic/pbs.twimg.com%2Fp" + "rofile_images%2F554585280938659841%2FFLV" + "AlX18.jpeg", + "statuses_count": 1568, + "verified": False, }, }, }), + ("https://nitter.pussthecat.org/i/user/2976459548", { + "url": "c740a2683db2c8ed2f350afc0494475c4444025b", + "pattern": r"https://nitter.pussthecat\.org/pic/orig" + r"/media%2FCGMNYZvW0AIVoom\.jpg", + "range": "1", + }), ("https://nitter.lacontrevoie.fr/supernaturepics"), - ("https://nitter.pussthecat.org/supernaturepics"), ("https://nitter.1d4.us/supernaturepics"), - ("https://nitter.kavin.rocks/supernaturepics"), + ("https://nitter.kavin.rocks/id:2976459548"), ("https://nitter.unixfox.eu/supernaturepics"), ) def tweets(self): - return self._pagination("/" + self.user) + return self._pagination("") class NitterRepliesExtractor(NitterExtractor): subcategory = "replies" - pattern = BASE_PATTERN + r"/([^/?#]+)/with_replies" + pattern = USER_PATTERN + r"/with_replies" test = ( ("https://nitter.net/supernaturepics/with_replies", { "pattern": r"https://nitter\.net/pic/orig" @@ -164,37 +311,41 @@ class NitterRepliesExtractor(NitterExtractor): ("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"), ("https://nitter.pussthecat.org/supernaturepics/with_replies"), ("https://nitter.1d4.us/supernaturepics/with_replies"), - ("https://nitter.kavin.rocks/supernaturepics/with_replies"), - ("https://nitter.unixfox.eu/supernaturepics/with_replies"), + ("https://nitter.kavin.rocks/id:2976459548/with_replies"), + ("https://nitter.unixfox.eu/i/user/2976459548/with_replies"), ) def tweets(self): - return self._pagination("/" + self.user + "/with_replies") + return self._pagination("/with_replies") class NitterMediaExtractor(NitterExtractor): subcategory = "media" - pattern = BASE_PATTERN + r"/([^/?#]+)/media" + pattern = USER_PATTERN + r"/media" test = ( ("https://nitter.net/supernaturepics/media", { "pattern": r"https://nitter\.net/pic/orig" r"/media%2F[\w-]+\.(jpg|png)$", "range": "1-20", }), + ("https://nitter.kavin.rocks/id:2976459548/media", { + "pattern": r"https://nitter\.kavin\.rocks/pic/orig" + r"/media%2F[\w-]+\.(jpg|png)$", + "range": "1-20", + }), ("https://nitter.lacontrevoie.fr/supernaturepics/media"), ("https://nitter.pussthecat.org/supernaturepics/media"), ("https://nitter.1d4.us/supernaturepics/media"), - ("https://nitter.kavin.rocks/supernaturepics/media"), - ("https://nitter.unixfox.eu/supernaturepics/media"), + ("https://nitter.unixfox.eu/i/user/2976459548/media"), ) def tweets(self): - return self._pagination("/" + self.user + "/media") + return self._pagination("/media") class NitterSearchExtractor(NitterExtractor): subcategory = "search" - pattern = BASE_PATTERN + r"/([^/?#]+)/search" + pattern = USER_PATTERN + r"/search" test = ( ("https://nitter.net/supernaturepics/search", { "pattern": r"https://nitter\.net/pic/orig" @@ -204,12 +355,12 @@ class NitterSearchExtractor(NitterExtractor): ("https://nitter.lacontrevoie.fr/supernaturepics/search"), ("https://nitter.pussthecat.org/supernaturepics/search"), ("https://nitter.1d4.us/supernaturepics/search"), - ("https://nitter.kavin.rocks/supernaturepics/search"), - ("https://nitter.unixfox.eu/supernaturepics/search"), + ("https://nitter.kavin.rocks/id:2976459548/search"), + ("https://nitter.unixfox.eu/i/user/2976459548/search"), ) def tweets(self): - return self._pagination("/" + self.user + "/search") + return self._pagination("/search") class NitterTweetExtractor(NitterExtractor): @@ -218,11 +369,30 @@ class NitterTweetExtractor(NitterExtractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{num}" - pattern = BASE_PATTERN + r"/[^/?#]+/status/(\d+)" + pattern = BASE_PATTERN + r"/(i/web|[^/?#]+)/status/(\d+())" test = ( ("https://nitter.net/supernaturepics/status/604341487988576256", { "url": "3f2b64e175bf284aa672c3bb53ed275e470b919a", "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", + "keyword": { + "comments": 16, + "content": "Big Wedeene River, Canada", + "count": 1, + "date": "dt:2015-05-29 17:40:00", + "extension": "jpg", + "filename": "CGMNYZvW0AIVoom", + "likes": int, + "num": 1, + "quotes": 10, + "retweets": int, + "tweet_id": "604341487988576256", + "url": "https://nitter.net/pic/orig" + "/media%2FCGMNYZvW0AIVoom.jpg", + "user": { + "name": "supernaturepics", + "nick": "Nature Pictures", + }, + }, }), # 4 images ("https://nitter.lacontrevoie.fr/i/status/894001459754180609", { @@ -234,6 +404,10 @@ class NitterTweetExtractor(NitterExtractor): r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F" r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F" r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5", + "keyword": { + "extension": "mp4", + "filename": "nv8hUQC1R0SjhzcZ", + }, }), # content with emoji, newlines, hashtags (#338) ("https://nitter.1d4.us/playpokemon/status/1263832915173048321", { @@ -249,8 +423,48 @@ class NitterTweetExtractor(NitterExtractor): "url": "e115bd1c86c660064e392b05269bbcafcd8c8b7a", "content": "f29501e44d88437fe460f5c927b7543fda0f6e34", }), + # Reply to deleted tweet (#403, #838) + ("https://nitter.unixfox.eu/i/web/status/1170041925560258560", { + "pattern": r"https://nitter\.unixfox\.eu/pic/orig" + r"/media%2FEDzS7VrU0AAFL4_\.jpg", + }), + # "quoted" option (#854) + ("https://nitter.net/StobiesGalaxy/status/1270755918330896395", { + "options": (("quoted", True),), + "pattern": r"https://nitter\.net/pic/orig/media%2FEa[KG].+\.jpg", + "count": 8, + }), + # quoted tweet (#526, #854) + ("https://nitter.1d4.us/StobiesGalaxy/status/1270755918330896395", { + "pattern": r"https://nitter\.1d4\.us/pic/orig" + r"/enc/bWVkaWEvRWFL\w+LmpwZw==", + "keyword": {"filename": r"re:EaK.{12}"}, + "count": 4, + }), + # deleted quote tweet (#2225) + ("https://nitter.lacontrevoie.fr/i/status/1460044411165888515", { + "count": 0, + }), + # "Misleading" content + ("https://nitter.pussthecat.org/i/status/1486373748911575046", { + "count": 4, + }), + # age-restricted (#2354) + ("https://nitter.unixfox.eu/mightbecurse/status/1492954264909479936", { + "keywords": {"date": "dt:2022-02-13 20:10:09"}, + "count": 1, + }), ) def tweets(self): url = "{}/i/status/{}".format(self.root, self.user) - return (self.request(url).text,) + html = text.extr(self.request(url).text, 'class="main-tweet', '''\ + </div> + </div></div></div>''') + html, quote = self._extract_quote(html) + tweet = self._tweet_from_html(html) + if quote and self.config("quoted", False): + quoted = self._tweet_from_quote(quote) + quoted["user"] = tweet["user"] + return (tweet, quoted) + return (tweet,) |
