From 3338dfce719c999467ffe08fd45663be8190057a Mon Sep 17 00:00:00 2001 From: Unit 193 Date: Sun, 4 Dec 2022 23:25:06 -0500 Subject: New upstream version 1.24.1. --- gallery_dl/extractor/nitter.py | 314 ++++++++++++++++++++++++++++++++++------- 1 file changed, 264 insertions(+), 50 deletions(-) (limited to 'gallery_dl/extractor/nitter.py') diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 1ba8253..dfe78ae 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -10,6 +10,7 @@ from .common import BaseExtractor, Message from .. import text +import binascii class NitterExtractor(BaseExtractor): @@ -20,51 +21,102 @@ class NitterExtractor(BaseExtractor): archive_fmt = "{tweet_id}_{num}" def __init__(self, match): + self.cookiedomain = self.root.partition("://")[2] BaseExtractor.__init__(self, match) - self.user = match.group(match.lastindex) + + lastindex = match.lastindex + self.user = match.group(lastindex) + self.user_id = match.group(lastindex + 1) + self.user_obj = None def items(self): - for tweet_html in self.tweets(): - tweet = self._tweet_from_html(tweet_html) - - attachments_html = tweet.pop("_attach", "") - if attachments_html: - attachments = list(text.extract_iter( - attachments_html, 'href="', '"')) - attachments.extend(text.extract_iter( - attachments_html, 'data-url="', '"')) + retweets = self.config("retweets", False) + videos = self.config("videos", True) + if videos: + ytdl = (videos == "ytdl") + videos = True + self._cookiejar.set("hlsPlayback", "on", domain=self.cookiedomain) + + for tweet in self.tweets(): + + if not retweets and tweet["retweet"]: + self.log.debug("Skipping %s (retweet)", tweet["tweet_id"]) + continue + + attachments = tweet.pop("_attach", "") + if attachments: + files = [] + append = files.append + + for url in text.extract_iter( + attachments, 'href="', '"'): + + if "/enc/" in url: + name = binascii.a2b_base64(url.rpartition( + "/")[2]).decode().rpartition("/")[2] + else: + name = url.rpartition("%2F")[2] + + if url[0] == "/": + url = self.root + url + file = { + "url": url, + "_http_retry_codes": (404,), + } + file["filename"], _, file["extension"] = \ + name.rpartition(".") + append(file) + + if videos and not files: + if ytdl: + append({ + "url": "ytdl:{}/i/status/{}".format( + self.root, tweet["tweet_id"]), + "extension": None, + }) + else: + for url in text.extract_iter( + attachments, 'data-url="', '"'): + + if "/enc/" in url: + name = binascii.a2b_base64(url.rpartition( + "/")[2]).decode().rpartition("/")[2] + else: + name = url.rpartition("%2F")[2] + + if url[0] == "/": + url = self.root + url + append({ + "url" : "ytdl:" + url, + "filename" : name.rpartition(".")[0], + "extension": "mp4", + }) else: - attachments = () - tweet["count"] = len(attachments) + files = () + tweet["count"] = len(files) yield Message.Directory, tweet - for tweet["num"], url in enumerate(attachments, 1): - if url[0] == "/": - url = self.root + url - if "/video/" in url: - url = "ytdl:" + url - tweet["filename"] = url.rpartition( - "%2F")[2].partition(".")[0] - tweet["extension"] = "mp4" - else: - text.nameext_from_url(url, tweet) - yield Message.Url, url, tweet + for tweet["num"], file in enumerate(files, 1): + url = file["url"] + file.update(tweet) + yield Message.Url, url, file def _tweet_from_html(self, html): extr = text.extract_from(html) - user = { + author = { "name": extr('class="fullname" href="/', '"'), "nick": extr('title="', '"'), } extr('")[2], - "_attach": extr('class="attachments', 'class="tweet-stats'), + "_attach" : extr('class="attachments', 'class="tweet-stats'), "comments": text.parse_int(extr( 'class="icon-comment', '').rpartition(">")[2]), "retweets": text.parse_int(extr( @@ -73,17 +125,87 @@ class NitterExtractor(BaseExtractor): 'class="icon-quote', '').rpartition(">")[2]), "likes" : text.parse_int(extr( 'class="icon-heart', '').rpartition(">")[2]), + "retweet" : 'class="retweet-header' in html, + "quoted": False, + } + + def _tweet_from_quote(self, html): + extr = text.extract_from(html) + author = { + "name": extr('class="fullname" href="/', '"'), + "nick": extr('title="', '"'), } + extr('")[2], + "_attach" : extr('class="attachments', ''' + '''), + "retweet" : False, + "quoted": True, + } + + def _user_from_html(self, html): + extr = text.extract_from(html, html.index('class="profile-tabs')) + banner = extr('class="profile-banner">', '<'), + "date" : text.parse_datetime( + extr('class="profile-joindate">', '<').replace(",", "")), + "friends_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "followers_count" : text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "favourites_count": text.parse_int(extr( + 'class="profile-stat-num">', '<').replace(",", "")), + "verified" : 'title="Verified account"' in html, + } + + def _extract_quote(self, html): + html, _, quote = html.partition('class="quote') + if quote: + quote, _, tail = quote.partition('class="tweet-published') + return (html + tail, quote) + return (html, None) def _pagination(self, path): - base_url = url = self.root + path + quoted = self.config("quoted", False) + + if self.user_id: + self.user = self.request( + "{}/i/user/{}".format(self.root, self.user_id), + allow_redirects=False, + ).headers["location"].rpartition("/")[2] + base_url = url = "{}/{}{}".format(self.root, self.user, path) while True: - page = self.request(url).text + tweets_html = self.request(url).text.split( + '