New upstream version 1.24.1.upstream/1.24.1

author: Unit 193 <unit193@unit193.net> 2022-12-04 23:25:06 -0500
committer: Unit 193 <unit193@unit193.net> 2022-12-04 23:25:06 -0500
commit: 3338dfce719c999467ffe08fd45663be8190057a (patch)
tree: fd3235a1379c19508bbb47b8e8b95d5d9164b0d3 /gallery_dl/extractor/nitter.py
parent: 7af5cc29d1c02d20a6890b7b7ba78ab41532a763 (diff)
1 files changed, 264 insertions, 50 deletions
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py
index 1ba8253..dfe78ae 100644
--- a/gallery_dl/extractor/nitter.py
+++ b/gallery_dl/extractor/nitter.py
@@ -10,6 +10,7 @@
 
 from .common import BaseExtractor, Message
 from .. import text
+import binascii
 
 
 class NitterExtractor(BaseExtractor):
@@ -20,51 +21,102 @@ class NitterExtractor(BaseExtractor):
     archive_fmt = "{tweet_id}_{num}"
 
     def __init__(self, match):
+        self.cookiedomain = self.root.partition("://")[2]
         BaseExtractor.__init__(self, match)
-        self.user = match.group(match.lastindex)
+
+        lastindex = match.lastindex
+        self.user = match.group(lastindex)
+        self.user_id = match.group(lastindex + 1)
+        self.user_obj = None
 
     def items(self):
-        for tweet_html in self.tweets():
-            tweet = self._tweet_from_html(tweet_html)
-
-            attachments_html = tweet.pop("_attach", "")
-            if attachments_html:
-                attachments = list(text.extract_iter(
-                    attachments_html, 'href="', '"'))
-                attachments.extend(text.extract_iter(
-                    attachments_html, 'data-url="', '"'))
+        retweets = self.config("retweets", False)
+        videos = self.config("videos", True)
+        if videos:
+            ytdl = (videos == "ytdl")
+            videos = True
+            self._cookiejar.set("hlsPlayback", "on", domain=self.cookiedomain)
+
+        for tweet in self.tweets():
+
+            if not retweets and tweet["retweet"]:
+                self.log.debug("Skipping %s (retweet)", tweet["tweet_id"])
+                continue
+
+            attachments = tweet.pop("_attach", "")
+            if attachments:
+                files = []
+                append = files.append
+
+                for url in text.extract_iter(
+                        attachments, 'href="', '"'):
+
+                    if "/enc/" in url:
+                        name = binascii.a2b_base64(url.rpartition(
+                            "/")[2]).decode().rpartition("/")[2]
+                    else:
+                        name = url.rpartition("%2F")[2]
+
+                    if url[0] == "/":
+                        url = self.root + url
+                    file = {
+                        "url": url,
+                        "_http_retry_codes": (404,),
+                    }
+                    file["filename"], _, file["extension"] = \
+                        name.rpartition(".")
+                    append(file)
+
+                if videos and not files:
+                    if ytdl:
+                        append({
+                            "url": "ytdl:{}/i/status/{}".format(
+                                self.root, tweet["tweet_id"]),
+                            "extension": None,
+                        })
+                    else:
+                        for url in text.extract_iter(
+                                attachments, 'data-url="', '"'):
+
+                            if "/enc/" in url:
+                                name = binascii.a2b_base64(url.rpartition(
+                                    "/")[2]).decode().rpartition("/")[2]
+                            else:
+                                name = url.rpartition("%2F")[2]
+
+                            if url[0] == "/":
+                                url = self.root + url
+                            append({
+                                "url"      : "ytdl:" + url,
+                                "filename" : name.rpartition(".")[0],
+                                "extension": "mp4",
+                            })
             else:
-                attachments = ()
-            tweet["count"] = len(attachments)
+                files = ()
+            tweet["count"] = len(files)
 
             yield Message.Directory, tweet
-            for tweet["num"], url in enumerate(attachments, 1):
-                if url[0] == "/":
-                    url = self.root + url
-                if "/video/" in url:
-                    url = "ytdl:" + url
-                    tweet["filename"] = url.rpartition(
-                        "%2F")[2].partition(".")[0]
-                    tweet["extension"] = "mp4"
-                else:
-                    text.nameext_from_url(url, tweet)
-                yield Message.Url, url, tweet
+            for tweet["num"], file in enumerate(files, 1):
+                url = file["url"]
+                file.update(tweet)
+                yield Message.Url, url, file
 
     def _tweet_from_html(self, html):
         extr = text.extract_from(html)
-        user = {
+        author = {
             "name": extr('class="fullname" href="/', '"'),
             "nick": extr('title="', '"'),
         }
         extr('<span class="tweet-date', '')
         link = extr('href="', '"')
         return {
-            "user": user,
-            "date": text.parse_datetime(
+            "author"  : author,
+            "user"    : self.user_obj or author,
+            "date"    : text.parse_datetime(
                 extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
             "tweet_id": link.rpartition("/")[2].partition("#")[0],
             "content": extr('class="tweet-content', "</div").partition(">")[2],
-            "_attach": extr('class="attachments', 'class="tweet-stats'),
+            "_attach" : extr('class="attachments', 'class="tweet-stats'),
             "comments": text.parse_int(extr(
                 'class="icon-comment', '</div>').rpartition(">")[2]),
             "retweets": text.parse_int(extr(
@@ -73,17 +125,87 @@ class NitterExtractor(BaseExtractor):
                 'class="icon-quote', '</div>').rpartition(">")[2]),
             "likes"   : text.parse_int(extr(
                 'class="icon-heart', '</div>').rpartition(">")[2]),
+            "retweet" : 'class="retweet-header' in html,
+            "quoted": False,
+        }
+
+    def _tweet_from_quote(self, html):
+        extr = text.extract_from(html)
+        author = {
+            "name": extr('class="fullname" href="/', '"'),
+            "nick": extr('title="', '"'),
         }
+        extr('<span class="tweet-date', '')
+        link = extr('href="', '"')
+        return {
+            "author"  : author,
+            "user"    : self.user_obj or author,
+            "date"    : text.parse_datetime(
+                extr('title="', '"'), "%b %d, %Y · %I:%M %p %Z"),
+            "tweet_id": link.rpartition("/")[2].partition("#")[0],
+            "content": extr('class="quote-text', "</div").partition(">")[2],
+            "_attach" : extr('class="attachments', '''
+                </div>'''),
+            "retweet" : False,
+            "quoted": True,
+        }
+
+    def _user_from_html(self, html):
+        extr = text.extract_from(html, html.index('class="profile-tabs'))
+        banner = extr('class="profile-banner"><a href="', '"')
+        return {
+            "id"              : banner.split("%2F")[4] if banner else None,
+            "profile_banner"  : self.root + banner if banner else "",
+            "profile_image"   : self.root + extr(
+                'class="profile-card-avatar" href="', '"'),
+            "nick"            : extr('title="', '"'),
+            "name"            : extr('title="@', '"'),
+            "description"     : extr('<p dir="auto">', '<'),
+            "date"            : text.parse_datetime(
+                extr('class="profile-joindate"><span title="', '"'),
+                "%I:%M %p - %d %b %Y"),
+            "statuses_count"  : text.parse_int(extr(
+                'class="profile-stat-num">', '<').replace(",", "")),
+            "friends_count"   : text.parse_int(extr(
+                'class="profile-stat-num">', '<').replace(",", "")),
+            "followers_count" : text.parse_int(extr(
+                'class="profile-stat-num">', '<').replace(",", "")),
+            "favourites_count": text.parse_int(extr(
+                'class="profile-stat-num">', '<').replace(",", "")),
+            "verified"        : 'title="Verified account"' in html,
+        }
+
+    def _extract_quote(self, html):
+        html, _, quote = html.partition('class="quote')
+        if quote:
+            quote, _, tail = quote.partition('class="tweet-published')
+            return (html + tail, quote)
+        return (html, None)
 
     def _pagination(self, path):
-        base_url = url = self.root + path
+        quoted = self.config("quoted", False)
+
+        if self.user_id:
+            self.user = self.request(
+                "{}/i/user/{}".format(self.root, self.user_id),
+                allow_redirects=False,
+            ).headers["location"].rpartition("/")[2]
+        base_url = url = "{}/{}{}".format(self.root, self.user, path)
 
         while True:
-            page = self.request(url).text
+            tweets_html = self.request(url).text.split(
+                '<div class="timeline-item')
 
-            yield from page.split('<div class="timeline-item')[1:]
+            if self.user_obj is None:
+                self.user_obj = self._user_from_html(tweets_html[0])
 
-            more = text.extr(page, '<div class="show-more"><a href="?', '"')
+            for html, quote in map(self._extract_quote, tweets_html[1:]):
+                yield self._tweet_from_html(html)
+                if quoted and quote:
+                    yield self._tweet_from_quote(quote)
+
+            more = text.extr(
+                tweets_html[-1], '<div class="show-more"><a href="?', '"')
             if not more:
                 return
             url = base_url + "?" + text.unescape(more)
@@ -116,10 +238,12 @@ BASE_PATTERN = NitterExtractor.update({
     },
 })
 
+USER_PATTERN = BASE_PATTERN + r"/(i(?:/user/|d:)(\d+)|[^/?#]+)"
+
 
 class NitterTweetsExtractor(NitterExtractor):
     subcategory = "tweets"
-    pattern = BASE_PATTERN + r"/([^/?#]+)(?:/tweets)?(?:$|\?|#)"
+    pattern = USER_PATTERN + r"(?:/tweets)?(?:$|\?|#)"
     test = (
         ("https://nitter.net/supernaturepics", {
             "pattern": r"https://nitter\.net/pic/orig"
@@ -127,6 +251,10 @@ class NitterTweetsExtractor(NitterExtractor):
             "range": "1-20",
             "count": 20,
             "keyword": {
+                "author": {
+                    "name": "supernaturepics",
+                    "nick": "Nature Pictures"
+                },
                 "comments": int,
                 "content": str,
                 "count": 1,
@@ -136,25 +264,44 @@ class NitterTweetsExtractor(NitterExtractor):
                 "retweets": int,
                 "tweet_id": r"re:\d+",
                 "user": {
+                    "date": "dt:2015-01-12 10:25:00",
+                    "description": "The very best nature pictures.",
+                    "favourites_count": int,
+                    "followers_count": int,
+                    "friends_count": int,
+                    "id": "2976459548",
                     "name": "supernaturepics",
-                    "nick": "Nature Pictures"
+                    "nick": "Nature Pictures",
+                    "profile_banner": "https://nitter.net/pic/https%3A%2F%2Fpb"
+                                      "s.twimg.com%2Fprofile_banners%2F2976459"
+                                      "548%2F1421058583%2F1500x500",
+                    "profile_image": "https://nitter.net/pic/pbs.twimg.com%2Fp"
+                                     "rofile_images%2F554585280938659841%2FFLV"
+                                     "AlX18.jpeg",
+                    "statuses_count": 1568,
+                    "verified": False,
                 },
             },
         }),
+        ("https://nitter.pussthecat.org/i/user/2976459548", {
+            "url": "c740a2683db2c8ed2f350afc0494475c4444025b",
+            "pattern": r"https://nitter.pussthecat\.org/pic/orig"
+                       r"/media%2FCGMNYZvW0AIVoom\.jpg",
+            "range": "1",
+        }),
         ("https://nitter.lacontrevoie.fr/supernaturepics"),
-        ("https://nitter.pussthecat.org/supernaturepics"),
         ("https://nitter.1d4.us/supernaturepics"),
-        ("https://nitter.kavin.rocks/supernaturepics"),
+        ("https://nitter.kavin.rocks/id:2976459548"),
         ("https://nitter.unixfox.eu/supernaturepics"),
     )
 
     def tweets(self):
-        return self._pagination("/" + self.user)
+        return self._pagination("")
 
 
 class NitterRepliesExtractor(NitterExtractor):
     subcategory = "replies"
-    pattern = BASE_PATTERN + r"/([^/?#]+)/with_replies"
+    pattern = USER_PATTERN + r"/with_replies"
     test = (
         ("https://nitter.net/supernaturepics/with_replies", {
             "pattern": r"https://nitter\.net/pic/orig"
@@ -164,37 +311,41 @@ class NitterRepliesExtractor(NitterExtractor):
         ("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"),
         ("https://nitter.pussthecat.org/supernaturepics/with_replies"),
         ("https://nitter.1d4.us/supernaturepics/with_replies"),
-        ("https://nitter.kavin.rocks/supernaturepics/with_replies"),
-        ("https://nitter.unixfox.eu/supernaturepics/with_replies"),
+        ("https://nitter.kavin.rocks/id:2976459548/with_replies"),
+        ("https://nitter.unixfox.eu/i/user/2976459548/with_replies"),
     )
 
     def tweets(self):
-        return self._pagination("/" + self.user + "/with_replies")
+        return self._pagination("/with_replies")
 
 
 class NitterMediaExtractor(NitterExtractor):
     subcategory = "media"
-    pattern = BASE_PATTERN + r"/([^/?#]+)/media"
+    pattern = USER_PATTERN + r"/media"
     test = (
         ("https://nitter.net/supernaturepics/media", {
             "pattern": r"https://nitter\.net/pic/orig"
                        r"/media%2F[\w-]+\.(jpg|png)$",
             "range": "1-20",
         }),
+        ("https://nitter.kavin.rocks/id:2976459548/media", {
+            "pattern": r"https://nitter\.kavin\.rocks/pic/orig"
+                       r"/media%2F[\w-]+\.(jpg|png)$",
+            "range": "1-20",
+        }),
         ("https://nitter.lacontrevoie.fr/supernaturepics/media"),
         ("https://nitter.pussthecat.org/supernaturepics/media"),
         ("https://nitter.1d4.us/supernaturepics/media"),
-        ("https://nitter.kavin.rocks/supernaturepics/media"),
-        ("https://nitter.unixfox.eu/supernaturepics/media"),
+        ("https://nitter.unixfox.eu/i/user/2976459548/media"),
     )
 
     def tweets(self):
-        return self._pagination("/" + self.user + "/media")
+        return self._pagination("/media")
 
 
 class NitterSearchExtractor(NitterExtractor):
     subcategory = "search"
-    pattern = BASE_PATTERN + r"/([^/?#]+)/search"
+    pattern = USER_PATTERN + r"/search"
     test = (
         ("https://nitter.net/supernaturepics/search", {
             "pattern": r"https://nitter\.net/pic/orig"
@@ -204,12 +355,12 @@ class NitterSearchExtractor(NitterExtractor):
         ("https://nitter.lacontrevoie.fr/supernaturepics/search"),
         ("https://nitter.pussthecat.org/supernaturepics/search"),
         ("https://nitter.1d4.us/supernaturepics/search"),
-        ("https://nitter.kavin.rocks/supernaturepics/search"),
-        ("https://nitter.unixfox.eu/supernaturepics/search"),
+        ("https://nitter.kavin.rocks/id:2976459548/search"),
+        ("https://nitter.unixfox.eu/i/user/2976459548/search"),
     )
 
     def tweets(self):
-        return self._pagination("/" + self.user + "/search")
+        return self._pagination("/search")
 
 
 class NitterTweetExtractor(NitterExtractor):
@@ -218,11 +369,30 @@ class NitterTweetExtractor(NitterExtractor):
     directory_fmt = ("{category}", "{user[name]}")
     filename_fmt = "{tweet_id}_{num}.{extension}"
     archive_fmt = "{tweet_id}_{num}"
-    pattern = BASE_PATTERN + r"/[^/?#]+/status/(\d+)"
+    pattern = BASE_PATTERN + r"/(i/web|[^/?#]+)/status/(\d+())"
     test = (
         ("https://nitter.net/supernaturepics/status/604341487988576256", {
             "url": "3f2b64e175bf284aa672c3bb53ed275e470b919a",
             "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
+            "keyword": {
+                "comments": 16,
+                "content": "Big Wedeene River, Canada",
+                "count": 1,
+                "date": "dt:2015-05-29 17:40:00",
+                "extension": "jpg",
+                "filename": "CGMNYZvW0AIVoom",
+                "likes": int,
+                "num": 1,
+                "quotes": 10,
+                "retweets": int,
+                "tweet_id": "604341487988576256",
+                "url": "https://nitter.net/pic/orig"
+                       "/media%2FCGMNYZvW0AIVoom.jpg",
+                "user": {
+                    "name": "supernaturepics",
+                    "nick": "Nature Pictures",
+                },
+            },
         }),
         # 4 images
         ("https://nitter.lacontrevoie.fr/i/status/894001459754180609", {
@@ -234,6 +404,10 @@ class NitterTweetExtractor(NitterExtractor):
                        r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F"
                        r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F"
                        r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5",
+            "keyword": {
+                "extension": "mp4",
+                "filename": "nv8hUQC1R0SjhzcZ",
+            },
         }),
         # content with emoji, newlines, hashtags (#338)
         ("https://nitter.1d4.us/playpokemon/status/1263832915173048321", {
@@ -249,8 +423,48 @@ class NitterTweetExtractor(NitterExtractor):
             "url": "e115bd1c86c660064e392b05269bbcafcd8c8b7a",
             "content": "f29501e44d88437fe460f5c927b7543fda0f6e34",
         }),
+        # Reply to deleted tweet (#403, #838)
+        ("https://nitter.unixfox.eu/i/web/status/1170041925560258560", {
+            "pattern": r"https://nitter\.unixfox\.eu/pic/orig"
+                       r"/media%2FEDzS7VrU0AAFL4_\.jpg",
+        }),
+        # "quoted" option (#854)
+        ("https://nitter.net/StobiesGalaxy/status/1270755918330896395", {
+            "options": (("quoted", True),),
+            "pattern": r"https://nitter\.net/pic/orig/media%2FEa[KG].+\.jpg",
+            "count": 8,
+        }),
+        # quoted tweet (#526, #854)
+        ("https://nitter.1d4.us/StobiesGalaxy/status/1270755918330896395", {
+            "pattern": r"https://nitter\.1d4\.us/pic/orig"
+                       r"/enc/bWVkaWEvRWFL\w+LmpwZw==",
+            "keyword": {"filename": r"re:EaK.{12}"},
+            "count": 4,
+        }),
+        # deleted quote tweet (#2225)
+        ("https://nitter.lacontrevoie.fr/i/status/1460044411165888515", {
+            "count": 0,
+        }),
+        # "Misleading" content
+        ("https://nitter.pussthecat.org/i/status/1486373748911575046", {
+            "count": 4,
+        }),
+        # age-restricted (#2354)
+        ("https://nitter.unixfox.eu/mightbecurse/status/1492954264909479936", {
+            "keywords": {"date": "dt:2022-02-13 20:10:09"},
+            "count": 1,
+        }),
     )
 
     def tweets(self):
         url = "{}/i/status/{}".format(self.root, self.user)
-        return (self.request(url).text,)
+        html = text.extr(self.request(url).text, 'class="main-tweet', '''\
+                </div>
+              </div></div></div>''')
+        html, quote = self._extract_quote(html)
+        tweet = self._tweet_from_html(html)
+        if quote and self.config("quoted", False):
+            quoted = self._tweet_from_quote(quote)
+            quoted["user"] = tweet["user"]
+            return (tweet, quoted)
+        return (tweet,)
author	Unit 193 <unit193@unit193.net>	2022-12-04 23:25:06 -0500
committer	Unit 193 <unit193@unit193.net>	2022-12-04 23:25:06 -0500
commit	3338dfce719c999467ffe08fd45663be8190057a (patch)
tree	fd3235a1379c19508bbb47b8e8b95d5d9164b0d3 /gallery_dl/extractor/nitter.py
parent	7af5cc29d1c02d20a6890b7b7ba78ab41532a763 (diff)