diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/artstation.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/aryion.py | 66 | ||||
| -rw-r--r-- | gallery_dl/extractor/common.py | 21 | ||||
| -rw-r--r-- | gallery_dl/extractor/foolslide.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/imgbb.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/imgur.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/kissmanga.py | 16 | ||||
| -rw-r--r-- | gallery_dl/extractor/mastodon.py | 31 | ||||
| -rw-r--r-- | gallery_dl/extractor/naver.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/pinterest.py | 88 | ||||
| -rw-r--r-- | gallery_dl/extractor/slickpic.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/speakerdeck.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/tsumino.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/tumblr.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/twitter.py | 149 | ||||
| -rw-r--r-- | gallery_dl/extractor/webtoons.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/weibo.py | 96 |
18 files changed, 314 insertions, 186 deletions
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index c504dba..64a4bf4 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -66,6 +66,8 @@ class ArtstationExtractor(Extractor): data["title"] = text.unescape(data["title"]) data["description"] = text.unescape(text.remove_html( data["description"])) + data["date"] = text.parse_datetime( + data["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") assets = data["assets"] del data["assets"] diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 7575de9..04bb146 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -26,9 +26,24 @@ class AryionExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) - self.offset = 0 + self.recursive = True - def posts(self, url): + def items(self): + for post_id in self.posts(): + post = self._parse_post(post_id) + if post: + yield Message.Directory, post + yield Message.Url, post["url"], post + elif post is False and self.recursive: + base = self.root + "/g4/view/" + data = {"_extractor": AryionPostExtractor} + for post_id in self._pagination(base + post_id): + yield Message.Queue, base + post_id, data + + def posts(self): + """Yield relevant post IDs""" + + def _pagination(self, url): while True: page = self.request(url).text yield from text.extract_iter( @@ -39,11 +54,14 @@ class AryionExtractor(Extractor): return url = self.root + text.rextract(page, "href='", "'", pos)[0] - def parse_post(self, post_id): + def _parse_post(self, post_id): url = "{}/g4/data.php?id={}".format(self.root, post_id) with self.request(url, method="HEAD", fatal=False) as response: if response.status_code >= 400: + self.log.warning( + "Unable to fetch post %s ('%s %s')", + post_id, response.status_code, response.reason) return None headers = response.headers @@ -106,9 +124,11 @@ class AryionExtractor(Extractor): class AryionGalleryExtractor(AryionExtractor): """Extractor for a user's gallery on eka's portal""" subcategory = "gallery" + categorytransfer = True pattern = BASE_PATTERN + r"/(?:gallery/|user/|latest.php\?name=)([^/?&#]+)" test = ( ("https://aryion.com/g4/gallery/jameshoward", { + "options": (("recursive", False),), "pattern": r"https://aryion\.com/g4/data\.php\?id=\d+$", "range": "48-52", "count": 5, @@ -117,17 +137,24 @@ class AryionGalleryExtractor(AryionExtractor): ("https://aryion.com/g4/latest.php?name=jameshoward"), ) + def __init__(self, match): + AryionExtractor.__init__(self, match) + self.recursive = self.config("recursive", True) + self.offset = 0 + def skip(self, num): + if self.recursive: + num = 0 self.offset += num return num - def items(self): - url = "{}/g4/latest.php?name={}".format(self.root, self.user) - for post_id in util.advance(self.posts(url), self.offset): - post = self.parse_post(post_id) - if post: - yield Message.Directory, post - yield Message.Url, post["url"], post + def posts(self): + if self.recursive: + url = "{}/g4/gallery/{}".format(self.root, self.user) + return self._pagination(url) + else: + url = "{}/g4/latest.php?name={}".format(self.root, self.user) + return util.advance(self._pagination(url), self.offset) class AryionPostExtractor(AryionExtractor): @@ -164,19 +191,6 @@ class AryionPostExtractor(AryionExtractor): }), ) - def items(self): - post_id = self.user - self.user = None - post = self.parse_post(post_id) - - if post: - yield Message.Directory, post - yield Message.Url, post["url"], post - - elif post is False: - folder_url = "{}/g4/view/{}".format(self.root, post_id) - data = {"_extractor": AryionPostExtractor} - - for post_id in self.posts(folder_url): - url = "{}/g4/view/{}".format(self.root, post_id) - yield Message.Queue, url, data + def posts(self): + post_id, self.user = self.user, None + return (post_id,) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index dd685df..bbbd8a6 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -328,14 +328,15 @@ class Extractor(): test = (test, None) yield test - def _dump_response(self, response): + def _dump_response(self, response, history=True): """Write the response content to a .dump file in the current directory. The file name is derived from the response url, replacing special characters with "_" """ - for resp in response.history: - self._dump_response(resp) + if history: + for resp in response.history: + self._dump_response(resp, False) if hasattr(Extractor, "_dump_index"): Extractor._dump_index += 1 @@ -350,7 +351,8 @@ class Extractor(): try: with open(fname + ".dump", 'wb') as fp: - util.dump_response(response, fp) + util.dump_response( + response, fp, headers=(self._write_pages == "all")) except Exception as e: self.log.warning("Failed to dump HTTP request (%s: %s)", e.__class__.__name__, e) @@ -490,10 +492,13 @@ class SharedConfigMixin(): """Enable sharing of config settings based on 'basecategory'""" basecategory = "" - def config(self, key, default=None, *, sentinel=util.SENTINEL): - value = Extractor.config(self, key, sentinel) - return value if value is not sentinel else config.interpolate( - ("extractor", self.basecategory, self.subcategory), key, default) + def config(self, key, default=None): + return config.interpolate_common( + ("extractor",), ( + (self.category, self.subcategory), + (self.basecategory, self.subcategory), + ), key, default, + ) def generate_extractors(extractor_data, symtable, classes): diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 86f63ae..731f54b 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -225,9 +225,9 @@ EXTRACTORS = { }), ), "test-manga": - ("https://sensescans.com/reader/series/hakkenden/", { - "url": "3e0559029c21ca5af8a2082dd6de1567fcec4d83", - "keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23", + ("https://sensescans.com/reader/series/yotsubato/", { + "url": "ee4dca7c421bf15ac039200f8c0bcb0858153640", + "keyword": "f94961bd731bd878bbd4d48555bc3ace1d937364", }), }, "worldthree": { diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 8d2c937..3882a92 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -124,8 +124,8 @@ class ImgbbAlbumExtractor(ImgbbExtractor): }), ("https://ibb.co/album/i5PggF?sort=title_asc", { "range": "1-80", - "url": "a2dfc58fe3348fa37e242082bd5a85eaa490d0a5", - "keyword": "5bb79c82411c3770d673fac64a0a98fa28111c3b", + "url": "afdf5fc95d8e09d77e8f44312f3e9b843987bb5a", + "keyword": "f090e14d0e5f7868595082b2c95da1309c84872d", }), # no user data (#471) ("https://ibb.co/album/kYKpwF", { diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 44fa5f2..20b698b 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -210,6 +210,7 @@ class ImgurAlbumExtractor(ImgurExtractor): album = self.api.album(self.key) album["date"] = text.parse_timestamp(album["datetime"]) images = album["images"] + count = len(images) try: del album["images"] @@ -218,11 +219,12 @@ class ImgurAlbumExtractor(ImgurExtractor): pass yield Message.Version, 1 - yield Message.Directory, {"album": album, "count": len(images)} for num, image in enumerate(images, 1): url = self._prepare(image) image["num"] = num + image["count"] = count image["album"] = album + yield Message.Directory, image yield Message.Url, url, image diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 3781711..bf6b10f 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -386,7 +386,7 @@ class InstagramImageExtractor(InstagramExtractor): # GraphVideo ("https://www.instagram.com/p/Bqxp0VSBgJg/", { - "pattern": r"/47129943_191645575115739_8539303288426725376_n\.mp4", + "pattern": r"/46840863_726311431074534_7805566102611403091_n\.mp4", "keyword": { "date": "dt:2018-11-29 19:23:58", "description": str, @@ -404,7 +404,7 @@ class InstagramImageExtractor(InstagramExtractor): # GraphVideo (IGTV) ("https://www.instagram.com/tv/BkQjCfsBIzi/", { - "pattern": r"/10000000_1760663964018792_716207142595461120_n\.mp4", + "pattern": r"/10000000_597132547321814_702169244961988209_n\.mp4", "keyword": { "date": "dt:2018-06-20 19:51:32", "description": str, diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py index ade245b..348453d 100644 --- a/gallery_dl/extractor/kissmanga.py +++ b/gallery_dl/extractor/kissmanga.py @@ -9,10 +9,9 @@ """Extract manga-chapters and entire manga from https://kissmanga.com/""" from .common import ChapterExtractor, MangaExtractor, Extractor -from .. import text, aes +from .. import text, aes, exception from ..cache import cache import hashlib -import time import ast import re @@ -25,7 +24,18 @@ class RedirectMixin(): response = Extractor.request(self, url, **kwargs) if not response.history or "/AreYouHuman" not in response.url: return response - time.sleep(2) + if self.config("captcha", "stop") == "wait": + self.log.warning( + "Redirect to \n%s\nVisit this URL in your browser, solve " + "the CAPTCHA, and press ENTER to continue", response.url) + try: + input() + except (EOFError, OSError): + pass + else: + raise exception.StopExtraction( + "Redirect to \n%s\nVisit this URL in your browser and " + "solve the CAPTCHA to continue", response.url) class KissmangaBase(RedirectMixin): diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 002c8f7..fa1fecc 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -27,22 +27,25 @@ class MastodonExtractor(Extractor): Extractor.__init__(self, match) self.api = MastodonAPI(self) - def config(self, key, default=None, *, sentinel=util.SENTINEL): - value = Extractor.config(self, key, sentinel) - return value if value is not sentinel else config.interpolate( - ("extractor", "mastodon", self.instance, self.subcategory), - key, default, + def config(self, key, default=None): + return config.interpolate_common( + ("extractor",), ( + (self.category, self.subcategory), + (self.basecategory, self.instance, self.subcategory), + ), key, default, ) def items(self): yield Message.Version, 1 for status in self.statuses(): - attachments = self.prepare(status) - yield Message.Directory, status - for media in attachments: - status["media"] = media - url = media["url"] - yield Message.Url, url, text.nameext_from_url(url, status) + attachments = status["media_attachments"] + if attachments: + self.prepare(status) + yield Message.Directory, status + for media in attachments: + status["media"] = media + url = media["url"] + yield Message.Url, url, text.nameext_from_url(url, status) def statuses(self): """Return an iterable containing all relevant Status-objects""" @@ -50,11 +53,11 @@ class MastodonExtractor(Extractor): def prepare(self, status): """Prepare a status object""" + del status["media_attachments"] status["instance"] = self.instance status["tags"] = [tag["name"] for tag in status["tags"]] - attachments = status["media_attachments"] - del status["media_attachments"] - return attachments + status["date"] = text.parse_datetime( + status["created_at"][:19], "%Y-%m-%dT%H:%M:%S") class MastodonUserExtractor(MastodonExtractor): diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index c980a38..413a58a 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -81,6 +81,7 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): class NaverBlogExtractor(NaverBase, Extractor): """Extractor for a user's blog on blog.naver.com""" subcategory = "blog" + categorytransfer = True pattern = (r"(?:https?://)?blog\.naver\.com/" r"(?:PostList.nhn\?(?:[^&#]+&)*blogId=([^&#]+)|(\w+)/?$)") test = ( diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 24a0a55..3bbe06a 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -1,15 +1,16 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.pinterest.com/""" +"""Extractors for https://www.pinterest.com/""" from .common import Extractor, Message from .. import text, exception +import itertools import json @@ -86,12 +87,17 @@ class PinterestBoardExtractor(PinterestExtractor): subcategory = "board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") archive_fmt = "{board[id]}_{id}" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)(?!.*#related$)" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)/?$" test = ( ("https://www.pinterest.com/g1952849/test-/", { "pattern": r"https://i\.pinimg\.com/originals/", "count": 2, }), + # board with sections (#835) + ("https://www.pinterest.com/g1952849/stuff/", { + "options": (("sections", True),), + "count": 5, + }), ("https://www.pinterest.com/g1952848/test/", { "exception": exception.GalleryDLException, }), @@ -100,16 +106,51 @@ class PinterestBoardExtractor(PinterestExtractor): def __init__(self, match): PinterestExtractor.__init__(self, match) self.user = text.unquote(match.group(1)) - self.board = text.unquote(match.group(2)) - self.board_id = 0 + self.board_name = text.unquote(match.group(2)) + self.board = None def metadata(self): - board = self.api.board(self.user, self.board) - self.board_id = board["id"] - return {"board": board} + self.board = self.api.board(self.user, self.board_name) + return {"board": self.board} def pins(self): - return self.api.board_pins(self.board_id) + board = self.board + + if board["section_count"] and self.config("sections", True): + pins = [self.api.board_pins(board["id"])] + for section in self.api.board_sections(board["id"]): + pins.append(self.api.board_section_pins(section["id"])) + return itertools.chain.from_iterable(pins) + else: + return self.api.board_pins(board["id"]) + + +class PinterestSectionExtractor(PinterestExtractor): + """Extractor for board sections on pinterest.com""" + subcategory = "section" + directory_fmt = ("{category}", "{board[owner][username]}", + "{board[name]}", "{section[title]}") + archive_fmt = "{board[id]}_{id}" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)/([^/?#&]+)" + test = ("https://www.pinterest.com/g1952849/stuff/section", { + "count": 2, + }) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.user = text.unquote(match.group(1)) + self.board_slug = text.unquote(match.group(2)) + self.section_slug = text.unquote(match.group(3)) + self.section = None + + def metadata(self): + section = self.section = self.api.board_section( + self.user, self.board_slug, self.section_slug) + section.pop("preview_pins", None) + return {"board": section.pop("board"), "section": section} + + def pins(self): + return self.api.board_section_pins(self.section["id"]) class PinterestRelatedPinExtractor(PinterestPinExtractor): @@ -136,7 +177,7 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor): subcategory = "related-board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}", "related") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+).*#related$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)/?#related$" test = ("https://www.pinterest.com/g1952849/test-/#related", { "range": "31-70", "count": 40, @@ -144,7 +185,7 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor): }) def pins(self): - return self.api.board_related(self.board_id) + return self.api.board_related(self.board["id"]) class PinterestPinitExtractor(PinterestExtractor): @@ -188,9 +229,10 @@ class PinterestAPI(): "*/*, q=0.01", "Accept-Language" : "en-US,en;q=0.5", "X-Pinterest-AppState": "active", - "X-APP-VERSION" : "cb1c7f9", + "X-APP-VERSION" : "b00dd49", "X-Requested-With" : "XMLHttpRequest", - "Origin" : BASE_URL + "/", + "Origin" : BASE_URL, + "Referer" : BASE_URL + "/", } def __init__(self, extractor): @@ -206,9 +248,9 @@ class PinterestAPI(): options = {"pin": pin_id, "add_vase": True, "pins_only": True} return self._pagination("RelatedPinFeed", options) - def board(self, user, board): + def board(self, user, board_name): """Query information about a board""" - options = {"slug": board, "username": user, + options = {"slug": board_name, "username": user, "field_set_key": "detailed"} return self._call("Board", options)["resource_response"]["data"] @@ -217,6 +259,22 @@ class PinterestAPI(): options = {"board_id": board_id} return self._pagination("BoardFeed", options) + def board_section(self, user, board_slug, section_slug): + """Yield a specific board section""" + options = {"board_slug": board_slug, "section_slug": section_slug, + "username": user} + return self._call("BoardSection", options)["resource_response"]["data"] + + def board_sections(self, board_id): + """Yield all sections of a specific board""" + options = {"board_id": board_id} + return self._pagination("BoardSections", options) + + def board_section_pins(self, section_id): + """Yield all pins from a board section""" + options = {"section_id": section_id} + return self._pagination("BoardSectionPins", options) + def board_related(self, board_id): """Yield related pins of a specific board""" options = {"board_id": board_id, "add_vase": True} diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py index 1063716..05ec117 100644 --- a/gallery_dl/extractor/slickpic.py +++ b/gallery_dl/extractor/slickpic.py @@ -42,7 +42,8 @@ class SlickpicAlbumExtractor(SlickpicExtractor): ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", { "range": "34", "content": ("cec6630e659dc72db1ee1a9a6f3b525189261988", - "6f81e1e74c6cd6db36844e7211eef8e7cd30055d"), + "6f81e1e74c6cd6db36844e7211eef8e7cd30055d", + "22e83645fc242bc3584eca7ec982c8a53a4d8a44"), }), ) diff --git a/gallery_dl/extractor/speakerdeck.py b/gallery_dl/extractor/speakerdeck.py index 1a9691c..a3819c7 100644 --- a/gallery_dl/extractor/speakerdeck.py +++ b/gallery_dl/extractor/speakerdeck.py @@ -23,8 +23,10 @@ class SpeakerdeckPresentationExtractor(Extractor): r"/([^/?&#]+)/([^/?&#]+)") test = ( (("https://speakerdeck.com/speakerdeck/introduction-to-speakerdeck"), { - "url": "e97d4a7d5c64267e921c13eb7946d7074794a0d2", + "pattern": r"https://files.speakerdeck.com/presentations/" + r"50021f75cf1db900020005e7/slide_\d+.jpg", "content": "75c7abf0969b0bcab23e0da9712c95ee5113db3a", + "count": 6, }), ) diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index 31dbdad..5809463 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -57,7 +57,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): "collection": "", "artist" : ["Itou Life"], "group" : ["Itou Life"], - "parody" : ["Fate/Grand Order"], + "parody" : list, "characters": list, "tags" : list, "type" : "Doujinshi", diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 3e3a5a0..70fead8 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -194,7 +194,7 @@ class TumblrExtractor(Extractor): return not self.reblogs def _skip_reblog_same_blog(self, post): - return self.blog != post["reblogged_root_uuid"] + return self.blog != post.get("reblogged_root_uuid") class TumblrUserExtractor(TumblrExtractor): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 7cabb8c..1e985e3 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -31,6 +31,7 @@ class TwitterExtractor(Extractor): self.retweets = self.config("retweets", True) self.replies = self.config("replies", True) self.twitpic = self.config("twitpic", False) + self.quoted = self.config("quoted", True) self.videos = self.config("videos", True) self._user_cache = {} @@ -41,8 +42,9 @@ class TwitterExtractor(Extractor): for tweet in self.tweets(): - if not self.retweets and "retweeted_status_id_str" in tweet or \ - not self.replies and "in_reply_to_user_id_str" in tweet: + if (not self.retweets and "retweeted_status_id_str" in tweet or + not self.replies and "in_reply_to_user_id_str" in tweet or + not self.quoted and "quoted" in tweet): continue if self.twitpic: @@ -60,7 +62,7 @@ class TwitterExtractor(Extractor): tdata["width"] = media["original_info"].get("width", 0) tdata["height"] = media["original_info"].get("height", 0) - if "video_info" in media and self.videos: + if "video_info" in media: if self.videos == "ytdl": url = "ytdl:{}/i/web/status/{}".format( @@ -68,7 +70,7 @@ class TwitterExtractor(Extractor): tdata["extension"] = None yield Message.Url, url, tdata - else: + elif self.videos: video_info = media["video_info"] variant = max( video_info["variants"], @@ -149,11 +151,10 @@ class TwitterExtractor(Extractor): if "in_reply_to_screen_name" in tweet: tdata["reply_to"] = tweet["in_reply_to_screen_name"] - if "full_text_quoted" in tweet: - tdata["content_quoted"] = tweet["full_text_quoted"] - if "author" in tweet: tdata["author"] = self._transform_user(tweet["author"]) + else: + tdata["author"] = tdata["user"] return tdata @@ -264,6 +265,27 @@ class TwitterMediaExtractor(TwitterExtractor): return TwitterAPI(self).timeline_media(self.user) +class TwitterLikesExtractor(TwitterExtractor): + """Extractor for liked tweets""" + subcategory = "likes" + pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" + r"/(?!search)([^/?&#]+)/likes(?!\w)") + test = ("https://twitter.com/supernaturepics/likes",) + + def tweets(self): + return TwitterAPI(self).timeline_favorites(self.user) + + +class TwitterBookmarkExtractor(TwitterExtractor): + """Extractor for bookmarked tweets""" + subcategory = "bookmark" + pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()" + test = ("https://twitter.com/i/bookmarks",) + + def tweets(self): + return TwitterAPI(self).timeline_bookmark() + + class TwitterSearchExtractor(TwitterExtractor): """Extractor for all images from a search timeline""" subcategory = "search" @@ -279,7 +301,7 @@ class TwitterSearchExtractor(TwitterExtractor): return {"search": text.unquote(self.user)} def tweets(self): - return TwitterAPI(self).search(self.user) + return TwitterAPI(self).search(text.unquote(self.user)) class TwitterTweetExtractor(TwitterExtractor): @@ -298,7 +320,6 @@ class TwitterTweetExtractor(TwitterExtractor): }), # video ("https://twitter.com/perrypumas/status/1065692031626829824", { - "options": (("videos", True),), "pattern": r"https://video.twimg.com/ext_tw_video/.+\.mp4\?tag=5", }), # content with emoji, newlines, hashtags (#338) @@ -310,23 +331,25 @@ class TwitterTweetExtractor(TwitterExtractor): "It’s our \\(Mystery\\) Gift to you, Trainers! \n\n❓🎁➡️ " )}, }), - # Reply to another tweet (#403) - ("https://twitter.com/tyson_hesse/status/1103767554424598528", { - "options": (("videos", "ytdl"),), - "pattern": r"ytdl:https://twitter.com/i/web.+/1103767554424598528", + # Reply to deleted tweet (#403, #838) + ("https://twitter.com/i/web/status/1170041925560258560", { + "pattern": r"https://pbs.twimg.com/media/EDzS7VrU0AAFL4_.jpg:orig", }), # 'replies' option (#705) - ("https://twitter.com/tyson_hesse/status/1103767554424598528", { + ("https://twitter.com/i/web/status/1170041925560258560", { "options": (("replies", False),), "count": 0, }), - # /i/web/ URL - ("https://twitter.com/i/web/status/1155074198240292865", { - "pattern": r"https://pbs.twimg.com/media/EAel0vUUYAAZ4Bq.jpg:orig", + # quoted tweet (#526, #854) + ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", { + "pattern": r"https://pbs\.twimg\.com/media/Ea[KG].+\.jpg", + "count": 8, }), - # quoted tweet (#526) - ("https://twitter.com/Pistachio/status/1222690391817932803", { - "pattern": r"https://pbs\.twimg\.com/media/EPfMfDUU8AAnByO\.jpg", + # "quoted" option (#854) + ("https://twitter.com/StobiesGalaxy/status/1270755918330896395", { + "options": (("quoted", False),), + "pattern": r"https://pbs\.twimg\.com/media/EaK.+\.jpg", + "count": 4, }), # TwitPic embeds (#579) ("https://twitter.com/i/web/status/112900228289540096", { @@ -344,16 +367,6 @@ class TwitterTweetExtractor(TwitterExtractor): return TwitterAPI(self).tweet(self.tweet_id) -class TwitterBookmarkExtractor(TwitterExtractor): - """Extractor for bookmarked tweets""" - subcategory = "bookmark" - pattern = r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com/i/bookmarks()" - test = ("https://twitter.com/i/bookmarks",) - - def tweets(self): - return TwitterAPI(self).bookmarks() - - class TwitterAPI(): def __init__(self, extractor): @@ -409,16 +422,21 @@ class TwitterAPI(): self.headers["x-twitter-auth-type"] = "OAuth2Session" else: # guest token - guest_token = _guest_token(self.extractor, self.headers) + guest_token = self._guest_token() self.headers["x-guest-token"] = guest_token cookies.set("gt", guest_token, domain=".twitter.com") def tweet(self, tweet_id): endpoint = "2/timeline/conversation/{}.json".format(tweet_id) + tweets = [] for tweet in self._pagination(endpoint): if tweet["id_str"] == tweet_id: - return (tweet,) - return () + tweets.append(tweet) + if "quoted_status_id_str" in tweet: + tweet_id = tweet["quoted_status_id_str"] + else: + break + return tweets def timeline_profile(self, screen_name): user = self.user_by_screen_name(screen_name) @@ -430,17 +448,26 @@ class TwitterAPI(): endpoint = "2/timeline/media/{}.json".format(user["rest_id"]) return self._pagination(endpoint) + def timeline_favorites(self, screen_name): + user = self.user_by_screen_name(screen_name) + endpoint = "2/timeline/favorites/{}.json".format(user["rest_id"]) + return self._pagination(endpoint) + + def timeline_bookmark(self): + endpoint = "2/timeline/bookmark.json" + return self._pagination(endpoint) + def search(self, query): endpoint = "2/search/adaptive.json" params = self.params.copy() - params["q"] = text.unquote(query) + params["q"] = query + params["tweet_search_mode"] = "live" + params["query_source"] = "typed_query" + params["pc"] = "1" + params["spelling_corrections"] = "1" return self._pagination( endpoint, params, "sq-I-t-", "sq-cursor-bottom") - def bookmarks(self): - endpoint = "2/timeline/bookmark.json" - return self._pagination(endpoint) - def user_by_screen_name(self, screen_name): endpoint = "graphql/-xfUfZsnR_zqjFd-IfrN5A/UserByScreenName" params = { @@ -449,10 +476,16 @@ class TwitterAPI(): } return self._call(endpoint, params)["data"]["user"] - def _call(self, endpoint, params): + @cache(maxage=3600) + def _guest_token(self): + endpoint = "1.1/guest/activate.json" + return self._call(endpoint, None, "POST")["guest_token"] + + def _call(self, endpoint, params, method="GET"): url = "https://api.twitter.com/" + endpoint response = self.extractor.request( - url, params=params, headers=self.headers, fatal=None) + url, method=method, params=params, headers=self.headers, + fatal=None) if response.status_code < 400: return response.json() if response.status_code == 429: @@ -479,28 +512,30 @@ class TwitterAPI(): for entry in instr[0]["addEntries"]["entries"]: if entry["entryId"].startswith(entry_tweet): - tid = entry["content"]["item"]["content"]["tweet"]["id"] - if tid not in tweets: + try: + tweet = tweets[ + entry["content"]["item"]["content"]["tweet"]["id"]] + except KeyError: self.extractor.log.debug( - "Skipping unavailable Tweet %s", tid) + "Skipping unavailable Tweet %s", + entry["entryId"][6:]) continue - tweet = tweets[tid] tweet["user"] = users[tweet["user_id_str"]] - if "quoted_status_id_str" in tweet: - quoted = tweets.get(tweet["quoted_status_id_str"]) - if quoted: - tweet["full_text_quoted"] = quoted["full_text"] - if "extended_entities" in quoted: - tweet["extended_entities"] = \ - quoted["extended_entities"] - elif "retweeted_status_id_str" in tweet: + if "retweeted_status_id_str" in tweet: retweet = tweets.get(tweet["retweeted_status_id_str"]) if retweet: tweet["author"] = users[retweet["user_id_str"]] - yield tweet + if "quoted_status_id_str" in tweet: + quoted = tweets.get(tweet["quoted_status_id_str"]) + if quoted: + quoted["author"] = users[quoted["user_id_str"]] + quoted["user"] = tweet["user"] + quoted["quoted"] = True + yield quoted + elif entry["entryId"].startswith(entry_cursor): cursor = entry["content"]["operation"]["cursor"] if not cursor.get("stopOnEmptyResponse"): @@ -515,11 +550,3 @@ class TwitterAPI(): if not cursor or not tweet: return params["cursor"] = cursor - - -@cache(maxage=3600) -def _guest_token(extr, headers): - return extr.request( - "https://api.twitter.com/1.1/guest/activate.json", - method="POST", headers=headers, - ).json().get("guest_token") diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index 3b992a2..d42730e 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -96,6 +96,7 @@ class WebtoonsEpisodeExtractor(WebtoonsExtractor): class WebtoonsComicExtractor(WebtoonsExtractor): """Extractor for an entire comic on webtoons.com""" subcategory = "comic" + categorytransfer = True pattern = (BASE_PATTERN + r"/([^/?&#]+)/([^/?&#]+))" r"/list(?:\?([^#]+))") test = ( diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index aa9bdae..d1ad388 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, exception +import itertools import json @@ -30,53 +31,53 @@ class WeiboExtractor(Extractor): for status in self.statuses(): - yield Message.Directory, status - obj = status - num = 1 - - while True: - - if "pics" in obj: - for image in obj["pics"]: - pid = image["pid"] - if "large" in image: - image = image["large"] - geo = image.get("geo") or {} - data = text.nameext_from_url(image["url"], { - "num" : num, - "pid" : pid, - "url" : image["url"], - "width" : text.parse_int(geo.get("width")), - "height": text.parse_int(geo.get("height")), - "status": status, - }) - yield Message.Url, image["url"], data - num += 1 - - if self.videos and "media_info" in obj.get("page_info", ()): - info = obj["page_info"]["media_info"] - url = info.get("stream_url_hd") or info.get("stream_url") - - if url: - data = text.nameext_from_url(url, { - "num" : num, - "pid" : 0, - "url" : url, - "width" : 0, - "height": 0, - "status": status, - }) - if data["extension"] == "m3u8": - url = "ytdl:" + url - data["extension"] = "mp4" - data["_ytdl_extra"] = {"protocol": "m3u8_native"} - yield Message.Url, url, data - num += 1 - - if self.retweets and "retweeted_status" in obj: - obj = obj["retweeted_status"] - else: - break + files = self._files_from_status(status) + if self.retweets and "retweeted_status" in status: + files = itertools.chain( + files, + self._files_from_status(status["retweeted_status"]), + ) + + for num, file in enumerate(files, 1): + if num == 1: + status["date"] = text.parse_datetime( + status["created_at"], "%a %b %d %H:%M:%S %z %Y") + yield Message.Directory, status + file["status"] = status + file["num"] = num + yield Message.Url, file["url"], file + + def _files_from_status(self, status): + images = status.pop("pics", ()) + page_info = status.pop("page_info", ()) + + for image in images: + pid = image["pid"] + if "large" in image: + image = image["large"] + geo = image.get("geo") or {} + yield text.nameext_from_url(image["url"], { + "url" : image["url"], + "pid" : pid, + "width" : text.parse_int(geo.get("width")), + "height": text.parse_int(geo.get("height")), + }) + + if self.videos and "media_info" in page_info: + info = page_info["media_info"] + url = info.get("stream_url_hd") or info.get("stream_url") + if url: + data = text.nameext_from_url(url, { + "url" : url, + "pid" : 0, + "width" : 0, + "height": 0, + }) + if data["extension"] == "m3u8": + data["extension"] = "mp4" + data["url"] = "ytdl:" + url + data["_ytdl_extra"] = {"protocol": "m3u8_native"} + yield data def statuses(self): """Returns an iterable containing all relevant 'status' objects""" @@ -124,6 +125,7 @@ class WeiboStatusExtractor(WeiboExtractor): test = ( ("https://m.weibo.cn/detail/4323047042991618", { "pattern": r"https?://wx\d+.sinaimg.cn/large/\w+.jpg", + "keyword": {"status": {"date": "dt:2018-12-30 13:56:36"}}, }), ("https://m.weibo.cn/detail/4339748116375525", { "pattern": r"https?://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_hd", |
