diff options
| author | 2024-03-25 02:57:44 -0400 | |
|---|---|---|
| committer | 2024-03-25 02:57:44 -0400 | |
| commit | 6e662211019a89caec44de8a57c675872b0b5498 (patch) | |
| tree | 5d9d5a2b7efc3a24dd6074e99b253b639fe5af1d /gallery_dl/extractor | |
| parent | 01166fa52707cc282467427cf0e65c1b8983c4be (diff) | |
New upstream version 1.26.9.upstream/1.26.9
Diffstat (limited to 'gallery_dl/extractor')
37 files changed, 745 insertions, 317 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index a665249..591e6a8 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -194,7 +194,6 @@ modules = [ "directlink", "recursive", "oauth", - "test", "ytdl", "generic", ] diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index b58b3d3..49fde7b 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -29,11 +29,13 @@ class ArtstationExtractor(Extractor): self.user = match.group(1) or match.group(2) def items(self): - data = self.metadata() - - projects = self.projects() + videos = self.config("videos", True) + previews = self.config("previews", False) external = self.config("external", False) max_posts = self.config("max-posts") + + data = self.metadata() + projects = self.projects() if max_posts: projects = itertools.islice(projects, max_posts) @@ -45,13 +47,29 @@ class ArtstationExtractor(Extractor): asset["num"] = num yield Message.Directory, asset - if adict["has_embedded_player"] and external: + if adict["has_embedded_player"]: player = adict["player_embedded"] url = (text.extr(player, 'src="', '"') or text.extr(player, "src='", "'")) - if url and not url.startswith(self.root): - asset["extension"] = None - yield Message.Url, "ytdl:" + url, asset + if url.startswith(self.root): + # video clip hosted on artstation + if videos: + page = self.request(url).text + url = text.extr(page, ' src="', '"') + text.nameext_from_url(url, asset) + yield Message.Url, url, asset + elif url: + # external URL + if external: + asset["extension"] = "mp4" + yield Message.Url, "ytdl:" + url, asset + else: + self.log.debug(player) + self.log.warning( + "Failed to extract embedded player URL (%s)", + adict.get("id")) + + if not previews: continue if adict["has_image"]: @@ -59,10 +77,11 @@ class ArtstationExtractor(Extractor): text.nameext_from_url(url, asset) url = self._no_cache(url) - lhs, _, rhs = url.partition("/large/") - if rhs: - url = lhs + "/4k/" + rhs - asset["_fallback"] = self._image_fallback(lhs, rhs) + if "/video_clips/" not in url: + lhs, _, rhs = url.partition("/large/") + if rhs: + url = lhs + "/4k/" + rhs + asset["_fallback"] = self._image_fallback(lhs, rhs) yield Message.Url, url, asset @@ -175,7 +194,7 @@ class ArtstationUserExtractor(ArtstationExtractor): subcategory = "user" pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com" r"/(?!artwork|projects|search)([^/?#]+)(?:/albums/all)?" - r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$") + r"|((?!www)[\w-]+)\.artstation\.com(?:/projects)?)/?$") example = "https://www.artstation.com/USER" def projects(self): @@ -192,7 +211,7 @@ class ArtstationAlbumExtractor(ArtstationExtractor): archive_fmt = "a_{album[id]}_{asset[id]}" pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com" r"/(?!artwork|projects|search)([^/?#]+)" - r"|((?!www)\w+)\.artstation\.com)/albums/(\d+)") + r"|((?!www)[\w-]+)\.artstation\.com)/albums/(\d+)") example = "https://www.artstation.com/USER/albums/12345" def __init__(self, match): @@ -226,7 +245,7 @@ class ArtstationLikesExtractor(ArtstationExtractor): directory_fmt = ("{category}", "{userinfo[username]}", "Likes") archive_fmt = "f_{userinfo[id]}_{asset[id]}" pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" - r"/(?!artwork|projects|search)([^/?#]+)/likes/?") + r"/(?!artwork|projects|search)([^/?#]+)/likes") example = "https://www.artstation.com/USER/likes" def projects(self): @@ -234,6 +253,54 @@ class ArtstationLikesExtractor(ArtstationExtractor): return self._pagination(url) +class ArtstationCollectionExtractor(ArtstationExtractor): + """Extractor for an artstation collection""" + subcategory = "collection" + directory_fmt = ("{category}", "{user}", + "{collection[id]} {collection[name]}") + archive_fmt = "c_{collection[id]}_{asset[id]}" + pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?#]+)/collections/(\d+)") + example = "https://www.artstation.com/USER/collections/12345" + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.collection_id = match.group(2) + + def metadata(self): + url = "{}/collections/{}.json".format( + self.root, self.collection_id) + params = {"username": self.user} + collection = self.request( + url, params=params, notfound="collection").json() + return {"collection": collection, "user": self.user} + + def projects(self): + url = "{}/collections/{}/projects.json".format( + self.root, self.collection_id) + params = {"collection_id": self.collection_id} + return self._pagination(url, params) + + +class ArtstationCollectionsExtractor(ArtstationExtractor): + """Extractor for an artstation user's collections""" + subcategory = "collections" + pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?#]+)/collections/?$") + example = "https://www.artstation.com/USER/collections" + + def items(self): + url = self.root + "/collections.json" + params = {"username": self.user} + + for collection in self.request( + url, params=params, notfound="collections").json(): + url = "{}/{}/collections/{}".format( + self.root, self.user, collection["id"]) + collection["_extractor"] = ArtstationCollectionExtractor + yield Message.Queue, url, collection + + class ArtstationChallengeExtractor(ArtstationExtractor): """Extractor for submissions of artstation challenges""" subcategory = "challenge" @@ -355,7 +422,7 @@ class ArtstationImageExtractor(ArtstationExtractor): """Extractor for images from a single artstation project""" subcategory = "image" pattern = (r"(?:https?://)?(?:" - r"(?:\w+\.)?artstation\.com/(?:artwork|projects|search)" + r"(?:[\w-]+\.)?artstation\.com/(?:artwork|projects|search)" r"|artstn\.co/p)/(\w+)") example = "https://www.artstation.com/artwork/abcde" diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index 8de0d7b..84c3187 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -39,12 +39,19 @@ class BlueskyExtractor(Extractor): self._metadata_facets = ("facets" in meta) self.api = BlueskyAPI(self) - self._user = None + self._user = self._user_did = None + self.instance = self.root.partition("://")[2] def items(self): for post in self.posts(): if "post" in post: post = post["post"] + + pid = post["uri"].rpartition("/")[2] + if self._user_did and post["author"]["did"] != self._user_did: + self.log.debug("Skipping %s (repost)", pid) + continue + post.update(post["record"]) del post["record"] @@ -75,7 +82,8 @@ class BlueskyExtractor(Extractor): if self._metadata_user: post["user"] = self._user or post["author"] - post["post_id"] = post["uri"].rpartition("/")[2] + post["instance"] = self.instance + post["post_id"] = pid post["count"] = len(images) post["date"] = text.parse_datetime( post["createdAt"][:19], "%Y-%m-%dT%H:%M:%S") @@ -101,10 +109,14 @@ class BlueskyExtractor(Extractor): post["width"] = post["height"] = 0 image = file["image"] - post["filename"] = link = image["ref"]["$link"] + try: + cid = image["ref"]["$link"] + except KeyError: + cid = image["cid"] + post["filename"] = cid post["extension"] = image["mimeType"].rpartition("/")[2] - yield Message.Url, base + link, post + yield Message.Url, base + cid, post def posts(self): return () @@ -230,6 +242,7 @@ class BlueskyFollowingExtractor(BlueskyExtractor): def items(self): for user in self.api.get_follows(self.user): url = "https://bsky.app/profile/" + user["did"] + user["_extractor"] = BlueskyUserExtractor yield Message.Queue, url, user @@ -314,7 +327,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getFeed" params = { "feed" : "at://{}/app.bsky.feed.generator/{}".format( - self._did_from_actor(actor), feed), + self._did_from_actor(actor, False), feed), "limit": "100", } return self._pagination(endpoint, params) @@ -331,7 +344,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getListFeed" params = { "list" : "at://{}/app.bsky.graph.list/{}".format( - self._did_from_actor(actor), list), + self._did_from_actor(actor, False), list), "limit": "100", } return self._pagination(endpoint, params) @@ -378,14 +391,17 @@ class BlueskyAPI(): } return self._pagination(endpoint, params, "posts") - def _did_from_actor(self, actor): + def _did_from_actor(self, actor, user_did=True): if actor.startswith("did:"): did = actor else: did = self.resolve_handle(actor) - if self.extractor._metadata_user: - self.extractor._user = self.get_profile(did) + extr = self.extractor + if user_did and not extr.config("reposts", False): + extr._user_did = did + if extr._metadata_user: + extr._user = self.get_profile(did) return did @@ -434,13 +450,20 @@ class BlueskyAPI(): if response.status_code < 400: return response.json() if response.status_code == 429: - self.extractor.wait(seconds=60) + until = response.headers.get("RateLimit-Reset") + self.extractor.wait(until=until) continue + try: + data = response.json() + msg = "API request failed ('{}: {}')".format( + data["error"], data["message"]) + except Exception: + msg = "API request failed ({} {})".format( + response.status_code, response.reason) + self.extractor.log.debug("Server response: %s", response.text) - raise exception.StopExtraction( - "API request failed (%s %s)", - response.status_code, response.reason) + raise exception.StopExtraction(msg) def _pagination(self, endpoint, params, key="feed"): while True: diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 1a0e47d..a093347 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -54,7 +54,6 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "album_id" : self.album_id, "album_name" : text.unescape(info[0]), "album_size" : size[1:-1], - "description": text.unescape(info[2]) if len(info) > 2 else "", "count" : len(urls), } diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index cf0f8c9..d14e13a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -203,9 +203,15 @@ class Extractor(): self.log.debug("%s (%s/%s)", msg, tries, retries+1) if tries > retries: break - self.sleep( - max(tries, self._interval()) if self._interval else tries, - "retry") + + if self._interval: + seconds = self._interval() + if seconds < tries: + seconds = tries + else: + seconds = tries + + self.sleep(seconds, "retry") tries += 1 raise exception.HttpError(msg, response) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 0cf4f88..ca8acaa 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -18,12 +18,12 @@ import binascii import time import re - BASE_PATTERN = ( r"(?:https?://)?(?:" r"(?:www\.)?(?:fx)?deviantart\.com/(?!watch/)([\w-]+)|" r"(?!www\.)([\w-]+)\.(?:fx)?deviantart\.com)" ) +DEFAULT_AVATAR = "https://a.deviantart.net/avatars/default.gif" class DeviantartExtractor(Extractor): @@ -47,8 +47,9 @@ class DeviantartExtractor(Extractor): self.extra = self.config("extra", False) self.quality = self.config("quality", "100") self.original = self.config("original", True) - self.comments = self.config("comments", False) self.intermediary = self.config("intermediary", True) + self.comments_avatars = self.config("comments-avatars", False) + self.comments = self.comments_avatars or self.config("comments", False) self.api = DeviantartOAuthAPI(self) self.group = False @@ -83,6 +84,16 @@ class DeviantartExtractor(Extractor): else: self.commit_journal = None + def request(self, url, **kwargs): + if "fatal" not in kwargs: + kwargs["fatal"] = False + while True: + response = Extractor.request(self, url, **kwargs) + if response.status_code != 403 or \ + b"Request blocked." not in response.content: + return response + self.wait(seconds=300, reason="CloudFront block") + def skip(self, num): self.offset += num return num @@ -100,9 +111,9 @@ class DeviantartExtractor(Extractor): if self.user: group = self.config("group", True) if group: - profile = self.api.user_profile(self.user) - if profile: - self.user = profile["user"]["username"] + user = _user_details(self, self.user) + if user: + self.user = user["username"] self.group = False elif group == "skip": self.log.info("Skipping group '%s'", self.user) @@ -172,6 +183,20 @@ class DeviantartExtractor(Extractor): deviation["is_original"] = True yield self.commit_journal(deviation, journal) + if self.comments_avatars: + for comment in deviation["comments"]: + user = comment["user"] + name = user["username"].lower() + if user["usericon"] == DEFAULT_AVATAR: + self.log.debug( + "Skipping avatar of '%s' (default)", name) + continue + _user_details.update(name, user) + + url = "{}/{}/avatar/".format(self.root, name) + comment["_extractor"] = DeviantartAvatarExtractor + yield Message.Queue, url, comment + if not self.extra: continue @@ -198,7 +223,9 @@ class DeviantartExtractor(Extractor): """Adjust the contents of a Deviation-object""" if "index" not in deviation: try: - if deviation["url"].startswith("https://sta.sh"): + if deviation["url"].startswith(( + "https://www.deviantart.com/stash/", "https://sta.sh", + )): filename = deviation["content"]["src"].split("/")[5] deviation["index_base36"] = filename.partition("-")[0][1:] deviation["index"] = id_from_base36( @@ -445,18 +472,12 @@ class DeviantartExtractor(Extractor): def _limited_request(self, url, **kwargs): """Limits HTTP requests to one every 2 seconds""" - kwargs["fatal"] = None diff = time.time() - DeviantartExtractor._last_request if diff < 2.0: self.sleep(2.0 - diff, "request") - - while True: - response = self.request(url, **kwargs) - if response.status_code != 403 or \ - b"Request blocked." not in response.content: - DeviantartExtractor._last_request = time.time() - return response - self.wait(seconds=180) + response = self.request(url, **kwargs) + DeviantartExtractor._last_request = time.time() + return response def _fetch_premium(self, deviation): try: @@ -569,13 +590,18 @@ class DeviantartAvatarExtractor(DeviantartExtractor): def deviations(self): name = self.user.lower() - profile = self.api.user_profile(name) - if not profile: + user = _user_details(self, name) + if not user: return () - user = profile["user"] icon = user["usericon"] - index = icon.rpartition("?")[2] + if icon == DEFAULT_AVATAR: + self.log.debug("Skipping avatar of '%s' (default)", name) + return () + + _, sep, index = icon.rpartition("?") + if not sep: + index = "0" formats = self.config("formats") if not formats: @@ -658,7 +684,8 @@ class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" archive_fmt = "{index}.{extension}" - pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)" + pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)" + r"/([a-z0-9]+)") example = "https://sta.sh/abcde" skip = Extractor.skip @@ -679,7 +706,7 @@ class DeviantartStashExtractor(DeviantartExtractor): if uuid: deviation = self.api.deviation(uuid) deviation["index"] = text.parse_int(text.extr( - page, 'gmi-deviationid="', '"')) + page, '\\"deviationId\\":', ',')) yield deviation return @@ -1086,9 +1113,8 @@ class DeviantartOAuthAPI(): if not isinstance(self.mature, str): self.mature = "true" if self.mature else "false" - self.folders = extractor.config("folders", False) - self.metadata = extractor.extra or extractor.config("metadata", False) self.strategy = extractor.config("pagination") + self.folders = extractor.config("folders", False) self.public = extractor.config("public", True) client_id = extractor.config("client-id") @@ -1106,6 +1132,42 @@ class DeviantartOAuthAPI(): token = None self.refresh_token_key = token + metadata = extractor.config("metadata", False) + if not metadata: + metadata = bool(extractor.extra) + if metadata: + self.metadata = True + + if isinstance(metadata, str): + if metadata == "all": + metadata = ("submission", "camera", "stats", + "collection", "gallery") + else: + metadata = metadata.replace(" ", "").split(",") + elif not isinstance(metadata, (list, tuple)): + metadata = () + + self._metadata_params = {"mature_content": self.mature} + self._metadata_public = None + if metadata: + # extended metadata + self.limit = 10 + for param in metadata: + self._metadata_params["ext_" + param] = "1" + if "ext_collection" in self._metadata_params or \ + "ext_gallery" in self._metadata_params: + if token: + self._metadata_public = False + else: + self.log.error("'collection' and 'gallery' metadata " + "require a refresh token") + else: + # base metadata + self.limit = 50 + else: + self.metadata = False + self.limit = None + self.log.debug( "Using %s API credentials (client-id %s)", "default" if self.client_id == self.CLIENT_ID else "custom", @@ -1115,14 +1177,14 @@ class DeviantartOAuthAPI(): def browse_deviantsyouwatch(self, offset=0): """Yield deviations from users you watch""" endpoint = "/browse/deviantsyouwatch" - params = {"limit": "50", "offset": offset, + params = {"limit": 50, "offset": offset, "mature_content": self.mature} return self._pagination(endpoint, params, public=False) def browse_posts_deviantsyouwatch(self, offset=0): """Yield posts from users you watch""" endpoint = "/browse/posts/deviantsyouwatch" - params = {"limit": "50", "offset": offset, + params = {"limit": 50, "offset": offset, "mature_content": self.mature} return self._pagination(endpoint, params, public=False, unpack=True) @@ -1131,7 +1193,7 @@ class DeviantartOAuthAPI(): endpoint = "/browse/newest" params = { "q" : query, - "limit" : 50 if self.metadata else 120, + "limit" : 120, "offset" : offset, "mature_content": self.mature, } @@ -1142,7 +1204,7 @@ class DeviantartOAuthAPI(): endpoint = "/browse/popular" params = { "q" : query, - "limit" : 50 if self.metadata else 120, + "limit" : 120, "timerange" : timerange, "offset" : offset, "mature_content": self.mature, @@ -1249,8 +1311,11 @@ class DeviantartOAuthAPI(): "deviationids[{}]={}".format(num, deviation["deviationid"]) for num, deviation in enumerate(deviations) ) - params = {"mature_content": self.mature} - return self._call(endpoint, params=params)["metadata"] + return self._call( + endpoint, + params=self._metadata_params, + public=self._metadata_public, + )["metadata"] def gallery(self, username, folder_id, offset=0, extend=True, public=None): """Yield all Deviation-objects contained in a gallery folder""" @@ -1357,9 +1422,14 @@ class DeviantartOAuthAPI(): self.authenticate(None if public else self.refresh_token_key) kwargs["headers"] = self.headers response = self.extractor.request(url, **kwargs) - data = response.json() - status = response.status_code + try: + data = response.json() + except ValueError: + self.log.error("Unable to parse API response") + data = {} + + status = response.status_code if 200 <= status < 400: if self.delay > self.delay_min: self.delay -= 1 @@ -1412,6 +1482,9 @@ class DeviantartOAuthAPI(): if public is None: public = self.public + if self.limit and params["limit"] > self.limit: + params["limit"] = (params["limit"] // self.limit) * self.limit + while True: data = self._call(endpoint, params=params, public=public) try: @@ -1483,6 +1556,15 @@ class DeviantartOAuthAPI(): def _metadata(self, deviations): """Add extended metadata to each deviation object""" + if len(deviations) <= self.limit: + self._metadata_batch(deviations) + else: + n = self.limit + for index in range(0, len(deviations), n): + self._metadata_batch(deviations[index:index+n]) + + def _metadata_batch(self, deviations): + """Fetch extended metadata for a single batch of deviations""" for deviation, metadata in zip( deviations, self.deviation_metadata(deviations)): deviation.update(metadata) @@ -1667,6 +1749,14 @@ class DeviantartEclipseAPI(): return token +@memcache(keyarg=1) +def _user_details(extr, name): + try: + return extr.api.user_profile(name)["user"] + except Exception: + return None + + @cache(maxage=36500*86400, keyarg=0) def _refresh_token_cache(token): if token and token[0] == "#": diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index aff8e61..838ae7b 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -42,7 +42,8 @@ class FapelloPostExtractor(Extractor): "type" : "video" if 'type="video' in page else "photo", "thumbnail": text.extr(page, 'poster="', '"'), } - url = text.extr(page, 'src="', '"') + url = text.extr(page, 'src="', '"').replace( + ".md", "").replace(".th", "") yield Message.Directory, data yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index f7dc3cc..c94a110 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -77,6 +77,8 @@ class FlickrImageExtractor(FlickrExtractor): photo = self.api.photos_getInfo(self.item_id) if self.api.exif: photo.update(self.api.photos_getExif(self.item_id)) + if self.api.contexts: + photo.update(self.api.photos_getAllContexts(self.item_id)) if photo["media"] == "video" and self.api.videos: self.api._extract_video(photo) @@ -268,6 +270,8 @@ class FlickrAPI(oauth.OAuth1API): self.exif = extractor.config("exif", False) self.videos = extractor.config("videos", True) + self.contexts = extractor.config("contexts", False) + self.maxsize = extractor.config("size-max") if isinstance(self.maxsize, str): for fmt, fmtname, fmtwidth in self.FORMATS: @@ -311,6 +315,13 @@ class FlickrAPI(oauth.OAuth1API): params = {"user_id": user_id} return self._pagination("people.getPhotos", params) + def photos_getAllContexts(self, photo_id): + """Returns all visible sets and pools the photo belongs to.""" + params = {"photo_id": photo_id} + data = self._call("photos.getAllContexts", params) + del data["stat"] + return data + def photos_getExif(self, photo_id): """Retrieves a list of EXIF/TIFF/GPS tags for a given photo.""" params = {"photo_id": photo_id} @@ -444,6 +455,8 @@ class FlickrAPI(oauth.OAuth1API): if self.exif: photo.update(self.photos_getExif(photo["id"])) + if self.contexts: + photo.update(self.photos_getAllContexts(photo["id"])) photo["id"] = text.parse_int(photo["id"]) if "owner" in photo: diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 83f1392..2459a61 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -32,6 +32,9 @@ class GelbooruBase(): url = self.root + "/index.php?page=dapi&q=index&json=1" data = self.request(url, params=params).json() + if not key: + return data + try: posts = data[key] except KeyError: @@ -167,13 +170,61 @@ class GelbooruFavoriteExtractor(GelbooruBase, params = { "s" : "favorite", "id" : self.favorite_id, - "limit": "1", + "limit": "2", } + data = self._api_request(params, None, True) - count = self._api_request(params, "@attributes", True)[0]["count"] - if count <= self.offset: - return + count = data["@attributes"]["count"] + self.log.debug("API reports %s favorite entries", count) + + favs = data["favorite"] + try: + order = 1 if favs[0]["id"] < favs[1]["id"] else -1 + except LookupError as exc: + self.log.debug( + "Error when determining API favorite order (%s: %s)", + exc.__class__.__name__, exc) + order = -1 + else: + self.log.debug("API yields favorites in %sscending order", + "a" if order > 0 else "de") + + order_favs = self.config("order-posts") + if order_favs and order_favs[0] in ("r", "a"): + self.log.debug("Returning them in reverse") + order = -order + + if order < 0: + return self._pagination(params, count) + return self._pagination_reverse(params, count) + + def _pagination(self, params, count): + if self.offset: + pnum, skip = divmod(self.offset, self.per_page) + else: + pnum = skip = 0 + + params["pid"] = pnum + params["limit"] = self.per_page + + while True: + favs = self._api_request(params, "favorite") + + if not favs: + return + + if skip: + favs = favs[skip:] + skip = 0 + + for fav in favs: + for post in self._api_request({"id": fav["favorite"]}): + post["date_favorited"] = text.parse_timestamp(fav["added"]) + yield post + + params["pid"] += 1 + def _pagination_reverse(self, params, count): pnum, last = divmod(count-1, self.per_page) if self.offset > last: # page number change @@ -182,12 +233,11 @@ class GelbooruFavoriteExtractor(GelbooruBase, pnum -= diff + 1 skip = self.offset - # paginate over them in reverse params["pid"] = pnum params["limit"] = self.per_page while True: - favs = self._api_request(params, "favorite", True) + favs = self._api_request(params, "favorite") favs.reverse() if skip: @@ -195,7 +245,9 @@ class GelbooruFavoriteExtractor(GelbooruBase, skip = 0 for fav in favs: - yield from self._api_request({"id": fav["favorite"]}) + for post in self._api_request({"id": fav["favorite"]}): + post["date_favorited"] = text.parse_timestamp(fav["added"]) + yield post params["pid"] -= 1 if params["pid"] < 0: diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 289f91c..f0eb4e9 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -41,9 +41,13 @@ class GofileFolderExtractor(Extractor): folder = self._get_content(self.content_id, password) yield Message.Directory, folder + try: + contents = folder.pop("children") + except KeyError: + raise exception.AuthorizationError("Password required") + num = 0 - contents = folder.pop("contents") - for content_id in folder["childs"]: + for content_id in folder["childrenIds"]: content = contents[content_id] content["folder"] = folder @@ -67,31 +71,32 @@ class GofileFolderExtractor(Extractor): @memcache() def _create_account(self): self.log.debug("Creating temporary account") - return self._api_request("createAccount")["token"] + return self._api_request("accounts", method="POST")["token"] @cache(maxage=86400) def _get_website_token(self): self.log.debug("Fetching website token") page = self.request(self.root + "/dist/js/alljs.js").text - return text.extr(page, 'fetchData.wt = "', '"') + return text.extr(page, 'wt: "', '"') def _get_content(self, content_id, password=None): + headers = {"Authorization": "Bearer " + self.api_token} + params = {"wt": self.website_token} if password is not None: - password = hashlib.sha256(password.encode()).hexdigest() - return self._api_request("getContent", { - "contentId" : content_id, - "token" : self.api_token, - "wt" : self.website_token, - "password" : password, - }) - - def _api_request(self, endpoint, params=None): + params["password"] = hashlib.sha256(password.encode()).hexdigest() + return self._api_request("contents/" + content_id, params, headers) + + def _api_request(self, endpoint, params=None, headers=None, method="GET"): response = self.request( - "https://api.gofile.io/" + endpoint, params=params).json() + "https://api.gofile.io/" + endpoint, + method=method, params=params, headers=headers, + ).json() if response["status"] != "ok": if response["status"] == "error-notFound": raise exception.NotFoundError("content") + if response["status"] == "error-passwordRequired": + raise exception.AuthorizationError("Password required") raise exception.StopExtraction( "%s failed (Status: %s)", endpoint, response["status"]) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 20491b5..aadce6c 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -25,7 +25,7 @@ class HiperdexBase(): @memcache(keyarg=1) def manga_data(self, manga, page=None): if not page: - url = "{}/manga/{}/".format(self.root, manga) + url = "{}/mangas/{}/".format(self.root, manga) page = self.request(url).text extr = text.extract_from(page) @@ -33,7 +33,7 @@ class HiperdexBase(): "url" : text.unescape(extr( 'property="og:url" content="', '"')), "manga" : text.unescape(extr( - '"headline": "', '"')), + ' property="name" title="', '"')), "score" : text.parse_float(extr( 'id="averagerate">', '<')), "author" : text.remove_html(extr( @@ -68,8 +68,8 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for manga chapters from hiperdex.com""" - pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))" - example = "https://hiperdex.com/manga/MANGA/CHAPTER/" + pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))" + example = "https://hiperdex.com/mangas/MANGA/CHAPTER/" def __init__(self, match): root, path, self.manga, self.chapter = match.groups() @@ -90,8 +90,8 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): """Extractor for manga from hiperdex.com""" chapterclass = HiperdexChapterExtractor - pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$" - example = "https://hiperdex.com/manga/MANGA/" + pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$" + example = "https://hiperdex.com/mangas/MANGA/" def __init__(self, match): root, path, self.manga = match.groups() diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index c249a3e..dfd9a31 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -101,9 +101,8 @@ class IdolcomplexExtractor(SankakuExtractor): page = self.request(url, retries=10).text extr = text.extract_from(page) - pid_alnum = extr('/posts/', '"') - vavg = extr('itemprop="ratingValue">', "<") - vcnt = extr('itemprop="reviewCount">', "<") + vavg = extr('id="rating"', "</ul>") + vcnt = extr('>Votes</strong>:', "<") pid = extr(">Post ID:", "<") created = extr(' title="', '"') @@ -120,10 +119,10 @@ class IdolcomplexExtractor(SankakuExtractor): rating = extr(">Rating:", "<br") data = { - "id" : text.parse_int(pid), - "id_alnum" : pid_alnum, + "id" : pid.strip(), "md5" : file_url.rpartition("/")[2].partition(".")[0], - "vote_average": text.parse_float(vavg), + "vote_average": (1.0 * vavg.count('class="star-full"') + + 0.5 * vavg.count('class="star-half"')), "vote_count" : text.parse_int(vcnt), "created_at" : created, "date" : text.parse_datetime( @@ -222,8 +221,8 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/pools?/show/(\d+)" - example = "https://idol.sankakucomplex.com/pools/show/12345" + pattern = BASE_PATTERN + r"/pools?/(?:show/)?(\w+)" + example = "https://idol.sankakucomplex.com/pools/0123456789abcdef" per_page = 24 def __init__(self, match): diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index 3bdcfdf..85446c0 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -161,11 +161,12 @@ class ImagefapFolderExtractor(ImagefapExtractor): self.user = user or profile def items(self): - for gallery_id, name in self.galleries(self.folder_id): + for gallery_id, name, folder in self.galleries(self.folder_id): url = "{}/gallery/{}".format(self.root, gallery_id) data = { "gallery_id": gallery_id, "title" : text.unescape(name), + "folder" : text.unescape(folder), "_extractor": ImagefapGalleryExtractor, } yield Message.Queue, url, data @@ -173,6 +174,7 @@ class ImagefapFolderExtractor(ImagefapExtractor): def galleries(self, folder_id): """Yield gallery IDs and titles of a folder""" if folder_id == "-1": + folder_name = "Uncategorized" if self._id: url = "{}/usergallery.php?userid={}&folderid=-1".format( self.root, self.user) @@ -180,23 +182,28 @@ class ImagefapFolderExtractor(ImagefapExtractor): url = "{}/profile/{}/galleries?folderid=-1".format( self.root, self.user) else: + folder_name = None url = "{}/organizer/{}/".format(self.root, folder_id) params = {"page": 0} + extr = text.extract_from(self.request(url, params=params).text) + if not folder_name: + folder_name = extr("class'blk_galleries'><b>", "</b>") + while True: - extr = text.extract_from(self.request(url, params=params).text) cnt = 0 while True: - gid = extr('<a href="/gallery/', '"') + gid = extr(' id="gid-', '"') if not gid: break - yield gid, extr("<b>", "<") + yield gid, extr("<b>", "<"), folder_name cnt += 1 if cnt < 20: break params["page"] += 1 + extr = text.extract_from(self.request(url, params=params).text) class ImagefapUserExtractor(ImagefapExtractor): diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 8884d3e..86b1edd 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -39,10 +39,15 @@ class ImgurExtractor(Extractor): image["url"] = url = "https://i.imgur.com/{}.{}".format( image["id"], image["ext"]) image["date"] = text.parse_datetime(image["created_at"]) + image["_http_validate"] = self._validate text.nameext_from_url(url, image) return url + def _validate(self, response): + return (not response.history or + not response.url.endswith("/removed.png")) + def _items_queue(self, items): album_ex = ImgurAlbumExtractor image_ex = ImgurImageExtractor diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 6eae7db..9c2b1de 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -165,7 +165,7 @@ class InstagramExtractor(Extractor): data = { "post_id" : post["pk"], "post_shortcode": post["code"], - "likes": post["like_count"], + "likes": post.get("like_count", 0), "pinned": post.get("timeline_pinned_user_ids", ()), "date": text.parse_timestamp(post.get("taken_at")), } @@ -689,7 +689,10 @@ class InstagramRestAPI(): def reels_media(self, reel_ids): endpoint = "/v1/feed/reels_media/" params = {"reel_ids": reel_ids} - return self._call(endpoint, params=params)["reels_media"] + try: + return self._call(endpoint, params=params)["reels_media"] + except KeyError: + raise exception.AuthorizationError("Login required") def tags_media(self, tag): for section in self.tags_sections(tag): @@ -733,7 +736,7 @@ class InstagramRestAPI(): not user["followed_by_viewer"]: name = user["username"] s = "" if name.endswith("s") else "s" - raise exception.StopExtraction("%s'%s posts are private", name, s) + self.extractor.log.warning("%s'%s posts are private", name, s) self.extractor._assign_user(user) return user["id"] diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index fd5a73a..9c77b7a 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -15,7 +15,7 @@ import itertools import json import re -BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" +BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(su|party)" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})" @@ -41,9 +41,12 @@ class KemonopartyExtractor(Extractor): self.revisions = self.config("revisions") if self.revisions: self.revisions_unique = (self.revisions == "unique") + order = self.config("order-revisions") + self.revisions_reverse = order[0] in ("r", "a") if order else False + self._prepare_ddosguard_cookies() self._find_inline = re.compile( - r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' + r'src="(?:https?://(?:kemono|coomer)\.(?:su|party))?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall self._json_dumps = json.JSONEncoder( ensure_ascii=False, check_circular=False, @@ -232,6 +235,7 @@ class KemonopartyExtractor(Extractor): except exception.HttpError: post["revision_hash"] = self._revision_hash(post) post["revision_index"] = 1 + post["revision_count"] = 1 return (post,) revs.insert(0, post) @@ -247,22 +251,30 @@ class KemonopartyExtractor(Extractor): uniq.append(rev) revs = uniq - idx = len(revs) + cnt = idx = len(revs) for rev in revs: rev["revision_index"] = idx + rev["revision_count"] = cnt idx -= 1 + if self.revisions_reverse: + revs.reverse() + return revs def _revisions_all(self, url): revs = self.request(url + "/revisions").json() - idx = len(revs) + cnt = idx = len(revs) for rev in revs: rev["revision_hash"] = self._revision_hash(rev) rev["revision_index"] = idx + rev["revision_count"] = cnt idx -= 1 + if self.revisions_reverse: + revs.reverse() + return revs def _revision_hash(self, revision): diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index d4ccf33..12e8860 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -104,7 +104,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): filename_fmt = "{category}_{id}{title:?_//}.{extension}" directory_fmt = ("{category}",) archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/i/(\w+)" + pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)" example = "https://lensdump.com/i/ID" def __init__(self, match): diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 68b4196..030d7d1 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -70,7 +70,11 @@ class MastodonExtractor(BaseExtractor): def _check_moved(self, account): self._check_moved = None - if "moved" in account: + # Certain fediverse software (such as Iceshrimp and Sharkey) have a + # null account "moved" field instead of not having it outright. + # To handle this, check if the "moved" value is truthy instead + # if only it exists. + if account.get("moved"): self.log.warning("Account '%s' moved to '%s'", account["acct"], account["moved"]["acct"]) diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 55faf9e..d3150e6 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -26,7 +26,8 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): "{post[date]:%Y-%m-%d} {post[title]}") archive_fmt = "{blog[id]}_{post[num]}_{num}" pattern = (r"(?:https?://)?blog\.naver\.com/" - r"(?:PostView\.nhn\?blogId=(\w+)&logNo=(\d+)|(\w+)/(\d+)/?$)") + r"(?:PostView\.n(?:aver|hn)\?blogId=(\w+)&logNo=(\d+)|" + r"(\w+)/(\d+)/?$)") example = "https://blog.naver.com/BLOGID/12345" def __init__(self, match): @@ -46,8 +47,10 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): extr = text.extract_from(page) data = { "post": { - "title" : extr('"og:title" content="', '"'), - "description": extr('"og:description" content="', '"'), + "title" : text.unescape(extr( + '"og:title" content="', '"')), + "description": text.unescape(extr( + '"og:description" content="', '"')).replace(" ", " "), "num" : text.parse_int(self.post_id), }, "blog": { @@ -62,10 +65,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): return data def images(self, page): - return [ - (url.replace("://post", "://blog", 1).partition("?")[0], None) - for url in text.extract_iter(page, 'data-lazy-src="', '"') - ] + results = [] + for url in text.extract_iter(page, 'data-lazy-src="', '"'): + url = url.replace("://post", "://blog", 1).partition("?")[0] + if "\ufffd" in text.unquote(url): + url = text.unquote(url, encoding="EUC-KR") + results.append((url, None)) + return results class NaverBlogExtractor(NaverBase, Extractor): @@ -73,7 +79,8 @@ class NaverBlogExtractor(NaverBase, Extractor): subcategory = "blog" categorytransfer = True pattern = (r"(?:https?://)?blog\.naver\.com/" - r"(?:PostList.nhn\?(?:[^&#]+&)*blogId=([^&#]+)|(\w+)/?$)") + r"(?:PostList\.n(?:aver|hn)\?(?:[^&#]+&)*blogId=([^&#]+)|" + r"(\w+)/?$)") example = "https://blog.naver.com/BLOGID" def __init__(self, match): @@ -81,12 +88,11 @@ class NaverBlogExtractor(NaverBase, Extractor): self.blog_id = match.group(1) or match.group(2) def items(self): - # fetch first post number url = "{}/PostList.nhn?blogId={}".format(self.root, self.blog_id) - post_num = text.extract( + post_num = text.extr( self.request(url).text, 'gnFirstLogNo = "', '"', - )[0] + ) # setup params for API calls url = "{}/PostViewBottomTitleListAsync.nhn".format(self.root) diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 9614513..c50c013 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -19,7 +19,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): directory_fmt = ("{category}", "{user_id}") filename_fmt = "{image_id}_p{num}.{extension}" archive_fmt = "{image_id}_{num}" - request_interval = (1.0, 2.0) + request_interval = (2.0, 4.0) def __init__(self, match): BaseExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index d36f509..2bce597 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -219,7 +219,10 @@ class NitterExtractor(BaseExtractor): self.user_obj = self._user_from_html(tweets_html[0]) for html, quote in map(self._extract_quote, tweets_html[1:]): - yield self._tweet_from_html(html) + tweet = self._tweet_from_html(html) + if not tweet["date"]: + continue + yield tweet if quoted and quote: yield self._tweet_from_quote(quote) diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 5226724..b21e1eb 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -26,13 +26,13 @@ class PahealExtractor(Extractor): data = self.get_metadata() for post in self.get_posts(): - url = post["file_url"] - for key in ("id", "width", "height"): - post[key] = text.parse_int(post[key]) + post["id"] = text.parse_int(post["id"]) post["tags"] = text.unquote(post["tags"]) + post["width"] = text.parse_int(post["width"]) + post["height"] = text.parse_int(post["height"]) post.update(data) yield Message.Directory, post - yield Message.Url, url, post + yield Message.Url, post["file_url"], post def get_metadata(self): """Return general metadata""" @@ -114,17 +114,19 @@ class PahealTagExtractor(PahealExtractor): tags, data, date = data.split("\n") dimensions, size, ext = data.split(" // ") - tags = text.unescape(tags) width, _, height = dimensions.partition("x") height, _, duration = height.partition(", ") return { - "id": pid, "md5": md5, "file_url": url, - "width": width, "height": height, - "duration": text.parse_float(duration[:-1]), - "tags": tags, - "size": text.parse_bytes(size[:-1]), - "date": text.parse_datetime(date, "%B %d, %Y; %H:%M"), + "id" : pid, + "md5" : md5, + "file_url" : url, + "width" : width, + "height" : height, + "duration" : text.parse_float(duration[:-1]), + "tags" : text.unescape(tags), + "size" : text.parse_bytes(size[:-1]), + "date" : text.parse_datetime(date, "%B %d, %Y; %H:%M"), "filename" : "{} - {}".format(pid, tags), "extension": ext, } diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index b9821f2..862a7db 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -650,7 +650,7 @@ class PixivNovelExtractor(PixivExtractor): yield Message.Directory, novel try: - content = self.api.novel_text(novel["id"])["novel_text"] + content = self.api.novel_webview(novel["id"])["text"] except Exception: self.log.warning("Unable to download novel %s", novel["id"]) continue @@ -663,7 +663,7 @@ class PixivNovelExtractor(PixivExtractor): illusts = {} for marker in text.extract_iter(content, "[", "]"): - if marker.startswith("[jumpuri:If you would like to "): + if marker.startswith("uploadedimage:"): desktop = True elif marker.startswith("pixivimage:"): illusts[marker[11:].partition("-")[0]] = None @@ -918,6 +918,15 @@ class PixivAppAPI(): params = {"novel_id": novel_id} return self._call("/v1/novel/text", params) + def novel_webview(self, novel_id): + params = {"id": novel_id, "viewer_version": "20221031_ai"} + return self._call( + "/webview/v2/novel", params, self._novel_webview_parse) + + def _novel_webview_parse(self, response): + return util.json_loads(text.extr( + response.text, "novel: ", ",\n")) + def search_illust(self, word, sort=None, target=None, duration=None, date_start=None, date_end=None): params = {"word": word, "search_target": target, @@ -962,13 +971,17 @@ class PixivAppAPI(): params = {"illust_id": illust_id} return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"] - def _call(self, endpoint, params=None): + def _call(self, endpoint, params=None, parse=None): url = "https://app-api.pixiv.net" + endpoint while True: self.login() response = self.extractor.request(url, params=params, fatal=False) - data = response.json() + + if parse: + data = parse(response) + else: + data = response.json() if "error" not in data: return data diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index 7ff40a3..c7283fc 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -143,6 +143,9 @@ class PornhubGifExtractor(PornhubExtractor): "url" : extr('"contentUrl": "', '"'), "date" : text.parse_datetime( extr('"uploadDate": "', '"'), "%Y-%m-%d"), + "viewkey" : extr('From this video: ' + '<a href="/view_video.php?viewkey=', '"'), + "timestamp": extr('lass="directLink tstamp" rel="nofollow">', '<'), "user" : text.remove_html(extr("Created by:", "</div>")), } diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 2ef0f9f..e099c7e 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -191,6 +191,8 @@ class RedditExtractor(Extractor): try: if "reddit_video_preview" in post["preview"]: video = post["preview"]["reddit_video_preview"] + if "fallback_url" in video: + yield video["fallback_url"] if "dash_url" in video: yield "ytdl:" + video["dash_url"] if "hls_url" in video: @@ -200,6 +202,12 @@ class RedditExtractor(Extractor): try: for image in post["preview"]["images"]: + variants = image.get("variants") + if variants: + if "gif" in variants: + yield variants["gif"]["source"]["url"] + if "mp4" in variants: + yield variants["mp4"]["source"]["url"] yield image["source"]["url"] except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index 6185acb..327bcd1 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -52,23 +52,22 @@ class RedgifsExtractor(Extractor): gif.update(metadata) gif["count"] = cnt + gif["date"] = text.parse_timestamp(gif.get("createDate")) yield Message.Directory, gif for num, gif in enumerate(gifs, enum): - url = self._process(gif) + gif["_fallback"] = formats = self._formats(gif) + url = next(formats, None) + if not url: self.log.warning( "Skipping '%s' (format not available)", gif["id"]) continue + gif["num"] = num gif["count"] = cnt yield Message.Url, url, gif - def _process(self, gif): - gif["_fallback"] = formats = self._formats(gif) - gif["date"] = text.parse_timestamp(gif.get("createDate")) - return next(formats, None) - def _formats(self, gif): urls = gif["urls"] for fmt in self.formats: diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 0b29ed0..38a2d16 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -7,7 +7,7 @@ """Extractors for https://skeb.jp/""" from .common import Extractor, Message -from .. import text +from .. import text, exception import itertools @@ -26,6 +26,19 @@ class SkebExtractor(Extractor): def _init(self): self.thumbnails = self.config("thumbnails", False) self.article = self.config("article", False) + self.headers = {"Accept": "application/json, text/plain, */*"} + + if "Authorization" not in self.session.headers: + self.headers["Authorization"] = "Bearer null" + + def request(self, url, **kwargs): + while True: + try: + return Extractor.request(self, url, **kwargs) + except exception.HttpError as exc: + if exc.status == 429 and "request_key" in exc.response.cookies: + continue + raise def items(self): metadata = self.metadata() @@ -42,6 +55,12 @@ class SkebExtractor(Extractor): url = file["file_url"] yield Message.Url, url, text.nameext_from_url(url, post) + def _items_users(self): + base = self.root + "/@" + for user in self.users(): + user["_extractor"] = SkebUserExtractor + yield Message.Queue, base + user["screen_name"], user + def posts(self): """Return post number""" @@ -49,11 +68,11 @@ class SkebExtractor(Extractor): """Return additional metadata""" def _pagination(self, url, params): - headers = {"Authorization": "Bearer null"} params["offset"] = 0 while True: - posts = self.request(url, params=params, headers=headers).json() + posts = self.request( + url, params=params, headers=self.headers).json() for post in posts: parts = post["path"].split("/") @@ -70,11 +89,24 @@ class SkebExtractor(Extractor): return params["offset"] += 30 + def _pagination_users(self, endpoint, params): + url = "{}/api{}".format(self.root, endpoint) + params["offset"] = 0 + params["limit"] = 90 + + while True: + data = self.request( + url, params=params, headers=self.headers).json() + yield from data + + if len(data) < params["limit"]: + return + params["offset"] += params["limit"] + def _get_post_data(self, user_name, post_num): url = "{}/api/users/{}/works/{}".format( self.root, user_name, post_num) - headers = {"Authorization": "Bearer null"} - resp = self.request(url, headers=headers).json() + resp = self.request(url, headers=self.headers).json() creator = resp["creator"] post = { "post_id" : resp["id"], @@ -244,22 +276,23 @@ class SkebFollowingExtractor(SkebExtractor): pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/following_creators" example = "https://skeb.jp/@USER/following_creators" - def items(self): - for user in self.users(): - url = "{}/@{}".format(self.root, user["screen_name"]) - user["_extractor"] = SkebUserExtractor - yield Message.Queue, url, user + items = SkebExtractor._items_users def users(self): - url = "{}/api/users/{}/following_creators".format( - self.root, self.user_name) - params = {"sort": "date", "offset": 0, "limit": 90} - headers = {"Authorization": "Bearer null"} + endpoint = "/users/{}/following_creators".format(self.user_name) + params = {"sort": "date"} + return self._pagination_users(endpoint, params) - while True: - data = self.request(url, params=params, headers=headers).json() - yield from data - if len(data) < params["limit"]: - return - params["offset"] += params["limit"] +class SkebFollowingUsersExtractor(SkebExtractor): + """Extractor for your followed users""" + subcategory = "following-users" + pattern = r"(?:https?://)?skeb\.jp/following_users()" + example = "https://skeb.jp/following_users" + + items = SkebExtractor._items_users + + def users(self): + endpoint = "/following_users" + params = {} + return self._pagination_users(endpoint, params) diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index 9d46fd6..8582824 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -163,6 +163,9 @@ class SteamgriddbAssetExtractor(SteamgriddbExtractor): def assets(self): endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id asset = self._call(endpoint)["asset"] + if asset is None: + raise exception.NotFoundError("asset ({}:{})".format( + self.asset_type, self.asset_id)) return (asset,) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 31fb891..d4adfed 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -175,7 +175,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor): "author_id" : text.parse_int(extr('data-user-id="', '"')), "author_nick": text.unescape(extr('alt="', '"')), "date" : self._parse_datetime(extr( - 'class="section-subtitle">', '<')), + '<span class="star_link-types">', '<')), "content" : (extr( '<div class="post-content', '<div class="post-uploads') .partition(">")[2]), diff --git a/gallery_dl/extractor/test.py b/gallery_dl/extractor/test.py deleted file mode 100644 index e3f9f74..0000000 --- a/gallery_dl/extractor/test.py +++ /dev/null @@ -1,81 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2016-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Utility extractor to execute tests of other extractors""" - -from .common import Extractor, Message -from .. import extractor, exception - - -class TestExtractor(Extractor): - """Extractor to select and run the test URLs of other extractors - - The general form is 'test:<categories>:<subcategories>:<indices>', where - <categories> and <subcategories> are comma-separated (sub)category names - and <indices> is a comma-seperated list of array indices. - To select all possible values for a field use the star '*' character or - leave the field empty. - - Examples: - - test:pixiv - run all pixiv tests - - - test:pixiv:user,favorite:0 - run the first test of the PixivUser- and PixivFavoriteExtractor - - - test: - run all tests - """ - category = "test" - pattern = r"t(?:est)?:([^:]*)(?::([^:]*)(?::(\*|[\d,]*))?)?$" - example = "test:CATEGORY" - - def __init__(self, match): - Extractor.__init__(self, match) - categories, subcategories, indices = match.groups() - self.categories = self._split(categories) - self.subcategories = self._split(subcategories) - self.indices = self._split(indices) or self - - def items(self): - extractors = extractor.extractors() - - if self.categories: - extractors = [ - extr for extr in extractors - if extr.category in self.categories - ] - - if self.subcategories: - extractors = [ - extr for extr in extractors - if extr.subcategory in self.subcategories - ] - - tests = [ - test - for extr in extractors - for index, test in enumerate(extr._get_tests()) - if str(index) in self.indices - ] - - if not tests: - raise exception.NotFoundError("test") - - for test in tests: - yield Message.Queue, test[0], {} - - @staticmethod - def __contains__(_): - return True - - @staticmethod - def _split(value): - if value and value != "*": - return value.split(",") - return None diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ad5bfc6..a5bd984 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -340,6 +340,8 @@ class TwitterExtractor(Extractor): txt, _, tco = content.rpartition(" ") tdata["content"] = txt if tco.startswith("https://t.co/") else content + if "birdwatch_pivot" in tweet: + tdata["birdwatch"] = tweet["birdwatch_pivot"]["subtitle"]["text"] if "in_reply_to_screen_name" in legacy: tdata["reply_to"] = legacy["in_reply_to_screen_name"] if "quoted_by" in legacy: @@ -380,6 +382,7 @@ class TwitterExtractor(Extractor): "date" : text.parse_datetime( uget("created_at"), "%a %b %d %H:%M:%S %z %Y"), "verified" : uget("verified", False), + "protected" : uget("protected", False), "profile_banner" : uget("profile_banner_url", ""), "profile_image" : uget( "profile_image_url_https", "").replace("_normal.", "."), @@ -731,9 +734,9 @@ class TwitterEventExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor): - """Extractor for images from individual tweets""" + """Extractor for individual tweets""" subcategory = "tweet" - pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" + pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)/?$" example = "https://twitter.com/USER/status/12345" def __init__(self, match): @@ -810,6 +813,18 @@ class TwitterTweetExtractor(TwitterExtractor): return itertools.chain(buffer, tweets) +class TwitterQuotesExtractor(TwitterExtractor): + """Extractor for quotes of a Tweet""" + subcategory = "quotes" + pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes" + example = "https://twitter.com/USER/status/12345/quotes" + + def items(self): + url = "{}/search?q=quoted_tweet_id:{}".format(self.root, self.user) + data = {"_extractor": TwitterSearchExtractor} + yield Message.Queue, url, data + + class TwitterAvatarExtractor(TwitterExtractor): subcategory = "avatar" filename_fmt = "avatar {date}.{extension}" @@ -882,6 +897,7 @@ class TwitterAPI(): def __init__(self, extractor): self.extractor = extractor + self.log = extractor.log self.root = "https://twitter.com/i/api" self._nsfw_warning = True @@ -1244,7 +1260,7 @@ class TwitterAPI(): @cache(maxage=3600) def _guest_token(self): endpoint = "/1.1/guest/activate.json" - self.extractor.log.info("Requesting guest token") + self.log.info("Requesting guest token") return str(self._call( endpoint, None, "POST", False, "https://api.twitter.com", )["guest_token"]) @@ -1274,17 +1290,35 @@ class TwitterAPI(): if response.status_code < 400: data = response.json() - if not data.get("errors") or not any( - (e.get("message") or "").lower().startswith("timeout") - for e in data["errors"]): - return data # success or non-timeout errors - msg = data["errors"][0].get("message") or "Unspecified" - self.extractor.log.debug("Internal Twitter error: '%s'", msg) + errors = data.get("errors") + if not errors: + return data - if self.headers["x-twitter-auth-type"]: - self.extractor.log.debug("Retrying API request") - continue # retry + retry = False + for error in errors: + msg = error.get("message") or "Unspecified" + self.log.debug("API error: '%s'", msg) + + if "this account is temporarily locked" in msg: + msg = "Account temporarily locked" + if self.extractor.config("locked") != "wait": + raise exception.AuthorizationError(msg) + self.log.warning("%s. Press ENTER to retry.", msg) + try: + input() + except (EOFError, OSError): + pass + retry = True + + elif msg.lower().startswith("timeout"): + retry = True + + if not retry: + return data + elif self.headers["x-twitter-auth-type"]: + self.log.debug("Retrying API request") + continue # fall through to "Login Required" response.status_code = 404 @@ -1374,7 +1408,7 @@ class TwitterAPI(): try: tweet = tweets[tweet_id] except KeyError: - self.extractor.log.debug("Skipping %s (deleted)", tweet_id) + self.log.debug("Skipping %s (deleted)", tweet_id) continue if "retweeted_status_id_str" in tweet: @@ -1606,8 +1640,10 @@ class TwitterAPI(): variables["cursor"] = cursor def _pagination_users(self, endpoint, variables, path=None): - params = {"variables": None, - "features" : self._json_dumps(self.features_pagination)} + params = { + "variables": None, + "features" : self._json_dumps(self.features_pagination), + } while True: cursor = entry = None @@ -1651,9 +1687,9 @@ class TwitterAPI(): if text.startswith("Age-restricted"): if self._nsfw_warning: self._nsfw_warning = False - self.extractor.log.warning('"%s"', text) + self.log.warning('"%s"', text) - self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text) + self.log.debug("Skipping %s ('%s')", tweet_id, text) @cache(maxage=365*86400, keyarg=1) diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 5374f1c..6dfb23c 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -26,17 +26,39 @@ class VipergirlsExtractor(Extractor): cookies_domain = ".vipergirls.to" cookies_names = ("vg_userid", "vg_password") + def _init(self): + domain = self.config("domain") + if domain: + self.root = text.ensure_http_scheme(domain) + def items(self): self.login() + posts = self.posts() + + like = self.config("like") + if like: + user_hash = posts[0].get("hash") + if len(user_hash) < 16: + self.log.warning("Login required to like posts") + like = False - for post in self.posts(): + posts = posts.iter("post") + if self.page: + util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) + + for post in posts: data = post.attrib data["thread_id"] = self.thread_id yield Message.Directory, data + + image = None for image in post: yield Message.Queue, image.attrib["main_url"], data + if image is not None and like: + self.like(post, user_hash) + def login(self): if self.cookies_check(self.cookies_names): return @@ -64,6 +86,17 @@ class VipergirlsExtractor(Extractor): return {cookie.name: cookie.value for cookie in response.cookies} + def like(self, post, user_hash): + url = self.root + "/post_thanks.php" + params = { + "do" : "post_thanks_add", + "p" : post.get("id"), + "securitytoken": user_hash, + } + + with self.request(url, params=params, allow_redirects=False): + pass + class VipergirlsThreadExtractor(VipergirlsExtractor): """Extractor for vipergirls threads""" @@ -77,12 +110,7 @@ class VipergirlsThreadExtractor(VipergirlsExtractor): def posts(self): url = "{}/vr.php?t={}".format(self.root, self.thread_id) - root = ElementTree.fromstring(self.request(url).text) - posts = root.iter("post") - - if self.page: - util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) - return posts + return ElementTree.fromstring(self.request(url).text) class VipergirlsPostExtractor(VipergirlsExtractor): @@ -95,8 +123,8 @@ class VipergirlsPostExtractor(VipergirlsExtractor): def __init__(self, match): VipergirlsExtractor.__init__(self, match) self.thread_id, self.post_id = match.groups() + self.page = 0 def posts(self): url = "{}/vr.php?p={}".format(self.root, self.post_id) - root = ElementTree.fromstring(self.request(url).text) - return root.iter("post") + return ElementTree.fromstring(self.request(url).text) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 3bb635d..e91f45f 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -50,7 +50,7 @@ class WarosuThreadExtractor(Extractor): title = text.unescape(text.extr(page, "class=filetitle>", "<")) return { "board" : self.board, - "board_name": boardname.rpartition(" - ")[2], + "board_name": boardname.split(" - ")[1], "thread" : self.thread, "title" : title, } @@ -64,8 +64,7 @@ class WarosuThreadExtractor(Extractor): def parse(self, post): """Build post object by extracting data from an HTML post""" data = self._extract_post(post) - if "<span> File:" in post: - self._extract_image(post, data) + if "<span> File:" in post and self._extract_image(post, data): part = data["image"].rpartition("/")[2] data["tim"], _, data["extension"] = part.partition(".") data["ext"] = "." + data["extension"] @@ -91,6 +90,11 @@ class WarosuThreadExtractor(Extractor): "", "<").rstrip().rpartition(".")[0]) extr("<br>", "") - data["image"] = url = extr("<a href=", ">") - if url[0] == "/": - data["image"] = self.root + url + url = extr("<a href=", ">") + if url: + if url[0] == "/": + data["image"] = self.root + url + else: + data["image"] = url + return True + return False diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 5b45148..83b1642 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -30,9 +30,9 @@ class WeiboExtractor(Extractor): self._prefix, self.user = match.groups() def _init(self): - self.retweets = self.config("retweets", True) - self.videos = self.config("videos", True) self.livephoto = self.config("livephoto", True) + self.retweets = self.config("retweets", False) + self.videos = self.config("videos", True) self.gifs = self.config("gifs", True) self.gifs_video = (self.gifs == "video") @@ -59,15 +59,25 @@ class WeiboExtractor(Extractor): for status in self.statuses(): - files = [] - if self.retweets and "retweeted_status" in status: + if "ori_mid" in status and not self.retweets: + self.log.debug("Skipping %s (快转 retweet)", status["id"]) + continue + + if "retweeted_status" in status: + if not self.retweets: + self.log.debug("Skipping %s (retweet)", status["id"]) + continue + + # videos of the original post are in status + # images of the original post are in status["retweeted_status"] + files = [] + self._extract_status(status, files) + self._extract_status(status["retweeted_status"], files) + if original_retweets: status = status["retweeted_status"] - self._extract_status(status, files) - else: - self._extract_status(status, files) - self._extract_status(status["retweeted_status"], files) else: + files = [] self._extract_status(status, files) status["date"] = text.parse_datetime( @@ -118,7 +128,7 @@ class WeiboExtractor(Extractor): append(pic["largest"].copy()) file = {"url": pic["video"]} - file["filehame"], _, file["extension"] = \ + file["filename"], _, file["extension"] = \ pic["video"].rpartition("%2F")[2].rpartition(".") append(file) @@ -176,23 +186,34 @@ class WeiboExtractor(Extractor): data = data["data"] statuses = data["list"] - if not statuses: - return yield from statuses - if "next_cursor" in data: # videos, newvideo - if data["next_cursor"] == -1: + # videos, newvideo + cursor = data.get("next_cursor") + if cursor: + if cursor == -1: return - params["cursor"] = data["next_cursor"] - elif "page" in params: # home, article - params["page"] += 1 - elif data["since_id"]: # album + params["cursor"] = cursor + continue + + # album + since_id = data.get("since_id") + if since_id: params["sinceid"] = data["since_id"] - else: # feed, last album page - try: - params["since_id"] = statuses[-1]["id"] - 1 - except KeyError: + continue + + # home, article + if "page" in params: + if not statuses: return + params["page"] += 1 + continue + + # feed, last album page + try: + params["since_id"] = statuses[-1]["id"] - 1 + except LookupError: + return def _sina_visitor_system(self, response): self.log.info("Sina Visitor System") diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py index c93f33f..ac00682 100644 --- a/gallery_dl/extractor/wikimedia.py +++ b/gallery_dl/extractor/wikimedia.py @@ -162,6 +162,11 @@ BASE_PATTERN = WikimediaExtractor.update({ "pattern": r"(?:www\.)?pidgi\.net", "api-path": "/wiki/api.php", }, + "azurlanewiki": { + "root": "https://azurlane.koumakan.jp", + "pattern": r"azurlane\.koumakan\.jp", + "api-path": "/w/api.php", + }, }) diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 46e574e..da9d6b0 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -11,6 +11,9 @@ from .common import GalleryExtractor, Extractor, Message from .. import text, util +BASE_PATTERN = (r"(?:https?://)?(?:www\.)?xvideos\.com" + r"/(?:profiles|(?:amateur-|model-)?channels)") + class XvideosBase(): """Base class for xvideos extractors""" @@ -25,9 +28,7 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): "{gallery[id]} {gallery[title]}") filename_fmt = "{category}_{gallery[id]}_{num:>03}.{extension}" archive_fmt = "{gallery[id]}_{num}" - pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" - r"/(?:profiles|amateur-channels|model-channels)" - r"/([^/?#]+)/photos/(\d+)") + pattern = BASE_PATTERN + r"/([^/?#]+)/photos/(\d+)" example = "https://www.xvideos.com/profiles/USER/photos/12345" def __init__(self, match): @@ -58,22 +59,35 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): }, } - @staticmethod - def images(page): - """Return a list of all image urls for this gallery""" - return [ + def images(self, page): + results = [ (url, None) for url in text.extract_iter( page, '<a class="embed-responsive-item" href="', '"') ] + if not results: + return + + while len(results) % 500 == 0: + path = text.rextract(page, ' href="', '"', page.find(">Next</"))[0] + if not path: + break + page = self.request(self.root + path).text + results.extend( + (url, None) + for url in text.extract_iter( + page, '<a class="embed-responsive-item" href="', '"') + ) + + return results + class XvideosUserExtractor(XvideosBase, Extractor): """Extractor for user profiles on xvideos.com""" subcategory = "user" categorytransfer = True - pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" - r"/profiles/([^/?#]+)/?(?:#.*)?$") + pattern = BASE_PATTERN + r"/([^/?#]+)/?(?:#.*)?$" example = "https://www.xvideos.com/profiles/USER" def __init__(self, match): diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 6ee96e6..fc61dff 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -10,7 +10,7 @@ from .booru import BooruExtractor from ..cache import cache -from .. import text, exception +from .. import text, util, exception BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" @@ -21,8 +21,11 @@ class ZerochanExtractor(BooruExtractor): root = "https://www.zerochan.net" filename_fmt = "{id}.{extension}" archive_fmt = "{id}" + page_start = 1 + per_page = 250 cookies_domain = ".zerochan.net" cookies_names = ("z_id", "z_hash") + request_interval = (0.5, 1.5) def login(self): self._logged_in = True @@ -86,7 +89,7 @@ class ZerochanExtractor(BooruExtractor): return data - def _parse_entry_json(self, entry_id): + def _parse_entry_api(self, entry_id): url = "{}/{}?json".format(self.root, entry_id) item = self.request(url).json() @@ -117,14 +120,22 @@ class ZerochanTagExtractor(ZerochanExtractor): ZerochanExtractor.__init__(self, match) self.search_tag, self.query = match.groups() + def _init(self): + if self.config("pagination") == "html": + self.posts = self.posts_html + self.per_page = 24 + else: + self.posts = self.posts_api + self.session.headers["User-Agent"] = util.USERAGENT + def metadata(self): return {"search_tags": text.unquote( self.search_tag.replace("+", " "))} - def posts(self): + def posts_html(self): url = self.root + "/" + self.search_tag params = text.parse_query(self.query) - params["p"] = text.parse_int(params.get("p"), 1) + params["p"] = text.parse_int(params.get("p"), self.page_start) metadata = self.config("metadata") while True: @@ -140,7 +151,7 @@ class ZerochanTagExtractor(ZerochanExtractor): if metadata: entry_id = extr('href="/', '"') post = self._parse_entry_html(entry_id) - post.update(self._parse_entry_json(entry_id)) + post.update(self._parse_entry_api(entry_id)) yield post else: yield { @@ -157,6 +168,41 @@ class ZerochanTagExtractor(ZerochanExtractor): break params["p"] += 1 + def posts_api(self): + url = self.root + "/" + self.search_tag + metadata = self.config("metadata") + params = { + "json": "1", + "l" : self.per_page, + "p" : self.page_start, + } + + static = "https://static.zerochan.net/.full." + + while True: + data = self.request(url, params=params).json() + try: + posts = data["items"] + except ValueError: + return + + if metadata: + for post in posts: + post_id = post["id"] + post.update(self._parse_entry_html(post_id)) + post.update(self._parse_entry_api(post_id)) + else: + for post in posts: + base = static + str(post["id"]) + post["file_url"] = base + ".jpg" + post["_fallback"] = (base + ".png",) + + yield from posts + + if not data.get("next"): + return + params["p"] += 1 + class ZerochanImageExtractor(ZerochanExtractor): subcategory = "image" @@ -170,5 +216,5 @@ class ZerochanImageExtractor(ZerochanExtractor): def posts(self): post = self._parse_entry_html(self.image_id) if self.config("metadata"): - post.update(self._parse_entry_json(self.image_id)) + post.update(self._parse_entry_api(self.image_id)) return (post,) |
