diff options
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/arcalive.py | 5 | ||||
| -rw-r--r-- | gallery_dl/extractor/aryion.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/bluesky.py | 6 | ||||
| -rw-r--r-- | gallery_dl/extractor/civitai.py | 106 | ||||
| -rw-r--r-- | gallery_dl/extractor/fanbox.py | 10 | ||||
| -rw-r--r-- | gallery_dl/extractor/flickr.py | 112 | ||||
| -rw-r--r-- | gallery_dl/extractor/idolcomplex.py | 46 | ||||
| -rw-r--r-- | gallery_dl/extractor/instagram.py | 20 | ||||
| -rw-r--r-- | gallery_dl/extractor/mangadex.py | 151 | ||||
| -rw-r--r-- | gallery_dl/extractor/mastodon.py | 8 | ||||
| -rw-r--r-- | gallery_dl/extractor/motherless.py | 42 | ||||
| -rw-r--r-- | gallery_dl/extractor/pinterest.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixeldrain.py | 70 | ||||
| -rw-r--r-- | gallery_dl/extractor/pixiv.py | 28 | ||||
| -rw-r--r-- | gallery_dl/extractor/sankaku.py | 21 | ||||
| -rw-r--r-- | gallery_dl/extractor/subscribestar.py | 24 | ||||
| -rw-r--r-- | gallery_dl/extractor/vipergirls.py | 29 |
17 files changed, 512 insertions, 171 deletions
diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py index 8c44256..3c39a1a 100644 --- a/gallery_dl/extractor/arcalive.py +++ b/gallery_dl/extractor/arcalive.py @@ -17,6 +17,7 @@ class ArcaliveExtractor(Extractor): """Base class for Arca.live extractors""" category = "arcalive" root = "https://arca.live" + useragent = "net.umanle.arca.android.playstore/0.9.75" request_interval = (0.5, 1.5) def _init(self): @@ -149,9 +150,7 @@ class ArcaliveAPI(): self.log = extractor.log self.root = extractor.root + "/api/app" - headers = extractor.session.headers - headers["User-Agent"] = "net.umanle.arca.android.playstore/0.9.75" - headers["X-Device-Token"] = util.generate_token(64) + extractor.session.headers["X-Device-Token"] = util.generate_token(64) def board(self, board_slug, params): endpoint = "/list/channel/" + board_slug diff --git a/gallery_dl/extractor/aryion.py b/gallery_dl/extractor/aryion.py index 17b780e..ca88187 100644 --- a/gallery_dl/extractor/aryion.py +++ b/gallery_dl/extractor/aryion.py @@ -169,7 +169,7 @@ class AryionExtractor(Extractor): "<p>", "</p>"), "", "")), "filename" : fname, "extension": ext, - "_mtime" : lmod, + "_http_lastmodified": lmod, } diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index ec274b8..6f4abd5 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -49,7 +49,11 @@ class BlueskyExtractor(Extractor): self.log.debug("Skipping %s (repost)", self._pid(post)) continue embed = post.get("embed") - post.update(post.pop("record")) + try: + post.update(post.pop("record")) + except Exception: + self.log.debug("Skipping %s (no 'record')", self._pid(post)) + continue while True: self._prepare(post) diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py index de8f86c..56fe851 100644 --- a/gallery_dl/extractor/civitai.py +++ b/gallery_dl/extractor/civitai.py @@ -45,6 +45,20 @@ class CivitaiExtractor(Extractor): self._image_quality = "original=true" self._image_ext = "png" + quality_video = self.config("quality-videos") + if quality_video: + if not isinstance(quality_video, str): + quality_video = ",".join(quality_video) + if quality_video[0] == "+": + quality_video = (self._image_quality + "," + + quality_video.lstrip("+,")) + self._video_quality = quality_video + elif quality_video is not None and quality: + self._video_quality = self._image_quality + else: + self._video_quality = "quality=100" + self._video_ext = "webm" + metadata = self.config("metadata") if metadata: if isinstance(metadata, str): @@ -82,9 +96,8 @@ class CivitaiExtractor(Extractor): "user": post.pop("user"), } if self._meta_version: - data["version"] = version = self.api.model_version( - post["modelVersionId"]).copy() - data["model"] = version.pop("model") + data["model"], data["version"] = \ + self._extract_meta_version(post) yield Message.Directory, data for file in self._image_results(images): @@ -95,26 +108,22 @@ class CivitaiExtractor(Extractor): images = self.images() if images: for image in images: - url = self._url(image) + if self._meta_generation: - image["generation"] = self.api.image_generationdata( - image["id"]) + image["generation"] = \ + self._extract_meta_generation(image) if self._meta_version: - if "modelVersionId" in image: - version_id = image["modelVersionId"] - else: - post = image["post"] = self.api.post( - image["postId"]) - post.pop("user", None) - version_id = post["modelVersionId"] - image["version"] = version = self.api.model_version( - version_id).copy() - image["model"] = version.pop("model") - + image["model"], image["version"] = \ + self._extract_meta_version(image, False) image["date"] = text.parse_datetime( image["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ") + + url = self._url(image) text.nameext_from_url(url, image) - image["extension"] = self._image_ext + if not image["extension"]: + image["extension"] = ( + self._video_ext if image.get("type") == "video" else + self._image_ext) yield Message.Directory, image yield Message.Url, url, image return @@ -130,20 +139,23 @@ class CivitaiExtractor(Extractor): def _url(self, image): url = image["url"] + video = image.get("type") == "video" + quality = self._video_quality if video else self._image_quality + if "/" in url: parts = url.rsplit("/", 3) image["uuid"] = parts[1] - parts[2] = self._image_quality + parts[2] = quality return "/".join(parts) - image["uuid"] = url + image["uuid"] = url name = image.get("name") if not name: mime = image.get("mimeType") or self._image_ext name = "{}.{}".format(image.get("id"), mime.rpartition("/")[2]) return ( "https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/{}/{}/{}".format( - url, self._image_quality, name) + url, quality, name) ) def _image_results(self, images): @@ -154,11 +166,13 @@ class CivitaiExtractor(Extractor): "url" : self._url(file), }) if not data["extension"]: - data["extension"] = self._image_ext + data["extension"] = ( + self._video_ext if file.get("type") == "video" else + self._image_ext) if "id" not in file and data["filename"].isdecimal(): file["id"] = text.parse_int(data["filename"]) if self._meta_generation: - file["generation"] = self.api.image_generationdata(file["id"]) + file["generation"] = self._extract_meta_generation(file) yield data def _parse_query(self, value): @@ -166,6 +180,38 @@ class CivitaiExtractor(Extractor): value, {"tags", "reactions", "baseModels", "tools", "techniques", "types", "fileFormats"}) + def _extract_meta_generation(self, image): + try: + return self.api.image_generationdata(image["id"]) + except Exception as exc: + return self.log.debug("", exc_info=exc) + + def _extract_meta_version(self, item, is_post=True): + try: + version_id = self._extract_version_id(item, is_post) + if version_id: + version = self.api.model_version(version_id).copy() + return version.pop("model", None), version + except Exception as exc: + self.log.debug("", exc_info=exc) + return None, None + + def _extract_version_id(self, item, is_post=True): + version_id = item.get("modelVersionId") + if version_id: + return version_id + + version_ids = item.get("modelVersionIds") + if version_ids: + return version_ids[0] + + if is_post: + return None + + item["post"] = post = self.api.post(item["postId"]) + post.pop("user", None) + return self._extract_version_id(post) + class CivitaiModelExtractor(CivitaiExtractor): subcategory = "model" @@ -235,16 +281,20 @@ class CivitaiModelExtractor(CivitaiExtractor): files = [] for num, file in enumerate(version["files"], 1): + name, sep, ext = file["name"].rpartition(".") + if not sep: + name = ext + ext = "bin" file["uuid"] = "model-{}-{}-{}".format( model["id"], version["id"], file["id"]) files.append({ "num" : num, "file" : file, - "filename" : file["name"], - "extension": "bin", - "url" : file.get("downloadUrl") or - "{}/api/download/models/{}".format( - self.root, version["id"]), + "filename" : name, + "extension": ext, + "url" : (file.get("downloadUrl") or + "{}/api/download/models/{}".format( + self.root, version["id"])), "_http_headers" : { "Authorization": self.api.headers.get("Authorization")}, "_http_validate": self._validate_file_model, diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 3b43134..8981c29 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -26,12 +26,18 @@ class FanboxExtractor(Extractor): directory_fmt = ("{category}", "{creatorId}") filename_fmt = "{id}_{num}.{extension}" archive_fmt = "{id}_{num}" + browser = "firefox" _warning = True def _init(self): self.headers = { - "Accept": "application/json, text/plain, */*", - "Origin": self.root, + "Accept" : "application/json, text/plain, */*", + "Origin" : "https://www.fanbox.cc", + "Referer": "https://www.fanbox.cc/", + "Cookie" : None, + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-site", } self.embeds = self.config("embeds", True) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index e85a375..eb68c3e 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -23,13 +23,10 @@ class FlickrExtractor(Extractor): request_interval = (1.0, 2.0) request_interval_min = 0.5 - def __init__(self, match): - Extractor.__init__(self, match) - self.item_id = match.group(1) - def _init(self): self.api = FlickrAPI(self) self.user = None + self.item_id = self.groups[0] def items(self): data = self.metadata() @@ -51,6 +48,8 @@ class FlickrExtractor(Extractor): def metadata(self): """Return general metadata""" self.user = self.api.urls_lookupUser(self.item_id) + if self.config("profile", False): + self.user.update(self.api.people_getInfo(self.user["nsid"])) return {"user": self.user} def photos(self): @@ -75,23 +74,26 @@ class FlickrImageExtractor(FlickrExtractor): r"|flic\.kr/p/([A-Za-z1-9]+))") example = "https://www.flickr.com/photos/USER/12345" - def __init__(self, match): - FlickrExtractor.__init__(self, match) - if not self.item_id: + def items(self): + item_id, enc_id = self.groups + if enc_id is not None: alphabet = ("123456789abcdefghijkmnopqrstu" "vwxyzABCDEFGHJKLMNPQRSTUVWXYZ") - self.item_id = util.bdecode(match.group(2), alphabet) + item_id = util.bdecode(enc_id, alphabet) - def items(self): - photo = self.api.photos_getInfo(self.item_id) + photo = self.api.photos_getInfo(item_id) - self.api._extract_metadata(photo) + self.api._extract_metadata(photo, False) if photo["media"] == "video" and self.api.videos: self.api._extract_video(photo) else: self.api._extract_photo(photo) - photo["user"] = photo["owner"] + if self.config("profile", False): + photo["user"] = self.api.people_getInfo(photo["owner"]["nsid"]) + else: + photo["user"] = photo["owner"] + photo["title"] = photo["title"]["_content"] photo["comments"] = text.parse_int(photo["comments"]["_content"]) photo["description"] = photo["description"]["_content"] @@ -120,11 +122,8 @@ class FlickrAlbumExtractor(FlickrExtractor): pattern = BASE_PATTERN + r"/photos/([^/?#]+)/(?:album|set)s(?:/(\d+))?" example = "https://www.flickr.com/photos/USER/albums/12345" - def __init__(self, match): - FlickrExtractor.__init__(self, match) - self.album_id = match.group(2) - def items(self): + self.album_id = self.groups[1] if self.album_id: return FlickrExtractor.items(self) return self._album_items() @@ -163,12 +162,9 @@ class FlickrGalleryExtractor(FlickrExtractor): pattern = BASE_PATTERN + r"/photos/([^/?#]+)/galleries/(\d+)" example = "https://www.flickr.com/photos/USER/galleries/12345/" - def __init__(self, match): - FlickrExtractor.__init__(self, match) - self.gallery_id = match.group(2) - def metadata(self): data = FlickrExtractor.metadata(self) + self.gallery_id = self.groups[1] data["gallery"] = self.api.galleries_getInfo(self.gallery_id) return data @@ -223,13 +219,10 @@ class FlickrSearchExtractor(FlickrExtractor): pattern = BASE_PATTERN + r"/search/?\?([^#]+)" example = "https://flickr.com/search/?text=QUERY" - def __init__(self, match): - FlickrExtractor.__init__(self, match) - self.search = text.parse_query(match.group(1)) + def metadata(self): + self.search = text.parse_query(self.groups[0]) if "text" not in self.search: self.search["text"] = "" - - def metadata(self): return {"search": self.search} def photos(self): @@ -275,13 +268,27 @@ class FlickrAPI(oauth.OAuth1API): "appletv" : 1, "iphone_wifi": 0, } + LICENSES = { + "0": "All Rights Reserved", + "1": "Attribution-NonCommercial-ShareAlike License", + "2": "Attribution-NonCommercial License", + "3": "Attribution-NonCommercial-NoDerivs License", + "4": "Attribution License", + "5": "Attribution-ShareAlike License", + "6": "Attribution-NoDerivs License", + "7": "No known copyright restrictions", + "8": "United States Government Work", + "9": "Public Domain Dedication (CC0)", + "10": "Public Domain Mark", + } def __init__(self, extractor): oauth.OAuth1API.__init__(self, extractor) - self.exif = extractor.config("exif", False) self.videos = extractor.config("videos", True) - self.contexts = extractor.config("contexts", False) + self.meta_exif = extractor.config("exif", False) + self.meta_info = extractor.config("info", False) + self.meta_contexts = extractor.config("contexts", False) self.maxsize = extractor.config("size-max") if isinstance(self.maxsize, str): @@ -321,6 +328,26 @@ class FlickrAPI(oauth.OAuth1API): params = {"group_id": group_id} return self._pagination("groups.pools.getPhotos", params) + def people_getInfo(self, user_id): + """Get information about a user.""" + params = {"user_id": user_id} + user = self._call("people.getInfo", params) + + try: + user = user["person"] + for key in ("description", "username", "realname", "location", + "profileurl", "photosurl", "mobileurl"): + if isinstance(user.get(key), dict): + user[key] = user[key]["_content"] + photos = user["photos"] + for key in ("count", "firstdate", "firstdatetaken"): + if isinstance(photos.get(key), dict): + photos[key] = photos[key]["_content"] + except Exception: + pass + + return user + def people_getPhotos(self, user_id): """Return photos from the given user's photostream.""" params = {"user_id": user_id} @@ -469,14 +496,15 @@ class FlickrAPI(oauth.OAuth1API): self._extract_metadata(photo) photo["id"] = text.parse_int(photo["id"]) - if "owner" in photo: + if "owner" not in photo: + photo["owner"] = self.extractor.user + elif not self.meta_info: photo["owner"] = { "nsid" : photo["owner"], "username" : photo["ownername"], "path_alias": photo["pathalias"], } - else: - photo["owner"] = self.extractor.user + del photo["pathalias"] del photo["ownername"] @@ -522,8 +550,23 @@ class FlickrAPI(oauth.OAuth1API): photo["width"] = photo["height"] = 0 return photo - def _extract_metadata(self, photo): - if self.exif: + def _extract_metadata(self, photo, info=True): + if info and self.meta_info: + try: + photo.update(self.photos_getInfo(photo["id"])) + photo["title"] = photo["title"]["_content"] + photo["comments"] = text.parse_int( + photo["comments"]["_content"]) + photo["description"] = photo["description"]["_content"] + photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]] + photo["views"] = text.parse_int(photo["views"]) + photo["id"] = text.parse_int(photo["id"]) + except Exception as exc: + self.log.warning( + "Unable to retrieve 'info' data for %s (%s: %s)", + photo["id"], exc.__class__.__name__, exc) + + if self.meta_exif: try: photo.update(self.photos_getExif(photo["id"])) except Exception as exc: @@ -531,7 +574,7 @@ class FlickrAPI(oauth.OAuth1API): "Unable to retrieve 'exif' data for %s (%s: %s)", photo["id"], exc.__class__.__name__, exc) - if self.contexts: + if self.meta_contexts: try: photo.update(self.photos_getAllContexts(photo["id"])) except Exception as exc: @@ -539,6 +582,9 @@ class FlickrAPI(oauth.OAuth1API): "Unable to retrieve 'contexts' data for %s (%s: %s)", photo["id"], exc.__class__.__name__, exc) + if "license" in photo: + photo["license_name"] = self.LICENSES.get(photo["license"]) + @staticmethod def _clean_info(info): info["title"] = info["title"]["_content"] diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index dfd9a31..8f4a10c 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -90,9 +90,11 @@ class IdolcomplexExtractor(SankakuExtractor): "user[password]": password, "commit" : "Login", } + self.sleep(10, "login") response = self.request(url, method="POST", headers=headers, data=data) - if not response.history or response.url.endswith("/user/home"): + if not response.history or response.url.endswith( + ("/users/login", "/user/home")): raise exception.AuthenticationError() return {c.name: c.value for c in response.history[0].cookies} @@ -187,32 +189,39 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): return {"search_tags": " ".join(tags)} def post_ids(self): - params = {"tags": self.tags} + url = self.root + "/en/posts" + params = {"auto_page": "t"} if self.next: params["next"] = self.next else: params["page"] = self.start_page + params["tags"] = self.tags while True: - page = self.request(self.root, params=params, retries=10).text - pos = ((page.find('id="more-popular-posts-link"') + 1) or - (page.find('<span class="thumb') + 1)) + response = self.request(url, params=params, retries=10) + if response.history and "/posts/premium" in response.url: + self.log.warning("HTTP redirect to %s", response.url) + page = response.text - yield from self.find_pids(page, pos) + yield from text.extract_iter(page, '"id":"', '"') - next_url = text.extract(page, 'next-page-url="', '"', pos)[0] - if not next_url: + next_page_url = text.extr(page, 'next-page-url="', '"') + if not next_page_url: return - next_params = text.parse_query(text.unquote(text.unescape( - text.unescape(next_url).lstrip("?/")))) + url, _, next_params = text.unquote( + text.unescape(text.unescape(next_page_url))).partition("?") + next_params = text.parse_query(next_params) if "next" in next_params: # stop if the same "next" value occurs twice in a row (#265) if "next" in params and params["next"] == next_params["next"]: return next_params["page"] = "2" + + if url[0] == "/": + url = self.root + url params = next_params @@ -225,10 +234,6 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): example = "https://idol.sankakucomplex.com/pools/0123456789abcdef" per_page = 24 - def __init__(self, match): - IdolcomplexExtractor.__init__(self, match) - self.pool_id = match.group(1) - def skip(self, num): pages, posts = divmod(num, self.per_page) self.start_page += pages @@ -236,10 +241,13 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): return num def metadata(self): - return {"pool": self.pool_id} + return {"pool": self.groups[0]} def post_ids(self): - url = self.root + "/pools/show/" + self.pool_id + if not self.logged_in: + self.log.warning("Login required") + + url = self.root + "/pools/show/" + self.groups[0] params = {"page": self.start_page} while True: @@ -260,9 +268,5 @@ class IdolcomplexPostExtractor(IdolcomplexExtractor): pattern = BASE_PATTERN + r"/posts?/(?:show/)?(\w+)" example = "https://idol.sankakucomplex.com/posts/0123456789abcdef" - def __init__(self, match): - IdolcomplexExtractor.__init__(self, match) - self.post_id = match.group(1) - def post_ids(self): - return (self.post_id,) + return (self.groups[0],) diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 0f88cac..624bba2 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -165,13 +165,16 @@ class InstagramExtractor(Extractor): if "items" in post: # story or highlight items = post["items"] reel_id = str(post["id"]).rpartition(":")[2] + expires = post.get("expiring_at") data = { - "expires": text.parse_timestamp(post.get("expiring_at")), + "expires": text.parse_timestamp(expires), "post_id": reel_id, "post_shortcode": shortcode_from_id(reel_id), } if "title" in post: data["highlight_title"] = post["title"] + if expires and not post.get("seen"): + post["seen"] = expires - 86400 else: # regular image/video post data = { @@ -583,7 +586,10 @@ class InstagramStoriesExtractor(InstagramExtractor): reel_id = self.highlight_id or self.api.user_id(self.user) reels = self.api.reels_media(reel_id) - if self.media_id and reels: + if not reels: + return () + + if self.media_id: reel = reels[0] for item in reel["items"]: if item["pk"] == self.media_id: @@ -592,6 +598,16 @@ class InstagramStoriesExtractor(InstagramExtractor): else: raise exception.NotFoundError("story") + elif self.config("split"): + reel = reels[0] + reels = [] + for item in reel["items"]: + item.pop("user", None) + copy = reel.copy() + copy.update(item) + copy["items"] = (item,) + reels.append(copy) + return reels diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 7f87cff..42a508d 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -29,11 +29,8 @@ class MangadexExtractor(Extractor): useragent = util.USERAGENT _cache = {} - def __init__(self, match): - Extractor.__init__(self, match) - self.uuid = match.group(1) - def _init(self): + self.uuid = self.groups[0] self.api = MangadexAPI(self) def items(self): @@ -44,6 +41,12 @@ class MangadexExtractor(Extractor): self._cache[uuid] = data yield Message.Queue, self.root + "/chapter/" + uuid, data + def _items_manga(self): + data = {"_extractor": MangadexMangaExtractor} + for manga in self.manga(): + url = "{}/title/{}".format(self.root, manga["id"]) + yield Message.Queue, url, data + def _transform(self, chapter): relationships = defaultdict(list) for item in chapter["relationships"]: @@ -130,7 +133,7 @@ class MangadexChapterExtractor(MangadexExtractor): class MangadexMangaExtractor(MangadexExtractor): """Extractor for manga from mangadex.org""" subcategory = "manga" - pattern = BASE_PATTERN + r"/(?:title|manga)/(?!feed$)([0-9a-f-]+)" + pattern = BASE_PATTERN + r"/(?:title|manga)/(?!follows|feed$)([0-9a-f-]+)" example = ("https://mangadex.org/title" "/01234567-89ab-cdef-0123-456789abcdef") @@ -139,17 +142,29 @@ class MangadexMangaExtractor(MangadexExtractor): class MangadexFeedExtractor(MangadexExtractor): - """Extractor for chapters from your Followed Feed""" + """Extractor for chapters from your Updates Feed""" subcategory = "feed" - pattern = BASE_PATTERN + r"/title/feed$()" + pattern = BASE_PATTERN + r"/titles?/feed$()" example = "https://mangadex.org/title/feed" def chapters(self): return self.api.user_follows_manga_feed() +class MangadexFollowingExtractor(MangadexExtractor): + """Extractor for followed manga from your Library""" + subcategory = "following" + pattern = BASE_PATTERN + r"/titles?/follows(?:\?([^#]+))?$" + example = "https://mangadex.org/title/follows" + + items = MangadexExtractor._items_manga + + def manga(self): + return self.api.user_follows_manga() + + class MangadexListExtractor(MangadexExtractor): - """Extractor for mangadex lists""" + """Extractor for mangadex MDLists""" subcategory = "list" pattern = (BASE_PATTERN + r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?") @@ -161,17 +176,17 @@ class MangadexListExtractor(MangadexExtractor): if match.group(2) == "feed": self.subcategory = "list-feed" else: - self.items = self._items_titles + self.items = self._items_manga def chapters(self): return self.api.list_feed(self.uuid) - def _items_titles(self): - data = {"_extractor": MangadexMangaExtractor} - for item in self.api.list(self.uuid)["relationships"]: - if item["type"] == "manga": - url = "{}/title/{}".format(self.root, item["id"]) - yield Message.Queue, url, data + def manga(self): + return [ + item + for item in self.api.list(self.uuid)["relationships"] + if item["type"] == "manga" + ] class MangadexAuthorExtractor(MangadexExtractor): @@ -196,10 +211,18 @@ class MangadexAPI(): def __init__(self, extr): self.extractor = extr - self.headers = {} + self.headers = None + self.headers_auth = {} self.username, self.password = extr._get_auth_info() - if not self.username: + if self.username: + self.client_id = cid = extr.config("client-id") + self.client_secret = extr.config("client-secret") + if cid: + self._authenticate_impl = self._authenticate_impl_client + else: + self._authenticate_impl = self._authenticate_impl_legacy + else: self.authenticate = util.noop server = extr.config("api-server") @@ -218,10 +241,10 @@ class MangadexAPI(): return self._call("/chapter/" + uuid, params)["data"] def list(self, uuid): - return self._call("/list/" + uuid)["data"] + return self._call("/list/" + uuid, None, True)["data"] def list_feed(self, uuid): - return self._pagination_chapters("/list/" + uuid + "/feed") + return self._pagination_chapters("/list/" + uuid + "/feed", None, True) @memcache(keyarg=1) def manga(self, uuid): @@ -240,28 +263,73 @@ class MangadexAPI(): } return self._pagination_chapters("/manga/" + uuid + "/feed", params) + def user_follows_manga(self): + params = {"contentRating": None} + return self._pagination_manga( + "/user/follows/manga", params, True) + def user_follows_manga_feed(self): params = {"order[publishAt]": "desc"} - return self._pagination_chapters("/user/follows/manga/feed", params) + return self._pagination_chapters( + "/user/follows/manga/feed", params, True) def authenticate(self): - self.headers["Authorization"] = \ + self.headers_auth["Authorization"] = \ self._authenticate_impl(self.username, self.password) @cache(maxage=900, keyarg=1) - def _authenticate_impl(self, username, password): + def _authenticate_impl_client(self, username, password): + refresh_token = _refresh_token_cache((username, "personal")) + if refresh_token: + self.extractor.log.info("Refreshing access token") + data = { + "grant_type" : "refresh_token", + "refresh_token": refresh_token, + "client_id" : self.client_id, + "client_secret": self.client_secret, + } + else: + self.extractor.log.info("Logging in as %s", username) + data = { + "grant_type" : "password", + "username" : self.username, + "password" : self.password, + "client_id" : self.client_id, + "client_secret": self.client_secret, + } + + self.extractor.log.debug("Using client-id '%s…'", self.client_id[:24]) + url = ("https://auth.mangadex.org/realms/mangadex" + "/protocol/openid-connect/token") + data = self.extractor.request( + url, method="POST", data=data, fatal=None).json() + + try: + access_token = data["access_token"] + except Exception: + raise exception.AuthenticationError(data.get("error_description")) + + if refresh_token != data.get("refresh_token"): + _refresh_token_cache.update( + (username, "personal"), data["refresh_token"]) + + return "Bearer " + access_token + + @cache(maxage=900, keyarg=1) + def _authenticate_impl_legacy(self, username, password): refresh_token = _refresh_token_cache(username) if refresh_token: self.extractor.log.info("Refreshing access token") url = self.root + "/auth/refresh" - data = {"token": refresh_token} + json = {"token": refresh_token} else: self.extractor.log.info("Logging in as %s", username) url = self.root + "/auth/login" - data = {"username": username, "password": password} + json = {"username": username, "password": password} + self.extractor.log.debug("Using legacy login method") data = self.extractor.request( - url, method="POST", json=data, fatal=None).json() + url, method="POST", json=json, fatal=None).json() if data.get("result") != "ok": raise exception.AuthenticationError() @@ -269,13 +337,15 @@ class MangadexAPI(): _refresh_token_cache.update(username, data["token"]["refresh"]) return "Bearer " + data["token"]["session"] - def _call(self, endpoint, params=None): + def _call(self, endpoint, params=None, auth=False): url = self.root + endpoint + headers = self.headers_auth if auth else self.headers while True: - self.authenticate() + if auth: + self.authenticate() response = self.extractor.request( - url, params=params, headers=self.headers, fatal=None) + url, params=params, headers=headers, fatal=None) if response.status_code < 400: return response.json() @@ -284,12 +354,12 @@ class MangadexAPI(): self.extractor.wait(until=until) continue - msg = ", ".join('{title}: {detail}'.format_map(error) + msg = ", ".join('{title}: "{detail}"'.format_map(error) for error in response.json()["errors"]) raise exception.StopExtraction( "%s %s (%s)", response.status_code, response.reason, msg) - def _pagination_chapters(self, endpoint, params=None): + def _pagination_chapters(self, endpoint, params=None, auth=False): if params is None: params = {} @@ -299,21 +369,22 @@ class MangadexAPI(): params["translatedLanguage[]"] = lang params["includes[]"] = ("scanlation_group",) - return self._pagination(endpoint, params) + return self._pagination(endpoint, params, auth) - def _pagination_manga(self, endpoint, params=None): + def _pagination_manga(self, endpoint, params=None, auth=False): if params is None: params = {} - return self._pagination(endpoint, params) + return self._pagination(endpoint, params, auth) - def _pagination(self, endpoint, params): + def _pagination(self, endpoint, params, auth=False): config = self.extractor.config - ratings = config("ratings") - if ratings is None: - ratings = ("safe", "suggestive", "erotica", "pornographic") - params["contentRating[]"] = ratings + if "contentRating" not in params: + ratings = config("ratings") + if ratings is None: + ratings = ("safe", "suggestive", "erotica", "pornographic") + params["contentRating[]"] = ratings params["offset"] = 0 api_params = config("api-parameters") @@ -321,7 +392,7 @@ class MangadexAPI(): params.update(api_params) while True: - data = self._call(endpoint, params) + data = self._call(endpoint, params, auth) yield from data["data"] params["offset"] = data["offset"] + data["limit"] @@ -329,6 +400,6 @@ class MangadexAPI(): return -@cache(maxage=28*86400, keyarg=0) +@cache(maxage=90*86400, keyarg=0) def _refresh_token_cache(username): return None diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 5e78ad4..8b38474 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -196,11 +196,15 @@ class MastodonFollowingExtractor(MastodonExtractor): class MastodonStatusExtractor(MastodonExtractor): """Extractor for images from a status""" subcategory = "status" - pattern = (BASE_PATTERN + r"/(?:@[^/?#]+|(?:users/[^/?#]+/)?statuses)" - r"/(?!following)([^/?#]+)") + pattern = (BASE_PATTERN + r"/(?:@[^/?#]+|(?:users/[^/?#]+/)?" + r"(?:statuses|notice|objects()))/(?!following)([^/?#]+)") example = "https://mastodon.social/@USER/12345" def statuses(self): + if self.groups[-2] is not None: + url = "{}/objects/{}".format(self.root, self.item) + location = self.request_location(url) + self.item = location.rpartition("/")[2] return (MastodonAPI(self).status(self.item),) diff --git a/gallery_dl/extractor/motherless.py b/gallery_dl/extractor/motherless.py index c5b9322..ce83ded 100644 --- a/gallery_dl/extractor/motherless.py +++ b/gallery_dl/extractor/motherless.py @@ -23,21 +23,6 @@ class MotherlessExtractor(Extractor): filename_fmt = "{id} {title}.{extension}" archive_fmt = "{id}" - -class MotherlessMediaExtractor(MotherlessExtractor): - """Extractor for a single image/video from motherless.com""" - subcategory = "media" - pattern = (BASE_PATTERN + - r"/((?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?" - r"(?!G)[A-Z0-9]+)") - example = "https://motherless.com/ABC123" - - def items(self): - file = self._extract_media(self.groups[0]) - url = file["url"] - yield Message.Directory, file - yield Message.Url, url, text.nameext_from_url(url, file) - def _extract_media(self, path): url = self.root + "/" + path page = self.request(url).text @@ -95,6 +80,21 @@ class MotherlessMediaExtractor(MotherlessExtractor): return "" +class MotherlessMediaExtractor(MotherlessExtractor): + """Extractor for a single image/video from motherless.com""" + subcategory = "media" + pattern = (BASE_PATTERN + + r"/((?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?" + r"(?!G)[A-Z0-9]+)") + example = "https://motherless.com/ABC123" + + def items(self): + file = self._extract_media(self.groups[0]) + url = file["url"] + yield Message.Directory, file + yield Message.Url, url, text.nameext_from_url(url, file) + + class MotherlessGalleryExtractor(MotherlessExtractor): """Extractor for a motherless.com gallery""" subcategory = "gallery" @@ -119,6 +119,10 @@ class MotherlessGalleryExtractor(MotherlessExtractor): for num, thumb in enumerate(self._pagination(page), 1): file = self._parse_thumb_data(thumb) + + if file["type"] == "video": + file = self._extract_media(file["id"]) + file.update(data) file["num"] = num url = file["url"] @@ -151,17 +155,13 @@ class MotherlessGalleryExtractor(MotherlessExtractor): def _parse_thumb_data(self, thumb): extr = text.extract_from(thumb) + data = { "id" : extr('data-codename="', '"'), "type" : extr('data-mediatype="', '"'), "thumbnail": extr('class="static" src="', '"'), "title" : extr(' alt="', '"'), } - - type = data["type"] - url = data["thumbnail"].replace("thumb", type) - if type == "video": - url = "{}/{}.mp4".format(url.rpartition("/")[0], data["id"]) - data["url"] = url + data["url"] = data["thumbnail"].replace("thumb", data["type"]) return data diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index ad8c681..62fa9be 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -132,6 +132,9 @@ class PinterestExtractor(Extractor): "extension": "txt", "media_id": block.get("id")} + elif type == "story_pin_product_sticker_block": + continue + elif type == "story_pin_static_sticker_block": continue diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py index 83f3577..7a4d1a5 100644 --- a/gallery_dl/extractor/pixeldrain.py +++ b/gallery_dl/extractor/pixeldrain.py @@ -96,3 +96,73 @@ class PixeldrainAlbumExtractor(PixeldrainExtractor): file["date"] = self.parse_datetime(file["date_upload"]) text.nameext_from_url(file["name"], file) yield Message.Url, url, file + + +class PixeldrainFolderExtractor(PixeldrainExtractor): + """Extractor for pixeldrain filesystem files and directories""" + subcategory = "folder" + filename_fmt = "{filename[:230]}.{extension}" + archive_fmt = "{path}_{num}" + pattern = BASE_PATTERN + r"/(?:d|api/filesystem)/([^?]+)" + example = "https://pixeldrain.com/d/abcdefgh" + + def metadata(self, data): + return { + "type" : data["type"], + "path" : data["path"], + "name" : data["name"], + "mime_type" : data["file_type"], + "size" : data["file_size"], + "hash_sha256": data["sha256_sum"], + "date" : self.parse_datetime(data["created"]), + } + + def items(self): + recursive = self.config("recursive") + + url = "{}/api/filesystem/{}".format(self.root, self.groups[0]) + stat = self.request(url + "?stat").json() + + paths = stat["path"] + path = paths[stat["base_index"]] + if path["type"] == "dir": + children = [ + child + for child in stat["children"] + if child["name"] != ".search_index.gz" + ] + else: + children = (path,) + + folder = self.metadata(path) + folder["id"] = paths[0]["id"] + + yield Message.Directory, folder + + num = 0 + for child in children: + if child["type"] == "file": + num += 1 + url = "{}/api/filesystem{}?attach".format( + self.root, child["path"]) + share_url = "{}/d{}".format(self.root, child["path"]) + data = self.metadata(child) + data.update({ + "id" : folder["id"], + "num" : num, + "url" : url, + "share_url": share_url, + }) + data["filename"], _, data["extension"] = \ + child["name"].rpartition(".") + yield Message.Url, url, data + + elif child["type"] == "dir": + if recursive: + url = "{}/d{}".format(self.root, child["path"]) + child["_extractor"] = PixeldrainFolderExtractor + yield Message.Queue, url, child + + else: + self.log.debug("'%s' is of unknown type (%s)", + child.get("name"), child["type"]) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index c063216..73c5c1c 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -136,7 +136,21 @@ class PixivExtractor(Extractor): self.log.warning("%s: 'limit_sanity_level' warning", work_id) if self.sanity_workaround: body = self._request_ajax("/illust/" + str(work_id)) - return self._extract_ajax(work, body) + if work["type"] == "ugoira": + if not self.load_ugoira: + return () + self.log.info("%s: Retrieving Ugoira AJAX metadata", + work["id"]) + try: + self._extract_ajax(work, body) + return self._extract_ugoira(work, url) + except Exception as exc: + self.log.debug("", exc_info=exc) + self.log.warning( + "%s: Unable to extract Ugoira URL. Provide " + "logged-in cookies to access it", work["id"]) + else: + return self._extract_ajax(work, body) elif limit_type == "limit_mypixiv_360.png": work["_mypixiv"] = True @@ -161,7 +175,12 @@ class PixivExtractor(Extractor): return () def _extract_ugoira(self, work, img_url): - ugoira = self.api.ugoira_metadata(work["id"]) + if work.get("_ajax"): + ugoira = self._request_ajax( + "/illust/" + str(work["id"]) + "/ugoira_meta") + img_url = ugoira["src"] + else: + ugoira = self.api.ugoira_metadata(work["id"]) work["_ugoira_frame_data"] = work["frames"] = frames = ugoira["frames"] work["_ugoira_original"] = self.load_ugoira_original work["_http_adjust_extension"] = False @@ -198,7 +217,10 @@ class PixivExtractor(Extractor): ] else: - zip_url = ugoira["zip_urls"]["medium"] + if work.get("_ajax"): + zip_url = ugoira["originalSrc"] + else: + zip_url = ugoira["zip_urls"]["medium"] work["date_url"] = self._date_from_url(zip_url) url = zip_url.replace("_ugoira600x600", "_ugoira1920x1080", 1) return ({"url": url},) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index c7303f2..3485db9 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -47,6 +47,10 @@ class SankakuExtractor(BooruExtractor): def _init(self): self.api = SankakuAPI(self) + if self.config("tags") == "extended": + self._tags = self._tags_extended + self._tags_findall = re.compile( + r"tag-type-([^\"' ]+).*?\?tags=([^\"'&]+)").findall def _file_url(self, post): url = post["file_url"] @@ -85,6 +89,23 @@ class SankakuExtractor(BooruExtractor): post["tags_" + name] = values post["tag_string_" + name] = " ".join(values) + def _tags_extended(self, post, page): + try: + url = "https://chan.sankakucomplex.com/posts/" + post["id"] + page = self.request(url).text + except Exception as exc: + return self.log.warning( + "%s: Failed to extract extended tag categories (%s: %s)", + post["id"], exc.__class__.__name__, exc) + + tags = collections.defaultdict(list) + tag_sidebar = text.extr(page, '<ul id="tag-sidebar"', "</ul>") + for tag_type, tag_name in self._tags_findall(tag_sidebar): + tags[tag_type].append(text.unescape(text.unquote(tag_name))) + for type, values in tags.items(): + post["tags_" + type] = values + post["tag_string_" + type] = " ".join(values) + def _notes(self, post, page): if post.get("has_notes"): post["notes"] = self.api.notes(post["id"]) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 1054a63..a83f2da 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -40,8 +40,14 @@ class SubscribestarExtractor(Extractor): for post_html in self.posts(): media = self._media_from_post(post_html) data = self._data_from_post(post_html) - data["title"] = text.unescape(text.extr( - data["content"], "<h1>", "</h1>")) + + content = data["content"] + if "<html><body>" in content: + data["content"] = content = text.extr( + content, "<body>", "</body>") + data["title"] = text.unescape( + text.rextract(content, "<h1>", "</h1>")[0] or "") + yield Message.Directory, data for num, item in enumerate(media, 1): item.update(data) @@ -189,7 +195,12 @@ class SubscribestarExtractor(Extractor): "author_nick": text.unescape(extr('>', '<')), "date" : self._parse_datetime(extr( 'class="post-date">', '</').rpartition(">")[2]), - "content" : extr('<body>', '</body>').strip(), + "content" : extr( + '<div class="post-content" data-role="post_content-text">', + '</div><div class="post-uploads for-youtube"').strip(), + "tags" : list(text.extract_iter(extr( + '<div class="post_tags for-post">', + '<div class="post-actions">'), '?tag=', '"')), } def _parse_datetime(self, dt): @@ -243,7 +254,12 @@ class SubscribestarPostExtractor(SubscribestarExtractor): "post_id" : text.parse_int(extr('data-id="', '"')), "date" : self._parse_datetime(extr( '<div class="section-title_date">', '<')), - "content" : extr('<body>', '</body>').strip(), + "content" : extr( + '<div class="post-content" data-role="post_content-text">', + '</div><div class="post-uploads for-youtube"').strip(), + "tags" : list(text.extract_iter(extr( + '<div class="post_tags for-post">', + '<div class="post-actions">'), '?tag=', '"')), "author_name": text.unescape(extr( 'class="star_link" href="/', '"')), "author_id" : text.parse_int(extr('data-user-id="', '"')), diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index af3f32d..1dd3482 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -43,31 +43,40 @@ class VipergirlsExtractor(Extractor): def items(self): self.login() - posts = self.posts() + root = self.posts() + forum_title = root[1].attrib["title"] + thread_title = root[2].attrib["title"] like = self.config("like") if like: - user_hash = posts[0].get("hash") + user_hash = root[0].get("hash") if len(user_hash) < 16: self.log.warning("Login required to like posts") like = False - posts = posts.iter("post") + posts = root.iter("post") if self.page: util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15) for post in posts: + images = list(post) + data = post.attrib + data["forum_title"] = forum_title data["thread_id"] = self.thread_id + data["thread_title"] = thread_title + data["post_id"] = data.pop("id") + data["post_num"] = data.pop("number") + data["post_title"] = data.pop("title") + data["count"] = len(images) + del data["imagecount"] yield Message.Directory, data - - image = None - for image in post: - yield Message.Queue, image.attrib["main_url"], data - - if image is not None and like: - self.like(post, user_hash) + if images: + for data["num"], image in enumerate(images, 1): + yield Message.Queue, image.attrib["main_url"], data + if like: + self.like(post, user_hash) def login(self): if self.cookies_check(self.cookies_names): |
