diff options
Diffstat (limited to 'gallery_dl/extractor')
28 files changed, 851 insertions, 376 deletions
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 4dc4f0d..fd973c3 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -166,7 +166,7 @@ class _500pxGalleryExtractor(_500pxExtractor): } gallery = self._request_graphql( "GalleriesDetailQueryRendererQuery", variables, - "1afc7dede86ff73456b4defbc5aeb593e330b990943d114cbef7da5be0d7ce2f", + "fd367cacf9bebcdc0620bd749dbd8fc9b0ccbeb54fc76b8b4b95e66a8c0cba49", )["gallery"] self._photos = gallery["photos"] @@ -194,8 +194,8 @@ class _500pxGalleryExtractor(_500pxExtractor): variables["cursor"] = photos["pageInfo"]["endCursor"] photos = self._request_graphql( "GalleriesDetailPaginationContainerQuery", variables, - "3fcbc9ea1589f31c86fc43a0a02c2163" - "cab070f9d376651f270de9f30f031539", + "457c66d976f56863c81795f03e98cb54" + "3c7c6cdae7abeab8fe9e8e8a67479fa9", )["galleryByOwnerIdAndSlugOrToken"]["photos"] diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py index fafb785..b248735 100644 --- a/gallery_dl/extractor/8muses.py +++ b/gallery_dl/extractor/8muses.py @@ -94,12 +94,12 @@ class _8musesAlbumExtractor(Extractor): if albums: for album in albums: url = self.root + "/comics/album/" + album["permalink"] - album = { - "url" : url, - "name" : album["name"], - "private": album["isPrivate"], + yield Message.Queue, url, { + "url" : url, + "name" : album["name"], + "private" : album["isPrivate"], + "_extractor": _8musesAlbumExtractor, } - yield Message.Queue, url, album if data["page"] >= data["pages"]: return diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 53bc726..b8e39bc 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -59,7 +59,6 @@ modules = [ "kabeuchi", "keenspot", "khinsider", - "kissmanga", "komikcast", "konachan", "lineblog", @@ -118,6 +117,7 @@ modules = [ "vsco", "wallhaven", "warosu", + "weasyl", "webtoons", "weibo", "wikiart", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 1126615..be498bc 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -31,8 +31,14 @@ class BehanceExtractor(Extractor): def _update(data): # compress data to simple lists if data["fields"] and isinstance(data["fields"][0], dict): - data["fields"] = [field["name"] for field in data["fields"]] - data["owners"] = [owner["display_name"] for owner in data["owners"]] + data["fields"] = [ + field.get("name") or field.get("label") + for field in data["fields"] + ] + data["owners"] = [ + owner.get("display_name") or owner.get("displayName") + for owner in data["owners"] + ] tags = data.get("tags") or () if tags and isinstance(tags[0], dict): @@ -101,7 +107,7 @@ class BehanceGalleryExtractor(BehanceExtractor): cookies = { "_evidon_consent_cookie": '{"consent_date":"2019-01-31T09:41:15.132Z"}', - "bcp": "815b5eee-8bdf-4898-ac79-33c2bcc0ed19", + "bcp": "4c34489d-914c-46cd-b44c-dfd0e661136d", "gk_suid": "66981391", "gki": '{"feature_project_view":false,' '"feature_discover_login_prompt":false,' @@ -184,14 +190,267 @@ class BehanceCollectionExtractor(BehanceExtractor): self.collection_id = match.group(1) def galleries(self): - url = "{}/collection/{}/a".format(self.root, self.collection_id) - params = {"offset": 0} - headers = {"X-Requested-With": "XMLHttpRequest"} + url = self.root + "/v3/graphql" + headers = { + "Origin" : self.root, + "Referer": self.root + "/collection/" + self.collection_id, + "X-BCP" : "4c34489d-914c-46cd-b44c-dfd0e661136d", + "X-NewRelic-ID" : "VgUFVldbGwsFU1BRDwUBVw==", + "X-Requested-With": "XMLHttpRequest", + } + cookies = { + "bcp" : "4c34489d-914c-46cd-b44c-dfd0e661136d", + "gk_suid": "66981391", + "ilo0" : "true", + } + + query = """ +query GetMoodboardItemsAndRecommendations( + $id: Int! + $firstItem: Int! + $afterItem: String + $shouldGetRecommendations: Boolean! + $shouldGetItems: Boolean! + $shouldGetMoodboardFields: Boolean! + ) { + viewer @include(if: $shouldGetMoodboardFields) { + isOptedOutOfRecommendations + } + moodboard(id: $id) { + ...moodboardFields @include(if: $shouldGetMoodboardFields) + + items(first: $firstItem, after: $afterItem) @include(if: $shouldGetItems) + { + pageInfo { + endCursor + hasNextPage + } + nodes { + ...nodesFields + } + } + + recommendedItems(first: 80) @include(if: $shouldGetRecommendations) { + nodes { + ...nodesFields + fetchSource + } + } + } + } + + fragment moodboardFields on Moodboard { + id + label + privacy + followerCount + isFollowing + projectCount + url + isOwner + owners { + id + displayName + url + firstName + location + locationUrl + images { + size_50 { + url + } + size_100 { + url + } + size_115 { + url + } + size_230 { + url + } + size_138 { + url + } + size_276 { + url + } + } + } + } + + fragment projectFields on Project { + id + isOwner + publishedOn + matureAccess + hasMatureContent + modifiedOn + name + url + isPrivate + slug + fields { + label + } + colors { + r + g + b + } + owners { + url + displayName + id + location + locationUrl + isProfileOwner + images { + size_50 { + url + } + size_100 { + url + } + size_115 { + url + } + size_230 { + url + } + size_138 { + url + } + size_276 { + url + } + } + } + covers { + size_original { + url + } + size_max_808 { + url + } + size_808 { + url + } + size_404 { + url + } + size_202 { + url + } + size_230 { + url + } + size_115 { + url + } + } + stats { + views { + all + } + appreciations { + all + } + comments { + all + } + } + } + + fragment exifDataValueFields on exifDataValue { + id + label + value + searchValue + } + + fragment nodesFields on MoodboardItem { + id + entityType + width + height + flexWidth + flexHeight + images { + size + url + } + + entity { + ... on Project { + ...projectFields + } + + ... on ImageModule { + project { + ...projectFields + } + + exifData { + lens { + ...exifDataValueFields + } + software { + ...exifDataValueFields + } + makeAndModel { + ...exifDataValueFields + } + focalLength { + ...exifDataValueFields + } + iso { + ...exifDataValueFields + } + location { + ...exifDataValueFields + } + flash { + ...exifDataValueFields + } + exposureMode { + ...exifDataValueFields + } + shutterSpeed { + ...exifDataValueFields + } + aperture { + ...exifDataValueFields + } + } + } + + ... on MediaCollectionComponent { + project { + ...projectFields + } + } + } + } +""" + variables = { + "afterItem": "MAo=", + "firstItem": 40, + "id" : self.collection_id, + "shouldGetItems" : True, + "shouldGetMoodboardFields": False, + "shouldGetRecommendations": False, + } + data = {"query": query, "variables": variables} while True: - data = self.request(url, params=params, headers=headers).json() - for item in data["items"]: - yield item["project"] - if len(data["items"]) < 40: + items = self.request( + url, method="POST", headers=headers, + cookies=cookies, json=data, + ).json()["data"]["moodboard"]["items"] + + for node in items["nodes"]: + yield node["entity"] + + if not items["pageInfo"]["hasNextPage"]: return - params["offset"] += len(data["items"]) + variables["afterItem"] = items["pageInfo"]["endCursor"] diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index a0f4d1c..9cceaee 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -262,9 +262,11 @@ class DeviantartExtractor(Extractor): return folder raise exception.NotFoundError("folder") - def _folder_urls(self, folders, category): - url = "{}/{}/{}/0/".format(self.root, self.user, category) - return [(url + folder["name"], folder) for folder in folders] + def _folder_urls(self, folders, category, extractor): + base = "{}/{}/{}/0/".format(self.root, self.user, category) + for folder in folders: + folder["_extractor"] = extractor + yield base + folder["name"], folder def _update_content_default(self, deviation, content): public = "premium_folder_data" not in deviation @@ -450,7 +452,7 @@ class DeviantartGalleryExtractor(DeviantartExtractor): if self.flat and not self.group: return self.api.gallery_all(self.user, self.offset) folders = self.api.gallery_folders(self.user) - return self._folder_urls(folders, "gallery") + return self._folder_urls(folders, "gallery", DeviantartFolderExtractor) class DeviantartFolderExtractor(DeviantartExtractor): @@ -589,7 +591,8 @@ class DeviantartFavoriteExtractor(DeviantartExtractor): self.api.collections(self.user, folder["folderid"]) for folder in folders ) - return self._folder_urls(folders, "favourites") + return self._folder_urls( + folders, "favourites", DeviantartCollectionExtractor) class DeviantartCollectionExtractor(DeviantartExtractor): diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index cb4df11..06b5ba2 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -392,6 +392,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): def items(self): self.login() yield Message.Version, 1 + data = {"_extractor": ExhentaiGalleryExtractor} while True: last = None @@ -402,7 +403,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): if url == last: continue last = url - yield Message.Queue, url, {} + yield Message.Queue, url, data if 'class="ptdd">><' in page or ">No hits found</p>" in page: return diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py index a2d8c04..44863a9 100644 --- a/gallery_dl/extractor/fallenangels.py +++ b/gallery_dl/extractor/fallenangels.py @@ -66,9 +66,9 @@ class FallenangelsMangaExtractor(MangaExtractor): category = "fallenangels" pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$" test = ( - ("http://manga.fascans.com/manga/trinity-seven", { - "url": "293057f264de6c438b979bd1c3de4719568db452", - "keyword": "50e0374dba60734230e4284b5ffdadef5104ae62", + ("https://manga.fascans.com/manga/chronos-ruler", { + "url": "eea07dd50f5bc4903aa09e2cc3e45c7241c9a9c2", + "keyword": "c414249525d4c74ad83498b3c59a813557e59d7e", }), ("https://truyen.fascans.com/manga/rakudai-kishi-no-eiyuutan", { "url": "51a731a6b82d5eb7a335fbae6b02d06aeb2ab07b", diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 0ab42db..bf925b6 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -173,8 +173,7 @@ EXTRACTORS = { ), "test-manga": ("https://sensescans.com/reader/series/yotsubato/", { - "url": "305e6eb6160e3bb90c3de39ff5fb7c971e052087", - "keyword": "562fb5a7362a4cb43d59d5c8a6ea8080fc65cf99", + "count": ">= 3", }), }, "_ckey": "chapterclass", diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 950a174..2a5ef6e 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -236,7 +236,9 @@ class FuraffinityPostExtractor(FuraffinityExtractor): pattern = BASE_PATTERN + r"/(?:view|full)/(\d+)" test = ( ("https://www.furaffinity.net/view/21835115/", { - "url": "d80254eb4fba654597b4df8320d55916e11ba375", + "pattern": r"https://d\d*\.facdn\.net/(download/)?art/mirlinthloth" + r"/music/1488278723/1480267446.mirlinthloth_dj_fennmink" + r"_-_bude_s_4_ever\.mp3", "keyword": { "artist" : "mirlinthloth", "artist_url" : "mirlinthloth", diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index ac1bca3..ba2fe5d 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -100,13 +100,13 @@ class GfycatImageExtractor(GfycatExtractor): "gfyName": "GrayGenerousCowrie", "gfyNumber": "755075459", "title": "Bottom's up", - "userName": "jackson3oh3", + "username": "jackson3oh3", "createDate": 1495884169, "md5": "a4796e05b0db9ba9ce5140145cd318aa", "width": 400, "height": 224, - "frameRate": 23, - "numFrames": 158, + "frameRate": 23.0, + "numFrames": 158.0, "views": int, }, }), diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py index 1ab71d6..833135e 100644 --- a/gallery_dl/extractor/hentaicafe.py +++ b/gallery_dl/extractor/hentaicafe.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2019 Mike Fährmann +# Copyright 2018-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -50,17 +50,17 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): # single chapter ("https://hentai.cafe/hazuki-yuuto-summer-blues/", { "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b", - "keyword": "eb9f98544098c961bd8cf5dbe69e6da51c4fb2f6", + "keyword": "5af1c570bb5f533a32b3375f9cdaa17a0152ba67", }), # multi-chapter ("https://hentai.cafe/saitom-saitom-box/", { "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", - "keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb", + "keyword": "3c28517d356cac6acbd9895c9eeefae505304078", }), # new-style URL ("https://hentai.cafe/hc.fyi/2782", { "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", - "keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb", + "keyword": "3c28517d356cac6acbd9895c9eeefae505304078", }), # foolslide URL ("https://hentai.cafe/manga/series/saitom-box/", { @@ -80,12 +80,14 @@ class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): chapters.reverse() return chapters - tags , pos = text.extract(page, "<p>Tags: ", "</br>") + url , pos = text.extract(page, '<link rel="canonical" href="', '"') + tags , pos = text.extract(page, "<p>Tags: ", "</br>", pos) artist, pos = text.extract(page, "\nArtists: ", "</br>", pos) manga , pos = text.extract(page, "/manga/read/", "/", pos) data = { - "tags" : text.split_html(tags)[::2], - "artist": text.split_html(artist), + "manga_id": text.parse_int(url.rpartition("/")[2]), + "tags" : text.split_html(tags)[::2], + "artist" : text.split_html(artist), } HentaicafeChapterExtractor._data(manga).update(data) diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 6e82091..5eb46b6 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -34,7 +34,7 @@ class HentaifoundryExtractor(Extractor): yield Message.Directory, data self.set_filters() - for page_url in util.advance(self.get_image_pages(), self.start_post): + for page_url in util.advance(self._pagination(), self.start_post): image = self.get_image_metadata(page_url) image.update(data) yield Message.Url, image["src"], image @@ -50,13 +50,12 @@ class HentaifoundryExtractor(Extractor): self.request(self.root + "/?enterAgree=1") return {"user": self.user} - def get_image_pages(self): - """Yield urls of all relevant image pages""" + def _pagination(self, begin='thumbTitle"><a href="', end='"'): num = self.start_page while True: page = self.request("{}/page/{}".format(self.page_url, num)).text - yield from text.extract_iter(page, 'thumbTitle"><a href="', '"') + yield from text.extract_iter(page, begin, end) if 'class="pager"' not in page or 'class="last hidden"' in page: return @@ -90,6 +89,33 @@ class HentaifoundryExtractor(Extractor): return text.nameext_from_url(data["src"], data) + def get_story_metadata(self, html): + """Collect url and metadata for a story""" + extr = text.extract_from(html) + data = { + "user" : self.user, + "title" : text.unescape(extr( + "<div class='titlebar'>", "</a>").rpartition(">")[2]), + "author" : text.unescape(extr('alt="', '"')), + "date" : text.parse_datetime(extr( + ">Updated<", "</span>").rpartition(">")[2], "%B %d, %Y"), + "status" : extr("class='indent'>", "<"), + } + + for c in ("Chapters", "Words", "Comments", "Views", "Rating"): + data[c.lower()] = text.parse_int(extr( + ">" + c + ":</span>", "<").replace(",", "")) + + data["description"] = text.unescape(extr( + "class='storyDescript'>", "<div")) + path = extr('href="', '"') + data["src"] = self.root + path + data["index"] = text.parse_int(path.rsplit("/", 2)[1]) + data["ratings"] = [text.unescape(r) for r in text.extract_iter(extr( + "class='ratings_box'", "</div>"), "title='", "'")] + + return text.nameext_from_url(data["src"], data) + def set_filters(self): """Set site-internal filters to show all images""" token = text.unquote(text.extract( @@ -127,19 +153,41 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor): """Extractor for all images of a hentai-foundry-user""" subcategory = "user" pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" - r"/(?:pictures/user/([^/]+)(?:/page/(\d+))?/?$" - r"|user/([^/]+)/profile)") + r"/user/([^/]+)/profile") + test = ("https://www.hentai-foundry.com/user/Tenpura/profile",) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, match.group(1)) + + def items(self): + user = "/user/" + self.user + return self._dispatch_extractors(( + (HentaifoundryPicturesExtractor , + self.root + "/pictures" + user), + (HentaifoundryScrapsExtractor, + self.root + "/pictures" + user + "/scraps"), + (HentaifoundryStoriesExtractor, + self.root + "/stories" + user), + (HentaifoundryFavoriteExtractor, + self.root + user + "/faves/pictures"), + ), ("pictures",)) + + +class HentaifoundryPicturesExtractor(HentaifoundryExtractor): + """Extractor for all pictures of a hentaifoundry user""" + subcategory = "pictures" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/pictures/user/([^/]+)(?:/page/(\d+))?/?$") test = ( ("https://www.hentai-foundry.com/pictures/user/Tenpura", { "url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28", }), ("https://www.hentai-foundry.com/pictures/user/Tenpura/page/3"), - ("https://www.hentai-foundry.com/user/Tenpura/profile"), ) def __init__(self, match): HentaifoundryExtractor.__init__( - self, match, match.group(1) or match.group(3), match.group(2)) + self, match, match.group(1), match.group(2)) self.page_url = "{}/pictures/user/{}".format(self.root, self.user) def get_job_metadata(self): @@ -284,3 +332,68 @@ class HentaifoundryImageExtractor(HentaifoundryExtractor): def skip(self, _): return 0 + + +class HentaifoundryStoriesExtractor(HentaifoundryExtractor): + """Extractor for stories of a hentai-foundry user""" + subcategory = "stories" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/stories/user/([^/]+)(?:/page/(\d+))?/?$") + test = ("https://www.hentai-foundry.com/stories/user/SnowWolf35", { + "count": ">= 35", + "keyword": { + "author" : "SnowWolf35", + "chapters" : int, + "comments" : int, + "date" : "type:datetime", + "description": str, + "index" : int, + "rating" : int, + "ratings" : list, + "status" : "re:(Inc|C)omplete", + "title" : str, + "user" : "SnowWolf35", + "views" : int, + "words" : int, + }, + }) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, match.group(1)) + self.page_url = "{}/stories/user/{}".format(self.root, self.user) + + def items(self): + self.get_job_metadata() + self.set_filters() + stories = self._pagination('<div class="storyRow">', '</tr></table>') + for story_html in util.advance(stories, self.start_post): + story = self.get_story_metadata(story_html) + yield Message.Directory, story + yield Message.Url, story["src"], story + + +class HentaifoundryStoryExtractor(HentaifoundryExtractor): + """Extractor for a hentaifoundry story""" + subcategory = "story" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/stories/user/([^/]+)/(\d+)") + test = (("https://www.hentai-foundry.com/stories/user/SnowWolf35" + "/26416/Overwatch-High-Chapter-Voting-Location"), { + "url": "5a67cfa8c3bf7634c8af8485dd07c1ea74ee0ae8", + "keyword": {"title": "Overwatch High Chapter Voting Location"}, + }) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, match.group(1)) + self.index = match.group(2) + + def items(self): + story_url = "{}/stories/user/{}/{}/x?enterAgree=1".format( + self.root, self.user, self.index) + page = self.request(story_url).text + story = self.get_story_metadata(page) + yield Message.Directory, story + yield Message.Url, story["src"], story + + def skip(self, _): + return 0 diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py deleted file mode 100644 index 348453d..0000000 --- a/gallery_dl/extractor/kissmanga.py +++ /dev/null @@ -1,222 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2015-2020 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extract manga-chapters and entire manga from https://kissmanga.com/""" - -from .common import ChapterExtractor, MangaExtractor, Extractor -from .. import text, aes, exception -from ..cache import cache -import hashlib -import ast -import re - - -class RedirectMixin(): - """Detect and handle redirects to CAPTCHA pages""" - - def request(self, url, **kwargs): - while True: - response = Extractor.request(self, url, **kwargs) - if not response.history or "/AreYouHuman" not in response.url: - return response - if self.config("captcha", "stop") == "wait": - self.log.warning( - "Redirect to \n%s\nVisit this URL in your browser, solve " - "the CAPTCHA, and press ENTER to continue", response.url) - try: - input() - except (EOFError, OSError): - pass - else: - raise exception.StopExtraction( - "Redirect to \n%s\nVisit this URL in your browser and " - "solve the CAPTCHA to continue", response.url) - - -class KissmangaBase(RedirectMixin): - """Base class for kissmanga extractors""" - category = "kissmanga" - archive_fmt = "{chapter_id}_{page}" - root = "https://kissmanga.com" - - @staticmethod - def parse_chapter_string(data): - """Parse 'chapter_string' value contained in 'data'""" - data["chapter_string"] = text.unescape(data["chapter_string"]) - - match = re.match(( - r"(?:[Vv]ol\.0*(\d+) )?" - r"(?:[Cc]h\.)?0*(\d+)" - r"(?:[.:]0*(\d+))?" - r"(?: *[:-]? *(.+))?" - ), data["chapter_string"]) - - if not match: - match = re.match(( - r".+?(?: -)? ()" - r"0*(\d+)(?:[Vv.]0*(\d+))?" - r"(?: *[:-]? *(.+))?" - ), data["chapter_string"]) - - if match: - volume, chapter, minor, title = match.groups() - else: - volume, chapter, minor, title = 0, 0, "", data["chapter_string"] - - data["volume"] = text.parse_int(volume) - data["chapter"] = text.parse_int(chapter) - data["chapter_minor"] = "." + minor if minor else "" - data["title"] = title if title and title != "Read Online" else "" - return data - - -class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): - """Extractor for manga-chapters from kissmanga.com""" - pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" - r"(/Manga/[^/?&#]+/[^/?&#]+\?id=(\d+))") - test = ( - ("https://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", { - "url": "46e63fd63e9e16f19bc1e6c7a45dc060815642fd", - "keyword": "1cd0b5214ac7ae4d53e2fd8fec40ceec84cd09bf", - }), - ("https://kissmanga.com/Manga/Urban-Tales/a?id=256717", { - "url": "c26be8bf9c2abacee2076979d021634092cf38f1", - "keyword": "e1d16780df8e04076ed2b5f0637c5b710ec2f2ea", - }), - ("https://kissmanga.com/Manga/Monster/Monster-79?id=7608", { - "count": 23, - "keyword": "f433a7a8fae840e17dace316a243fa27faab86de", - }), - ("https://kissmanga.com/Manga/Houseki-no-Kuni/Oneshot?id=404189", { - "count": 49, - "keyword": "cea131c9fe9c71309b3270cd86718d4d1198c31c", - }), - ("https://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608"), - ) - - def __init__(self, match): - ChapterExtractor.__init__(self, match) - self.chapter_id = match.group(2) - self.session.headers["Referer"] = self.root - - def metadata(self, page): - title = text.extract(page, "<title>", "</title>")[0].strip() - manga, cinfo = title.split("\n")[1:3] - data = { - "manga": manga.strip(), - "chapter_string": cinfo.strip(), - "chapter_id": text.parse_int(self.chapter_id), - "lang": "en", - "language": "English", - } - return self.parse_chapter_string(data) - - def images(self, page): - self.session.headers["Referer"] = None - try: - key = self.build_aes_key(page) - iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0, - 0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3) - return [ - (aes.aes_cbc_decrypt_text( - data, key, iv).partition("&")[0], None) - for data in text.extract_iter( - page, 'push(wrapKA("', '"' - ) - ] - except UnicodeDecodeError: - self.log.error("Failed to decrypt image URLs") - except (ValueError, IndexError): - self.log.error("Failed to get AES key") - return [] - - def build_aes_key(self, page): - chko = self._chko_from_external_script() - - for script in self._scripts(page): - for stmt in [s.strip() for s in script.split(";")]: - - if stmt.startswith("var _"): - name, _, value = stmt[4:].partition(" = ") - name += "[0]" - value = ast.literal_eval(value)[0] - - elif stmt.startswith("chko = "): - stmt = stmt[7:] - if stmt == name: - chko = value - elif stmt == "chko + " + name: - chko = chko + value - elif stmt == name + " + chko": - chko = value + chko - else: - self.log.warning("unrecognized expression: '%s'", stmt) - - elif stmt.startswith("key = "): - pass - - else: - self.log.warning("unrecognized statement: '%s'", stmt) - - return list(hashlib.sha256(chko.encode("ascii")).digest()) - - @staticmethod - def _scripts(page): - end = 0 - while True: - pos = page.find("key = ", end) - if pos == -1: - return - beg = page.rindex('<script type="text/javascript">', 0, pos) + 31 - end = page.index('</script>', pos) - yield page[beg:end] - - @cache(maxage=3600) - def _chko_from_external_script(self): - script = self.request(self.root + "/Scripts/lo.js").text - - pos = script.index("var chko") - var = text.extract(script, "=", "[", pos)[0].lstrip() - idx = text.extract(script, "[", "]", pos)[0] - - pos = script.index(var) - lst = text.extract(script, "=", ";", pos)[0] - return ast.literal_eval(lst.strip())[int(idx)] - - -class KissmangaMangaExtractor(KissmangaBase, MangaExtractor): - """Extractor for manga from kissmanga.com""" - chapterclass = KissmangaChapterExtractor - pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" - r"(/Manga/[^/?&#]+/?)$") - test = ( - ("https://kissmanga.com/Manga/Dropout", { - "url": "9e3a6f715b229aa3fafa42a1d5da5d65614cb532", - "keyword": "32b09711c28b481845acc32e3bb6054cfc90224d", - }), - ("https://kissmanga.com/manga/feng-shen-ji"), # lowercase - ) - - def chapters(self, page): - results = [] - manga, pos = text.extract(page, ' class="barTitle">', '\ninformation') - page , pos = text.extract(page, ' class="listing">', '</table>', pos) - manga = manga.strip() - needle = '" title="Read ' + manga + ' ' - manga = text.unescape(manga) - - for item in text.extract_iter(page, '<a href="', ' online">'): - url, _, chapter = item.partition(needle) - data = { - "manga": manga, "chapter_string": chapter, - "chapter_id": text.parse_int(url.rpartition("=")[2]), - "lang": "en", "language": "English", - } - self.parse_chapter_string(data) - results.append((self.root + url, data)) - return results diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index 0e04f97..5743498 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -167,6 +167,8 @@ class MangoxoChannelExtractor(MangoxoExtractor): self.login() num = total = 1 url = "{}/channel/{}/album/".format(self.root, self.channel_id) + data = {"_extractor": MangoxoAlbumExtractor} + yield Message.Version, 1 while True: @@ -174,7 +176,7 @@ class MangoxoChannelExtractor(MangoxoExtractor): for album in text.extract_iter( page, '<a class="link black" href="', '"'): - yield Message.Queue, album, {} + yield Message.Queue, album, data if num == 1: total = self._total_pages(page) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 19a2b92..f9dc886 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -19,8 +19,8 @@ class NewgroundsExtractor(Extractor): """Base class for newgrounds extractors""" category = "newgrounds" directory_fmt = ("{category}", "{artist[:10]:J, }") - filename_fmt = "{category}_{index}_{title}.{extension}" - archive_fmt = "{index}" + filename_fmt = "{category}_{_index}_{title}.{extension}" + archive_fmt = "{_index}" root = "https://www.newgrounds.com" cookiedomain = ".newgrounds.com" cookienames = ("NG_GG_username", "vmk1du5I8m") @@ -44,6 +44,13 @@ class NewgroundsExtractor(Extractor): if url: yield Message.Directory, post yield Message.Url, url, text.nameext_from_url(url, post) + + for num, url in enumerate(text.extract_iter( + post["_comment"], 'data-smartload-src="', '"'), 1): + post["num"] = num + post["_index"] = "{}_{:>02}".format(post["index"], num) + text.nameext_from_url(url, post) + yield Message.Url, url, post else: self.log.warning( "Unable to get download URL for '%s'", post_url) @@ -97,8 +104,9 @@ class NewgroundsExtractor(Extractor): else: data = self._extract_media_data(extr, post_url) - data["comment"] = text.unescape(text.remove_html(extr( - 'id="author_comments">', '</div>'), "", "")) + data["_comment"] = extr('id="author_comments"', '</div>') + data["comment"] = text.unescape(text.remove_html( + data["_comment"].partition(">")[2], "", "")) data["favorites"] = text.parse_int(extr( 'id="faves_load">', '<').replace(",", "")) data["score"] = text.parse_float(extr('id="score_number">', '<')) @@ -125,33 +133,54 @@ class NewgroundsExtractor(Extractor): "width" : text.parse_int(full('width="', '"')), "height" : text.parse_int(full('height="', '"')), } - data["index"] = text.parse_int( - data["url"].rpartition("/")[2].partition("_")[0]) + index = data["url"].rpartition("/")[2].partition("_")[0] + data["index"] = text.parse_int(index) + data["_index"] = index return data @staticmethod def _extract_audio_data(extr, url): + index = url.split("/")[5] return { "title" : text.unescape(extr('"og:title" content="', '"')), "description": text.unescape(extr(':description" content="', '"')), "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "url" : extr('{"url":"', '"').replace("\\/", "/"), - "index" : text.parse_int(url.split("/")[5]), + "index" : text.parse_int(index), + "_index" : index, "rating" : "", } - @staticmethod - def _extract_media_data(extr, url): + def _extract_media_data(self, extr, url): + index = url.split("/")[5] + title = extr('"og:title" content="', '"') + src = extr('{"url":"', '"') + + if src: + src = src.replace("\\/", "/") + date = text.parse_datetime(extr( + 'itemprop="datePublished" content="', '"')) + else: + url = self.root + "/portal/video/" + index + headers = { + "Accept": "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Referer": self.root, + } + data = self.request(url, headers=headers).json() + src = data["sources"]["360p"][0]["src"].replace(".360p.", ".") + date = text.parse_timestamp(src.rpartition("?")[2]) + return { - "title" : text.unescape(extr('"og:title" content="', '"')), - "url" : extr('{"url":"', '"').replace("\\/", "/"), - "date" : text.parse_datetime(extr( - 'itemprop="datePublished" content="', '"')), + "title" : text.unescape(title), + "url" : src, + "date" : date, "description": text.unescape(extr( 'itemprop="description" content="', '"')), "rating" : extr('class="rated-', '"'), - "index" : text.parse_int(url.split("/")[5]), + "index" : text.parse_int(index), + "_index" : index, } def _pagination(self, kind): @@ -215,6 +244,10 @@ class NewgroundsImageExtractor(NewgroundsExtractor): ("https://art.ngfiles.com/images/0/94_tomfulp_ryu-is-hawt.gif", { "url": "57f182bcbbf2612690c3a54f16ffa1da5105245e", }), + ("https://www.newgrounds.com/art/view/sailoryon/yon-dream-buster", { + "url": "84eec95e663041a80630df72719f231e157e5f5d", + "count": 2, + }) ) def __init__(self, match): @@ -236,23 +269,21 @@ class NewgroundsMediaExtractor(NewgroundsExtractor): pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" r"(/(?:portal/view|audio/listen)/\d+)") test = ( - ("https://www.newgrounds.com/portal/view/589549", { - "url": "48d916d819c99139e6a3acbbf659a78a867d363e", - "content": "ceb865426727ec887177d99e0d20bb021e8606ae", + ("https://www.newgrounds.com/portal/view/595355", { + "pattern": r"https://uploads\.ungrounded\.net/alternate/564000" + r"/564957_alternate_31\.mp4\?1359712249", "keyword": { - "artist" : ["psychogoldfish", "tomfulp"], - "comment" : "re:People have been asking me how I like the ", - "date" : "dt:2012-02-08 21:40:56", - "description": "re:People have been asking how I like the ", + "artist" : ["kickinthehead", "danpaladin", "tomfulp"], + "comment" : "re:My fan trailer for Alien Hominid HD!", + "date" : "dt:2013-02-01 09:50:49", "favorites" : int, - "filename" : "527818_alternate_1896", - "index" : 589549, - "rating" : "t", + "filename" : "564957_alternate_31", + "index" : 595355, + "rating" : "e", "score" : float, - "tags" : ["newgrounds", "psychogoldfish", - "rage", "redesign-2012"], - "title" : "Redesign Rage", - "user" : "psychogoldfish", + "tags" : ["alienhominid", "trailer"], + "title" : "Alien Hominid Fan Trailer", + "user" : "kickinthehead", }, }), ("https://www.newgrounds.com/audio/listen/609768", { diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index aae17a3..2394acf 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2019 Mike Fährmann +# Copyright 2015-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -127,9 +127,25 @@ class NijieExtractor(AsynchronousMixin, Extractor): class NijieUserExtractor(NijieExtractor): - """Extractor for works of a nijie-user""" + """Extractor for nijie user profiles""" subcategory = "user" - pattern = BASE_PATTERN + r"/members(?:_illust)?\.php\?id=(\d+)" + cookiedomain = None + pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)" + test = ("https://nijie.info/members.php?id=44",) + + def items(self): + base = "{}/{{}}.php?id={}".format(self.root, self.user_id) + return self._dispatch_extractors(( + (NijieIllustrationExtractor, base.format("members_illust")), + (NijieDoujinExtractor , base.format("members_dojin")), + (NijieFavoriteExtractor , base.format("user_like_illust_view")), + ), ("illustration", "doujin")) + + +class NijieIllustrationExtractor(NijieExtractor): + """Extractor for all illustrations of a nijie-user""" + subcategory = "illustration" + pattern = BASE_PATTERN + r"/members_illust\.php\?id=(\d+)" test = ( ("https://nijie.info/members_illust.php?id=44", { "url": "66c4ff94c6e77c0765dd88f2d8c663055fda573e", @@ -152,7 +168,6 @@ class NijieUserExtractor(NijieExtractor): ("https://nijie.info/members_illust.php?id=43", { "exception": exception.NotFoundError, }), - ("https://nijie.info/members.php?id=44"), ) def image_ids(self): diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py index abf88cd..5e7e387 100644 --- a/gallery_dl/extractor/nozomi.py +++ b/gallery_dl/extractor/nozomi.py @@ -106,7 +106,7 @@ class NozomiPostExtractor(NozomiExtractor): # multiple images per post ("https://nozomi.la/post/25588032.html", { "url": "6aa3b7db385abcc9d374bdffd19187bccbf8f228", - "keyword": "0aa99cbaaeada2984a1fbf912274409c6ba106d4", + "keyword": "8c3a2561ccc9ad429be9850d1383a952d0b4a8ab", "count": 7, }), ) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index c07c4b7..6d7b27a 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -180,16 +180,11 @@ class OAuthBase(Extractor): self.send(msg) def _generate_message(self, names, values): - if len(names) == 1: - _vh = "This value has" - _is = "is" - _it = "it" - _va = "this value" - else: - _vh = "These values have" - _is = "are" - _it = "them" - _va = "these values" + _vh, _va, _is, _it = ( + ("This value has", "this value", "is", "it") + if len(names) == 1 else + ("These values have", "these values", "are", "them") + ) msg = "\nYour {} {}\n\n{}\n\n".format( " and ".join("'" + n + "'" for n in names), @@ -197,23 +192,21 @@ class OAuthBase(Extractor): "\n".join(values), ) - if self.cache: - opt = self.oauth_config(names[0]) - if opt is None or opt == "cache": - msg += _vh + " been cached and will automatically be used." - else: - msg += ( - "Set 'extractor.{}.{}' to \"cache\" to use {}.".format( - self.subcategory, names[0], _it, - ) - ) + opt = self.oauth_config(names[0]) + if self.cache and (opt is None or opt == "cache"): + msg += _vh + " been cached and will automatically be used." else: msg += "Put " + _va + " into your configuration file as \n" msg += " and\n".join( "'extractor." + self.subcategory + "." + n + "'" for n in names ) - msg += "." + if self.cache: + msg += ( + "\nor set\n'extractor.{}.{}' to \"cache\"" + .format(self.subcategory, names[0]) + ) + msg += "\nto use {}.".format(_it) return msg diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py index 8f2d633..f08055c 100644 --- a/gallery_dl/extractor/paheal.py +++ b/gallery_dl/extractor/paheal.py @@ -95,8 +95,8 @@ class PahealPostExtractor(PahealExtractor): pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/view/(\d+)") test = ("https://rule34.paheal.net/post/view/481609", { - "url": "d3fd0f82762716fe3fb03c9c923e61c13ce22204", - "keyword": "35748081bfeaab48f909f4b097a4d79b2be12538", + "url": "a91d579be030753282f55b8cb4eeaa89c45a9116", + "keyword": "44154bdac3d6cf289d0d9739a566acd8b7839e50", "content": "7b924bcf150b352ac75c9d281d061e174c851a11", }) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index eaf97fd..ee8f9bb 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -105,7 +105,7 @@ class PixivUserExtractor(PixivExtractor): # avatar (#595, 623) ("https://www.pixiv.net/en/users/173530", { "options": (("avatar", True),), - "content": "22af450d4dbaf4973d370f164f66f48c7382a6de", + "content": "4e57544480cc2036ea9608103e8f024fa737fe66", "range": "1", }), # deleted account diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index bbbc709..6b36cdd 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -29,9 +29,9 @@ class PornhubGalleryExtractor(PornhubExtractor): archive_fmt = "{id}" pattern = BASE_PATTERN + r"/album/(\d+)" test = ( - ("https://www.pornhub.com/album/1708982", { + ("https://www.pornhub.com/album/17218841", { "pattern": r"https://\w+.phncdn.com/pics/albums/\d+/\d+/\d+/\d+/", - "count": 93, + "count": 81, "keyword": { "id": int, "num": int, @@ -40,11 +40,11 @@ class PornhubGalleryExtractor(PornhubExtractor): "caption": str, "user": "Unknown", "gallery": { - "id" : 1708982, + "id" : 17218841, "score": int, "views": int, "tags" : list, - "title": "Random Hentai", + "title": "Hentai/Ecchi 41", }, }, }), diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 8290d2d..e5b4b44 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -16,7 +16,7 @@ import time import json -BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)" +BASE_PATTERN = r"(?:https?://)?((?:[^/.]+\.)?reactor\.cc)" class ReactorExtractor(SharedConfigMixin, Extractor): diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index dda4809..7030c81 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -1,20 +1,19 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2019 Mike Fährmann +# Copyright 2016-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract comic-issues and entire comics from https://readcomiconline.to/""" +"""Extractors for https://readcomiconline.to/""" -from .common import ChapterExtractor, MangaExtractor -from .kissmanga import RedirectMixin -from .. import text +from .common import Extractor, ChapterExtractor, MangaExtractor +from .. import text, exception import re -class ReadcomiconlineBase(RedirectMixin): +class ReadcomiconlineBase(): """Base class for readcomiconline extractors""" category = "readcomiconline" directory_fmt = ("{category}", "{comic}", "{issue:>03}") @@ -22,6 +21,25 @@ class ReadcomiconlineBase(RedirectMixin): archive_fmt = "{issue_id}_{page}" root = "https://readcomiconline.to" + def request(self, url, **kwargs): + """Detect and handle redirects to CAPTCHA pages""" + while True: + response = Extractor.request(self, url, **kwargs) + if not response.history or "/AreYouHuman" not in response.url: + return response + if self.config("captcha", "stop") == "wait": + self.log.warning( + "Redirect to \n%s\nVisit this URL in your browser, solve " + "the CAPTCHA, and press ENTER to continue", response.url) + try: + input() + except (EOFError, OSError): + pass + else: + raise exception.StopExtraction( + "Redirect to \n%s\nVisit this URL in your browser and " + "solve the CAPTCHA to continue", response.url) + class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): """Extractor for comic-issues from readcomiconline.to""" diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index b07d024..a9252f5 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -152,7 +152,7 @@ class SankakuTagExtractor(SankakuExtractor): test = ( ("https://chan.sankakucomplex.com/?tags=bonocho", { "count": 5, - "pattern": r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" + "pattern": r"https://c?s\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", }), # respect 'page' query parameter diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 236a001..c98a300 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -110,16 +110,17 @@ class TwitterExtractor(Extractor): twitpics = [] for url in tweet["entities"].get("urls", ()): url = url["expanded_url"] - if "//twitpic.com/" in url: + if "//twitpic.com/" in url and "/photos/" not in url: response = self.request(url, fatal=False) if response.status_code >= 400: continue url = text.extract( response.text, 'name="twitter:image" value="', '"')[0] - twitpics.append({ - "original_info": {}, - "media_url" : url, - }) + if url: + twitpics.append({ + "original_info": {}, + "media_url" : url, + }) if twitpics: if "extended_entities" in tweet: tweet["extended_entities"]["media"].extend(twitpics) @@ -312,6 +313,7 @@ class TwitterSearchExtractor(TwitterExtractor): test = ("https://twitter.com/search?q=nature", { "range": "1-40", "count": 40, + "archive": False, }) def metadata(self): @@ -378,6 +380,15 @@ class TwitterTweetExtractor(TwitterExtractor): "url": "0f6a841e23948e4320af7ae41125e0c5b3cadc98", "content": "f29501e44d88437fe460f5c927b7543fda0f6e34", }), + # original retweets (#1026) + ("https://twitter.com/jessica_3978/status/1296304589591810048", { + "options": (("retweets", "original"),), + "count": 2, + "keyword": { + "tweet_id": 1296296016002547713, + "date" : "dt:2020-08-20 04:00:28", + }, + }), ) def __init__(self, match): @@ -451,7 +462,8 @@ class TwitterAPI(): endpoint = "2/timeline/conversation/{}.json".format(tweet_id) tweets = [] for tweet in self._pagination(endpoint): - if tweet["id_str"] == tweet_id: + if tweet["id_str"] == tweet_id or \ + tweet.get("_retweet_id_str") == tweet_id: tweets.append(tweet) if "quoted_status_id_str" in tweet: tweet_id = tweet["quoted_status_id_str"] @@ -536,6 +548,7 @@ class TwitterAPI(): entry_tweet="tweet-", entry_cursor="cursor-bottom-"): if params is None: params = self.params.copy() + original_retweets = (self.extractor.retweets == "original") while True: cursor = tweet = None @@ -558,12 +571,17 @@ class TwitterAPI(): "Skipping %s (deleted)", entry["entryId"][len(entry_tweet):]) continue - tweet["user"] = users[tweet["user_id_str"]] if "retweeted_status_id_str" in tweet: retweet = tweets.get(tweet["retweeted_status_id_str"]) - if retweet: + if original_retweets: + if not retweet: + continue + retweet["_retweet_id_str"] = tweet["id_str"] + tweet = retweet + elif retweet: tweet["author"] = users[retweet["user_id_str"]] + tweet["user"] = users[tweet["user_id_str"]] yield tweet if "quoted_status_id_str" in tweet: diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py new file mode 100644 index 0000000..a39fbf1 --- /dev/null +++ b/gallery_dl/extractor/weasyl.py @@ -0,0 +1,236 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.weasyl.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https://)?(?:www\.)?weasyl.com/" + + +class WeasylExtractor(Extractor): + category = "weasyl" + directory_fmt = ("{category}", "{owner_login}") + filename_fmt = "{submitid} {title}.{extension}" + archive_fmt = "{submitid}" + root = "https://www.weasyl.com" + + @staticmethod + def populate_submission(data): + # Some submissions don't have content and can be skipped + if "submission" in data["media"]: + data["url"] = data["media"]["submission"][0]["url"] + data["date"] = text.parse_datetime( + data["posted_at"][:19], "%Y-%m-%dT%H:%M:%S") + text.nameext_from_url(data["url"], data) + return True + return False + + def request_submission(self, submitid): + return self.request( + "{}/api/submissions/{}/view".format(self.root, submitid)).json() + + def retrieve_journal(self, journalid): + data = self.request( + "{}/api/journals/{}/view".format(self.root, journalid)).json() + data["extension"] = "html" + data["html"] = "text:" + data["content"] + data["date"] = text.parse_datetime(data["posted_at"]) + return data + + def submissions(self, owner_login, folderid=None): + url = "{}/api/users/{}/gallery".format(self.root, owner_login) + params = { + "nextid" : None, + "folderid": folderid, + } + + while True: + data = self.request(url, params=params).json() + for submission in data["submissions"]: + if self.populate_submission(submission): + submission["folderid"] = folderid + # Do any submissions have more than one url? If so + # a urllist of the submission array urls would work. + yield Message.Url, submission["url"], submission + if not data["nextid"]: + return + params["nextid"] = data["nextid"] + + +class WeasylSubmissionExtractor(WeasylExtractor): + subcategory = "submission" + pattern = BASE_PATTERN + r"(?:~[\w-]+/submissions|submission)/(\d+)" + test = ( + ("https://www.weasyl.com/~fiz/submissions/2031/a-wesley", { + "pattern": "https://cdn.weasyl.com/~fiz/submissions/2031/41ebc1c29" + "40be928532785dfbf35c37622664d2fbb8114c3b063df969562fc5" + "1/fiz-a-wesley.png", + "keyword": { + "comments" : int, + "date" : "dt:2012-04-20 00:38:04", + "description" : "<p>(flex)</p>", + "favorites" : int, + "folder_name" : "Wesley Stuff", + "folderid" : 2081, + "friends_only": False, + "owner" : "Fiz", + "owner_login" : "fiz", + "rating" : "general", + "submitid" : 2031, + "subtype" : "visual", + "tags" : list, + "title" : "A Wesley!", + "type" : "submission", + "views" : int, + }, + }), + ("https://www.weasyl.com/submission/2031/a-wesley"), + ) + + def __init__(self, match): + WeasylExtractor.__init__(self, match) + self.submitid = match.group(1) + + def items(self): + data = self.request_submission(self.submitid) + if self.populate_submission(data): + yield Message.Directory, data + yield Message.Url, data["url"], data + + +class WeasylSubmissionsExtractor(WeasylExtractor): + subcategory = "submissions" + pattern = BASE_PATTERN + r"(?:~|submissions/)([\w-]+)/?$" + test = ( + ("https://www.weasyl.com/~tanidareal", { + "count": ">= 200" + }), + ("https://www.weasyl.com/submissions/tanidareal"), + ) + + def __init__(self, match): + WeasylExtractor.__init__(self, match) + self.owner_login = match.group(1) + + def items(self): + yield Message.Version, 1 + yield Message.Directory, {"owner_login": self.owner_login} + yield from self.submissions(self.owner_login) + + +class WeasylFolderExtractor(WeasylExtractor): + subcategory = "folder" + directory_fmt = ("{category}", "{owner_login}", "{folder_name}") + pattern = BASE_PATTERN + r"submissions/([\w-]+)\?folderid=(\d+)" + test = ("https://www.weasyl.com/submissions/tanidareal?folderid=7403", { + "count": ">= 12" + }) + + def __init__(self, match): + WeasylExtractor.__init__(self, match) + self.owner_login, self.folderid = match.groups() + + def items(self): + yield Message.Version, 1 + iter = self.submissions(self.owner_login, self.folderid) + # Folder names are only on single submission api calls + msg, url, data = next(iter) + details = self.request_submission(data["submitid"]) + yield Message.Directory, details + yield msg, url, data + yield from iter + + +class WeasylJournalExtractor(WeasylExtractor): + subcategory = "journal" + filename_fmt = "{journalid} {title}.{extension}" + archive_fmt = "{journalid}" + pattern = BASE_PATTERN + r"journal/(\d+)" + test = ("https://www.weasyl.com/journal/17647/bbcode", { + "keyword": { + "title" : "BBCode", + "date" : "dt:2013-09-19 23:11:23", + "content": "<p><a>javascript:alert(42);</a></p>" + "<p>No more of that!</p>", + }, + }) + + def __init__(self, match): + WeasylExtractor.__init__(self, match) + self.journalid = match.group(1) + + def items(self): + data = self.retrieve_journal(self.journalid) + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, data["html"], data + + +class WeasylJournalsExtractor(WeasylExtractor): + subcategory = "journals" + filename_fmt = "{journalid} {title}.{extension}" + archive_fmt = "{journalid}" + pattern = BASE_PATTERN + r"journals/([\w-]+)" + test = ("https://www.weasyl.com/journals/charmander", { + "count": ">= 2", + }) + + def __init__(self, match): + WeasylExtractor.__init__(self, match) + self.owner_login = match.group(1) + + def items(self): + yield Message.Version, 1 + yield Message.Directory, {"owner_login": self.owner_login} + + url = "{}/journals/{}".format(self.root, self.owner_login) + page = self.request(url).text + for journalid in text.extract_iter(page, 'href="/journal/', '/'): + data = self.retrieve_journal(journalid) + yield Message.Url, data["html"], data + + +class WeasylFavoriteExtractor(WeasylExtractor): + subcategory = "favorite" + directory_fmt = ("{category}", "{owner_login}", "Favorites") + pattern = BASE_PATTERN + r"favorites\?userid=(\d+)&feature=submit" + test = ("https://www.weasyl.com/favorites?userid=184616&feature=submit", { + "count": ">= 5", + }) + + def __init__(self, match): + WeasylExtractor.__init__(self, match) + self.userid = match.group(1) + + def items(self): + owner_login = lastid = None + url = self.root + "/favorites" + params = { + "userid" : self.userid, + "feature": "submit", + } + + while True: + page = self.request(url, params=params).text + pos = page.index('id="favorites-content"') + + if not owner_login: + owner_login = text.extract(page, '<a href="/~', '"')[0] + yield Message.Directory, {"owner_login": owner_login} + + for submitid in text.extract_iter(page, "/submissions/", "/", pos): + if submitid == lastid: + continue + lastid = submitid + submission = self.request_submission(submitid) + if self.populate_submission(submission): + yield Message.Url, submission["url"], submission + + if "&nextid=" not in page: + return + params["nextid"] = submitid diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index 0b1b2d9..a325f87 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -47,21 +47,31 @@ class WeiboExtractor(Extractor): file["num"] = num yield Message.Url, file["url"], file + def statuses(self): + """Returns an iterable containing all relevant 'status' objects""" + + def _status_by_id(self, status_id): + url = "{}/detail/{}".format(self.root, status_id) + page = self.request(url, fatal=False).text + data = text.extract(page, "var $render_data = [", "][0] || {};")[0] + return json.loads(data)["status"] if data else None + def _files_from_status(self, status): - images = status.pop("pics", ()) page_info = status.pop("page_info", ()) - - for image in images: - pid = image["pid"] - if "large" in image: - image = image["large"] - geo = image.get("geo") or {} - yield text.nameext_from_url(image["url"], { - "url" : image["url"], - "pid" : pid, - "width" : text.parse_int(geo.get("width")), - "height": text.parse_int(geo.get("height")), - }) + if "pics" in status: + if len(status["pics"]) < status["pic_num"]: + status = self._status_by_id(status["id"]) or status + for image in status.pop("pics"): + pid = image["pid"] + if "large" in image: + image = image["large"] + geo = image.get("geo") or {} + yield text.nameext_from_url(image["url"], { + "url" : image["url"], + "pid" : pid, + "width" : text.parse_int(geo.get("width")), + "height": text.parse_int(geo.get("height")), + }) if self.videos and "media_info" in page_info: info = page_info["media_info"] @@ -79,9 +89,6 @@ class WeiboExtractor(Extractor): data["_ytdl_extra"] = {"protocol": "m3u8_native"} yield data - def statuses(self): - """Returns an iterable containing all relevant 'status' objects""" - class WeiboUserExtractor(WeiboExtractor): """Extractor for all images of a user on weibo.cn""" @@ -107,13 +114,13 @@ class WeiboUserExtractor(WeiboExtractor): while True: data = self.request(url, params=params).json() + cards = data["data"]["cards"] - for card in data["data"]["cards"]: + if not cards: + return + for card in cards: if "mblog" in card: yield card["mblog"] - - if not data["data"]["cards"]: - return params["page"] += 1 @@ -145,9 +152,7 @@ class WeiboStatusExtractor(WeiboExtractor): self.status_id = match.group(1) def statuses(self): - url = "{}/detail/{}".format(self.root, self.status_id) - page = self.request(url, notfound="status").text - data = text.extract(page, "var $render_data = [", "][0] || {};")[0] - if not data: + status = self._status_by_id(self.status_id) + if not status: raise exception.NotFoundError("status") - return (json.loads(data)["status"],) + return (status,) diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index 2548ead..b7d116a 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -59,13 +59,13 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) + title = extr('"title":"', '"') user = { "id" : text.parse_int(extr('"id_user":', ',')), "display": extr('"display":"', '"'), "sex" : extr('"sex":"', '"'), "name" : self.user, } - title = extr('"title":"', '"') user["description"] = extr( '<small class="mobile-hide">', '</small>').strip() tags = extr('<em>Tagged:</em>', '<').strip() |
