diff options
Diffstat (limited to 'gallery_dl/extractor/newgrounds.py')
| -rw-r--r-- | gallery_dl/extractor/newgrounds.py | 321 |
1 files changed, 252 insertions, 69 deletions
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 1ca1073..5454e52 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -9,43 +9,107 @@ """Extractors for https://www.newgrounds.com/""" from .common import Extractor, Message -from .. import text +from .. import text, exception +from ..cache import cache import json class NewgroundsExtractor(Extractor): """Base class for newgrounds extractors""" category = "newgrounds" - directory_fmt = ("{category}", "{user}") + directory_fmt = ("{category}", "{artist[:10]:J, }") filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" + root = "https://www.newgrounds.com" + cookiedomain = ".newgrounds.com" + cookienames = ("NG_GG_username", "vmk1du5I8m") def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) - self.root = "https://{}.newgrounds.com".format(self.user) + self.user_root = "https://{}.newgrounds.com".format(self.user) def items(self): - data = self.get_metadata() + self.login() yield Message.Version, 1 - yield Message.Directory, data - for page_url in self.get_page_urls(): - image = self.parse_page_data(page_url) - image.update(data) - url = image["url"] - yield Message.Url, url, text.nameext_from_url(url, image) + for post_url in self.posts(): + try: + file = self.extract_post(post_url) + url = file["url"] + # except Exception: + except OSError: + url = None + if not url: + self.log.warning("Unable to get download URL for %s", post_url) + continue + yield Message.Directory, file + yield Message.Url, url, text.nameext_from_url(url, file) - def get_metadata(self): - """Collect metadata for extractor-job""" - return {"user": self.user} - - def get_page_urls(self): + def posts(self): """Return urls of all relevant image pages""" + return self._pagination(self.subcategory) + + def login(self): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=360*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/passport/" + page = self.request(url).text + headers = {"Origin": self.root, "Referer": url} + + url = text.urljoin(self.root, text.extract(page, 'action="', '"')[0]) + data = { + "username": username, + "password": password, + "remember": "1", + "login" : "1", + } + + response = self.request(url, method="POST", headers=headers, data=data) + if not response.history: + raise exception.AuthenticationError() + + return { + cookie.name: cookie.value + for cookie in response.history[0].cookies + if cookie.expires and cookie.domain == self.cookiedomain + } + + def extract_post(self, post_url): + page = self.request(post_url).text + extr = text.extract_from(page) + + if "/art/view/" in post_url: + data = self._extract_image_data(extr, post_url) + elif "/audio/listen/" in post_url: + data = self._extract_audio_data(extr, post_url) + else: + data = self._extract_media_data(extr, post_url) + + data["comment"] = text.unescape(text.remove_html(extr( + 'id="author_comments">', '</div>'), "", "")) + data["favorites"] = text.parse_int(extr( + 'id="faves_load">', '<').replace(",", "")) + data["score"] = text.parse_float(extr('id="score_number">', '<')) + data["tags"] = text.split_html(extr( + '<dd class="tags momag">', '</dd>')) + data["artist"] = [ + text.extract(user, '//', '.')[0] + for user in text.extract_iter(page, '<div class="item-user">', '>') + ] + + data["tags"].sort() + data["user"] = self.user or data["artist"][0] + return data - def parse_page_data(self, page_url): - """Collect url and metadata from an image page""" - extr = text.extract_from(self.request(page_url).text) + @staticmethod + def _extract_image_data(extr, url): full = text.extract_from(json.loads(extr('"full_image_text":', '});'))) data = { "title" : text.unescape(extr('"og:title" content="', '"')), @@ -53,53 +117,68 @@ class NewgroundsExtractor(Extractor): "date" : text.parse_datetime(extr( 'itemprop="datePublished" content="', '"')), "rating" : extr('class="rated-', '"'), - "favorites" : text.parse_int(extr('id="faves_load">', '<')), - "score" : text.parse_float(extr('id="score_number">', '<')), - "tags" : text.split_html(extr( - '<dd class="tags momag">', '</dd>')), "url" : full('src="', '"'), "width" : text.parse_int(full('width="', '"')), "height" : text.parse_int(full('height="', '"')), } - data["tags"].sort() data["index"] = text.parse_int( data["url"].rpartition("/")[2].partition("_")[0]) return data - def _pagination(self, url): + @staticmethod + def _extract_audio_data(extr, url): + return { + "title" : text.unescape(extr('"og:title" content="', '"')), + "description": text.unescape(extr(':description" content="', '"')), + "date" : text.parse_datetime(extr( + 'itemprop="datePublished" content="', '"')), + "url" : extr('{"url":"', '"').replace("\\/", "/"), + "index" : text.parse_int(url.split("/")[5]), + "rating" : "", + } + + @staticmethod + def _extract_media_data(extr, url): + return { + "title" : text.unescape(extr('"og:title" content="', '"')), + "url" : extr('{"url":"', '"').replace("\\/", "/"), + "date" : text.parse_datetime(extr( + 'itemprop="datePublished" content="', '"')), + "description": text.unescape(extr( + 'itemprop="description" content="', '"')), + "rating" : extr('class="rated-', '"'), + "index" : text.parse_int(url.split("/")[5]), + } + + def _pagination(self, kind): + root = self.user_root headers = { - "Referer": self.root, - "X-Requested-With": "XMLHttpRequest", "Accept": "application/json, text/javascript, */*; q=0.01", + "X-Requested-With": "XMLHttpRequest", + "Referer": root, } + url = "{}/{}/page/1".format(root, kind) while True: - data = self.request(url, headers=headers).json() + with self.request(url, headers=headers, fatal=False) as response: + try: + data = response.json() + except ValueError: + return + if not data: + return + if "errors" in data: + msg = ", ".join(text.unescape(e) for e in data["errors"]) + raise exception.StopExtraction(msg) for year in data["sequence"]: for item in data["years"][str(year)]["items"]: page_url = text.extract(item, 'href="', '"')[0] - yield text.urljoin(self.root, page_url) + yield text.urljoin(root, page_url) if not data["more"]: return - url = text.urljoin(self.root, data["more"]) - - -class NewgroundsUserExtractor(NewgroundsExtractor): - """Extractor for all images of a newgrounds user""" - subcategory = "user" - pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com(?:/art)?/?$" - test = ( - ("https://blitzwuff.newgrounds.com/art", { - "url": "24b19c4a135a09889fac7b46a74e427e4308d02b", - "keyword": "62981f7bdd66e1f1c72ab1d9b932423c156bc9a1", - }), - ("https://blitzwuff.newgrounds.com/"), - ) - - def get_page_urls(self): - return self._pagination(self.root + "/art/page/1") + url = text.urljoin(root, data["more"]) class NewgroundsImageExtractor(NewgroundsExtractor): @@ -109,14 +188,28 @@ class NewgroundsImageExtractor(NewgroundsExtractor): r"(?:www\.)?newgrounds\.com/art/view/([^/?&#]+)/[^/?&#]+" r"|art\.ngfiles\.com/images/\d+/\d+_([^_]+)_([^.]+))") test = ( - ("https://www.newgrounds.com/art/view/blitzwuff/ffx", { - "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818", - "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e", - "content": "cb067d6593598710292cdd340d350d14a26fe075", + ("https://www.newgrounds.com/art/view/tomfulp/ryu-is-hawt", { + "url": "57f182bcbbf2612690c3a54f16ffa1da5105245e", + "content": "8f395e08333eb2457ba8d8b715238f8910221365", + "keyword": { + "artist" : ["tomfulp"], + "comment" : "re:Consider this the bottom threshold for ", + "date" : "type:datetime", + "description": "re:Consider this the bottom threshold for ", + "favorites" : int, + "filename" : "94_tomfulp_ryu-is-hawt", + "height" : 476, + "index" : 94, + "rating" : "e", + "score" : float, + "tags" : ["ryu", "streetfighter"], + "title" : "Ryu is Hawt", + "user" : "tomfulp", + "width" : 447, + }, }), - ("https://art.ngfiles.com/images/587000/587551_blitzwuff_ffx.png", { - "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818", - "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e", + ("https://art.ngfiles.com/images/0/94_tomfulp_ryu-is-hawt.gif", { + "url": "57f182bcbbf2612690c3a54f16ffa1da5105245e", }), ) @@ -124,30 +217,120 @@ class NewgroundsImageExtractor(NewgroundsExtractor): NewgroundsExtractor.__init__(self, match) if match.group(2): self.user = match.group(2) - self.page_url = "https://www.newgrounds.com/art/view/{}/{}".format( + self.post_url = "https://www.newgrounds.com/art/view/{}/{}".format( self.user, match.group(3)) else: - self.page_url = match.group(0) + url = match.group(0) + if not url.startswith("http"): + url = "https://" + url + self.post_url = url + + def posts(self): + return (self.post_url,) + + +class NewgroundsMediaExtractor(NewgroundsExtractor): + """Extractor for a media file from newgrounds.com""" + subcategory = "media" + pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com" + r"(/(?:portal/view|audio/listen)/\d+)") + test = ( + ("https://www.newgrounds.com/portal/view/589549", { + "url": "48d916d819c99139e6a3acbbf659a78a867d363e", + "content": "ceb865426727ec887177d99e0d20bb021e8606ae", + "keyword": { + "artist" : ["psychogoldfish", "tomfulp"], + "comment" : "re:People have been asking me how I like the ", + "date" : "type:datetime", + "description": "re:People have been asking how I like the ", + "favorites" : int, + "filename" : "527818_alternate_1896", + "index" : 589549, + "rating" : "t", + "score" : float, + "tags" : ["newgrounds", "psychogoldfish", + "rage", "redesign-2012"], + "title" : "Redesign Rage", + "user" : "psychogoldfish", + }, + }), + ("https://www.newgrounds.com/audio/listen/609768", { + "url": "f4c5490ae559a3b05e46821bb7ee834f93a43c95", + "keyword": { + "artist" : ["zj", "tomfulp"], + "comment" : "re:RECORDED 12-09-2014\n\nFrom The ZJ \"Late ", + "date" : "type:datetime", + "description": "From The ZJ Report Show!", + "favorites" : int, + "index" : 609768, + "rating" : "", + "score" : float, + "tags" : ["fulp", "interview", "tom", "zj"], + "title" : "ZJ Interviews Tom Fulp!", + "user" : "zj", + }, + }), + ) + + def __init__(self, match): + NewgroundsExtractor.__init__(self, match) + self.user = "" + self.post_url = self.root + match.group(1) - def get_page_urls(self): - return (self.page_url,) + def posts(self): + return (self.post_url,) -class NewgroundsVideoExtractor(NewgroundsExtractor): - """Extractor for all videos of a newgrounds user""" - subcategory = "video" - filename_fmt = "{category}_{index}.{extension}" +class NewgroundsArtExtractor(NewgroundsExtractor): + """Extractor for all images of a newgrounds user""" + subcategory = "art" + pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/art/?$" + test = ("https://tomfulp.newgrounds.com/art", { + "pattern": NewgroundsImageExtractor.pattern, + "count": ">= 3", + }) + + +class NewgroundsAudioExtractor(NewgroundsExtractor): + """Extractor for all audio submissions of a newgrounds user""" + subcategory = "audio" + pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/audio/?$" + test = ("https://tomfulp.newgrounds.com/audio", { + "pattern": r"https://audio.ngfiles.com/\d+/\d+_.+\.mp3", + "count": ">= 4", + }) + + +class NewgroundsMoviesExtractor(NewgroundsExtractor): + """Extractor for all movies of a newgrounds user""" + subcategory = "movies" pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$" test = ("https://tomfulp.newgrounds.com/movies", { - "pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+", - "count": ">= 32", + "pattern": r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+_.+", + "range": "1-10", + "count": 10, }) - def get_page_urls(self): - return self._pagination(self.root + "/movies/page/1") - def parse_page_data(self, page_url): - return { - "url" : "ytdl:" + page_url, - "index": text.parse_int(page_url.rpartition("/")[2]), - } +class NewgroundsUserExtractor(NewgroundsExtractor): + """Extractor for a newgrounds user profile""" + subcategory = "user" + pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/?$" + test = ( + ("https://tomfulp.newgrounds.com", { + "pattern": "https://tomfulp.newgrounds.com/art$", + }), + ("https://tomfulp.newgrounds.com", { + "options": (("include", "all"),), + "pattern": "https://tomfulp.newgrounds.com/(art|audio|movies)$", + "count": 3, + }), + ) + + def items(self): + base = self.user_root + "/" + return self._dispatch_extractors(( + (NewgroundsArtExtractor , base + "art"), + (NewgroundsAudioExtractor , base + "audio"), + (NewgroundsMoviesExtractor, base + "movies"), + ), ("art",)) |
