diff options
| author | 2019-08-04 17:52:59 -0400 | |
|---|---|---|
| committer | 2019-08-04 17:52:59 -0400 | |
| commit | 64ad8e7bd15df71ab1116eede414558631bcad32 (patch) | |
| tree | 7416e191aedce591087903a943198aed13fa0b26 /gallery_dl/extractor | |
| parent | 2a63a9c9b7032a76894c48ac4d9cea732fcaee49 (diff) | |
New upstream version 1.10.1upstream/1.10.1
Diffstat (limited to 'gallery_dl/extractor')
| -rw-r--r-- | gallery_dl/extractor/__init__.py | 3 | ||||
| -rw-r--r-- | gallery_dl/extractor/adultempire.py | 58 | ||||
| -rw-r--r-- | gallery_dl/extractor/behance.py | 22 | ||||
| -rw-r--r-- | gallery_dl/extractor/dynastyscans.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/exhentai.py | 102 | ||||
| -rw-r--r-- | gallery_dl/extractor/gelbooru.py | 1 | ||||
| -rw-r--r-- | gallery_dl/extractor/imgbb.py | 179 | ||||
| -rw-r--r-- | gallery_dl/extractor/luscious.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/ngomik.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/sankaku.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/sankakucomplex.py | 4 | ||||
| -rw-r--r-- | gallery_dl/extractor/tsumino.py | 2 | ||||
| -rw-r--r-- | gallery_dl/extractor/vsco.py | 176 |
13 files changed, 509 insertions, 48 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 189c163..0b24111 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -17,6 +17,7 @@ modules = [ "500px", "8chan", "8muses", + "adultempire", "artstation", "behance", "bobx", @@ -42,6 +43,7 @@ modules = [ "idolcomplex", "imagebam", "imagefap", + "imgbb", "imgbox", "imgth", "imgur", @@ -95,6 +97,7 @@ modules = [ "tumblr", "twitter", "vanillarock", + "vsco", "wallhaven", "warosu", "weibo", diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py new file mode 100644 index 0000000..5ea835f --- /dev/null +++ b/gallery_dl/extractor/adultempire.py @@ -0,0 +1,58 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.adultempire.com/""" + +from .common import GalleryExtractor +from .. import text + + +class AdultempireGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from www.adultempire.com""" + category = "adultempire" + root = "https://www.adultempire.com" + pattern = (r"(?:https?://)?(?:www\.)?adult(?:dvd)?empire\.com" + r"(/(\d+)/gallery\.html)") + test = ( + ("https://www.adultempire.com/5998/gallery.html", { + "range": "1", + "keyword": "0533ef1184892be8ac02b17286797c95f389ba63", + "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e", + }), + ("https://www.adultdvdempire.com/5683/gallery.html", { + "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d", + "keyword": "59fe5d95929efc5040a819a5f77aba7a022bb85a", + }), + ) + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + def metadata(self, page): + extr = text.extract_from(page, page.index('<div id="content">')) + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(extr('title="', '"')), + "studio" : extr(">studio</small>", "<").strip(), + "date" : text.parse_datetime(extr( + ">released</small>", "<").strip(), "%m/%d/%Y"), + "actors" : text.split_html(extr( + '<ul class="item-details item-cast-list ', '</ul>'))[1:], + } + + def images(self, page): + params = {"page": 1} + while True: + urls = list(text.extract_iter(page, 'rel="L"><img src="', '"')) + for url in urls: + yield url.replace("_200.", "_9600."), None + if len(urls) < 24: + return + params["page"] += 1 + page = self.request(self.chapter_url, params=params).text diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 111d560..467a935 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -30,7 +30,8 @@ class BehanceExtractor(Extractor): @staticmethod def _update(data): # compress data to simple lists - data["fields"] = [field["name"] for field in data["fields"]] + if data["fields"] and isinstance(data["fields"][0], dict): + data["fields"] = [field["name"] for field in data["fields"]] data["owners"] = [owner["display_name"] for owner in data["owners"]] if "tags" in data: data["tags"] = [tag["title"] for tag in data["tags"]] @@ -140,11 +141,11 @@ class BehanceUserExtractor(BehanceExtractor): def galleries(self): url = "{}/{}/projects".format(self.root, self.user) - headers = {"X-Requested-With": "XMLHttpRequest"} params = {"offset": 0} + headers = {"X-Requested-With": "XMLHttpRequest"} while True: - data = self.request(url, headers=headers, params=params).json() + data = self.request(url, params=params, headers=headers).json() work = data["profile"]["activeSection"]["work"] yield from work["projects"] if not work["hasMore"]: @@ -157,8 +158,8 @@ class BehanceCollectionExtractor(BehanceExtractor): subcategory = "collection" categorytransfer = True pattern = r"(?:https?://)?(?:www\.)?behance\.net/collection/(\d+)" - test = ("https://www.behance.net/collection/170615607/Sky", { - "count": ">= 13", + test = ("https://www.behance.net/collection/71340149/inspiration", { + "count": ">= 145", "pattern": BehanceGalleryExtractor.pattern, }) @@ -168,12 +169,13 @@ class BehanceCollectionExtractor(BehanceExtractor): def galleries(self): url = "{}/collection/{}/a".format(self.root, self.collection_id) + params = {"offset": 0} headers = {"X-Requested-With": "XMLHttpRequest"} - params = {} while True: - data = self.request(url, headers=headers, params=params).json() - yield from data["output"] - if not data.get("offset"): + data = self.request(url, params=params, headers=headers).json() + for item in data["items"]: + yield item["project"] + if len(data["items"]) < 40: return - params["offset"] = data["offset"] + params["offset"] += len(data["items"]) diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py index b10bd35..9cc6738 100644 --- a/gallery_dl/extractor/dynastyscans.py +++ b/gallery_dl/extractor/dynastyscans.py @@ -100,7 +100,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor): test = ( ("https://dynasty-scans.com/images?with[]=4930&with[]=5211", { "url": "6b570eedd8a741c2cd34fb98b22a49d772f84191", - "keyword": "a1e2d05c1406a08b02f347389616a6babb1b50bf", + "keyword": "fa7ff94f82cdf942f7734741d758f160a6b0905a", }), ("https://dynasty-scans.com/images", { "range": "1", diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 20e0746..1833b1a 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from galleries at https://exhentai.org/""" +"""Extractors for https://e-hentai.org/ and https://exhentai.org/""" from .common import Extractor, Message from .. import text, util, exception @@ -23,16 +23,19 @@ BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org" class ExhentaiExtractor(Extractor): """Base class for exhentai extractors""" category = "exhentai" - directory_fmt = ("{category}", "{gallery_id}") + directory_fmt = ("{category}", "{gallery_id} {title}") filename_fmt = ( "{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}") archive_fmt = "{gallery_id}_{num}" - cookiedomain = ".exhentai.org" cookienames = ("ipb_member_id", "ipb_pass_hash") + cookiedomain = ".exhentai.org" root = "https://exhentai.org" + LIMIT = False + def __init__(self, match): - if match.group(1) != "ex": + version = match.group(1) + if version != "ex": self.root = "https://e-hentai.org" self.cookiedomain = ".e-hentai.org" Extractor.__init__(self, match) @@ -45,6 +48,8 @@ class ExhentaiExtractor(Extractor): if self.wait_max < self.wait_min: self.wait_max = self.wait_min self.session.headers["Referer"] = self.root + "/" + if version != "ex": + self.session.cookies.set("nw", "1", domain=self.cookiedomain) def request(self, *args, **kwargs): response = Extractor.request(self, *args, **kwargs) @@ -63,6 +68,9 @@ class ExhentaiExtractor(Extractor): def login(self): """Login and set necessary cookies""" + if self.LIMIT: + self.log.error("Image limit reached!") + raise exception.StopExtraction() if self._check_cookies(self.cookienames): return username, password = self._get_auth_info() @@ -92,7 +100,7 @@ class ExhentaiExtractor(Extractor): } response = self.request(url, method="POST", headers=headers, data=data) - if "You are now logged in as:" not in response.text: + if b"You are now logged in as:" not in response.content: raise exception.AuthenticationError() return {c: response.cookies[c] for c in self.cookienames} @@ -112,9 +120,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): r"(?:/g/(\d+)/([\da-f]{10})" r"|/s/([\da-f]{10})/(\d+)-(\d+))") test = ( - ("https://exhentai.org/g/960460/4f0e369d82/", { - "keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480", - "content": "493d759de534355c9f55f8e365565b62411de146", + ("https://exhentai.org/g/1200119/d55c44d3d0/", { + "keyword": "1b353fad00dff0665b1746cdd151ab5cc326df23", + "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff", }), ("https://exhentai.org/g/960461/4f0e369d82/", { "exception": exception.NotFoundError, @@ -122,13 +130,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): ("http://exhentai.org/g/962698/7f02358e00/", { "exception": exception.AuthorizationError, }), - ("https://exhentai.org/s/3957343c3b/960460-5", { + ("https://exhentai.org/s/f68367b4c8/1200119-3", { "count": 2, }), - ("https://e-hentai.org/s/3957343c3b/960460-5", { + ("https://e-hentai.org/s/f68367b4c8/1200119-3", { "count": 2, }), - ("https://g.e-hentai.org/g/960460/4f0e369d82/"), + ("https://g.e-hentai.org/g/1200119/d55c44d3d0/"), ) def __init__(self, match): @@ -143,14 +151,25 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def items(self): self.login() + if self.limits: + self._init_limits() + if self.gallery_token: gpage = self._gallery_page() self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0] + if not self.image_token: + self.log.error("Failed to extract initial image token") + self.log.debug("Page content:\n%s", gpage) + return self.wait() ipage = self._image_page() else: ipage = self._image_page() part = text.extract(ipage, 'hentai.org/g/', '"')[0] + if not part: + self.log.error("Failed to extract gallery token") + self.log.debug("Page content:\n%s", ipage) + return self.gallery_token = part.split("/")[1] self.wait() gpage = self._gallery_page() @@ -211,12 +230,17 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): iurl = extr('<img id="img" src="', '"') orig = extr('hentai.org/fullimg.php', '"') - if self.original and orig: - url = self.root + "/fullimg.php" + text.unescape(orig) - data = self._parse_original_info(extr('ownload original', '<')) - else: - url = iurl - data = self._parse_image_info(url) + try: + if self.original and orig: + url = self.root + "/fullimg.php" + text.unescape(orig) + data = self._parse_original_info(extr('ownload original', '<')) + else: + url = iurl + data = self._parse_image_info(url) + except IndexError: + self.log.error("Unable to parse image info for '%s'", url) + self.log.debug("Page content:\n%s", page) + raise exception.StopExtraction() data["num"] = self.image_num data["image_token"] = self.key["start"] = extr('var startkey="', '";') @@ -242,13 +266,18 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos) origurl, pos = text.extract(page["i7"], '<a href="', '"') - if self.original and origurl: - url = text.unescape(origurl) - data = self._parse_original_info( - text.extract(page["i7"], "ownload original", "<", pos)[0]) - else: - url = imgurl - data = self._parse_image_info(url) + try: + if self.original and origurl: + url = text.unescape(origurl) + data = self._parse_original_info(text.extract( + page["i7"], "ownload original", "<", pos)[0]) + else: + url = imgurl + data = self._parse_image_info(url) + except IndexError: + self.log.error("Unable to parse image info for '%s'", url) + self.log.debug("Page content:\n%s", page) + raise exception.StopExtraction() data["num"] = request["page"] data["image_token"] = imgkey @@ -266,6 +295,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.AuthorizationError() if page.startswith(("Key missing", "Gallery not found")): raise exception.NotFoundError("gallery") + if "hentai.org/mpv/" in page: + self.log.warning("Enabled Multi-Page Viewer is not supported") return page def _image_page(self): @@ -277,17 +308,24 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.NotFoundError("image page") return page + def _init_limits(self): + self._update_limits() + if self._remaining <= 0: + self.log.error("Image limit reached!") + ExhentaiExtractor.LIMIT = True + raise exception.StopExtraction() + def _check_limits(self, data): - if not self._remaining or data["num"] % 20 == 0: + if data["num"] % 20 == 0: self._update_limits() self._remaining -= data["cost"] if self._remaining <= 0: url = "{}/s/{}/{}-{}".format( self.root, data["image_token"], self.gallery_id, data["num"]) - self.log.error( - "Image limit reached! Reset it and continue with " - "'%s' as URL.", url) + self.log.error("Image limit reached! Continue with " + "'%s' as URL after resetting it.", url) + ExhentaiExtractor.LIMIT = True raise exception.StopExtraction() def _update_limits(self): @@ -301,6 +339,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): page = self.request(url, cookies=cookies).text current, pos = text.extract(page, "<strong>", "</strong>") maximum, pos = text.extract(page, "<strong>", "</strong>", pos) + self.log.debug("Image Limits: %s/%s", current, maximum) self._remaining = text.parse_int(maximum) - text.parse_int(current) @staticmethod @@ -330,7 +369,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor): subcategory = "search" pattern = BASE_PATTERN + r"/?\?(.*)$" test = ( - ("https://exhentai.org/?f_search=touhou"), + ("https://e-hentai.org/?f_search=touhou"), (("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0" "&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0" "&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), { @@ -372,7 +411,10 @@ class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): subcategory = "favorite" pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?" test = ( - ("https://exhentai.org/favorites.php"), + ("https://e-hentai.org/favorites.php", { + "count": 1, + "pattern": r"https?://e-hentai\.org/g/1200119/d55c44d3d0" + }), ("https://exhentai.org/favorites.php?favcat=1&f_search=touhou" "&f_apply=Search+Favorites"), ) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 15bd0a8..ce2e83b 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -30,6 +30,7 @@ class GelbooruExtractor(booru.XmlParserMixin, self.params.update({"page": "dapi", "s": "post", "q": "index"}) else: self.items = self.items_noapi + self.session.cookies["fringeBenefits"] = "yup" def items_noapi(self): data = self.get_metadata() diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py new file mode 100644 index 0000000..442634b --- /dev/null +++ b/gallery_dl/extractor/imgbb.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://imgbb.com/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache +import json + + +class ImgbbExtractor(Extractor): + """Base class for imgbb extractors""" + category = "imgbb" + filename_fmt = "{title} {id}.{extension}" + archive_fmt = "{id}" + root = "https://imgbb.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.page_url = self.sort = None + + def items(self): + self.login() + page = self.request(self.page_url, params={"sort": self.sort}).text + data = self.metadata(page) + first = True + + yield Message.Version, 1 + for img in self.images(page): + image = { + "id" : img["url_viewer"].rpartition("/")[2], + "user" : img["user"]["username"], + "title" : text.unescape(img["title"]), + "url" : img["image"]["url"], + "extension": img["image"]["extension"], + "size" : text.parse_int(img["image"]["size"]), + "width" : text.parse_int(img["width"]), + "height" : text.parse_int(img["height"]), + } + image.update(data) + if first: + first = False + yield Message.Directory, data + yield Message.Url, image["url"], image + + def login(self): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=360*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + url = self.root + "/login" + page = self.request(url).text + token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0] + + headers = {"Referer": url} + data = { + "auth_token" : token, + "login-subject": username, + "password" : password, + } + response = self.request(url, method="POST", headers=headers, data=data) + + if not response.history: + raise exception.AuthenticationError() + return self.session.cookies + + def _pagination(self, page, endpoint, params): + params["page"] = 2 + data = None + + while True: + for img in text.extract_iter(page, "data-object='", "'"): + yield json.loads(text.unquote(img)) + if data: + if params["seek"] == data["seekEnd"]: + return + params["seek"] = data["seekEnd"] + params["page"] += 1 + data = self.request(endpoint, "POST", data=params).json() + page = data["html"] + + +class ImgbbAlbumExtractor(ImgbbExtractor): + """Extractor for albums on imgbb.com""" + subcategory = "album" + directory_fmt = ("{category}", "{user}", "{album_name} {album_id}") + pattern = r"(?:https?://)?ibb\.co/album/([^/?&#]+)/?(?:\?([^#]+))?" + test = ( + ("https://ibb.co/album/c6p5Yv", { + "range": "1-80", + "url": "8adaf0f7dfc19ff8bc4712c97f534af8b1e06412", + "keyword": "155b665a53e83d359e914cab7c69d5b829444d64", + }), + ("https://ibb.co/album/c6p5Yv?sort=title_asc", { + "range": "1-80", + "url": "d6c45041d5c8323c435b183a976f3fde2af7c547", + "keyword": "30c3262214e2044bbcf6bf2dee8e3ca7ebd62b71", + }), + ) + + def __init__(self, match): + ImgbbExtractor.__init__(self, match) + self.album_name = None + self.album_id = match.group(1) + self.sort = text.parse_query(match.group(2)).get("sort", "date_desc") + self.page_url = "https://ibb.co/album/" + self.album_id + + def metadata(self, page): + album, pos = text.extract(page, '"og:title" content="', '"') + user , pos = text.extract(page, 'rel="author">', '<', pos) + return { + "album_id" : self.album_id, + "album_name": text.unescape(album), + "user" : user.lower(), + } + + def images(self, page): + seek, pos = text.extract(page, 'data-seek="', '"') + tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos) + + return self._pagination(page, "https://ibb.co/json", { + "action" : "list", + "list" : "images", + "from" : "album", + "sort" : self.sort, + "albumid" : self.album_id, + "seek" : seek, + "auth_token": tokn, + "params_hidden[list]" : "images", + "params_hidden[from]" : "album", + "params_hidden[albumid]": self.album_id, + }) + + +class ImgbbUserExtractor(ImgbbExtractor): + """Extractor for user profiles in imgbb.com""" + subcategory = "user" + directory_fmt = ("{category}", "{user}") + pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$" + test = ("https://folkie.imgbb.com", { + "range": "1-80", + "pattern": r"https?://i\.ibb\.co/\w+/[^/?&#]+", + }) + + def __init__(self, match): + ImgbbExtractor.__init__(self, match) + self.user = match.group(1) + self.sort = text.parse_query(match.group(2)).get("sort", "date_desc") + self.page_url = "https://{}.imgbb.com/".format(self.user) + + def metadata(self, page): + return {"user": self.user} + + def images(self, page): + seek, pos = text.extract(page, 'data-seek="', '"') + tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos) + user, pos = text.extract(page, '.obj.resource={"id":"', '"', pos) + + return self._pagination(page, self.page_url + "json", { + "action" : "list", + "list" : "images", + "from" : "user", + "sort" : self.sort, + "seek" : seek, + "userid" : user, + "auth_token": tokn, + "params_hidden[userid]": user, + "params_hidden[from]" : "user", + }) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 65ae843..879d38b 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor): test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { "url": "7e4984a271a1072ac6483e4228a045895aff86f3", - "keyword": "c597c132834f4990f90bf5dee5de2a9d4ba263a4", + "keyword": "ab4e5b71583fd439b4c8012a642aa8b58d8d0758", "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", }), ("https://luscious.net/albums/virgin-killer-sweater_282582/", { diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py index 8135a8a..f3608b2 100644 --- a/gallery_dl/extractor/ngomik.py +++ b/gallery_dl/extractor/ngomik.py @@ -44,7 +44,7 @@ class NgomikChapterExtractor(ChapterExtractor): @staticmethod def images(page): - readerarea = text.extract(page, 'id=readerarea', 'class=chnav')[0] + readerarea = text.extract(page, 'id="readerarea"', 'class="chnav"')[0] return [ (text.unescape(url), None) for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index 012cb8b..da9735e 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -283,9 +283,9 @@ class SankakuPostExtractor(SankakuExtractor): "options": (("tags", True),), "keyword": { "tags_artist": "bonocho", - "tags_copyright": "batman_(series) the_dark_knight", - "tags_medium": "sketch copyright_name", "tags_studio": "dc_comics", + "tags_medium": "sketch copyright_name", + "tags_copyright": str, "tags_character": str, "tags_general": str, }, diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index 55eda9f..0189fc9 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -34,11 +34,11 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): test = ( ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", { "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d", - "keyword": "4b3b5766b277a5d0acbec90fa8f2343262b07efd", + "keyword": "bfe08310e7d9a572f568f6900e0ed0eb295aa2b3", }), ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", { "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c", - "keyword": "f47a416d680717855bbc3e4f0cd44479f61d9aa4", + "keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68", }), ) diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index 03ee144..66ad431 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -65,7 +65,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): "uploader" : "sehki", "lang" : "en", "language" : "English", - "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996", + "thumbnail" : "re:https?://www.tsumino.com/Image/Thumb/40996", }, }), ("https://www.tsumino.com/Read/View/45834"), diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py new file mode 100644 index 0000000..639ec82 --- /dev/null +++ b/gallery_dl/extractor/vsco.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://vsco.co/""" + +from .common import Extractor, Message +from .. import text +import json + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co/([^/]+)" + + +class VscoExtractor(Extractor): + """Base class for vsco extractors""" + category = "vsco" + root = "https://vsco.co" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{id}.{extension}" + archive_fmt = "{id}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1).lower() + + def items(self): + yield Message.Version, 1 + yield Message.Directory, {"user": self.user} + for img in self.images(): + url = "https://" + (img.get("video_url") or img["responsive_url"]) + data = text.nameext_from_url(url, { + "id" : img["_id"], + "user" : self.user, + "grid" : img["grid_name"], + "meta" : img.get("image_meta") or {}, + "tags" : [tag["text"] for tag in img.get("tags") or ()], + "date" : text.parse_timestamp(img["upload_date"] // 1000), + "video" : img["is_video"], + "width" : img["width"], + "height": img["height"], + "description": img["description"], + }) + yield Message.Url, url, data + + def images(self): + """Return an iterable with all relevant image objects""" + + def _extract_preload_state(self, url): + page = self.request(url, notfound=self.subcategory).text + return json.loads(text.extract(page, "__PRELOADED_STATE__ = ", "<")[0]) + + def _pagination(self, url, params, token, key, extra): + headers = { + "Referer" : "{}/{}".format(self.root, self.user), + "Authorization" : "Bearer " + token, + "X-Client-Platform": "web", + "X-Client-Build" : "1", + } + + yield from map(self._transform_media, extra) + + while True: + data = self.request(url, params=params, headers=headers).json() + if not data.get(key): + return + yield from data[key] + params["page"] += 1 + + @staticmethod + def _transform_media(media): + media["_id"] = media["id"] + media["is_video"] = media["isVideo"] + media["grid_name"] = media["gridName"] + media["upload_date"] = media["uploadDate"] + media["responsive_url"] = media["responsiveUrl"] + media["video_url"] = media.get("videoUrl") + media["image_meta"] = media.get("imageMeta") + return media + + +class VscoUserExtractor(VscoExtractor): + """Extractor for images from a user on vsco.co""" + subcategory = "user" + pattern = BASE_PATTERN + r"/images/" + test = ("https://vsco.co/missuri/images/1", { + "range": "1-80", + "count": 80, + "pattern": r"https://im\.vsco\.co/[^/]+/[0-9a-f/]+/vsco\w+\.\w+", + }) + + def images(self): + url = "{}/{}/images/1".format(self.root, self.user) + data = self._extract_preload_state(url) + + tkn = data["users"]["currentUser"]["tkn"] + sid = str(data["sites"]["siteByUsername"][self.user]["site"]["id"]) + + url = "{}/api/2.0/medias".format(self.root) + params = {"page": 2, "size": "30", "site_id": sid} + return self._pagination(url, params, tkn, "media", ( + data["medias"]["byId"][mid]["media"] + for mid in data["medias"]["bySiteId"][sid]["medias"]["1"] + )) + + +class VscoCollectionExtractor(VscoExtractor): + """Extractor for images from a collection on vsco.co""" + subcategory = "collection" + directory_fmt = ("{category}", "{user}", "collection") + archive_fmt = "c_{user}_{id}" + pattern = BASE_PATTERN + r"/collection/" + test = ("https://vsco.co/vsco/collection/1", { + "range": "1-80", + "count": 80, + "pattern": r"https://im\.vsco\.co/[^/]+/[0-9a-f/]+/vsco\w+\.\w+", + }) + + def images(self): + url = "{}/{}/collection/1".format(self.root, self.user) + data = self._extract_preload_state(url) + + tkn = data["users"]["currentUser"]["tkn"] + cid = (data["sites"]["siteByUsername"][self.user] + ["site"]["siteCollectionId"]) + + url = "{}/api/2.0/collections/{}/medias".format(self.root, cid) + params = {"page": 2, "size": "20"} + return self._pagination(url, params, tkn, "medias", ( + data["medias"]["byId"][mid]["media"] + for mid in data + ["collections"]["byCollectionId"][cid]["collection"]["1"] + )) + + +class VscoImageExtractor(VscoExtractor): + """Extractor for individual images on vsco.co""" + subcategory = "image" + pattern = BASE_PATTERN + r"/media/([0-9a-fA-F]+)" + test = ( + ("https://vsco.co/erenyildiz/media/5d34b93ef632433030707ce2", { + "url": "faa214d10f859f374ad91da3f7547d2439f5af08", + "content": "1394d070828d82078035f19a92f404557b56b83f", + "keyword": { + "id" : "5d34b93ef632433030707ce2", + "user" : "erenyildiz", + "grid" : "erenyildiz", + "meta" : dict, + "tags" : list, + "date" : "type:datetime", + "video" : False, + "width" : 1537, + "height": 1537, + "description": "re:Ni seviyorum. #vsco #vscox #vscochallenges", + }, + }), + ("https://vsco.co/jimenalazof/media/5b4feec558f6c45c18c040fd", { + "url": "08e7eef3301756ce81206c0b47c1e9373756a74a", + "content": "e739f058d726ee42c51c180a505747972a7dfa47", + "keyword": {"video" : True}, + }), + ) + + def __init__(self, match): + VscoExtractor.__init__(self, match) + self.media_id = match.group(2) + + def images(self): + url = "{}/{}/media/{}".format(self.root, self.user, self.media_id) + data = self._extract_preload_state(url) + media = data["medias"]["byId"].popitem()[1]["media"] + return (self._transform_media(media),) |
