diff options
| author | 2019-08-26 19:34:45 -0400 | |
|---|---|---|
| committer | 2019-08-26 19:34:45 -0400 | |
| commit | b75d158d014d6c43d7d785c46c9372a9cf84d144 (patch) | |
| tree | 7dca4a7e61fe8b6e2bff2142fc19891e783a7d6d /gallery_dl/extractor | |
| parent | 64ad8e7bd15df71ab1116eede414558631bcad32 (diff) | |
New upstream version 1.10.2upstream/1.10.2
Diffstat (limited to 'gallery_dl/extractor')
24 files changed, 673 insertions, 379 deletions
diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py index 5ea835f..5e2480a 100644 --- a/gallery_dl/extractor/adultempire.py +++ b/gallery_dl/extractor/adultempire.py @@ -21,12 +21,12 @@ class AdultempireGalleryExtractor(GalleryExtractor): test = ( ("https://www.adultempire.com/5998/gallery.html", { "range": "1", - "keyword": "0533ef1184892be8ac02b17286797c95f389ba63", + "keyword": "25c8171f5623678491a0d7bdf38a7a6ebfa4a361", "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e", }), ("https://www.adultdvdempire.com/5683/gallery.html", { "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d", - "keyword": "59fe5d95929efc5040a819a5f77aba7a022bb85a", + "keyword": "0fe9a6e3f0a331b95ba77f66a643705ca86e8ec5", }), ) @@ -42,8 +42,8 @@ class AdultempireGalleryExtractor(GalleryExtractor): "studio" : extr(">studio</small>", "<").strip(), "date" : text.parse_datetime(extr( ">released</small>", "<").strip(), "%m/%d/%Y"), - "actors" : text.split_html(extr( - '<ul class="item-details item-cast-list ', '</ul>'))[1:], + "actors" : sorted(text.split_html(extr( + '<ul class="item-details item-cast-list ', '</ul>'))[1:]), } def images(self, page): diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index f7b3bc1..2892bd4 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -41,6 +41,7 @@ class ArtstationExtractor(Extractor): player = adict["player_embedded"] url = text.extract(player, 'src="', '"')[0] if not url.startswith(self.root): + asset["extension"] = None yield Message.Url, "ytdl:" + url, asset continue diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py index c63085a..54a8878 100644 --- a/gallery_dl/extractor/booru.py +++ b/gallery_dl/extractor/booru.py @@ -41,10 +41,8 @@ class BooruExtractor(SharedConfigMixin, Extractor): return pages * self.per_page def items(self): - data = self.get_metadata() - yield Message.Version, 1 - yield Message.Directory, data + data = self.get_metadata() self.reset_page() while True: @@ -59,9 +57,11 @@ class BooruExtractor(SharedConfigMixin, Extractor): if url.startswith("/"): url = text.urljoin(self.api_url, url) image.update(data) + text.nameext_from_url(url, image) if self.extags: self.extended_tags(image) - yield Message.Url, url, text.nameext_from_url(url, image) + yield Message.Directory, image + yield Message.Url, url, image if len(images) < self.per_page: return diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 5c40e2a..a90af1c 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -87,7 +87,8 @@ class Extractor(): raise exception.HttpError(exc) else: code = response.status_code - if 200 <= code < 400 or not fatal and \ + if 200 <= code < 400 or fatal is None and \ + (400 <= code < 500) or not fatal and \ (400 <= code < 429 or 431 <= code < 500): if encoding: response.encoding = encoding diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 63e2913..bd1299b 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -27,7 +27,7 @@ BASE_PATTERN = ( class DeviantartExtractor(Extractor): - """Base class for deviantart extractors""" + """Base class for deviantart extractors using the OAuth API""" category = "deviantart" directory_fmt = ("{category}", "{author[username]!l}") filename_fmt = "{category}_{index}_{title}.{extension}" @@ -38,11 +38,15 @@ class DeviantartExtractor(Extractor): self.offset = 0 self.flat = self.config("flat", True) self.extra = self.config("extra", False) + self.quality = self.config("quality", "100") self.original = self.config("original", True) self.user = match.group(1) or match.group(2) self.group = False self.api = DeviantartAPI(self) + if self.quality: + self.quality = "q_{}".format(self.quality) + if self.original != "image": self._update_content = self._update_content_default else: @@ -81,12 +85,15 @@ class DeviantartExtractor(Extractor): text.ext_from_url(content["src"]) != "gif": self._update_content(deviation, content) - if deviation["index"] <= 790677560 and \ - content["src"].startswith("https://images-wixmp-"): - # https://github.com/r888888888/danbooru/issues/4069 - content["src"] = re.sub( - r"(/f/[^/]+/[^/]+)/v\d+/.*", - r"/intermediary\1", content["src"]) + if content["src"].startswith("https://images-wixmp-"): + if deviation["index"] <= 790677560: + # https://github.com/r888888888/danbooru/issues/4069 + content["src"] = re.sub( + r"(/f/[^/]+/[^/]+)/v\d+/.*", + r"/intermediary\1", content["src"]) + if self.quality: + content["src"] = re.sub( + r"q_\d+", self.quality, content["src"]) yield self.commit(deviation, content) @@ -133,8 +140,16 @@ class DeviantartExtractor(Extractor): @staticmethod def commit(deviation, target): url = target["src"] - deviation["target"] = text.nameext_from_url(url, target.copy()) - deviation["extension"] = deviation["target"]["extension"] + thumb = deviation["thumbs"][0]["src"] if "thumbs" in deviation else url + target = text.nameext_from_url(thumb, target.copy()) + if target["filename"].endswith("-150"): + target["filename"] = target["filename"][:-4] + if not target["filename"].count("-"): + name, _, hid = target["filename"].rpartition("_") + target["filename"] = name + "-" + hid + deviation["target"] = target + deviation["filename"] = target["filename"] + deviation["extension"] = target["extension"] = text.ext_from_url(url) return Message.Url, url, deviation def _commit_journal_html(self, deviation, journal): @@ -225,14 +240,6 @@ class DeviantartExtractor(Extractor): if mtype and mtype.startswith("image/"): content.update(data) - def _html_request(self, url, **kwargs): - cookies = {"userinfo": ( - '__167217c8e6aac1a3331f;{"username":"","uniqueid":"ab2e8b184471bf0' - 'e3f8ed3ee7a3220aa","vd":"Bc7vEx,BdC7Fy,A,J,A,,B,A,B,BdC7Fy,BdC7XU' - ',J,J,A,BdC7XU,13,A,B,A,,A,A,B,A,A,,A","attr":56}' - )} - return self.request(url, cookies=cookies, **kwargs) - class DeviantartGalleryExtractor(DeviantartExtractor): """Extractor for all deviations from an artist's gallery""" @@ -360,68 +367,6 @@ class DeviantartFolderExtractor(DeviantartExtractor): deviation["folder"] = self.folder -class DeviantartDeviationExtractor(DeviantartExtractor): - """Extractor for single deviations""" - subcategory = "deviation" - archive_fmt = "{index}.{extension}" - pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)" - test = ( - (("https://www.deviantart.com/shimoda7/art/" - "For-the-sake-of-a-memory-10073852"), { - "options": (("original", 0),), - "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", - }), - ("https://www.deviantart.com/zzz/art/zzz-1234567890", { - "exception": exception.NotFoundError, - }), - (("https://www.deviantart.com/myria-moon/art/" - "Aime-Moi-part-en-vadrouille-261986576"), { - "pattern": (r"https?://s3\.amazonaws\.com/origin-orig\." - r"deviantart\.net/a383/f/2013/135/e/7/[^.]+\.jpg\?"), - }), - # wixmp URL rewrite - (("https://www.deviantart.com/citizenfresh/art/" - "Hverarond-14-the-beauty-of-the-earth-789295466"), { - "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" - r"/intermediary/f/[^/]+/[^.]+\.jpg$") - }), - # non-download URL for GIFs (#242) - (("https://www.deviantart.com/skatergators/art/" - "COM-Monique-Model-781571783"), { - "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" - r"/f/[^/]+/[^.]+\.gif\?token="), - }), - # external URLs from description (#302) - (("https://www.deviantart.com/uotapo/art/" - "INANAKI-Memorial-Humane7-590297498"), { - "options": (("extra", 1), ("original", 0)), - "pattern": r"https?://sta\.sh/\w+$", - "range": "2-", - "count": 4, - }), - # old-style URLs - ("https://shimoda7.deviantart.com" - "/art/For-the-sake-of-a-memory-10073852"), - ("https://myria-moon.deviantart.com" - "/art/Aime-Moi-part-en-vadrouille-261986576"), - ("https://zzz.deviantart.com/art/zzz-1234567890"), - ) - - skip = Extractor.skip - - def __init__(self, match): - DeviantartExtractor.__init__(self, match) - self.path = match.group(3) - - def deviations(self): - url = "{}/{}/{}".format(self.root, self.user, self.path) - response = self._html_request(url, fatal=False) - deviation_id = text.extract(response.text, '//deviation/', '"')[0] - if response.status_code >= 400 or not deviation_id: - raise exception.NotFoundError("image") - return (self.api.deviation(deviation_id),) - - class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" @@ -558,54 +503,6 @@ class DeviantartJournalExtractor(DeviantartExtractor): return self.api.browse_user_journals(self.user, self.offset) -class DeviantartScrapsExtractor(DeviantartExtractor): - """Extractor for an artist's scraps""" - subcategory = "scraps" - directory_fmt = ("{category}", "{username}", "Scraps") - archive_fmt = "s_{username}_{index}.{extension}" - pattern = BASE_PATTERN + r"/gallery/\?catpath=scraps\b" - test = ( - ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps", { - "count": 12, - "options": (("original", False),), - }), - ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"), - ) - - def deviations(self): - url = "{}/{}/gallery/?catpath=scraps".format(self.root, self.user) - page = self._html_request(url).text - csrf, pos = text.extract(page, '"csrf":"', '"') - iid , pos = text.extract(page, '"requestid":"', '"', pos) - - url = "https://www.deviantart.com/dapi/v1/gallery/0" - data = { - "username": self.user, - "offset": self.offset, - "limit": "24", - "catpath": "scraps", - "_csrf": csrf, - "dapiIid": iid + "-jsok7403-1.1" - } - - while True: - content = self.request( - url, method="POST", data=data).json()["content"] - - for item in content["results"]: - if item["html"].startswith('<div class="ad-container'): - continue - deviation_url = text.extract(item["html"], 'href="', '"')[0] - page = self._html_request(deviation_url).text - deviation_id = text.extract(page, '//deviation/', '"')[0] - if deviation_id: - yield self.api.deviation(deviation_id) - - if not content["has_more"]: - return - data["offset"] = content["next_offset"] - - class DeviantartPopularExtractor(DeviantartExtractor): """Extractor for popular deviations""" subcategory = "popular" @@ -649,6 +546,247 @@ class DeviantartPopularExtractor(DeviantartExtractor): deviation["popular"] = self.popular +class DeviantartExtractorV2(Extractor): + """Base class for deviantart extractors using the NAPI""" + category = "deviantart" + directory_fmt = ("{category}", "{author[username]!l}") + filename_fmt = "{category}_{index}_{title}.{extension}" + root = "https://www.deviantart.com" + + def __init__(self, match=None): + Extractor.__init__(self, match) + self.offset = 0 + self.extra = self.config("extra", False) + self.quality = self.config("quality", "100") + self.user = match.group(1) or match.group(2) + + if self.quality: + self.quality = "q_{}".format(self.quality) + + def items(self): + url = ( + self.root + "/_napi/da-browse/shared_api/deviation/extended_fetch" + ) + params = { + "deviationid" : None, + "username" : None, + "type" : None, + "include_session": "false", + } + headers = { + "Referer": self.root, + } + + yield Message.Version, 1 + for deviation in self.deviations(): + params["deviationid"] = deviation["deviationId"] + params["username"] = deviation["author"]["username"] + params["type"] = "journal" if deviation["isJournal"] else "art" + data = self.request(url, params=params, headers=headers).json() + + if "deviation" not in data: + self.log.warning("Skipping %s", params["deviationid"]) + continue + deviation = self._extract(data) + + yield Message.Directory, deviation + yield Message.Url, deviation["target"]["src"], deviation + if self.extra: + for match in DeviantartStashExtractor.pattern.finditer( + deviation["description"]): + deviation["_extractor"] = DeviantartStashExtractor + yield Message.Queue, match.group(0), deviation + + def _extract(self, data): + deviation = data["deviation"] + extended = deviation["extended"] + files = deviation["files"] + del deviation["extended"] + del deviation["files"] + + # prepare deviation metadata + deviation["description"] = extended.get("description", "") + deviation["username"] = self.user.lower() + deviation["stats"] = extended["stats"] + deviation["stats"]["comments"] = data["comments"]["total"] + deviation["index"] = deviation["deviationId"] + deviation["tags"] = [t["name"] for t in extended.get("tags") or ()] + deviation["date"] = text.parse_datetime( + deviation["publishedTime"]) + deviation["category_path"] = "/".join( + extended[key]["displayNameEn"] + for key in ("typeFacet", "contentFacet", "categoryFacet") + if key in extended + ) + + # extract download target + target = files[-1] + name = files[0]["src"] + + if target["type"] == "gif": + pass + elif target["type"] == "video": + # select largest video + target = max( + files, key=lambda x: text.parse_int(x.get("quality", "")[:-1])) + name = target["src"] + elif target["type"] == "flash": + if target["src"].startswith("https://sandbox.deviantart.com"): + # extract SWF file from "sandbox" + target["src"] = text.extract( + self.request(target["src"]).text, + 'id="sandboxembed" src="', '"', + )[0] + elif "download" in extended: + target = extended["download"] + target["src"] = target["url"] + del target["url"] + + # url rewrites + if target["src"].startswith("https://images-wixmp-"): + if deviation["index"] <= 790677560: + # https://github.com/r888888888/danbooru/issues/4069 + target["src"] = re.sub( + r"(/f/[^/]+/[^/]+)/v\d+/.*", + r"/intermediary\1", target["src"]) + if self.quality: + target["src"] = re.sub( + r"q_\d+", self.quality, target["src"]) + + text.nameext_from_url(name, target) + if target["filename"].endswith("-150"): + target["filename"] = target["filename"][:-4] + if not target["filename"].count("-"): + name, _, hid = target["filename"].rpartition("_") + target["filename"] = name + "-" + hid + deviation["target"] = target + deviation["filename"] = target["filename"] + deviation["extension"] = target["extension"] = ( + text.ext_from_url(target["src"])) + return deviation + + +class DeviantartDeviationExtractor(DeviantartExtractorV2): + """Extractor for single deviations""" + subcategory = "deviation" + archive_fmt = "{index}.{extension}" + pattern = BASE_PATTERN + r"/(art|journal)/(?:[^/?&#]+-)?(\d+)" + test = ( + (("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), { + "options": (("original", 0),), + "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", + }), + ("https://www.deviantart.com/zzz/art/zzz-1234567890", { + "count": 0, + }), + (("https://www.deviantart.com/myria-moon/art/Aime-Moi-261986576"), { + "pattern": (r"https://www.deviantart.com/download/261986576" + r"/[\w-]+\.jpg\?token=\w+&ts=\d+"), + }), + # wixmp URL rewrite + (("https://www.deviantart.com/citizenfresh/art/Hverarond-789295466"), { + "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" + r"/intermediary/f/[^/]+/[^.]+\.jpg$") + }), + # wixmp URL rewrite v2 (#369) + (("https://www.deviantart.com/josephbiwald/art/Destiny-2-804940104"), { + "pattern": r"https://images-wixmp-\w+\.wixmp\.com/.*,q_100," + }), + # non-download URL for GIFs (#242) + (("https://www.deviantart.com/skatergators/art/COM-Moni-781571783"), { + "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" + r"/f/[^/]+/[^.]+\.gif\?token="), + }), + # external URLs from description (#302) + (("https://www.deviantart.com/uotapo/art/INANAKI-Memo-590297498"), { + "options": (("extra", 1), ("original", 0)), + "pattern": r"https?://sta\.sh/\w+$", + "range": "2-", + "count": 4, + }), + # video + ("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", { + "url": "3b6e6e761d2d393fa61a4dc3ed6e7db51b14d07b", + "keyword": { + "target": { + "duration": 306, + "extension": "mp4", + "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5", + "filesize": 9963639, + "quality": "1080p", + "src": str, + "type": "video", + }, + } + }), + # archive + ("https://www.deviantart.com/itsvenue/art/-brush-pngs-14-763300948", { + "pattern": r"https://.+deviantart.com/download/763300948/.*\.rar", + }), + # swf + ("https://www.deviantart.com/ikatxfruti/art/Bang-Bang-528130222", { + "pattern": r"https://images-wixmp-.*wixmp.com/f/.*\.swf", + }), + # old-style URLs + ("https://shimoda7.deviantart.com" + "/art/For-the-sake-of-a-memory-10073852"), + ("https://myria-moon.deviantart.com" + "/art/Aime-Moi-part-en-vadrouille-261986576"), + ("https://zzz.deviantart.com/art/zzz-1234567890"), + ) + + skip = Extractor.skip + + def __init__(self, match): + DeviantartExtractorV2.__init__(self, match) + self.type = match.group(3) + self.deviation_id = match.group(4) + + def deviations(self): + return ({ + "deviationId": self.deviation_id, + "author" : {"username": self.user}, + "isJournal" : self.type == "journal", + },) + + +class DeviantartScrapsExtractor(DeviantartExtractorV2): + """Extractor for an artist's scraps""" + subcategory = "scraps" + directory_fmt = ("{category}", "{username}", "Scraps") + archive_fmt = "s_{username}_{index}.{extension}" + pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b" + test = ( + ("https://www.deviantart.com/shimoda7/gallery/scraps", { + "count": 12, + }), + ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps"), + ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"), + ) + + def deviations(self): + url = self.root + "/_napi/da-user-profile/api/gallery/contents" + params = { + "username" : self.user, + "offset" : self.offset, + "limit" : "24", + "scraps_folder": "true", + } + headers = { + "Referer": "{}/{}/gallery/scraps".format(self.root, self.user), + } + + while True: + data = self.request(url, params=params, headers=headers).json() + + for obj in data["results"]: + yield obj["deviation"] + + if not data["hasMore"]: + return + params["offset"] = data["nextOffset"] + + class DeviantartAPI(): """Minimal interface for the DeviantArt API @@ -805,7 +943,7 @@ class DeviantartAPI(): self.authenticate(None if public else self.refresh_token) response = self.extractor.request( - url, headers=self.headers, params=params, fatal=False) + url, headers=self.headers, params=params, fatal=None) data = response.json() status = response.status_code diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index ce2e83b..4ec7f00 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -33,16 +33,16 @@ class GelbooruExtractor(booru.XmlParserMixin, self.session.cookies["fringeBenefits"] = "yup" def items_noapi(self): - data = self.get_metadata() - yield Message.Version, 1 - yield Message.Directory, data + data = self.get_metadata() for post in self.get_posts(): post = self.get_post_data(post) url = post["file_url"] post.update(data) - yield Message.Url, url, text.nameext_from_url(url, post) + text.nameext_from_url(url, post) + yield Message.Directory, post + yield Message.Url, url, post def get_posts(self): """Return an iterable containing all relevant post objects""" diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index c112465..e4f18b3 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor): test = ( ("https://hitomi.la/galleries/867789.html", { "url": "cb759868d090fe0e2655c3e29ebf146054322b6d", - "keyword": "067b5d9b9c0f98530cd5dd2444e0f5a5b4b00d38", + "keyword": "d097a8db8e810045131b4510c41714004f9eff3a", }), ("https://hitomi.la/galleries/1036181.html", { # "aa" subdomain for gallery-id ending in 1 (#142) diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py index 6980185..76b2c38 100644 --- a/gallery_dl/extractor/imagebam.py +++ b/gallery_dl/extractor/imagebam.py @@ -41,14 +41,14 @@ class ImagebamGalleryExtractor(ImagebamExtractor): pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)" test = ( ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { - "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", + "url": "76d976788ae2757ac81694736b07b72356f5c4c8", "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", "content": "596e6bfa157f2c7169805d50075c2986549973a8", }), ("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", { # more than 100 images; see issue #219 "count": 107, - "url": "f92ce5b17676b6ea69288f0aef26f4cdbea7fd8d", + "url": "32ae6fe5dc3e4ca73ff6252e522d16473595d1d1", }), ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { "exception": exception.NotFoundError, @@ -108,7 +108,7 @@ class ImagebamImageExtractor(ImagebamExtractor): r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)") test = ( ("http://www.imagebam.com/image/94d56c502511890", { - "url": "b384893c35a01a09c58018db71ddc4cf2480be95", + "url": "5e9ba3b1451f8ded0ae3a1b84402888893915d4a", "keyword": "4263d4840007524129792b8587a562b5d20c2687", "content": "0c8768055e4e20e7c7259608b67799171b691140", }), diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 442634b..4aa670b 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -17,6 +17,7 @@ import json class ImgbbExtractor(Extractor): """Base class for imgbb extractors""" category = "imgbb" + directory_fmt = ("{category}", "{user}") filename_fmt = "{title} {id}.{extension}" archive_fmt = "{id}" root = "https://imgbb.com" @@ -145,7 +146,6 @@ class ImgbbAlbumExtractor(ImgbbExtractor): class ImgbbUserExtractor(ImgbbExtractor): """Extractor for user profiles in imgbb.com""" subcategory = "user" - directory_fmt = ("{category}", "{user}") pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$" test = ("https://folkie.imgbb.com", { "range": "1-80", @@ -177,3 +177,34 @@ class ImgbbUserExtractor(ImgbbExtractor): "params_hidden[userid]": user, "params_hidden[from]" : "user", }) + + +class ImgbbImageExtractor(ImgbbExtractor): + subcategory = "image" + pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)" + test = ("https://ibb.co/NLZHgqS", { + "url": "fbca86bac09de6fc0304054b2170b423ca1e84fa", + "keyword": "5d70e779bad03b2dc5273b627638045168671157", + }) + + def __init__(self, match): + ImgbbExtractor.__init__(self, match) + self.image_id = match.group(1) + + def items(self): + url = "https://ibb.co/" + self.image_id + extr = text.extract_from(self.request(url).text) + + image = { + "id" : self.image_id, + "title" : text.unescape(extr('"og:title" content="', '"')), + "url" : extr('"og:image" content="', '"'), + "width" : text.parse_int(extr('"og:image:width" content="', '"')), + "height": text.parse_int(extr('"og:image:height" content="', '"')), + "user" : extr('rel="author">', '<').lower(), + } + image["extension"] = text.ext_from_url(image["url"]) + + yield Message.Version, 1 + yield Message.Directory, image + yield Message.Url, image["url"], image diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index c5e3d17..8523523 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -20,13 +20,19 @@ class ImgurExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.item_id = match.group(1) + self.key = match.group(1) self.mp4 = self.config("mp4", True) - def _get_data(self, path): + def _extract_data(self, path): response = self.request(self.root + path, notfound=self.subcategory) - data = text.extract(response.text, "image : ", ",\n")[0] - return self._clean(json.loads(data)) + data = json.loads(text.extract( + response.text, "image : ", ",\n")[0]) + try: + del data["adConfig"] + del data["isAd"] + except KeyError: + pass + return data def _prepare(self, image): image["ext"] = image["ext"].partition("?")[0] @@ -37,18 +43,9 @@ class ImgurExtractor(Extractor): image["extension"] = image["ext"][1:] return url - @staticmethod - def _clean(data): - try: - del data["adConfig"] - del data["isAd"] - except KeyError: - pass - return data - class ImgurImageExtractor(ImgurExtractor): - """Extractor for individual images from imgur.com""" + """Extractor for individual images on imgur.com""" subcategory = "image" filename_fmt = "{category}_{hash}{title:?_//}.{extension}" archive_fmt = "{hash}" @@ -101,22 +98,21 @@ class ImgurImageExtractor(ImgurExtractor): ) def items(self): - image = self._get_data("/" + self.item_id) + image = self._extract_data("/" + self.key) url = self._prepare(image) - yield Message.Version, 1 yield Message.Directory, image yield Message.Url, url, image class ImgurAlbumExtractor(ImgurExtractor): - """Extractor for image albums from imgur.com""" + """Extractor for imgur albums""" subcategory = "album" directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}") filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}" archive_fmt = "{album[hash]}_{hash}" pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com" - r"/(?:a|gallery|t/unmuted)/(\w{7}|\w{5})") + r"/(?:a|t/unmuted)/(\w{7}|\w{5})") test = ( ("https://imgur.com/a/TcBmP", { "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", @@ -147,7 +143,7 @@ class ImgurAlbumExtractor(ImgurExtractor): "width": int, }, }), - ("https://imgur.com/gallery/eD9CT", { # large album + ("https://imgur.com/a/eD9CT", { # large album "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937", }), ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash @@ -164,13 +160,13 @@ class ImgurAlbumExtractor(ImgurExtractor): ) def items(self): - album = self._get_data("/a/" + self.item_id + "/all") + album = self._extract_data("/a/" + self.key + "/all") images = album["album_images"]["images"] del album["album_images"] if int(album["num_images"]) > len(images): url = "{}/ajaxalbums/getimages/{}/hit.json".format( - self.root, self.item_id) + self.root, self.key) images = self.request(url).json()["data"]["images"] yield Message.Version, 1 @@ -180,3 +176,32 @@ class ImgurAlbumExtractor(ImgurExtractor): image["num"] = num image["album"] = album yield Message.Url, url, image + + +class ImgurGalleryExtractor(ImgurExtractor): + """Extractor for imgur galleries""" + subcategory = "gallery" + pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com" + r"/gallery/(\w{7}|\w{5})") + test = ( + ("https://imgur.com/gallery/zf2fIms", { # non-album gallery (#380) + "pattern": "https://imgur.com/zf2fIms", + }), + ("https://imgur.com/gallery/eD9CT", { + "pattern": "https://imgur.com/a/eD9CT", + }), + ) + + def items(self): + url = self.root + "/a/" + self.key + with self.request(url, method="HEAD", fatal=False) as response: + code = response.status_code + + if code < 400: + extr = ImgurAlbumExtractor + else: + extr = ImgurImageExtractor + url = self.root + "/" + self.key + + yield Message.Version, 1 + yield Message.Queue, url, {"_extractor": extr} diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index 475e24b..e5cfe8b 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -8,11 +8,10 @@ """Extract images from https://www.instagram.com/""" -import hashlib -import json from .common import Extractor, Message from .. import text, exception from ..cache import cache +import json class InstagramExtractor(Extractor): @@ -37,10 +36,11 @@ class InstagramExtractor(Extractor): data.update(metadata) yield Message.Directory, data - if data['typename'] == 'GraphImage': + if data['typename'] in ('GraphImage', 'GraphStoryImage', 'GraphStoryVideo'): yield Message.Url, data['display_url'], \ text.nameext_from_url(data['display_url'], data) elif data['typename'] == 'GraphVideo': + data["extension"] = None yield Message.Url, \ 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data @@ -140,33 +140,113 @@ class InstagramExtractor(Extractor): return medias + def _extract_stories(self, url): + if self.highlight_id: + user_id = '' + highlight_id = '"{}"'.format(self.highlight_id) + query_hash = '30a89afdd826d78a5376008a7b81c205' + else: + page = self.request(url).text + shared_data = self._extract_shared_data(page) + + # If no stories are present the URL redirects to `ProfilePage' + if 'StoriesPage' not in shared_data['entry_data']: + return [] + + user_id = '"{}"'.format( + shared_data['entry_data']['StoriesPage'][0]['user']['id']) + highlight_id = '' + query_hash = 'cda12de4f7fd3719c0569ce03589f4c4' + + variables = ( + '{{' + '"reel_ids":[{}],"tag_names":[],"location_ids":[],' + '"highlight_reel_ids":[{}],"precomposed_overlay":true,' + '"show_story_viewer_list":true,' + '"story_viewer_fetch_count":50,"story_viewer_cursor":"",' + '"stories_video_dash_manifest":false}}' + ).format(user_id, highlight_id) + headers = { + "X-Requested-With": "XMLHttpRequest", + } + url = '{}/graphql/query/?query_hash={}&variables={}'.format( + self.root, + query_hash, + variables, + ) + shared_data = self.request(url, headers=headers).json() + + # If there are stories present but the user is not authenticated or + # does not have permissions no stories are returned. + if not shared_data['data']['reels_media']: + return [] # no stories present + + medias = [] + for media in shared_data['data']['reels_media'][0]['items']: + media_data = { + 'owner_id': media['owner']['id'], + 'username': media['owner']['username'], + 'date': text.parse_timestamp(media['taken_at_timestamp']), + 'expires': text.parse_timestamp(media['expiring_at_timestamp']), + 'media_id': media['id'], + 'typename': media['__typename'], + } + if media['__typename'] == 'GraphStoryImage': + media_data.update({ + 'display_url': media['display_url'], + 'height': text.parse_int(media['dimensions']['height']), + 'width': text.parse_int(media['dimensions']['width']), + }) + elif media['__typename'] == 'GraphStoryVideo': + vr = media['video_resources'][0] + media_data.update({ + 'duration': text.parse_float(media['video_duration']), + 'display_url': vr['src'], + 'height': text.parse_int(vr['config_height']), + 'width': text.parse_int(vr['config_width']), + }) + medias.append(media_data) + + return medias + def _extract_page(self, url, page_type): shared_data_fields = { 'ProfilePage': { + 'page': 'ProfilePage', 'node': 'user', 'node_id': 'id', 'edge_to_medias': 'edge_owner_to_timeline_media', 'variables_id': 'id', - 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41', + 'query_hash': 'f2405b236d85e8296cf30347c9f08c2a', + }, + 'ProfileChannelPage': { + 'page': 'ProfilePage', + 'node': 'user', + 'node_id': 'id', + 'edge_to_medias': 'edge_felix_video_timeline', + 'variables_id': 'id', + 'query_hash': 'bc78b344a68ed16dd5d7f264681c4c76', }, 'TagPage': { + 'page': 'TagPage', 'node': 'hashtag', 'node_id': 'name', 'edge_to_medias': 'edge_hashtag_to_media', 'variables_id': 'tag_name', - 'query_hash': 'f92f56d47dc7a55b606908374b43a314', + 'query_hash': 'f12c9ec5e46a3173b2969c712ad84744', }, } page = self.request(url).text shared_data = self._extract_shared_data(page) psdf = shared_data_fields[page_type] + csrf = shared_data["config"]["csrf_token"] while True: # Deal with different structure of pages: the first page # has interesting data in `entry_data', next pages in `data'. if 'entry_data' in shared_data: - base_shared_data = shared_data['entry_data'][page_type][0]['graphql'] + base_shared_data = shared_data['entry_data'][psdf['page']][0]['graphql'] # variables_id is available only in the first page variables_id = base_shared_data[psdf['node']][psdf['node_id']] @@ -192,7 +272,8 @@ class InstagramExtractor(Extractor): ) headers = { "X-Requested-With": "XMLHttpRequest", - "X-Instagram-GIS": hashlib.md5(variables.encode()).hexdigest(), + "X-CSRFToken": csrf, + "X-IG-App-ID": "936619743392459", } url = '{}/graphql/query/?query_hash={}&variables={}'.format( self.root, @@ -204,14 +285,20 @@ class InstagramExtractor(Extractor): def _extract_profilepage(self, url): yield from self._extract_page(url, 'ProfilePage') + def _extract_profilechannelpage(self, url): + yield from self._extract_page(url, 'ProfileChannelPage') + def _extract_tagpage(self, url): yield from self._extract_page(url, 'TagPage') + def _extract_storiespage(self, url): + yield from self._extract_stories(url) + class InstagramImageExtractor(InstagramExtractor): """Extractor for PostPage""" subcategory = "image" - pattern = r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/?&#]+)" + pattern = r"(?:https?://)?(?:www\.)?instagram\.com/(?:p|tv)/([^/?&#]+)" test = ( # GraphImage ("https://www.instagram.com/p/BqvsDleB3lV/", { @@ -258,6 +345,22 @@ class InstagramImageExtractor(InstagramExtractor): } }), + # GraphVideo (IGTV) + ("https://www.instagram.com/tv/BkQjCfsBIzi/", { + "url": "64208f408e11cbbca86c2df4488e90262ae9d9ec", + "keyword": { + "date": "type:datetime", + "description": str, + "height": int, + "likes": int, + "media_id": "1806097553666903266", + "shortcode": "BkQjCfsBIzi", + "typename": "GraphVideo", + "username": "instagram", + "width": int, + } + }), + # GraphSidecar with 2 embedded GraphVideo objects ("https://www.instagram.com/p/BtOvDOfhvRr/", { "count": 2, @@ -283,10 +386,11 @@ class InstagramUserExtractor(InstagramExtractor): """Extractor for ProfilePage""" subcategory = "user" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?!p/|explore/|directory/|accounts/)([^/?&#]+)") + r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" + r"([^/?&#]+)/?$") test = ("https://www.instagram.com/instagram/", { - "range": "1-12", - "count": ">= 12", + "range": "1-16", + "count": ">= 16", }) def __init__(self, match): @@ -298,6 +402,26 @@ class InstagramUserExtractor(InstagramExtractor): return self._extract_profilepage(url) +class InstagramChannelExtractor(InstagramExtractor): + """Extractor for ProfilePage channel""" + subcategory = "channel" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)" + r"([^/?&#]+)/channel") + test = ("https://www.instagram.com/instagram/channel/", { + "range": "1-16", + "count": ">= 16", + }) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.username = match.group(1) + + def instagrams(self): + url = '{}/{}/channel/'.format(self.root, self.username) + return self._extract_profilechannelpage(url) + + class InstagramTagExtractor(InstagramExtractor): """Extractor for TagPage""" subcategory = "tag" @@ -305,8 +429,8 @@ class InstagramTagExtractor(InstagramExtractor): pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" r"/explore/tags/([^/?&#]+)") test = ("https://www.instagram.com/explore/tags/instagram/", { - "range": "1-12", - "count": ">= 12", + "range": "1-16", + "count": ">= 16", }) def __init__(self, match): @@ -319,3 +443,22 @@ class InstagramTagExtractor(InstagramExtractor): def instagrams(self): url = '{}/explore/tags/{}/'.format(self.root, self.tag) return self._extract_tagpage(url) + + +class InstagramStoriesExtractor(InstagramExtractor): + """Extractor for StoriesPage""" + subcategory = "stories" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/stories/([^/?&#]+)(?:/(\d+))?") + test = ( + ("https://www.instagram.com/stories/instagram/"), + ("https://www.instagram.com/stories/highlights/18042509488170095/"), + ) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.username, self.highlight_id = match.groups() + + def instagrams(self): + url = '{}/stories/{}/'.format(self.root, self.username) + return self._extract_storiespage(url) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 879d38b..a73eb86 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor): test = ( ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { "url": "7e4984a271a1072ac6483e4228a045895aff86f3", - "keyword": "ab4e5b71583fd439b4c8012a642aa8b58d8d0758", + "keyword": "07c0b915f2ab1cc3bbf28b76e7950fccee1213f3", "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", }), ("https://luscious.net/albums/virgin-killer-sweater_282582/", { diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index 282c389..1ca1073 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -93,7 +93,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor): test = ( ("https://blitzwuff.newgrounds.com/art", { "url": "24b19c4a135a09889fac7b46a74e427e4308d02b", - "keyword": "98566e0c8096a8099b8d71962fea7e31c8b098d4", + "keyword": "62981f7bdd66e1f1c72ab1d9b932423c156bc9a1", }), ("https://blitzwuff.newgrounds.com/"), ) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 4884497..ab5932d 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -11,6 +11,8 @@ from .common import Extractor, Message from .. import text from ..cache import memcache +import collections +import json class PatreonExtractor(Extractor): @@ -33,70 +35,92 @@ class PatreonExtractor(Extractor): for post in self.posts(): yield Message.Directory, post + ids = set() post["num"] = 0 content = post.get("content") postfile = post.get("post_file") - for url in text.extract_iter(content or "", 'src="', '"'): + for image in post["images"]: + url = image.get("download_url") + if not url: + continue + ids.add(url.split("/")[-2]) + name = image.get("file_name") or self._filename(url) or url + post["num"] += 1 - yield Message.Url, url, text.nameext_from_url(url, post) + post["type"] = "image" + yield Message.Url, url, text.nameext_from_url(name, post) - if postfile: + if postfile and postfile["url"].split("/")[-2] not in ids: post["num"] += 1 + post["type"] = "postfile" text.nameext_from_url(postfile["name"], post) yield Message.Url, postfile["url"], post for attachment in post["attachments"]: post["num"] += 1 + post["type"] = "attachment" text.nameext_from_url(attachment["name"], post) yield Message.Url, attachment["url"], post + if content: + for url in text.extract_iter(content, 'src="', '"'): + post["num"] += 1 + post["type"] = "content" + yield Message.Url, url, text.nameext_from_url(url, post) + def posts(self): """Return all relevant post objects""" def _pagination(self, url): headers = {"Referer": self.root} - empty = [] while url: posts = self.request(url, headers=headers).json() - if "included" not in posts: - return - - # collect attachments - attachments = {} - for inc in posts["included"]: - if inc["type"] == "attachment": - attachments[inc["id"]] = inc["attributes"] - - # update posts - for post in posts["data"]: - attr = post["attributes"] - attr["id"] = text.parse_int(post["id"]) - attr["date"] = text.parse_datetime( - attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") - attr["creator"] = self._user( - post["relationships"]["user"]["links"]["related"]) - - # add attachments to post attributes - files = post["relationships"].get("attachments") - if files: - attr["attachments"] = [ - attachments[f["id"]] - for f in files["data"] - ] - else: - attr["attachments"] = empty - - yield attr + if "included" in posts: + included = self._transform(posts["included"]) + for post in posts["data"]: + yield self._process(post, included) if "links" not in posts: return url = posts["links"].get("next") + def _process(self, post, included): + """Process and extend a 'post' object""" + attr = post["attributes"] + attr["id"] = text.parse_int(post["id"]) + attr["images"] = self._files(post, included, "images") + attr["attachments"] = self._files(post, included, "attachments") + attr["date"] = text.parse_datetime( + attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + attr["creator"] = self._user( + post["relationships"]["user"]["links"]["related"]) + return attr + + @staticmethod + def _transform(included): + """Transform 'included' into an easier to handle format""" + result = collections.defaultdict(dict) + for inc in included: + result[inc["type"]][inc["id"]] = inc["attributes"] + return result + + @staticmethod + def _files(post, included, key): + """Build a list of files""" + files = post["relationships"].get(key) + if files and files.get("data"): + return [ + included[file["type"]][file["id"]] + for file in files["data"] + ] + return [] + @memcache(keyarg=1) def _user(self, url): + """Fetch user information""" user = self.request(url).json()["data"] attr = user["attributes"] attr["id"] = user["id"] @@ -104,14 +128,21 @@ class PatreonExtractor(Extractor): attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z") return attr + def _filename(self, url): + """Fetch filename from its Content-Disposition header""" + response = self.request(url, method="HEAD", fatal=False) + cd = response.headers.get("Content-Disposition") + return text.extract(cd, 'filename="', '"')[0] + @staticmethod def _build_url(endpoint, query): return ( "https://www.patreon.com/api/" + endpoint + - "?include=user,attachments,user_defined_tags,campaign,poll.choices" - ",poll.current_user_responses.user,poll.current_user_responses.cho" - "ice,poll.current_user_responses.poll,access_rules.tier.null" + "?include=user,images,attachments,user_defined_tags,campaign,poll." + "choices,poll.current_user_responses.user,poll.current_user_respon" + "ses.choice,poll.current_user_responses.poll,access_rules.tier.nul" + "l" "&fields[post]=change_visibility_at,comment_count,content,current_" "user_can_delete,current_user_can_view,current_user_has_liked,embe" @@ -133,7 +164,8 @@ class PatreonCreatorExtractor(PatreonExtractor): """Extractor for a creator's works""" subcategory = "creator" pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" - r"/(?!(?:home|join|login|signup)(?:$|[/?&#]))([^/?&#]+)/?") + r"/(?!(?:home|join|posts|login|signup)(?:$|[/?&#]))" + r"([^/?&#]+)/?") test = ("https://www.patreon.com/koveliana", { "range": "1-25", "count": ">= 25", @@ -144,6 +176,7 @@ class PatreonCreatorExtractor(PatreonExtractor): "creator": dict, "date": "type:datetime", "id": int, + "images": list, "like_count": int, "post_type": str, "published_at": str, @@ -181,3 +214,26 @@ class PatreonUserExtractor(PatreonExtractor): "&filter[is_following]=true" )) return self._pagination(url) + + +class PatreonPostExtractor(PatreonExtractor): + """Extractor for media from a single post""" + subcategory = "post" + pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" + r"/posts/[^/?&#]*?(\d+)") + test = ("https://www.patreon.com/posts/precious-metal-23563293", { + "count": 4, + }) + + def __init__(self, match): + PatreonExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + url = "{}/posts/{}".format(self.root, self.post_id) + page = self.request(url).text + data = text.extract(page, "window.patreon.bootstrap,", "\n});")[0] + post = json.loads(data + "}")["post"] + + included = self._transform(post["included"]) + return (self._process(post["data"], included),) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 76d4dc4..4f8ee9c 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -18,8 +18,8 @@ class PixivExtractor(Extractor): """Base class for pixiv extractors""" category = "pixiv" directory_fmt = ("{category}", "{user[id]} {user[account]}") - filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}" - archive_fmt = "{id}{num}.{extension}" + filename_fmt = "{id}_p{num}.{extension}" + archive_fmt = "{id}{suffix}.{extension}" def __init__(self, match): Extractor.__init__(self, match) @@ -40,9 +40,10 @@ class PixivExtractor(Extractor): del work["meta_single_page"] del work["image_urls"] del work["meta_pages"] - work["num"] = "" + work["num"] = 0 work["tags"] = [tag["name"] for tag in work["tags"]] work["date"] = text.parse_datetime(work["create_date"]) + work["suffix"] = "" work.update(metadata) yield Message.Directory, work @@ -55,20 +56,17 @@ class PixivExtractor(Extractor): url = ugoira["zip_urls"]["medium"].replace( "_ugoira600x600", "_ugoira1920x1080") work["frames"] = ugoira["frames"] - work["extension"] = "zip" - yield Message.Url, url, work + yield Message.Url, url, text.nameext_from_url(url, work) elif work["page_count"] == 1: url = meta_single_page["original_image_url"] - work["extension"] = url.rpartition(".")[2] - yield Message.Url, url, work + yield Message.Url, url, text.nameext_from_url(url, work) else: - for num, img in enumerate(meta_pages): + for work["num"], img in enumerate(meta_pages): url = img["image_urls"]["original"] - work["num"] = "_p{:02}".format(num) - work["extension"] = url.rpartition(".")[2] - yield Message.Url, url, work + work["suffix"] = "_p{:02}".format(work["num"]) + yield Message.Url, url, text.nameext_from_url(url, work) def works(self): """Return an iterable containing all relevant 'work'-objects""" diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py index fa4eb81..aa5c9c6 100644 --- a/gallery_dl/extractor/pururin.py +++ b/gallery_dl/extractor/pururin.py @@ -29,7 +29,7 @@ class PururinGalleryExtractor(GalleryExtractor): "artist" : ["Shoda Norihiro"], "group" : ["Obsidian Order"], "parody" : ["Kantai Collection"], - "characters": ["Iowa", "Teitoku"], + "characters": ["Admiral", "Iowa"], "tags" : list, "type" : "Doujinshi", "collection": "", diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 59d502a..f97454b 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -117,6 +117,8 @@ class ReactorExtractor(SharedConfigMixin, Extractor): url = text.extract(image, ' src="', '"')[0] if not url: continue + if url.startswith("//"): + url = "http:" + url width = text.extract(image, ' width="', '"')[0] height = text.extract(image, ' height="', '"')[0] image_id = url.rpartition("-")[2].partition(".")[0] @@ -268,8 +270,8 @@ class JoyreactorPostExtractor(ReactorPostExtractor): "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47", }), ("http://joyreactor.com/post/3668724", { # youtube embed - "url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a", - "keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651", + "url": "bf1666eddcff10c9b58f6be63fa94e4e13074214", + "keyword": "989112c7888e9cc80fd35870180c6c98165d953b", }), ("http://joyreactor.cc/post/1299", { # "malformed" JSON "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde", diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 2ba4b99..94e95e8 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -234,7 +234,7 @@ class RedditAPI(): url = "https://oauth.reddit.com" + endpoint params["raw_json"] = 1 self.authenticate() - response = self.extractor.request(url, params=params, fatal=False) + response = self.extractor.request(url, params=params, fatal=None) remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: wait = int(response.headers["x-ratelimit-reset"]) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index da9735e..bb8a2ae 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -40,17 +40,18 @@ class SankakuExtractor(SharedConfigMixin, Extractor): def items(self): self.login() - data = self.get_metadata() yield Message.Version, 1 - yield Message.Directory, data + data = self.get_metadata() for post_id in util.advance(self.get_posts(), self.start_post): self.wait() post = self.get_post_data(post_id) url = post["file_url"] post.update(data) - yield Message.Url, url, text.nameext_from_url(url, post) + text.nameext_from_url(url, post) + yield Message.Directory, post + yield Message.Url, url, post def skip(self, num): self.start_post += num diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index afd4eaa..38b7813 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -78,6 +78,7 @@ class SexcomExtractor(Extractor): path += "/hd" data["url"] = self.root + path else: + data["extension"] = None data["url"] = "ytdl:" + text.extract( extr('<iframe', '>'), ' src="', '"')[0] else: diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index 5ad372d..8567155 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -8,14 +8,16 @@ """Extract hentai-manga from https://www.simply-hentai.com/""" -from .common import GalleryExtractor, Extractor, Message +from .common import GalleryExtractor from .. import text, util, exception +import json class SimplyhentaiGalleryExtractor(GalleryExtractor): """Extractor for image galleries from simply-hentai.com""" category = "simplyhentai" archive_fmt = "{image_id}" + root = "https://www.simply-hentai.com" pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com" r"(?!/(?:album|gifs?|images?|series)(?:/|$))" r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?&#]+)+)") @@ -23,7 +25,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): (("https://original-work.simply-hentai.com" "/amazon-no-hiyaku-amazon-elixir"), { "url": "258289249990502c3138719cb89e995a60861e49", - "keyword": "eba83ccdbab3022a2280c77aa747f9458196138b", + "keyword": "8b2400e4b466e8f46802fa5a6b917d2788bb7e8e", }), ("https://www.simply-hentai.com/notfound", { "exception": exception.GalleryDLException, @@ -40,144 +42,30 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): self.session.headers["Referer"] = url def metadata(self, page): - extr = text.extract_from(page) - split = text.split_html - - title = extr('<meta property="og:title" content="', '"') - if not title: + path = text.extract(page, '<a class="preview" href="', '"')[0] + if not path: raise exception.NotFoundError("gallery") - data = { - "title" : text.unescape(title), - "gallery_id": text.parse_int(extr('/Album/', '/')), - "parody" : split(extr('box-title">Series</div>', '</div>')), - "language" : text.remove_html(extr( - 'box-title">Language</div>', '</div>')) or None, - "characters": split(extr('box-title">Characters</div>', '</div>')), - "tags" : split(extr('box-title">Tags</div>', '</div>')), - "artist" : split(extr('box-title">Artists</div>', '</div>')), - "date" : text.parse_datetime(text.remove_html( - extr('Uploaded', '</div>')), "%d.%m.%Y"), + page = self.request(self.root + path).text + data = json.loads(text.unescape(text.extract( + page, 'data-react-class="Reader" data-react-props="', '"')[0])) + self.manga = manga = data["manga"] + + return { + "title" : manga["title"], + "parody" : manga["series"]["title"], + "language" : manga["language"]["name"], + "lang" : util.language_to_code(manga["language"]["name"]), + "characters": [x["name"] for x in manga["characters"]], + "tags" : [x["name"] for x in manga["tags"]], + "artist" : [x["name"] for x in manga["artists"]], + "gallery_id": text.parse_int(text.extract( + manga["images"][0]["sizes"]["full"], "/Album/", "/")[0]), + "date" : text.parse_datetime( + manga["publish_date"], "%Y-%m-%dT%H:%M:%S.%f%z"), } - data["lang"] = util.language_to_code(data["language"]) - return data def images(self, _): - url = self.chapter_url + "/all-pages" - headers = {"Accept": "application/json"} - images = self.request(url, headers=headers).json() return [ - (urls["full"], {"image_id": text.parse_int(image_id)}) - for image_id, urls in sorted(images.items()) + (image["sizes"]["full"], {"image_id": image["id"]}) + for image in self.manga["images"] ] - - -class SimplyhentaiImageExtractor(Extractor): - """Extractor for individual images from simply-hentai.com""" - category = "simplyhentai" - subcategory = "image" - directory_fmt = ("{category}", "{type}s") - filename_fmt = "{category}_{token}{title:?_//}.{extension}" - archive_fmt = "{token}" - pattern = (r"(?:https?://)?(?:www\.)?(simply-hentai\.com" - r"/(image|gif)/[^/?&#]+)") - test = ( - (("https://www.simply-hentai.com/image" - "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), { - "url": "0338eb137830ab6f81e5f410d3936ef785d063d9", - "keyword": "e10e5588481cab68329ef6ec1e5325206b2079a2", - }), - ("https://www.simply-hentai.com/gif/8915dfcf-0b6a-47c", { - "url": "11c060d7ec4dfd0bd105300b6e1fd454674a5af1", - "keyword": "dd97a4bb449c397d6fec9f43a1303c0fb168ae65", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page_url = "https://www." + match.group(1) - self.type = match.group(2) - - def items(self): - extr = text.extract_from(self.request(self.page_url).text) - title = extr('"og:title" content="' , '"') - descr = extr('"og:description" content="', '"') - url = extr('"image":"' , '&') - url = extr(""content":"", "&") or url - - tags = text.extract(descr, " tagged with ", " online for free ")[0] - if tags: - tags = tags.split(", ") - tags[-1] = tags[-1].partition(" ")[2] - else: - tags = [] - - data = text.nameext_from_url(url, { - "title": text.unescape(title) if title else "", - "tags": tags, - "type": self.type, - }) - data["token"] = data["filename"].rpartition("_")[2] - - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, url, data - - -class SimplyhentaiVideoExtractor(Extractor): - """Extractor for hentai videos from simply-hentai.com""" - category = "simplyhentai" - subcategory = "video" - directory_fmt = ("{category}", "{type}s") - filename_fmt = "{title}{episode:?_//>02}.{extension}" - archive_fmt = "{title}_{episode}" - pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?&#]+)" - test = ( - ("https://videos.simply-hentai.com/creamy-pie-episode-02", { - "pattern": r"https://www\.googleapis\.com/drive/v3/files" - r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+", - "keyword": "706790708b14773efc1e075ddd3b738a375348a5", - "count": 1, - }), - (("https://videos.simply-hentai.com" - "/1715-tifa-in-hentai-gang-bang-3d-movie"), { - "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0", - "keyword": "f9dad94fbde9c95859e631ff4f07297a9567b874", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.page_url = "https://" + match.group(1) - - def items(self): - page = self.request(self.page_url).text - - title, pos = text.extract(page, "<title>", "</title>") - tags , pos = text.extract(page, ">Tags</div>", "</div>", pos) - date , pos = text.extract(page, ">Upload Date</div>", "</div>", pos) - title = title.rpartition(" - ")[0] - - if "<video" in page: - video_url = text.extract(page, '<source src="', '"', pos)[0] - episode = 0 - else: - # video url from myhentai.tv embed - pos = page.index('<div class="video-frame-container">', pos) - embed_url = text.extract(page, 'src="', '"', pos)[0].replace( - "embedplayer.php?link=", "embed.php?name=") - embed_page = self.request(embed_url).text - video_url = text.extract(embed_page, '"file":"', '"')[0] - title, _, episode = title.rpartition(" Episode ") - - data = text.nameext_from_url(video_url, { - "title": text.unescape(title), - "episode": text.parse_int(episode), - "tags": text.split_html(tags)[::2], - "type": "video", - "date": text.parse_datetime(text.remove_html( - date), "%B %d, %Y %H:%M"), - }) - - yield Message.Version, 1 - yield Message.Directory, data - yield Message.Url, video_url, data diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index ccba640..3672a6d 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -54,6 +54,7 @@ class TwitterExtractor(Extractor): if self.videos and "-videoContainer" in tweet: data["num"] = 1 + data["extension"] = None url = "ytdl:{}/{}/status/{}".format( self.root, data["user"], data["tweet_id"]) yield Message.Url, url, data diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py index b9c223c..463733f 100644 --- a/gallery_dl/extractor/wikiart.py +++ b/gallery_dl/extractor/wikiart.py @@ -70,7 +70,7 @@ class WikiartArtistExtractor(WikiartExtractor): pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)" test = ("https://www.wikiart.org/en/thomas-cole", { "url": "f1eee8158f5b8b7380382ab730a8f53884715c8b", - "keyword": "b62678394ce645815963883d5c9642255307225f", + "keyword": "c61f5a4774b977106000e9554d19cfb9438a7032", }) def __init__(self, match): diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index 9699806..23750db 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -13,13 +13,16 @@ from .. import text import json -BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)" +BASE_PATTERN = r"(?:https?://)?((?:[^.]+\.)?xhamster\d?\.(?:com|one|desi))" class XhamsterExtractor(Extractor): """Base class for xhamster extractors""" category = "xhamster" - root = "https://xhamster.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.root = "https://" + match.group(1) class XhamsterGalleryExtractor(XhamsterExtractor): @@ -66,16 +69,21 @@ class XhamsterGalleryExtractor(XhamsterExtractor): }, }, }), + ("https://jp.xhamster2.com/photos/gallery/11748968", { + "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$", + "count": ">= 144", + }), ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"), ("https://xhamster.com/photos/gallery/11748968"), ("https://xhamster.one/photos/gallery/11748968"), ("https://xhamster.desi/photos/gallery/11748968"), + ("https://xhamster2.com/photos/gallery/11748968"), ("https://en.xhamster.com/photos/gallery/11748968"), ) def __init__(self, match): XhamsterExtractor.__init__(self, match) - self.path = match.group(1) + self.path = match.group(2) self.data = None def items(self): @@ -154,7 +162,7 @@ class XhamsterUserExtractor(XhamsterExtractor): def __init__(self, match): XhamsterExtractor.__init__(self, match) - self.user = match.group(1) + self.user = match.group(2) def items(self): yield Message.Version, 1 |
