summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2019-08-26 19:34:45 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2019-08-26 19:34:45 -0400
commitb75d158d014d6c43d7d785c46c9372a9cf84d144 (patch)
tree7dca4a7e61fe8b6e2bff2142fc19891e783a7d6d /gallery_dl/extractor
parent64ad8e7bd15df71ab1116eede414558631bcad32 (diff)
New upstream version 1.10.2upstream/1.10.2
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/adultempire.py8
-rw-r--r--gallery_dl/extractor/artstation.py1
-rw-r--r--gallery_dl/extractor/booru.py8
-rw-r--r--gallery_dl/extractor/common.py3
-rw-r--r--gallery_dl/extractor/deviantart.py394
-rw-r--r--gallery_dl/extractor/gelbooru.py8
-rw-r--r--gallery_dl/extractor/hitomi.py2
-rw-r--r--gallery_dl/extractor/imagebam.py6
-rw-r--r--gallery_dl/extractor/imgbb.py33
-rw-r--r--gallery_dl/extractor/imgur.py67
-rw-r--r--gallery_dl/extractor/instagram.py169
-rw-r--r--gallery_dl/extractor/luscious.py2
-rw-r--r--gallery_dl/extractor/newgrounds.py2
-rw-r--r--gallery_dl/extractor/patreon.py130
-rw-r--r--gallery_dl/extractor/pixiv.py20
-rw-r--r--gallery_dl/extractor/pururin.py2
-rw-r--r--gallery_dl/extractor/reactor.py6
-rw-r--r--gallery_dl/extractor/reddit.py2
-rw-r--r--gallery_dl/extractor/sankaku.py7
-rw-r--r--gallery_dl/extractor/sexcom.py1
-rw-r--r--gallery_dl/extractor/simplyhentai.py162
-rw-r--r--gallery_dl/extractor/twitter.py1
-rw-r--r--gallery_dl/extractor/wikiart.py2
-rw-r--r--gallery_dl/extractor/xhamster.py16
24 files changed, 673 insertions, 379 deletions
diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py
index 5ea835f..5e2480a 100644
--- a/gallery_dl/extractor/adultempire.py
+++ b/gallery_dl/extractor/adultempire.py
@@ -21,12 +21,12 @@ class AdultempireGalleryExtractor(GalleryExtractor):
test = (
("https://www.adultempire.com/5998/gallery.html", {
"range": "1",
- "keyword": "0533ef1184892be8ac02b17286797c95f389ba63",
+ "keyword": "25c8171f5623678491a0d7bdf38a7a6ebfa4a361",
"content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e",
}),
("https://www.adultdvdempire.com/5683/gallery.html", {
"url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d",
- "keyword": "59fe5d95929efc5040a819a5f77aba7a022bb85a",
+ "keyword": "0fe9a6e3f0a331b95ba77f66a643705ca86e8ec5",
}),
)
@@ -42,8 +42,8 @@ class AdultempireGalleryExtractor(GalleryExtractor):
"studio" : extr(">studio</small>", "<").strip(),
"date" : text.parse_datetime(extr(
">released</small>", "<").strip(), "%m/%d/%Y"),
- "actors" : text.split_html(extr(
- '<ul class="item-details item-cast-list ', '</ul>'))[1:],
+ "actors" : sorted(text.split_html(extr(
+ '<ul class="item-details item-cast-list ', '</ul>'))[1:]),
}
def images(self, page):
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index f7b3bc1..2892bd4 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -41,6 +41,7 @@ class ArtstationExtractor(Extractor):
player = adict["player_embedded"]
url = text.extract(player, 'src="', '"')[0]
if not url.startswith(self.root):
+ asset["extension"] = None
yield Message.Url, "ytdl:" + url, asset
continue
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index c63085a..54a8878 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -41,10 +41,8 @@ class BooruExtractor(SharedConfigMixin, Extractor):
return pages * self.per_page
def items(self):
- data = self.get_metadata()
-
yield Message.Version, 1
- yield Message.Directory, data
+ data = self.get_metadata()
self.reset_page()
while True:
@@ -59,9 +57,11 @@ class BooruExtractor(SharedConfigMixin, Extractor):
if url.startswith("/"):
url = text.urljoin(self.api_url, url)
image.update(data)
+ text.nameext_from_url(url, image)
if self.extags:
self.extended_tags(image)
- yield Message.Url, url, text.nameext_from_url(url, image)
+ yield Message.Directory, image
+ yield Message.Url, url, image
if len(images) < self.per_page:
return
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 5c40e2a..a90af1c 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -87,7 +87,8 @@ class Extractor():
raise exception.HttpError(exc)
else:
code = response.status_code
- if 200 <= code < 400 or not fatal and \
+ if 200 <= code < 400 or fatal is None and \
+ (400 <= code < 500) or not fatal and \
(400 <= code < 429 or 431 <= code < 500):
if encoding:
response.encoding = encoding
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 63e2913..bd1299b 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -27,7 +27,7 @@ BASE_PATTERN = (
class DeviantartExtractor(Extractor):
- """Base class for deviantart extractors"""
+ """Base class for deviantart extractors using the OAuth API"""
category = "deviantart"
directory_fmt = ("{category}", "{author[username]!l}")
filename_fmt = "{category}_{index}_{title}.{extension}"
@@ -38,11 +38,15 @@ class DeviantartExtractor(Extractor):
self.offset = 0
self.flat = self.config("flat", True)
self.extra = self.config("extra", False)
+ self.quality = self.config("quality", "100")
self.original = self.config("original", True)
self.user = match.group(1) or match.group(2)
self.group = False
self.api = DeviantartAPI(self)
+ if self.quality:
+ self.quality = "q_{}".format(self.quality)
+
if self.original != "image":
self._update_content = self._update_content_default
else:
@@ -81,12 +85,15 @@ class DeviantartExtractor(Extractor):
text.ext_from_url(content["src"]) != "gif":
self._update_content(deviation, content)
- if deviation["index"] <= 790677560 and \
- content["src"].startswith("https://images-wixmp-"):
- # https://github.com/r888888888/danbooru/issues/4069
- content["src"] = re.sub(
- r"(/f/[^/]+/[^/]+)/v\d+/.*",
- r"/intermediary\1", content["src"])
+ if content["src"].startswith("https://images-wixmp-"):
+ if deviation["index"] <= 790677560:
+ # https://github.com/r888888888/danbooru/issues/4069
+ content["src"] = re.sub(
+ r"(/f/[^/]+/[^/]+)/v\d+/.*",
+ r"/intermediary\1", content["src"])
+ if self.quality:
+ content["src"] = re.sub(
+ r"q_\d+", self.quality, content["src"])
yield self.commit(deviation, content)
@@ -133,8 +140,16 @@ class DeviantartExtractor(Extractor):
@staticmethod
def commit(deviation, target):
url = target["src"]
- deviation["target"] = text.nameext_from_url(url, target.copy())
- deviation["extension"] = deviation["target"]["extension"]
+ thumb = deviation["thumbs"][0]["src"] if "thumbs" in deviation else url
+ target = text.nameext_from_url(thumb, target.copy())
+ if target["filename"].endswith("-150"):
+ target["filename"] = target["filename"][:-4]
+ if not target["filename"].count("-"):
+ name, _, hid = target["filename"].rpartition("_")
+ target["filename"] = name + "-" + hid
+ deviation["target"] = target
+ deviation["filename"] = target["filename"]
+ deviation["extension"] = target["extension"] = text.ext_from_url(url)
return Message.Url, url, deviation
def _commit_journal_html(self, deviation, journal):
@@ -225,14 +240,6 @@ class DeviantartExtractor(Extractor):
if mtype and mtype.startswith("image/"):
content.update(data)
- def _html_request(self, url, **kwargs):
- cookies = {"userinfo": (
- '__167217c8e6aac1a3331f;{"username":"","uniqueid":"ab2e8b184471bf0'
- 'e3f8ed3ee7a3220aa","vd":"Bc7vEx,BdC7Fy,A,J,A,,B,A,B,BdC7Fy,BdC7XU'
- ',J,J,A,BdC7XU,13,A,B,A,,A,A,B,A,A,,A","attr":56}'
- )}
- return self.request(url, cookies=cookies, **kwargs)
-
class DeviantartGalleryExtractor(DeviantartExtractor):
"""Extractor for all deviations from an artist's gallery"""
@@ -360,68 +367,6 @@ class DeviantartFolderExtractor(DeviantartExtractor):
deviation["folder"] = self.folder
-class DeviantartDeviationExtractor(DeviantartExtractor):
- """Extractor for single deviations"""
- subcategory = "deviation"
- archive_fmt = "{index}.{extension}"
- pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)"
- test = (
- (("https://www.deviantart.com/shimoda7/art/"
- "For-the-sake-of-a-memory-10073852"), {
- "options": (("original", 0),),
- "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
- }),
- ("https://www.deviantart.com/zzz/art/zzz-1234567890", {
- "exception": exception.NotFoundError,
- }),
- (("https://www.deviantart.com/myria-moon/art/"
- "Aime-Moi-part-en-vadrouille-261986576"), {
- "pattern": (r"https?://s3\.amazonaws\.com/origin-orig\."
- r"deviantart\.net/a383/f/2013/135/e/7/[^.]+\.jpg\?"),
- }),
- # wixmp URL rewrite
- (("https://www.deviantart.com/citizenfresh/art/"
- "Hverarond-14-the-beauty-of-the-earth-789295466"), {
- "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
- r"/intermediary/f/[^/]+/[^.]+\.jpg$")
- }),
- # non-download URL for GIFs (#242)
- (("https://www.deviantart.com/skatergators/art/"
- "COM-Monique-Model-781571783"), {
- "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
- r"/f/[^/]+/[^.]+\.gif\?token="),
- }),
- # external URLs from description (#302)
- (("https://www.deviantart.com/uotapo/art/"
- "INANAKI-Memorial-Humane7-590297498"), {
- "options": (("extra", 1), ("original", 0)),
- "pattern": r"https?://sta\.sh/\w+$",
- "range": "2-",
- "count": 4,
- }),
- # old-style URLs
- ("https://shimoda7.deviantart.com"
- "/art/For-the-sake-of-a-memory-10073852"),
- ("https://myria-moon.deviantart.com"
- "/art/Aime-Moi-part-en-vadrouille-261986576"),
- ("https://zzz.deviantart.com/art/zzz-1234567890"),
- )
-
- skip = Extractor.skip
-
- def __init__(self, match):
- DeviantartExtractor.__init__(self, match)
- self.path = match.group(3)
-
- def deviations(self):
- url = "{}/{}/{}".format(self.root, self.user, self.path)
- response = self._html_request(url, fatal=False)
- deviation_id = text.extract(response.text, '//deviation/', '"')[0]
- if response.status_code >= 400 or not deviation_id:
- raise exception.NotFoundError("image")
- return (self.api.deviation(deviation_id),)
-
-
class DeviantartStashExtractor(DeviantartExtractor):
"""Extractor for sta.sh-ed deviations"""
subcategory = "stash"
@@ -558,54 +503,6 @@ class DeviantartJournalExtractor(DeviantartExtractor):
return self.api.browse_user_journals(self.user, self.offset)
-class DeviantartScrapsExtractor(DeviantartExtractor):
- """Extractor for an artist's scraps"""
- subcategory = "scraps"
- directory_fmt = ("{category}", "{username}", "Scraps")
- archive_fmt = "s_{username}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/gallery/\?catpath=scraps\b"
- test = (
- ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps", {
- "count": 12,
- "options": (("original", False),),
- }),
- ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"),
- )
-
- def deviations(self):
- url = "{}/{}/gallery/?catpath=scraps".format(self.root, self.user)
- page = self._html_request(url).text
- csrf, pos = text.extract(page, '"csrf":"', '"')
- iid , pos = text.extract(page, '"requestid":"', '"', pos)
-
- url = "https://www.deviantart.com/dapi/v1/gallery/0"
- data = {
- "username": self.user,
- "offset": self.offset,
- "limit": "24",
- "catpath": "scraps",
- "_csrf": csrf,
- "dapiIid": iid + "-jsok7403-1.1"
- }
-
- while True:
- content = self.request(
- url, method="POST", data=data).json()["content"]
-
- for item in content["results"]:
- if item["html"].startswith('<div class="ad-container'):
- continue
- deviation_url = text.extract(item["html"], 'href="', '"')[0]
- page = self._html_request(deviation_url).text
- deviation_id = text.extract(page, '//deviation/', '"')[0]
- if deviation_id:
- yield self.api.deviation(deviation_id)
-
- if not content["has_more"]:
- return
- data["offset"] = content["next_offset"]
-
-
class DeviantartPopularExtractor(DeviantartExtractor):
"""Extractor for popular deviations"""
subcategory = "popular"
@@ -649,6 +546,247 @@ class DeviantartPopularExtractor(DeviantartExtractor):
deviation["popular"] = self.popular
+class DeviantartExtractorV2(Extractor):
+ """Base class for deviantart extractors using the NAPI"""
+ category = "deviantart"
+ directory_fmt = ("{category}", "{author[username]!l}")
+ filename_fmt = "{category}_{index}_{title}.{extension}"
+ root = "https://www.deviantart.com"
+
+ def __init__(self, match=None):
+ Extractor.__init__(self, match)
+ self.offset = 0
+ self.extra = self.config("extra", False)
+ self.quality = self.config("quality", "100")
+ self.user = match.group(1) or match.group(2)
+
+ if self.quality:
+ self.quality = "q_{}".format(self.quality)
+
+ def items(self):
+ url = (
+ self.root + "/_napi/da-browse/shared_api/deviation/extended_fetch"
+ )
+ params = {
+ "deviationid" : None,
+ "username" : None,
+ "type" : None,
+ "include_session": "false",
+ }
+ headers = {
+ "Referer": self.root,
+ }
+
+ yield Message.Version, 1
+ for deviation in self.deviations():
+ params["deviationid"] = deviation["deviationId"]
+ params["username"] = deviation["author"]["username"]
+ params["type"] = "journal" if deviation["isJournal"] else "art"
+ data = self.request(url, params=params, headers=headers).json()
+
+ if "deviation" not in data:
+ self.log.warning("Skipping %s", params["deviationid"])
+ continue
+ deviation = self._extract(data)
+
+ yield Message.Directory, deviation
+ yield Message.Url, deviation["target"]["src"], deviation
+ if self.extra:
+ for match in DeviantartStashExtractor.pattern.finditer(
+ deviation["description"]):
+ deviation["_extractor"] = DeviantartStashExtractor
+ yield Message.Queue, match.group(0), deviation
+
+ def _extract(self, data):
+ deviation = data["deviation"]
+ extended = deviation["extended"]
+ files = deviation["files"]
+ del deviation["extended"]
+ del deviation["files"]
+
+ # prepare deviation metadata
+ deviation["description"] = extended.get("description", "")
+ deviation["username"] = self.user.lower()
+ deviation["stats"] = extended["stats"]
+ deviation["stats"]["comments"] = data["comments"]["total"]
+ deviation["index"] = deviation["deviationId"]
+ deviation["tags"] = [t["name"] for t in extended.get("tags") or ()]
+ deviation["date"] = text.parse_datetime(
+ deviation["publishedTime"])
+ deviation["category_path"] = "/".join(
+ extended[key]["displayNameEn"]
+ for key in ("typeFacet", "contentFacet", "categoryFacet")
+ if key in extended
+ )
+
+ # extract download target
+ target = files[-1]
+ name = files[0]["src"]
+
+ if target["type"] == "gif":
+ pass
+ elif target["type"] == "video":
+ # select largest video
+ target = max(
+ files, key=lambda x: text.parse_int(x.get("quality", "")[:-1]))
+ name = target["src"]
+ elif target["type"] == "flash":
+ if target["src"].startswith("https://sandbox.deviantart.com"):
+ # extract SWF file from "sandbox"
+ target["src"] = text.extract(
+ self.request(target["src"]).text,
+ 'id="sandboxembed" src="', '"',
+ )[0]
+ elif "download" in extended:
+ target = extended["download"]
+ target["src"] = target["url"]
+ del target["url"]
+
+ # url rewrites
+ if target["src"].startswith("https://images-wixmp-"):
+ if deviation["index"] <= 790677560:
+ # https://github.com/r888888888/danbooru/issues/4069
+ target["src"] = re.sub(
+ r"(/f/[^/]+/[^/]+)/v\d+/.*",
+ r"/intermediary\1", target["src"])
+ if self.quality:
+ target["src"] = re.sub(
+ r"q_\d+", self.quality, target["src"])
+
+ text.nameext_from_url(name, target)
+ if target["filename"].endswith("-150"):
+ target["filename"] = target["filename"][:-4]
+ if not target["filename"].count("-"):
+ name, _, hid = target["filename"].rpartition("_")
+ target["filename"] = name + "-" + hid
+ deviation["target"] = target
+ deviation["filename"] = target["filename"]
+ deviation["extension"] = target["extension"] = (
+ text.ext_from_url(target["src"]))
+ return deviation
+
+
+class DeviantartDeviationExtractor(DeviantartExtractorV2):
+ """Extractor for single deviations"""
+ subcategory = "deviation"
+ archive_fmt = "{index}.{extension}"
+ pattern = BASE_PATTERN + r"/(art|journal)/(?:[^/?&#]+-)?(\d+)"
+ test = (
+ (("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), {
+ "options": (("original", 0),),
+ "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
+ }),
+ ("https://www.deviantart.com/zzz/art/zzz-1234567890", {
+ "count": 0,
+ }),
+ (("https://www.deviantart.com/myria-moon/art/Aime-Moi-261986576"), {
+ "pattern": (r"https://www.deviantart.com/download/261986576"
+ r"/[\w-]+\.jpg\?token=\w+&ts=\d+"),
+ }),
+ # wixmp URL rewrite
+ (("https://www.deviantart.com/citizenfresh/art/Hverarond-789295466"), {
+ "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
+ r"/intermediary/f/[^/]+/[^.]+\.jpg$")
+ }),
+ # wixmp URL rewrite v2 (#369)
+ (("https://www.deviantart.com/josephbiwald/art/Destiny-2-804940104"), {
+ "pattern": r"https://images-wixmp-\w+\.wixmp\.com/.*,q_100,"
+ }),
+ # non-download URL for GIFs (#242)
+ (("https://www.deviantart.com/skatergators/art/COM-Moni-781571783"), {
+ "pattern": (r"https://images-wixmp-\w+\.wixmp\.com"
+ r"/f/[^/]+/[^.]+\.gif\?token="),
+ }),
+ # external URLs from description (#302)
+ (("https://www.deviantart.com/uotapo/art/INANAKI-Memo-590297498"), {
+ "options": (("extra", 1), ("original", 0)),
+ "pattern": r"https?://sta\.sh/\w+$",
+ "range": "2-",
+ "count": 4,
+ }),
+ # video
+ ("https://www.deviantart.com/chi-u/art/-VIDEO-Brushes-330774593", {
+ "url": "3b6e6e761d2d393fa61a4dc3ed6e7db51b14d07b",
+ "keyword": {
+ "target": {
+ "duration": 306,
+ "extension": "mp4",
+ "filename": r"re:_video____brushes_\w+_by_chi_u-d5gxnb5",
+ "filesize": 9963639,
+ "quality": "1080p",
+ "src": str,
+ "type": "video",
+ },
+ }
+ }),
+ # archive
+ ("https://www.deviantart.com/itsvenue/art/-brush-pngs-14-763300948", {
+ "pattern": r"https://.+deviantart.com/download/763300948/.*\.rar",
+ }),
+ # swf
+ ("https://www.deviantart.com/ikatxfruti/art/Bang-Bang-528130222", {
+ "pattern": r"https://images-wixmp-.*wixmp.com/f/.*\.swf",
+ }),
+ # old-style URLs
+ ("https://shimoda7.deviantart.com"
+ "/art/For-the-sake-of-a-memory-10073852"),
+ ("https://myria-moon.deviantart.com"
+ "/art/Aime-Moi-part-en-vadrouille-261986576"),
+ ("https://zzz.deviantart.com/art/zzz-1234567890"),
+ )
+
+ skip = Extractor.skip
+
+ def __init__(self, match):
+ DeviantartExtractorV2.__init__(self, match)
+ self.type = match.group(3)
+ self.deviation_id = match.group(4)
+
+ def deviations(self):
+ return ({
+ "deviationId": self.deviation_id,
+ "author" : {"username": self.user},
+ "isJournal" : self.type == "journal",
+ },)
+
+
+class DeviantartScrapsExtractor(DeviantartExtractorV2):
+ """Extractor for an artist's scraps"""
+ subcategory = "scraps"
+ directory_fmt = ("{category}", "{username}", "Scraps")
+ archive_fmt = "s_{username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b"
+ test = (
+ ("https://www.deviantart.com/shimoda7/gallery/scraps", {
+ "count": 12,
+ }),
+ ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps"),
+ ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"),
+ )
+
+ def deviations(self):
+ url = self.root + "/_napi/da-user-profile/api/gallery/contents"
+ params = {
+ "username" : self.user,
+ "offset" : self.offset,
+ "limit" : "24",
+ "scraps_folder": "true",
+ }
+ headers = {
+ "Referer": "{}/{}/gallery/scraps".format(self.root, self.user),
+ }
+
+ while True:
+ data = self.request(url, params=params, headers=headers).json()
+
+ for obj in data["results"]:
+ yield obj["deviation"]
+
+ if not data["hasMore"]:
+ return
+ params["offset"] = data["nextOffset"]
+
+
class DeviantartAPI():
"""Minimal interface for the DeviantArt API
@@ -805,7 +943,7 @@ class DeviantartAPI():
self.authenticate(None if public else self.refresh_token)
response = self.extractor.request(
- url, headers=self.headers, params=params, fatal=False)
+ url, headers=self.headers, params=params, fatal=None)
data = response.json()
status = response.status_code
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index ce2e83b..4ec7f00 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -33,16 +33,16 @@ class GelbooruExtractor(booru.XmlParserMixin,
self.session.cookies["fringeBenefits"] = "yup"
def items_noapi(self):
- data = self.get_metadata()
-
yield Message.Version, 1
- yield Message.Directory, data
+ data = self.get_metadata()
for post in self.get_posts():
post = self.get_post_data(post)
url = post["file_url"]
post.update(data)
- yield Message.Url, url, text.nameext_from_url(url, post)
+ text.nameext_from_url(url, post)
+ yield Message.Directory, post
+ yield Message.Url, url, post
def get_posts(self):
"""Return an iterable containing all relevant post objects"""
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index c112465..e4f18b3 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -21,7 +21,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
test = (
("https://hitomi.la/galleries/867789.html", {
"url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
- "keyword": "067b5d9b9c0f98530cd5dd2444e0f5a5b4b00d38",
+ "keyword": "d097a8db8e810045131b4510c41714004f9eff3a",
}),
("https://hitomi.la/galleries/1036181.html", {
# "aa" subdomain for gallery-id ending in 1 (#142)
diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py
index 6980185..76b2c38 100644
--- a/gallery_dl/extractor/imagebam.py
+++ b/gallery_dl/extractor/imagebam.py
@@ -41,14 +41,14 @@ class ImagebamGalleryExtractor(ImagebamExtractor):
pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)"
test = (
("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", {
- "url": "fb01925129a1ff1941762eaa3a2783a66de6847f",
+ "url": "76d976788ae2757ac81694736b07b72356f5c4c8",
"keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a",
"content": "596e6bfa157f2c7169805d50075c2986549973a8",
}),
("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", {
# more than 100 images; see issue #219
"count": 107,
- "url": "f92ce5b17676b6ea69288f0aef26f4cdbea7fd8d",
+ "url": "32ae6fe5dc3e4ca73ff6252e522d16473595d1d1",
}),
("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", {
"exception": exception.NotFoundError,
@@ -108,7 +108,7 @@ class ImagebamImageExtractor(ImagebamExtractor):
r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)")
test = (
("http://www.imagebam.com/image/94d56c502511890", {
- "url": "b384893c35a01a09c58018db71ddc4cf2480be95",
+ "url": "5e9ba3b1451f8ded0ae3a1b84402888893915d4a",
"keyword": "4263d4840007524129792b8587a562b5d20c2687",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
}),
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index 442634b..4aa670b 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -17,6 +17,7 @@ import json
class ImgbbExtractor(Extractor):
"""Base class for imgbb extractors"""
category = "imgbb"
+ directory_fmt = ("{category}", "{user}")
filename_fmt = "{title} {id}.{extension}"
archive_fmt = "{id}"
root = "https://imgbb.com"
@@ -145,7 +146,6 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
class ImgbbUserExtractor(ImgbbExtractor):
"""Extractor for user profiles in imgbb.com"""
subcategory = "user"
- directory_fmt = ("{category}", "{user}")
pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$"
test = ("https://folkie.imgbb.com", {
"range": "1-80",
@@ -177,3 +177,34 @@ class ImgbbUserExtractor(ImgbbExtractor):
"params_hidden[userid]": user,
"params_hidden[from]" : "user",
})
+
+
+class ImgbbImageExtractor(ImgbbExtractor):
+ subcategory = "image"
+ pattern = r"(?:https?://)?ibb\.co/(?!album/)([^/?&#]+)"
+ test = ("https://ibb.co/NLZHgqS", {
+ "url": "fbca86bac09de6fc0304054b2170b423ca1e84fa",
+ "keyword": "5d70e779bad03b2dc5273b627638045168671157",
+ })
+
+ def __init__(self, match):
+ ImgbbExtractor.__init__(self, match)
+ self.image_id = match.group(1)
+
+ def items(self):
+ url = "https://ibb.co/" + self.image_id
+ extr = text.extract_from(self.request(url).text)
+
+ image = {
+ "id" : self.image_id,
+ "title" : text.unescape(extr('"og:title" content="', '"')),
+ "url" : extr('"og:image" content="', '"'),
+ "width" : text.parse_int(extr('"og:image:width" content="', '"')),
+ "height": text.parse_int(extr('"og:image:height" content="', '"')),
+ "user" : extr('rel="author">', '<').lower(),
+ }
+ image["extension"] = text.ext_from_url(image["url"])
+
+ yield Message.Version, 1
+ yield Message.Directory, image
+ yield Message.Url, image["url"], image
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index c5e3d17..8523523 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -20,13 +20,19 @@ class ImgurExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.item_id = match.group(1)
+ self.key = match.group(1)
self.mp4 = self.config("mp4", True)
- def _get_data(self, path):
+ def _extract_data(self, path):
response = self.request(self.root + path, notfound=self.subcategory)
- data = text.extract(response.text, "image : ", ",\n")[0]
- return self._clean(json.loads(data))
+ data = json.loads(text.extract(
+ response.text, "image : ", ",\n")[0])
+ try:
+ del data["adConfig"]
+ del data["isAd"]
+ except KeyError:
+ pass
+ return data
def _prepare(self, image):
image["ext"] = image["ext"].partition("?")[0]
@@ -37,18 +43,9 @@ class ImgurExtractor(Extractor):
image["extension"] = image["ext"][1:]
return url
- @staticmethod
- def _clean(data):
- try:
- del data["adConfig"]
- del data["isAd"]
- except KeyError:
- pass
- return data
-
class ImgurImageExtractor(ImgurExtractor):
- """Extractor for individual images from imgur.com"""
+ """Extractor for individual images on imgur.com"""
subcategory = "image"
filename_fmt = "{category}_{hash}{title:?_//}.{extension}"
archive_fmt = "{hash}"
@@ -101,22 +98,21 @@ class ImgurImageExtractor(ImgurExtractor):
)
def items(self):
- image = self._get_data("/" + self.item_id)
+ image = self._extract_data("/" + self.key)
url = self._prepare(image)
-
yield Message.Version, 1
yield Message.Directory, image
yield Message.Url, url, image
class ImgurAlbumExtractor(ImgurExtractor):
- """Extractor for image albums from imgur.com"""
+ """Extractor for imgur albums"""
subcategory = "album"
directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}")
filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}"
archive_fmt = "{album[hash]}_{hash}"
pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com"
- r"/(?:a|gallery|t/unmuted)/(\w{7}|\w{5})")
+ r"/(?:a|t/unmuted)/(\w{7}|\w{5})")
test = (
("https://imgur.com/a/TcBmP", {
"url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563",
@@ -147,7 +143,7 @@ class ImgurAlbumExtractor(ImgurExtractor):
"width": int,
},
}),
- ("https://imgur.com/gallery/eD9CT", { # large album
+ ("https://imgur.com/a/eD9CT", { # large album
"url": "4ee94de31ff26be416271bc0b1ea27b9349c9937",
}),
("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash
@@ -164,13 +160,13 @@ class ImgurAlbumExtractor(ImgurExtractor):
)
def items(self):
- album = self._get_data("/a/" + self.item_id + "/all")
+ album = self._extract_data("/a/" + self.key + "/all")
images = album["album_images"]["images"]
del album["album_images"]
if int(album["num_images"]) > len(images):
url = "{}/ajaxalbums/getimages/{}/hit.json".format(
- self.root, self.item_id)
+ self.root, self.key)
images = self.request(url).json()["data"]["images"]
yield Message.Version, 1
@@ -180,3 +176,32 @@ class ImgurAlbumExtractor(ImgurExtractor):
image["num"] = num
image["album"] = album
yield Message.Url, url, image
+
+
+class ImgurGalleryExtractor(ImgurExtractor):
+ """Extractor for imgur galleries"""
+ subcategory = "gallery"
+ pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com"
+ r"/gallery/(\w{7}|\w{5})")
+ test = (
+ ("https://imgur.com/gallery/zf2fIms", { # non-album gallery (#380)
+ "pattern": "https://imgur.com/zf2fIms",
+ }),
+ ("https://imgur.com/gallery/eD9CT", {
+ "pattern": "https://imgur.com/a/eD9CT",
+ }),
+ )
+
+ def items(self):
+ url = self.root + "/a/" + self.key
+ with self.request(url, method="HEAD", fatal=False) as response:
+ code = response.status_code
+
+ if code < 400:
+ extr = ImgurAlbumExtractor
+ else:
+ extr = ImgurImageExtractor
+ url = self.root + "/" + self.key
+
+ yield Message.Version, 1
+ yield Message.Queue, url, {"_extractor": extr}
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index 475e24b..e5cfe8b 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -8,11 +8,10 @@
"""Extract images from https://www.instagram.com/"""
-import hashlib
-import json
from .common import Extractor, Message
from .. import text, exception
from ..cache import cache
+import json
class InstagramExtractor(Extractor):
@@ -37,10 +36,11 @@ class InstagramExtractor(Extractor):
data.update(metadata)
yield Message.Directory, data
- if data['typename'] == 'GraphImage':
+ if data['typename'] in ('GraphImage', 'GraphStoryImage', 'GraphStoryVideo'):
yield Message.Url, data['display_url'], \
text.nameext_from_url(data['display_url'], data)
elif data['typename'] == 'GraphVideo':
+ data["extension"] = None
yield Message.Url, \
'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data
@@ -140,33 +140,113 @@ class InstagramExtractor(Extractor):
return medias
+ def _extract_stories(self, url):
+ if self.highlight_id:
+ user_id = ''
+ highlight_id = '"{}"'.format(self.highlight_id)
+ query_hash = '30a89afdd826d78a5376008a7b81c205'
+ else:
+ page = self.request(url).text
+ shared_data = self._extract_shared_data(page)
+
+ # If no stories are present the URL redirects to `ProfilePage'
+ if 'StoriesPage' not in shared_data['entry_data']:
+ return []
+
+ user_id = '"{}"'.format(
+ shared_data['entry_data']['StoriesPage'][0]['user']['id'])
+ highlight_id = ''
+ query_hash = 'cda12de4f7fd3719c0569ce03589f4c4'
+
+ variables = (
+ '{{'
+ '"reel_ids":[{}],"tag_names":[],"location_ids":[],'
+ '"highlight_reel_ids":[{}],"precomposed_overlay":true,'
+ '"show_story_viewer_list":true,'
+ '"story_viewer_fetch_count":50,"story_viewer_cursor":"",'
+ '"stories_video_dash_manifest":false}}'
+ ).format(user_id, highlight_id)
+ headers = {
+ "X-Requested-With": "XMLHttpRequest",
+ }
+ url = '{}/graphql/query/?query_hash={}&variables={}'.format(
+ self.root,
+ query_hash,
+ variables,
+ )
+ shared_data = self.request(url, headers=headers).json()
+
+ # If there are stories present but the user is not authenticated or
+ # does not have permissions no stories are returned.
+ if not shared_data['data']['reels_media']:
+ return [] # no stories present
+
+ medias = []
+ for media in shared_data['data']['reels_media'][0]['items']:
+ media_data = {
+ 'owner_id': media['owner']['id'],
+ 'username': media['owner']['username'],
+ 'date': text.parse_timestamp(media['taken_at_timestamp']),
+ 'expires': text.parse_timestamp(media['expiring_at_timestamp']),
+ 'media_id': media['id'],
+ 'typename': media['__typename'],
+ }
+ if media['__typename'] == 'GraphStoryImage':
+ media_data.update({
+ 'display_url': media['display_url'],
+ 'height': text.parse_int(media['dimensions']['height']),
+ 'width': text.parse_int(media['dimensions']['width']),
+ })
+ elif media['__typename'] == 'GraphStoryVideo':
+ vr = media['video_resources'][0]
+ media_data.update({
+ 'duration': text.parse_float(media['video_duration']),
+ 'display_url': vr['src'],
+ 'height': text.parse_int(vr['config_height']),
+ 'width': text.parse_int(vr['config_width']),
+ })
+ medias.append(media_data)
+
+ return medias
+
def _extract_page(self, url, page_type):
shared_data_fields = {
'ProfilePage': {
+ 'page': 'ProfilePage',
'node': 'user',
'node_id': 'id',
'edge_to_medias': 'edge_owner_to_timeline_media',
'variables_id': 'id',
- 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41',
+ 'query_hash': 'f2405b236d85e8296cf30347c9f08c2a',
+ },
+ 'ProfileChannelPage': {
+ 'page': 'ProfilePage',
+ 'node': 'user',
+ 'node_id': 'id',
+ 'edge_to_medias': 'edge_felix_video_timeline',
+ 'variables_id': 'id',
+ 'query_hash': 'bc78b344a68ed16dd5d7f264681c4c76',
},
'TagPage': {
+ 'page': 'TagPage',
'node': 'hashtag',
'node_id': 'name',
'edge_to_medias': 'edge_hashtag_to_media',
'variables_id': 'tag_name',
- 'query_hash': 'f92f56d47dc7a55b606908374b43a314',
+ 'query_hash': 'f12c9ec5e46a3173b2969c712ad84744',
},
}
page = self.request(url).text
shared_data = self._extract_shared_data(page)
psdf = shared_data_fields[page_type]
+ csrf = shared_data["config"]["csrf_token"]
while True:
# Deal with different structure of pages: the first page
# has interesting data in `entry_data', next pages in `data'.
if 'entry_data' in shared_data:
- base_shared_data = shared_data['entry_data'][page_type][0]['graphql']
+ base_shared_data = shared_data['entry_data'][psdf['page']][0]['graphql']
# variables_id is available only in the first page
variables_id = base_shared_data[psdf['node']][psdf['node_id']]
@@ -192,7 +272,8 @@ class InstagramExtractor(Extractor):
)
headers = {
"X-Requested-With": "XMLHttpRequest",
- "X-Instagram-GIS": hashlib.md5(variables.encode()).hexdigest(),
+ "X-CSRFToken": csrf,
+ "X-IG-App-ID": "936619743392459",
}
url = '{}/graphql/query/?query_hash={}&variables={}'.format(
self.root,
@@ -204,14 +285,20 @@ class InstagramExtractor(Extractor):
def _extract_profilepage(self, url):
yield from self._extract_page(url, 'ProfilePage')
+ def _extract_profilechannelpage(self, url):
+ yield from self._extract_page(url, 'ProfileChannelPage')
+
def _extract_tagpage(self, url):
yield from self._extract_page(url, 'TagPage')
+ def _extract_storiespage(self, url):
+ yield from self._extract_stories(url)
+
class InstagramImageExtractor(InstagramExtractor):
"""Extractor for PostPage"""
subcategory = "image"
- pattern = r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/?&#]+)"
+ pattern = r"(?:https?://)?(?:www\.)?instagram\.com/(?:p|tv)/([^/?&#]+)"
test = (
# GraphImage
("https://www.instagram.com/p/BqvsDleB3lV/", {
@@ -258,6 +345,22 @@ class InstagramImageExtractor(InstagramExtractor):
}
}),
+ # GraphVideo (IGTV)
+ ("https://www.instagram.com/tv/BkQjCfsBIzi/", {
+ "url": "64208f408e11cbbca86c2df4488e90262ae9d9ec",
+ "keyword": {
+ "date": "type:datetime",
+ "description": str,
+ "height": int,
+ "likes": int,
+ "media_id": "1806097553666903266",
+ "shortcode": "BkQjCfsBIzi",
+ "typename": "GraphVideo",
+ "username": "instagram",
+ "width": int,
+ }
+ }),
+
# GraphSidecar with 2 embedded GraphVideo objects
("https://www.instagram.com/p/BtOvDOfhvRr/", {
"count": 2,
@@ -283,10 +386,11 @@ class InstagramUserExtractor(InstagramExtractor):
"""Extractor for ProfilePage"""
subcategory = "user"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/(?!p/|explore/|directory/|accounts/)([^/?&#]+)")
+ r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
+ r"([^/?&#]+)/?$")
test = ("https://www.instagram.com/instagram/", {
- "range": "1-12",
- "count": ">= 12",
+ "range": "1-16",
+ "count": ">= 16",
})
def __init__(self, match):
@@ -298,6 +402,26 @@ class InstagramUserExtractor(InstagramExtractor):
return self._extract_profilepage(url)
+class InstagramChannelExtractor(InstagramExtractor):
+ """Extractor for ProfilePage channel"""
+ subcategory = "channel"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/(?!p/|explore/|directory/|accounts/|stories/|tv/)"
+ r"([^/?&#]+)/channel")
+ test = ("https://www.instagram.com/instagram/channel/", {
+ "range": "1-16",
+ "count": ">= 16",
+ })
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.username = match.group(1)
+
+ def instagrams(self):
+ url = '{}/{}/channel/'.format(self.root, self.username)
+ return self._extract_profilechannelpage(url)
+
+
class InstagramTagExtractor(InstagramExtractor):
"""Extractor for TagPage"""
subcategory = "tag"
@@ -305,8 +429,8 @@ class InstagramTagExtractor(InstagramExtractor):
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
r"/explore/tags/([^/?&#]+)")
test = ("https://www.instagram.com/explore/tags/instagram/", {
- "range": "1-12",
- "count": ">= 12",
+ "range": "1-16",
+ "count": ">= 16",
})
def __init__(self, match):
@@ -319,3 +443,22 @@ class InstagramTagExtractor(InstagramExtractor):
def instagrams(self):
url = '{}/explore/tags/{}/'.format(self.root, self.tag)
return self._extract_tagpage(url)
+
+
+class InstagramStoriesExtractor(InstagramExtractor):
+ """Extractor for StoriesPage"""
+ subcategory = "stories"
+ pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
+ r"/stories/([^/?&#]+)(?:/(\d+))?")
+ test = (
+ ("https://www.instagram.com/stories/instagram/"),
+ ("https://www.instagram.com/stories/highlights/18042509488170095/"),
+ )
+
+ def __init__(self, match):
+ InstagramExtractor.__init__(self, match)
+ self.username, self.highlight_id = match.groups()
+
+ def instagrams(self):
+ url = '{}/stories/{}/'.format(self.root, self.username)
+ return self._extract_storiespage(url)
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
index 879d38b..a73eb86 100644
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor):
test = (
("https://luscious.net/albums/okinami-no-koigokoro_277031/", {
"url": "7e4984a271a1072ac6483e4228a045895aff86f3",
- "keyword": "ab4e5b71583fd439b4c8012a642aa8b58d8d0758",
+ "keyword": "07c0b915f2ab1cc3bbf28b76e7950fccee1213f3",
"content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3",
}),
("https://luscious.net/albums/virgin-killer-sweater_282582/", {
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 282c389..1ca1073 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -93,7 +93,7 @@ class NewgroundsUserExtractor(NewgroundsExtractor):
test = (
("https://blitzwuff.newgrounds.com/art", {
"url": "24b19c4a135a09889fac7b46a74e427e4308d02b",
- "keyword": "98566e0c8096a8099b8d71962fea7e31c8b098d4",
+ "keyword": "62981f7bdd66e1f1c72ab1d9b932423c156bc9a1",
}),
("https://blitzwuff.newgrounds.com/"),
)
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 4884497..ab5932d 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -11,6 +11,8 @@
from .common import Extractor, Message
from .. import text
from ..cache import memcache
+import collections
+import json
class PatreonExtractor(Extractor):
@@ -33,70 +35,92 @@ class PatreonExtractor(Extractor):
for post in self.posts():
yield Message.Directory, post
+ ids = set()
post["num"] = 0
content = post.get("content")
postfile = post.get("post_file")
- for url in text.extract_iter(content or "", 'src="', '"'):
+ for image in post["images"]:
+ url = image.get("download_url")
+ if not url:
+ continue
+ ids.add(url.split("/")[-2])
+ name = image.get("file_name") or self._filename(url) or url
+
post["num"] += 1
- yield Message.Url, url, text.nameext_from_url(url, post)
+ post["type"] = "image"
+ yield Message.Url, url, text.nameext_from_url(name, post)
- if postfile:
+ if postfile and postfile["url"].split("/")[-2] not in ids:
post["num"] += 1
+ post["type"] = "postfile"
text.nameext_from_url(postfile["name"], post)
yield Message.Url, postfile["url"], post
for attachment in post["attachments"]:
post["num"] += 1
+ post["type"] = "attachment"
text.nameext_from_url(attachment["name"], post)
yield Message.Url, attachment["url"], post
+ if content:
+ for url in text.extract_iter(content, 'src="', '"'):
+ post["num"] += 1
+ post["type"] = "content"
+ yield Message.Url, url, text.nameext_from_url(url, post)
+
def posts(self):
"""Return all relevant post objects"""
def _pagination(self, url):
headers = {"Referer": self.root}
- empty = []
while url:
posts = self.request(url, headers=headers).json()
- if "included" not in posts:
- return
-
- # collect attachments
- attachments = {}
- for inc in posts["included"]:
- if inc["type"] == "attachment":
- attachments[inc["id"]] = inc["attributes"]
-
- # update posts
- for post in posts["data"]:
- attr = post["attributes"]
- attr["id"] = text.parse_int(post["id"])
- attr["date"] = text.parse_datetime(
- attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
- attr["creator"] = self._user(
- post["relationships"]["user"]["links"]["related"])
-
- # add attachments to post attributes
- files = post["relationships"].get("attachments")
- if files:
- attr["attachments"] = [
- attachments[f["id"]]
- for f in files["data"]
- ]
- else:
- attr["attachments"] = empty
-
- yield attr
+ if "included" in posts:
+ included = self._transform(posts["included"])
+ for post in posts["data"]:
+ yield self._process(post, included)
if "links" not in posts:
return
url = posts["links"].get("next")
+ def _process(self, post, included):
+ """Process and extend a 'post' object"""
+ attr = post["attributes"]
+ attr["id"] = text.parse_int(post["id"])
+ attr["images"] = self._files(post, included, "images")
+ attr["attachments"] = self._files(post, included, "attachments")
+ attr["date"] = text.parse_datetime(
+ attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ attr["creator"] = self._user(
+ post["relationships"]["user"]["links"]["related"])
+ return attr
+
+ @staticmethod
+ def _transform(included):
+ """Transform 'included' into an easier to handle format"""
+ result = collections.defaultdict(dict)
+ for inc in included:
+ result[inc["type"]][inc["id"]] = inc["attributes"]
+ return result
+
+ @staticmethod
+ def _files(post, included, key):
+ """Build a list of files"""
+ files = post["relationships"].get(key)
+ if files and files.get("data"):
+ return [
+ included[file["type"]][file["id"]]
+ for file in files["data"]
+ ]
+ return []
+
@memcache(keyarg=1)
def _user(self, url):
+ """Fetch user information"""
user = self.request(url).json()["data"]
attr = user["attributes"]
attr["id"] = user["id"]
@@ -104,14 +128,21 @@ class PatreonExtractor(Extractor):
attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z")
return attr
+ def _filename(self, url):
+ """Fetch filename from its Content-Disposition header"""
+ response = self.request(url, method="HEAD", fatal=False)
+ cd = response.headers.get("Content-Disposition")
+ return text.extract(cd, 'filename="', '"')[0]
+
@staticmethod
def _build_url(endpoint, query):
return (
"https://www.patreon.com/api/" + endpoint +
- "?include=user,attachments,user_defined_tags,campaign,poll.choices"
- ",poll.current_user_responses.user,poll.current_user_responses.cho"
- "ice,poll.current_user_responses.poll,access_rules.tier.null"
+ "?include=user,images,attachments,user_defined_tags,campaign,poll."
+ "choices,poll.current_user_responses.user,poll.current_user_respon"
+ "ses.choice,poll.current_user_responses.poll,access_rules.tier.nul"
+ "l"
"&fields[post]=change_visibility_at,comment_count,content,current_"
"user_can_delete,current_user_can_view,current_user_has_liked,embe"
@@ -133,7 +164,8 @@ class PatreonCreatorExtractor(PatreonExtractor):
"""Extractor for a creator's works"""
subcategory = "creator"
pattern = (r"(?:https?://)?(?:www\.)?patreon\.com"
- r"/(?!(?:home|join|login|signup)(?:$|[/?&#]))([^/?&#]+)/?")
+ r"/(?!(?:home|join|posts|login|signup)(?:$|[/?&#]))"
+ r"([^/?&#]+)/?")
test = ("https://www.patreon.com/koveliana", {
"range": "1-25",
"count": ">= 25",
@@ -144,6 +176,7 @@ class PatreonCreatorExtractor(PatreonExtractor):
"creator": dict,
"date": "type:datetime",
"id": int,
+ "images": list,
"like_count": int,
"post_type": str,
"published_at": str,
@@ -181,3 +214,26 @@ class PatreonUserExtractor(PatreonExtractor):
"&filter[is_following]=true"
))
return self._pagination(url)
+
+
+class PatreonPostExtractor(PatreonExtractor):
+ """Extractor for media from a single post"""
+ subcategory = "post"
+ pattern = (r"(?:https?://)?(?:www\.)?patreon\.com"
+ r"/posts/[^/?&#]*?(\d+)")
+ test = ("https://www.patreon.com/posts/precious-metal-23563293", {
+ "count": 4,
+ })
+
+ def __init__(self, match):
+ PatreonExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def posts(self):
+ url = "{}/posts/{}".format(self.root, self.post_id)
+ page = self.request(url).text
+ data = text.extract(page, "window.patreon.bootstrap,", "\n});")[0]
+ post = json.loads(data + "}")["post"]
+
+ included = self._transform(post["included"])
+ return (self._process(post["data"], included),)
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 76d4dc4..4f8ee9c 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -18,8 +18,8 @@ class PixivExtractor(Extractor):
"""Base class for pixiv extractors"""
category = "pixiv"
directory_fmt = ("{category}", "{user[id]} {user[account]}")
- filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}"
- archive_fmt = "{id}{num}.{extension}"
+ filename_fmt = "{id}_p{num}.{extension}"
+ archive_fmt = "{id}{suffix}.{extension}"
def __init__(self, match):
Extractor.__init__(self, match)
@@ -40,9 +40,10 @@ class PixivExtractor(Extractor):
del work["meta_single_page"]
del work["image_urls"]
del work["meta_pages"]
- work["num"] = ""
+ work["num"] = 0
work["tags"] = [tag["name"] for tag in work["tags"]]
work["date"] = text.parse_datetime(work["create_date"])
+ work["suffix"] = ""
work.update(metadata)
yield Message.Directory, work
@@ -55,20 +56,17 @@ class PixivExtractor(Extractor):
url = ugoira["zip_urls"]["medium"].replace(
"_ugoira600x600", "_ugoira1920x1080")
work["frames"] = ugoira["frames"]
- work["extension"] = "zip"
- yield Message.Url, url, work
+ yield Message.Url, url, text.nameext_from_url(url, work)
elif work["page_count"] == 1:
url = meta_single_page["original_image_url"]
- work["extension"] = url.rpartition(".")[2]
- yield Message.Url, url, work
+ yield Message.Url, url, text.nameext_from_url(url, work)
else:
- for num, img in enumerate(meta_pages):
+ for work["num"], img in enumerate(meta_pages):
url = img["image_urls"]["original"]
- work["num"] = "_p{:02}".format(num)
- work["extension"] = url.rpartition(".")[2]
- yield Message.Url, url, work
+ work["suffix"] = "_p{:02}".format(work["num"])
+ yield Message.Url, url, text.nameext_from_url(url, work)
def works(self):
"""Return an iterable containing all relevant 'work'-objects"""
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
index fa4eb81..aa5c9c6 100644
--- a/gallery_dl/extractor/pururin.py
+++ b/gallery_dl/extractor/pururin.py
@@ -29,7 +29,7 @@ class PururinGalleryExtractor(GalleryExtractor):
"artist" : ["Shoda Norihiro"],
"group" : ["Obsidian Order"],
"parody" : ["Kantai Collection"],
- "characters": ["Iowa", "Teitoku"],
+ "characters": ["Admiral", "Iowa"],
"tags" : list,
"type" : "Doujinshi",
"collection": "",
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index 59d502a..f97454b 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -117,6 +117,8 @@ class ReactorExtractor(SharedConfigMixin, Extractor):
url = text.extract(image, ' src="', '"')[0]
if not url:
continue
+ if url.startswith("//"):
+ url = "http:" + url
width = text.extract(image, ' width="', '"')[0]
height = text.extract(image, ' height="', '"')[0]
image_id = url.rpartition("-")[2].partition(".")[0]
@@ -268,8 +270,8 @@ class JoyreactorPostExtractor(ReactorPostExtractor):
"keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47",
}),
("http://joyreactor.com/post/3668724", { # youtube embed
- "url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a",
- "keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651",
+ "url": "bf1666eddcff10c9b58f6be63fa94e4e13074214",
+ "keyword": "989112c7888e9cc80fd35870180c6c98165d953b",
}),
("http://joyreactor.cc/post/1299", { # "malformed" JSON
"url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde",
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 2ba4b99..94e95e8 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -234,7 +234,7 @@ class RedditAPI():
url = "https://oauth.reddit.com" + endpoint
params["raw_json"] = 1
self.authenticate()
- response = self.extractor.request(url, params=params, fatal=False)
+ response = self.extractor.request(url, params=params, fatal=None)
remaining = response.headers.get("x-ratelimit-remaining")
if remaining and float(remaining) < 2:
wait = int(response.headers["x-ratelimit-reset"])
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index da9735e..bb8a2ae 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -40,17 +40,18 @@ class SankakuExtractor(SharedConfigMixin, Extractor):
def items(self):
self.login()
- data = self.get_metadata()
yield Message.Version, 1
- yield Message.Directory, data
+ data = self.get_metadata()
for post_id in util.advance(self.get_posts(), self.start_post):
self.wait()
post = self.get_post_data(post_id)
url = post["file_url"]
post.update(data)
- yield Message.Url, url, text.nameext_from_url(url, post)
+ text.nameext_from_url(url, post)
+ yield Message.Directory, post
+ yield Message.Url, url, post
def skip(self, num):
self.start_post += num
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index afd4eaa..38b7813 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -78,6 +78,7 @@ class SexcomExtractor(Extractor):
path += "/hd"
data["url"] = self.root + path
else:
+ data["extension"] = None
data["url"] = "ytdl:" + text.extract(
extr('<iframe', '>'), ' src="', '"')[0]
else:
diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py
index 5ad372d..8567155 100644
--- a/gallery_dl/extractor/simplyhentai.py
+++ b/gallery_dl/extractor/simplyhentai.py
@@ -8,14 +8,16 @@
"""Extract hentai-manga from https://www.simply-hentai.com/"""
-from .common import GalleryExtractor, Extractor, Message
+from .common import GalleryExtractor
from .. import text, util, exception
+import json
class SimplyhentaiGalleryExtractor(GalleryExtractor):
"""Extractor for image galleries from simply-hentai.com"""
category = "simplyhentai"
archive_fmt = "{image_id}"
+ root = "https://www.simply-hentai.com"
pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com"
r"(?!/(?:album|gifs?|images?|series)(?:/|$))"
r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?&#]+)+)")
@@ -23,7 +25,7 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
(("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), {
"url": "258289249990502c3138719cb89e995a60861e49",
- "keyword": "eba83ccdbab3022a2280c77aa747f9458196138b",
+ "keyword": "8b2400e4b466e8f46802fa5a6b917d2788bb7e8e",
}),
("https://www.simply-hentai.com/notfound", {
"exception": exception.GalleryDLException,
@@ -40,144 +42,30 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
self.session.headers["Referer"] = url
def metadata(self, page):
- extr = text.extract_from(page)
- split = text.split_html
-
- title = extr('<meta property="og:title" content="', '"')
- if not title:
+ path = text.extract(page, '<a class="preview" href="', '"')[0]
+ if not path:
raise exception.NotFoundError("gallery")
- data = {
- "title" : text.unescape(title),
- "gallery_id": text.parse_int(extr('/Album/', '/')),
- "parody" : split(extr('box-title">Series</div>', '</div>')),
- "language" : text.remove_html(extr(
- 'box-title">Language</div>', '</div>')) or None,
- "characters": split(extr('box-title">Characters</div>', '</div>')),
- "tags" : split(extr('box-title">Tags</div>', '</div>')),
- "artist" : split(extr('box-title">Artists</div>', '</div>')),
- "date" : text.parse_datetime(text.remove_html(
- extr('Uploaded', '</div>')), "%d.%m.%Y"),
+ page = self.request(self.root + path).text
+ data = json.loads(text.unescape(text.extract(
+ page, 'data-react-class="Reader" data-react-props="', '"')[0]))
+ self.manga = manga = data["manga"]
+
+ return {
+ "title" : manga["title"],
+ "parody" : manga["series"]["title"],
+ "language" : manga["language"]["name"],
+ "lang" : util.language_to_code(manga["language"]["name"]),
+ "characters": [x["name"] for x in manga["characters"]],
+ "tags" : [x["name"] for x in manga["tags"]],
+ "artist" : [x["name"] for x in manga["artists"]],
+ "gallery_id": text.parse_int(text.extract(
+ manga["images"][0]["sizes"]["full"], "/Album/", "/")[0]),
+ "date" : text.parse_datetime(
+ manga["publish_date"], "%Y-%m-%dT%H:%M:%S.%f%z"),
}
- data["lang"] = util.language_to_code(data["language"])
- return data
def images(self, _):
- url = self.chapter_url + "/all-pages"
- headers = {"Accept": "application/json"}
- images = self.request(url, headers=headers).json()
return [
- (urls["full"], {"image_id": text.parse_int(image_id)})
- for image_id, urls in sorted(images.items())
+ (image["sizes"]["full"], {"image_id": image["id"]})
+ for image in self.manga["images"]
]
-
-
-class SimplyhentaiImageExtractor(Extractor):
- """Extractor for individual images from simply-hentai.com"""
- category = "simplyhentai"
- subcategory = "image"
- directory_fmt = ("{category}", "{type}s")
- filename_fmt = "{category}_{token}{title:?_//}.{extension}"
- archive_fmt = "{token}"
- pattern = (r"(?:https?://)?(?:www\.)?(simply-hentai\.com"
- r"/(image|gif)/[^/?&#]+)")
- test = (
- (("https://www.simply-hentai.com/image"
- "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), {
- "url": "0338eb137830ab6f81e5f410d3936ef785d063d9",
- "keyword": "e10e5588481cab68329ef6ec1e5325206b2079a2",
- }),
- ("https://www.simply-hentai.com/gif/8915dfcf-0b6a-47c", {
- "url": "11c060d7ec4dfd0bd105300b6e1fd454674a5af1",
- "keyword": "dd97a4bb449c397d6fec9f43a1303c0fb168ae65",
- }),
- )
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.page_url = "https://www." + match.group(1)
- self.type = match.group(2)
-
- def items(self):
- extr = text.extract_from(self.request(self.page_url).text)
- title = extr('"og:title" content="' , '"')
- descr = extr('"og:description" content="', '"')
- url = extr('&quot;image&quot;:&quot;' , '&')
- url = extr("&quot;content&quot;:&quot;", "&") or url
-
- tags = text.extract(descr, " tagged with ", " online for free ")[0]
- if tags:
- tags = tags.split(", ")
- tags[-1] = tags[-1].partition(" ")[2]
- else:
- tags = []
-
- data = text.nameext_from_url(url, {
- "title": text.unescape(title) if title else "",
- "tags": tags,
- "type": self.type,
- })
- data["token"] = data["filename"].rpartition("_")[2]
-
- yield Message.Version, 1
- yield Message.Directory, data
- yield Message.Url, url, data
-
-
-class SimplyhentaiVideoExtractor(Extractor):
- """Extractor for hentai videos from simply-hentai.com"""
- category = "simplyhentai"
- subcategory = "video"
- directory_fmt = ("{category}", "{type}s")
- filename_fmt = "{title}{episode:?_//>02}.{extension}"
- archive_fmt = "{title}_{episode}"
- pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?&#]+)"
- test = (
- ("https://videos.simply-hentai.com/creamy-pie-episode-02", {
- "pattern": r"https://www\.googleapis\.com/drive/v3/files"
- r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+",
- "keyword": "706790708b14773efc1e075ddd3b738a375348a5",
- "count": 1,
- }),
- (("https://videos.simply-hentai.com"
- "/1715-tifa-in-hentai-gang-bang-3d-movie"), {
- "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0",
- "keyword": "f9dad94fbde9c95859e631ff4f07297a9567b874",
- }),
- )
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.page_url = "https://" + match.group(1)
-
- def items(self):
- page = self.request(self.page_url).text
-
- title, pos = text.extract(page, "<title>", "</title>")
- tags , pos = text.extract(page, ">Tags</div>", "</div>", pos)
- date , pos = text.extract(page, ">Upload Date</div>", "</div>", pos)
- title = title.rpartition(" - ")[0]
-
- if "<video" in page:
- video_url = text.extract(page, '<source src="', '"', pos)[0]
- episode = 0
- else:
- # video url from myhentai.tv embed
- pos = page.index('<div class="video-frame-container">', pos)
- embed_url = text.extract(page, 'src="', '"', pos)[0].replace(
- "embedplayer.php?link=", "embed.php?name=")
- embed_page = self.request(embed_url).text
- video_url = text.extract(embed_page, '"file":"', '"')[0]
- title, _, episode = title.rpartition(" Episode ")
-
- data = text.nameext_from_url(video_url, {
- "title": text.unescape(title),
- "episode": text.parse_int(episode),
- "tags": text.split_html(tags)[::2],
- "type": "video",
- "date": text.parse_datetime(text.remove_html(
- date), "%B %d, %Y %H:%M"),
- })
-
- yield Message.Version, 1
- yield Message.Directory, data
- yield Message.Url, video_url, data
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index ccba640..3672a6d 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -54,6 +54,7 @@ class TwitterExtractor(Extractor):
if self.videos and "-videoContainer" in tweet:
data["num"] = 1
+ data["extension"] = None
url = "ytdl:{}/{}/status/{}".format(
self.root, data["user"], data["tweet_id"])
yield Message.Url, url, data
diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py
index b9c223c..463733f 100644
--- a/gallery_dl/extractor/wikiart.py
+++ b/gallery_dl/extractor/wikiart.py
@@ -70,7 +70,7 @@ class WikiartArtistExtractor(WikiartExtractor):
pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)"
test = ("https://www.wikiart.org/en/thomas-cole", {
"url": "f1eee8158f5b8b7380382ab730a8f53884715c8b",
- "keyword": "b62678394ce645815963883d5c9642255307225f",
+ "keyword": "c61f5a4774b977106000e9554d19cfb9438a7032",
})
def __init__(self, match):
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 9699806..23750db 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -13,13 +13,16 @@ from .. import text
import json
-BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)"
+BASE_PATTERN = r"(?:https?://)?((?:[^.]+\.)?xhamster\d?\.(?:com|one|desi))"
class XhamsterExtractor(Extractor):
"""Base class for xhamster extractors"""
category = "xhamster"
- root = "https://xhamster.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.root = "https://" + match.group(1)
class XhamsterGalleryExtractor(XhamsterExtractor):
@@ -66,16 +69,21 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
},
},
}),
+ ("https://jp.xhamster2.com/photos/gallery/11748968", {
+ "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$",
+ "count": ">= 144",
+ }),
("https://xhamster.com/photos/gallery/make-the-world-better-11748968"),
("https://xhamster.com/photos/gallery/11748968"),
("https://xhamster.one/photos/gallery/11748968"),
("https://xhamster.desi/photos/gallery/11748968"),
+ ("https://xhamster2.com/photos/gallery/11748968"),
("https://en.xhamster.com/photos/gallery/11748968"),
)
def __init__(self, match):
XhamsterExtractor.__init__(self, match)
- self.path = match.group(1)
+ self.path = match.group(2)
self.data = None
def items(self):
@@ -154,7 +162,7 @@ class XhamsterUserExtractor(XhamsterExtractor):
def __init__(self, match):
XhamsterExtractor.__init__(self, match)
- self.user = match.group(1)
+ self.user = match.group(2)
def items(self):
yield Message.Version, 1