summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2019-08-04 17:52:59 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2019-08-04 17:52:59 -0400
commit64ad8e7bd15df71ab1116eede414558631bcad32 (patch)
tree7416e191aedce591087903a943198aed13fa0b26 /gallery_dl/extractor
parent2a63a9c9b7032a76894c48ac4d9cea732fcaee49 (diff)
New upstream version 1.10.1upstream/1.10.1
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py3
-rw-r--r--gallery_dl/extractor/adultempire.py58
-rw-r--r--gallery_dl/extractor/behance.py22
-rw-r--r--gallery_dl/extractor/dynastyscans.py2
-rw-r--r--gallery_dl/extractor/exhentai.py102
-rw-r--r--gallery_dl/extractor/gelbooru.py1
-rw-r--r--gallery_dl/extractor/imgbb.py179
-rw-r--r--gallery_dl/extractor/luscious.py2
-rw-r--r--gallery_dl/extractor/ngomik.py2
-rw-r--r--gallery_dl/extractor/sankaku.py4
-rw-r--r--gallery_dl/extractor/sankakucomplex.py4
-rw-r--r--gallery_dl/extractor/tsumino.py2
-rw-r--r--gallery_dl/extractor/vsco.py176
13 files changed, 509 insertions, 48 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 189c163..0b24111 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -17,6 +17,7 @@ modules = [
"500px",
"8chan",
"8muses",
+ "adultempire",
"artstation",
"behance",
"bobx",
@@ -42,6 +43,7 @@ modules = [
"idolcomplex",
"imagebam",
"imagefap",
+ "imgbb",
"imgbox",
"imgth",
"imgur",
@@ -95,6 +97,7 @@ modules = [
"tumblr",
"twitter",
"vanillarock",
+ "vsco",
"wallhaven",
"warosu",
"weibo",
diff --git a/gallery_dl/extractor/adultempire.py b/gallery_dl/extractor/adultempire.py
new file mode 100644
index 0000000..5ea835f
--- /dev/null
+++ b/gallery_dl/extractor/adultempire.py
@@ -0,0 +1,58 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.adultempire.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class AdultempireGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from www.adultempire.com"""
+ category = "adultempire"
+ root = "https://www.adultempire.com"
+ pattern = (r"(?:https?://)?(?:www\.)?adult(?:dvd)?empire\.com"
+ r"(/(\d+)/gallery\.html)")
+ test = (
+ ("https://www.adultempire.com/5998/gallery.html", {
+ "range": "1",
+ "keyword": "0533ef1184892be8ac02b17286797c95f389ba63",
+ "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e",
+ }),
+ ("https://www.adultdvdempire.com/5683/gallery.html", {
+ "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d",
+ "keyword": "59fe5d95929efc5040a819a5f77aba7a022bb85a",
+ }),
+ )
+
+ def __init__(self, match):
+ GalleryExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+
+ def metadata(self, page):
+ extr = text.extract_from(page, page.index('<div id="content">'))
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : text.unescape(extr('title="', '"')),
+ "studio" : extr(">studio</small>", "<").strip(),
+ "date" : text.parse_datetime(extr(
+ ">released</small>", "<").strip(), "%m/%d/%Y"),
+ "actors" : text.split_html(extr(
+ '<ul class="item-details item-cast-list ', '</ul>'))[1:],
+ }
+
+ def images(self, page):
+ params = {"page": 1}
+ while True:
+ urls = list(text.extract_iter(page, 'rel="L"><img src="', '"'))
+ for url in urls:
+ yield url.replace("_200.", "_9600."), None
+ if len(urls) < 24:
+ return
+ params["page"] += 1
+ page = self.request(self.chapter_url, params=params).text
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index 111d560..467a935 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -30,7 +30,8 @@ class BehanceExtractor(Extractor):
@staticmethod
def _update(data):
# compress data to simple lists
- data["fields"] = [field["name"] for field in data["fields"]]
+ if data["fields"] and isinstance(data["fields"][0], dict):
+ data["fields"] = [field["name"] for field in data["fields"]]
data["owners"] = [owner["display_name"] for owner in data["owners"]]
if "tags" in data:
data["tags"] = [tag["title"] for tag in data["tags"]]
@@ -140,11 +141,11 @@ class BehanceUserExtractor(BehanceExtractor):
def galleries(self):
url = "{}/{}/projects".format(self.root, self.user)
- headers = {"X-Requested-With": "XMLHttpRequest"}
params = {"offset": 0}
+ headers = {"X-Requested-With": "XMLHttpRequest"}
while True:
- data = self.request(url, headers=headers, params=params).json()
+ data = self.request(url, params=params, headers=headers).json()
work = data["profile"]["activeSection"]["work"]
yield from work["projects"]
if not work["hasMore"]:
@@ -157,8 +158,8 @@ class BehanceCollectionExtractor(BehanceExtractor):
subcategory = "collection"
categorytransfer = True
pattern = r"(?:https?://)?(?:www\.)?behance\.net/collection/(\d+)"
- test = ("https://www.behance.net/collection/170615607/Sky", {
- "count": ">= 13",
+ test = ("https://www.behance.net/collection/71340149/inspiration", {
+ "count": ">= 145",
"pattern": BehanceGalleryExtractor.pattern,
})
@@ -168,12 +169,13 @@ class BehanceCollectionExtractor(BehanceExtractor):
def galleries(self):
url = "{}/collection/{}/a".format(self.root, self.collection_id)
+ params = {"offset": 0}
headers = {"X-Requested-With": "XMLHttpRequest"}
- params = {}
while True:
- data = self.request(url, headers=headers, params=params).json()
- yield from data["output"]
- if not data.get("offset"):
+ data = self.request(url, params=params, headers=headers).json()
+ for item in data["items"]:
+ yield item["project"]
+ if len(data["items"]) < 40:
return
- params["offset"] = data["offset"]
+ params["offset"] += len(data["items"])
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index b10bd35..9cc6738 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -100,7 +100,7 @@ class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
test = (
("https://dynasty-scans.com/images?with[]=4930&with[]=5211", {
"url": "6b570eedd8a741c2cd34fb98b22a49d772f84191",
- "keyword": "a1e2d05c1406a08b02f347389616a6babb1b50bf",
+ "keyword": "fa7ff94f82cdf942f7734741d758f160a6b0905a",
}),
("https://dynasty-scans.com/images", {
"range": "1",
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 20e0746..1833b1a 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from galleries at https://exhentai.org/"""
+"""Extractors for https://e-hentai.org/ and https://exhentai.org/"""
from .common import Extractor, Message
from .. import text, util, exception
@@ -23,16 +23,19 @@ BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org"
class ExhentaiExtractor(Extractor):
"""Base class for exhentai extractors"""
category = "exhentai"
- directory_fmt = ("{category}", "{gallery_id}")
+ directory_fmt = ("{category}", "{gallery_id} {title}")
filename_fmt = (
"{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}")
archive_fmt = "{gallery_id}_{num}"
- cookiedomain = ".exhentai.org"
cookienames = ("ipb_member_id", "ipb_pass_hash")
+ cookiedomain = ".exhentai.org"
root = "https://exhentai.org"
+ LIMIT = False
+
def __init__(self, match):
- if match.group(1) != "ex":
+ version = match.group(1)
+ if version != "ex":
self.root = "https://e-hentai.org"
self.cookiedomain = ".e-hentai.org"
Extractor.__init__(self, match)
@@ -45,6 +48,8 @@ class ExhentaiExtractor(Extractor):
if self.wait_max < self.wait_min:
self.wait_max = self.wait_min
self.session.headers["Referer"] = self.root + "/"
+ if version != "ex":
+ self.session.cookies.set("nw", "1", domain=self.cookiedomain)
def request(self, *args, **kwargs):
response = Extractor.request(self, *args, **kwargs)
@@ -63,6 +68,9 @@ class ExhentaiExtractor(Extractor):
def login(self):
"""Login and set necessary cookies"""
+ if self.LIMIT:
+ self.log.error("Image limit reached!")
+ raise exception.StopExtraction()
if self._check_cookies(self.cookienames):
return
username, password = self._get_auth_info()
@@ -92,7 +100,7 @@ class ExhentaiExtractor(Extractor):
}
response = self.request(url, method="POST", headers=headers, data=data)
- if "You are now logged in as:" not in response.text:
+ if b"You are now logged in as:" not in response.content:
raise exception.AuthenticationError()
return {c: response.cookies[c] for c in self.cookienames}
@@ -112,9 +120,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
r"(?:/g/(\d+)/([\da-f]{10})"
r"|/s/([\da-f]{10})/(\d+)-(\d+))")
test = (
- ("https://exhentai.org/g/960460/4f0e369d82/", {
- "keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480",
- "content": "493d759de534355c9f55f8e365565b62411de146",
+ ("https://exhentai.org/g/1200119/d55c44d3d0/", {
+ "keyword": "1b353fad00dff0665b1746cdd151ab5cc326df23",
+ "content": "e9891a4c017ed0bb734cd1efba5cd03f594d31ff",
}),
("https://exhentai.org/g/960461/4f0e369d82/", {
"exception": exception.NotFoundError,
@@ -122,13 +130,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
("http://exhentai.org/g/962698/7f02358e00/", {
"exception": exception.AuthorizationError,
}),
- ("https://exhentai.org/s/3957343c3b/960460-5", {
+ ("https://exhentai.org/s/f68367b4c8/1200119-3", {
"count": 2,
}),
- ("https://e-hentai.org/s/3957343c3b/960460-5", {
+ ("https://e-hentai.org/s/f68367b4c8/1200119-3", {
"count": 2,
}),
- ("https://g.e-hentai.org/g/960460/4f0e369d82/"),
+ ("https://g.e-hentai.org/g/1200119/d55c44d3d0/"),
)
def __init__(self, match):
@@ -143,14 +151,25 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
def items(self):
self.login()
+ if self.limits:
+ self._init_limits()
+
if self.gallery_token:
gpage = self._gallery_page()
self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0]
+ if not self.image_token:
+ self.log.error("Failed to extract initial image token")
+ self.log.debug("Page content:\n%s", gpage)
+ return
self.wait()
ipage = self._image_page()
else:
ipage = self._image_page()
part = text.extract(ipage, 'hentai.org/g/', '"')[0]
+ if not part:
+ self.log.error("Failed to extract gallery token")
+ self.log.debug("Page content:\n%s", ipage)
+ return
self.gallery_token = part.split("/")[1]
self.wait()
gpage = self._gallery_page()
@@ -211,12 +230,17 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
iurl = extr('<img id="img" src="', '"')
orig = extr('hentai.org/fullimg.php', '"')
- if self.original and orig:
- url = self.root + "/fullimg.php" + text.unescape(orig)
- data = self._parse_original_info(extr('ownload original', '<'))
- else:
- url = iurl
- data = self._parse_image_info(url)
+ try:
+ if self.original and orig:
+ url = self.root + "/fullimg.php" + text.unescape(orig)
+ data = self._parse_original_info(extr('ownload original', '<'))
+ else:
+ url = iurl
+ data = self._parse_image_info(url)
+ except IndexError:
+ self.log.error("Unable to parse image info for '%s'", url)
+ self.log.debug("Page content:\n%s", page)
+ raise exception.StopExtraction()
data["num"] = self.image_num
data["image_token"] = self.key["start"] = extr('var startkey="', '";')
@@ -242,13 +266,18 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos)
origurl, pos = text.extract(page["i7"], '<a href="', '"')
- if self.original and origurl:
- url = text.unescape(origurl)
- data = self._parse_original_info(
- text.extract(page["i7"], "ownload original", "<", pos)[0])
- else:
- url = imgurl
- data = self._parse_image_info(url)
+ try:
+ if self.original and origurl:
+ url = text.unescape(origurl)
+ data = self._parse_original_info(text.extract(
+ page["i7"], "ownload original", "<", pos)[0])
+ else:
+ url = imgurl
+ data = self._parse_image_info(url)
+ except IndexError:
+ self.log.error("Unable to parse image info for '%s'", url)
+ self.log.debug("Page content:\n%s", page)
+ raise exception.StopExtraction()
data["num"] = request["page"]
data["image_token"] = imgkey
@@ -266,6 +295,8 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
raise exception.AuthorizationError()
if page.startswith(("Key missing", "Gallery not found")):
raise exception.NotFoundError("gallery")
+ if "hentai.org/mpv/" in page:
+ self.log.warning("Enabled Multi-Page Viewer is not supported")
return page
def _image_page(self):
@@ -277,17 +308,24 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
raise exception.NotFoundError("image page")
return page
+ def _init_limits(self):
+ self._update_limits()
+ if self._remaining <= 0:
+ self.log.error("Image limit reached!")
+ ExhentaiExtractor.LIMIT = True
+ raise exception.StopExtraction()
+
def _check_limits(self, data):
- if not self._remaining or data["num"] % 20 == 0:
+ if data["num"] % 20 == 0:
self._update_limits()
self._remaining -= data["cost"]
if self._remaining <= 0:
url = "{}/s/{}/{}-{}".format(
self.root, data["image_token"], self.gallery_id, data["num"])
- self.log.error(
- "Image limit reached! Reset it and continue with "
- "'%s' as URL.", url)
+ self.log.error("Image limit reached! Continue with "
+ "'%s' as URL after resetting it.", url)
+ ExhentaiExtractor.LIMIT = True
raise exception.StopExtraction()
def _update_limits(self):
@@ -301,6 +339,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
page = self.request(url, cookies=cookies).text
current, pos = text.extract(page, "<strong>", "</strong>")
maximum, pos = text.extract(page, "<strong>", "</strong>", pos)
+ self.log.debug("Image Limits: %s/%s", current, maximum)
self._remaining = text.parse_int(maximum) - text.parse_int(current)
@staticmethod
@@ -330,7 +369,7 @@ class ExhentaiSearchExtractor(ExhentaiExtractor):
subcategory = "search"
pattern = BASE_PATTERN + r"/?\?(.*)$"
test = (
- ("https://exhentai.org/?f_search=touhou"),
+ ("https://e-hentai.org/?f_search=touhou"),
(("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0"
"&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0"
"&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), {
@@ -372,7 +411,10 @@ class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor):
subcategory = "favorite"
pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?"
test = (
- ("https://exhentai.org/favorites.php"),
+ ("https://e-hentai.org/favorites.php", {
+ "count": 1,
+ "pattern": r"https?://e-hentai\.org/g/1200119/d55c44d3d0"
+ }),
("https://exhentai.org/favorites.php?favcat=1&f_search=touhou"
"&f_apply=Search+Favorites"),
)
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 15bd0a8..ce2e83b 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -30,6 +30,7 @@ class GelbooruExtractor(booru.XmlParserMixin,
self.params.update({"page": "dapi", "s": "post", "q": "index"})
else:
self.items = self.items_noapi
+ self.session.cookies["fringeBenefits"] = "yup"
def items_noapi(self):
data = self.get_metadata()
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
new file mode 100644
index 0000000..442634b
--- /dev/null
+++ b/gallery_dl/extractor/imgbb.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://imgbb.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+import json
+
+
+class ImgbbExtractor(Extractor):
+ """Base class for imgbb extractors"""
+ category = "imgbb"
+ filename_fmt = "{title} {id}.{extension}"
+ archive_fmt = "{id}"
+ root = "https://imgbb.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.page_url = self.sort = None
+
+ def items(self):
+ self.login()
+ page = self.request(self.page_url, params={"sort": self.sort}).text
+ data = self.metadata(page)
+ first = True
+
+ yield Message.Version, 1
+ for img in self.images(page):
+ image = {
+ "id" : img["url_viewer"].rpartition("/")[2],
+ "user" : img["user"]["username"],
+ "title" : text.unescape(img["title"]),
+ "url" : img["image"]["url"],
+ "extension": img["image"]["extension"],
+ "size" : text.parse_int(img["image"]["size"]),
+ "width" : text.parse_int(img["width"]),
+ "height" : text.parse_int(img["height"]),
+ }
+ image.update(data)
+ if first:
+ first = False
+ yield Message.Directory, data
+ yield Message.Url, image["url"], image
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+
+ @cache(maxage=360*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+
+ url = self.root + "/login"
+ page = self.request(url).text
+ token = text.extract(page, 'PF.obj.config.auth_token="', '"')[0]
+
+ headers = {"Referer": url}
+ data = {
+ "auth_token" : token,
+ "login-subject": username,
+ "password" : password,
+ }
+ response = self.request(url, method="POST", headers=headers, data=data)
+
+ if not response.history:
+ raise exception.AuthenticationError()
+ return self.session.cookies
+
+ def _pagination(self, page, endpoint, params):
+ params["page"] = 2
+ data = None
+
+ while True:
+ for img in text.extract_iter(page, "data-object='", "'"):
+ yield json.loads(text.unquote(img))
+ if data:
+ if params["seek"] == data["seekEnd"]:
+ return
+ params["seek"] = data["seekEnd"]
+ params["page"] += 1
+ data = self.request(endpoint, "POST", data=params).json()
+ page = data["html"]
+
+
+class ImgbbAlbumExtractor(ImgbbExtractor):
+ """Extractor for albums on imgbb.com"""
+ subcategory = "album"
+ directory_fmt = ("{category}", "{user}", "{album_name} {album_id}")
+ pattern = r"(?:https?://)?ibb\.co/album/([^/?&#]+)/?(?:\?([^#]+))?"
+ test = (
+ ("https://ibb.co/album/c6p5Yv", {
+ "range": "1-80",
+ "url": "8adaf0f7dfc19ff8bc4712c97f534af8b1e06412",
+ "keyword": "155b665a53e83d359e914cab7c69d5b829444d64",
+ }),
+ ("https://ibb.co/album/c6p5Yv?sort=title_asc", {
+ "range": "1-80",
+ "url": "d6c45041d5c8323c435b183a976f3fde2af7c547",
+ "keyword": "30c3262214e2044bbcf6bf2dee8e3ca7ebd62b71",
+ }),
+ )
+
+ def __init__(self, match):
+ ImgbbExtractor.__init__(self, match)
+ self.album_name = None
+ self.album_id = match.group(1)
+ self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
+ self.page_url = "https://ibb.co/album/" + self.album_id
+
+ def metadata(self, page):
+ album, pos = text.extract(page, '"og:title" content="', '"')
+ user , pos = text.extract(page, 'rel="author">', '<', pos)
+ return {
+ "album_id" : self.album_id,
+ "album_name": text.unescape(album),
+ "user" : user.lower(),
+ }
+
+ def images(self, page):
+ seek, pos = text.extract(page, 'data-seek="', '"')
+ tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)
+
+ return self._pagination(page, "https://ibb.co/json", {
+ "action" : "list",
+ "list" : "images",
+ "from" : "album",
+ "sort" : self.sort,
+ "albumid" : self.album_id,
+ "seek" : seek,
+ "auth_token": tokn,
+ "params_hidden[list]" : "images",
+ "params_hidden[from]" : "album",
+ "params_hidden[albumid]": self.album_id,
+ })
+
+
+class ImgbbUserExtractor(ImgbbExtractor):
+ """Extractor for user profiles in imgbb.com"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "{user}")
+ pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$"
+ test = ("https://folkie.imgbb.com", {
+ "range": "1-80",
+ "pattern": r"https?://i\.ibb\.co/\w+/[^/?&#]+",
+ })
+
+ def __init__(self, match):
+ ImgbbExtractor.__init__(self, match)
+ self.user = match.group(1)
+ self.sort = text.parse_query(match.group(2)).get("sort", "date_desc")
+ self.page_url = "https://{}.imgbb.com/".format(self.user)
+
+ def metadata(self, page):
+ return {"user": self.user}
+
+ def images(self, page):
+ seek, pos = text.extract(page, 'data-seek="', '"')
+ tokn, pos = text.extract(page, 'PF.obj.config.auth_token="', '"', pos)
+ user, pos = text.extract(page, '.obj.resource={"id":"', '"', pos)
+
+ return self._pagination(page, self.page_url + "json", {
+ "action" : "list",
+ "list" : "images",
+ "from" : "user",
+ "sort" : self.sort,
+ "seek" : seek,
+ "userid" : user,
+ "auth_token": tokn,
+ "params_hidden[userid]": user,
+ "params_hidden[from]" : "user",
+ })
diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py
index 65ae843..879d38b 100644
--- a/gallery_dl/extractor/luscious.py
+++ b/gallery_dl/extractor/luscious.py
@@ -62,7 +62,7 @@ class LusciousAlbumExtractor(LusciousBase, GalleryExtractor):
test = (
("https://luscious.net/albums/okinami-no-koigokoro_277031/", {
"url": "7e4984a271a1072ac6483e4228a045895aff86f3",
- "keyword": "c597c132834f4990f90bf5dee5de2a9d4ba263a4",
+ "keyword": "ab4e5b71583fd439b4c8012a642aa8b58d8d0758",
"content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3",
}),
("https://luscious.net/albums/virgin-killer-sweater_282582/", {
diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py
index 8135a8a..f3608b2 100644
--- a/gallery_dl/extractor/ngomik.py
+++ b/gallery_dl/extractor/ngomik.py
@@ -44,7 +44,7 @@ class NgomikChapterExtractor(ChapterExtractor):
@staticmethod
def images(page):
- readerarea = text.extract(page, 'id=readerarea', 'class=chnav')[0]
+ readerarea = text.extract(page, 'id="readerarea"', 'class="chnav"')[0]
return [
(text.unescape(url), None)
for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea)
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 012cb8b..da9735e 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -283,9 +283,9 @@ class SankakuPostExtractor(SankakuExtractor):
"options": (("tags", True),),
"keyword": {
"tags_artist": "bonocho",
- "tags_copyright": "batman_(series) the_dark_knight",
- "tags_medium": "sketch copyright_name",
"tags_studio": "dc_comics",
+ "tags_medium": "sketch copyright_name",
+ "tags_copyright": str,
"tags_character": str,
"tags_general": str,
},
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index 55eda9f..0189fc9 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -34,11 +34,11 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
test = (
("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", {
"url": "4a9ecc5ae917fbce469280da5b6a482510cae84d",
- "keyword": "4b3b5766b277a5d0acbec90fa8f2343262b07efd",
+ "keyword": "bfe08310e7d9a572f568f6900e0ed0eb295aa2b3",
}),
("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", {
"url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
- "keyword": "f47a416d680717855bbc3e4f0cd44479f61d9aa4",
+ "keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68",
}),
)
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
index 03ee144..66ad431 100644
--- a/gallery_dl/extractor/tsumino.py
+++ b/gallery_dl/extractor/tsumino.py
@@ -65,7 +65,7 @@ class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
"uploader" : "sehki",
"lang" : "en",
"language" : "English",
- "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
+ "thumbnail" : "re:https?://www.tsumino.com/Image/Thumb/40996",
},
}),
("https://www.tsumino.com/Read/View/45834"),
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
new file mode 100644
index 0000000..639ec82
--- /dev/null
+++ b/gallery_dl/extractor/vsco.py
@@ -0,0 +1,176 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://vsco.co/"""
+
+from .common import Extractor, Message
+from .. import text
+import json
+
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co/([^/]+)"
+
+
+class VscoExtractor(Extractor):
+ """Base class for vsco extractors"""
+ category = "vsco"
+ root = "https://vsco.co"
+ directory_fmt = ("{category}", "{user}")
+ filename_fmt = "{id}.{extension}"
+ archive_fmt = "{id}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user = match.group(1).lower()
+
+ def items(self):
+ yield Message.Version, 1
+ yield Message.Directory, {"user": self.user}
+ for img in self.images():
+ url = "https://" + (img.get("video_url") or img["responsive_url"])
+ data = text.nameext_from_url(url, {
+ "id" : img["_id"],
+ "user" : self.user,
+ "grid" : img["grid_name"],
+ "meta" : img.get("image_meta") or {},
+ "tags" : [tag["text"] for tag in img.get("tags") or ()],
+ "date" : text.parse_timestamp(img["upload_date"] // 1000),
+ "video" : img["is_video"],
+ "width" : img["width"],
+ "height": img["height"],
+ "description": img["description"],
+ })
+ yield Message.Url, url, data
+
+ def images(self):
+ """Return an iterable with all relevant image objects"""
+
+ def _extract_preload_state(self, url):
+ page = self.request(url, notfound=self.subcategory).text
+ return json.loads(text.extract(page, "__PRELOADED_STATE__ = ", "<")[0])
+
+ def _pagination(self, url, params, token, key, extra):
+ headers = {
+ "Referer" : "{}/{}".format(self.root, self.user),
+ "Authorization" : "Bearer " + token,
+ "X-Client-Platform": "web",
+ "X-Client-Build" : "1",
+ }
+
+ yield from map(self._transform_media, extra)
+
+ while True:
+ data = self.request(url, params=params, headers=headers).json()
+ if not data.get(key):
+ return
+ yield from data[key]
+ params["page"] += 1
+
+ @staticmethod
+ def _transform_media(media):
+ media["_id"] = media["id"]
+ media["is_video"] = media["isVideo"]
+ media["grid_name"] = media["gridName"]
+ media["upload_date"] = media["uploadDate"]
+ media["responsive_url"] = media["responsiveUrl"]
+ media["video_url"] = media.get("videoUrl")
+ media["image_meta"] = media.get("imageMeta")
+ return media
+
+
+class VscoUserExtractor(VscoExtractor):
+ """Extractor for images from a user on vsco.co"""
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"/images/"
+ test = ("https://vsco.co/missuri/images/1", {
+ "range": "1-80",
+ "count": 80,
+ "pattern": r"https://im\.vsco\.co/[^/]+/[0-9a-f/]+/vsco\w+\.\w+",
+ })
+
+ def images(self):
+ url = "{}/{}/images/1".format(self.root, self.user)
+ data = self._extract_preload_state(url)
+
+ tkn = data["users"]["currentUser"]["tkn"]
+ sid = str(data["sites"]["siteByUsername"][self.user]["site"]["id"])
+
+ url = "{}/api/2.0/medias".format(self.root)
+ params = {"page": 2, "size": "30", "site_id": sid}
+ return self._pagination(url, params, tkn, "media", (
+ data["medias"]["byId"][mid]["media"]
+ for mid in data["medias"]["bySiteId"][sid]["medias"]["1"]
+ ))
+
+
+class VscoCollectionExtractor(VscoExtractor):
+ """Extractor for images from a collection on vsco.co"""
+ subcategory = "collection"
+ directory_fmt = ("{category}", "{user}", "collection")
+ archive_fmt = "c_{user}_{id}"
+ pattern = BASE_PATTERN + r"/collection/"
+ test = ("https://vsco.co/vsco/collection/1", {
+ "range": "1-80",
+ "count": 80,
+ "pattern": r"https://im\.vsco\.co/[^/]+/[0-9a-f/]+/vsco\w+\.\w+",
+ })
+
+ def images(self):
+ url = "{}/{}/collection/1".format(self.root, self.user)
+ data = self._extract_preload_state(url)
+
+ tkn = data["users"]["currentUser"]["tkn"]
+ cid = (data["sites"]["siteByUsername"][self.user]
+ ["site"]["siteCollectionId"])
+
+ url = "{}/api/2.0/collections/{}/medias".format(self.root, cid)
+ params = {"page": 2, "size": "20"}
+ return self._pagination(url, params, tkn, "medias", (
+ data["medias"]["byId"][mid]["media"]
+ for mid in data
+ ["collections"]["byCollectionId"][cid]["collection"]["1"]
+ ))
+
+
+class VscoImageExtractor(VscoExtractor):
+ """Extractor for individual images on vsco.co"""
+ subcategory = "image"
+ pattern = BASE_PATTERN + r"/media/([0-9a-fA-F]+)"
+ test = (
+ ("https://vsco.co/erenyildiz/media/5d34b93ef632433030707ce2", {
+ "url": "faa214d10f859f374ad91da3f7547d2439f5af08",
+ "content": "1394d070828d82078035f19a92f404557b56b83f",
+ "keyword": {
+ "id" : "5d34b93ef632433030707ce2",
+ "user" : "erenyildiz",
+ "grid" : "erenyildiz",
+ "meta" : dict,
+ "tags" : list,
+ "date" : "type:datetime",
+ "video" : False,
+ "width" : 1537,
+ "height": 1537,
+ "description": "re:Ni seviyorum. #vsco #vscox #vscochallenges",
+ },
+ }),
+ ("https://vsco.co/jimenalazof/media/5b4feec558f6c45c18c040fd", {
+ "url": "08e7eef3301756ce81206c0b47c1e9373756a74a",
+ "content": "e739f058d726ee42c51c180a505747972a7dfa47",
+ "keyword": {"video" : True},
+ }),
+ )
+
+ def __init__(self, match):
+ VscoExtractor.__init__(self, match)
+ self.media_id = match.group(2)
+
+ def images(self):
+ url = "{}/{}/media/{}".format(self.root, self.user, self.media_id)
+ data = self._extract_preload_state(url)
+ media = data["medias"]["byId"].popitem()[1]["media"]
+ return (self._transform_media(media),)