summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2019-10-11 20:28:32 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2019-10-11 20:28:32 -0400
commit40f5fe6edef268632d3bc484e85e5b37bad67bff (patch)
tree98817850b65f1d2877bd4ed63a3908f37d794f8d /gallery_dl/extractor
parent639d9ea4a667733aadc3ff83a1df2cc9f0add3a9 (diff)
New upstream version 1.10.6upstream/1.10.6
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/8muses.py2
-rw-r--r--gallery_dl/extractor/behance.py7
-rw-r--r--gallery_dl/extractor/deviantart.py114
-rw-r--r--gallery_dl/extractor/gfycat.py2
-rw-r--r--gallery_dl/extractor/hitomi.py36
-rw-r--r--gallery_dl/extractor/komikcast.py2
-rw-r--r--gallery_dl/extractor/xhamster.py4
-rw-r--r--gallery_dl/extractor/yaplog.py41
8 files changed, 140 insertions, 68 deletions
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index f5ca9ce..089a0e9 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -53,7 +53,7 @@ class _8musesAlbumExtractor(Extractor):
"private": False,
},
}),
- ("https://www.8muses.com/comics/album/Fakku-Comics/6?sort=az", {
+ ("https://www.8muses.com/comics/album/Fakku-Comics/7?sort=az", {
"count": ">= 70",
"keyword": {"name": r"re:^[R-Zr-z]"},
}),
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index 467a935..c701927 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -33,8 +33,11 @@ class BehanceExtractor(Extractor):
if data["fields"] and isinstance(data["fields"][0], dict):
data["fields"] = [field["name"] for field in data["fields"]]
data["owners"] = [owner["display_name"] for owner in data["owners"]]
- if "tags" in data:
- data["tags"] = [tag["title"] for tag in data["tags"]]
+
+ tags = data.get("tags") or ()
+ if tags and isinstance(tags[0], dict):
+ tags = [tag["title"] for tag in tags]
+ data["tags"] = tags
# backwards compatibility
data["gallery_id"] = data["id"]
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 525cc84..ab32a00 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -47,12 +47,6 @@ class DeviantartExtractor(Extractor):
if self.quality:
self.quality = "q_{}".format(self.quality)
- if self.original != "image":
- self._update_content = self._update_content_default
- else:
- self._update_content = self._update_content_image
- self.original = True
-
self.commit_journal = {
"html": self._commit_journal_html,
"text": self._commit_journal_text,
@@ -98,7 +92,8 @@ class DeviantartExtractor(Extractor):
yield self.commit(deviation, content)
elif deviation["is_downloadable"]:
- content = self.api.deviation_download(deviation["deviationid"])
+ content = {}
+ self._update_content(deviation, content)
yield self.commit(deviation, content)
if "videos" in deviation:
@@ -240,15 +235,29 @@ class DeviantartExtractor(Extractor):
url = "{}/{}/{}/0/".format(self.root, self.user, category)
return [(url + folder["name"], folder) for folder in folders]
- def _update_content_default(self, deviation, content):
- content.update(self.api.deviation_download(deviation["deviationid"]))
-
- def _update_content_image(self, deviation, content):
- data = self.api.deviation_download(deviation["deviationid"])
- url = data["src"].partition("?")[0]
- mtype = mimetypes.guess_type(url, False)[0]
- if mtype and mtype.startswith("image/"):
- content.update(data)
+ def _update_content(self, deviation, content):
+ try:
+ data = self.api.deviation_extended_fetch(
+ deviation["index"],
+ deviation["author"]["username"],
+ "journal" if "excerpt" in deviation else "art",
+ )
+ download = data["deviation"]["extended"]["download"]
+ download["src"] = download["url"]
+ except Exception as e:
+ self.log.warning(
+ "Unable to fetch original download URL for ID %s ('%s: %s')",
+ deviation["index"], e.__class__.__name__, e,
+ )
+ self.log.debug("Server response: %s", data)
+ else:
+ if self.original == "image":
+ url = data["src"].partition("?")[0]
+ mtype = mimetypes.guess_type(url, False)[0]
+ if not mtype or not mtype.startswith("image/"):
+ return
+ del download["url"]
+ content.update(download)
class DeviantartGalleryExtractor(DeviantartExtractor):
@@ -258,8 +267,8 @@ class DeviantartGalleryExtractor(DeviantartExtractor):
pattern = BASE_PATTERN + r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$"
test = (
("https://www.deviantart.com/shimoda7/gallery/", {
- "pattern": r"https://(s3.amazonaws.com/origin-(img|orig)"
- r".deviantart.net/|images-wixmp-\w+.wixmp.com/)",
+ "pattern": r"https://(www.deviantart.com/download/\d+/"
+ r"|images-wixmp-[^.]+.wixmp.com/f/.+/.+.jpg\?token=.+)",
"count": ">= 30",
"keyword": {
"allows_comments": bool,
@@ -384,7 +393,7 @@ class DeviantartStashExtractor(DeviantartExtractor):
pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)"
test = (
("https://sta.sh/022c83odnaxc", {
- "pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net",
+ "pattern": r"https://sta.sh/download/7549925030122512/.+\?token=",
"count": 1,
}),
# multiple stash items
@@ -394,6 +403,7 @@ class DeviantartStashExtractor(DeviantartExtractor):
}),
# downloadable, but no "content" field (#307)
("https://sta.sh/024t4coz16mi", {
+ "pattern": r"https://sta.sh/download/7800709982190282/.+\?token=",
"count": 1,
}),
("https://sta.sh/abcdefghijkl", {
@@ -411,16 +421,34 @@ class DeviantartStashExtractor(DeviantartExtractor):
def deviations(self):
url = "https://sta.sh/" + self.stash_id
page = self.request(url).text
- deviation_id = text.extract(page, '//deviation/', '"')[0]
+ deviation_id, pos = text.extract(page, '//deviation/', '"')
if deviation_id:
- yield self.api.deviation(deviation_id)
+ deviation = self.api.deviation(deviation_id)
+ pos = page.find("dev-page-download", pos)
+ if pos >= 0:
+ deviation["_download"] = {
+ "width" : text.parse_int(text.extract(
+ page, 'data-download_width="' , '"', pos)[0]),
+ "height": text.parse_int(text.extract(
+ page, 'data-download_height="', '"', pos)[0]),
+ "src" : text.unescape(text.extract(
+ page, 'data-download_url="' , '"', pos)[0]),
+ }
+ return (deviation,)
else:
data = {"_extractor": DeviantartStashExtractor}
page = text.extract(
- page, '<div id="stash-body"', '<div class="footer"')[0]
- for url in text.extract_iter(page, '<a href="', '"'):
- yield url, data
+ page, 'id="stash-body"', 'class="footer"', pos)[0]
+ return [
+ (url, data)
+ for url in text.extract_iter(page, '<a href="', '"')
+ ]
+
+ def _update_content(self, deviation, content):
+ if "_download" in deviation:
+ content.update(deviation["_download"])
+ del deviation["_download"]
class DeviantartFavoriteExtractor(DeviantartExtractor):
@@ -562,28 +590,17 @@ class DeviantartExtractorV2(DeviantartExtractor):
"""Base class for deviantart extractors using the NAPI"""
def items(self):
- url = (
- self.root + "/_napi/da-browse/shared_api/deviation/extended_fetch"
- )
- params = {
- "deviationid" : None,
- "username" : None,
- "type" : None,
- "include_session": "false",
- }
- headers = {
- "Referer": self.root,
- }
-
yield Message.Version, 1
for deviation in self.deviations():
- params["deviationid"] = deviation["deviationId"]
- params["username"] = deviation["author"]["username"]
- params["type"] = "journal" if deviation["isJournal"] else "art"
- data = self.request(url, params=params, headers=headers).json()
+ data = self.api.deviation_extended_fetch(
+ deviation["deviationId"],
+ deviation["author"]["username"],
+ "journal" if deviation["isJournal"] else "art",
+ )
if "deviation" not in data:
- self.log.warning("Skipping %s", params["deviationid"])
+ self.log.warning("Skipping ID %s", deviation["deviationId"])
+ self.log.debug("Server response: %s", data)
continue
deviation = self._extract(data)
@@ -887,6 +904,19 @@ class DeviantartAPI():
params = {"mature_content": self.mature}
return self._call(endpoint, params)
+ def deviation_extended_fetch(self, deviation_id, user, kind):
+ url = ("https://www.deviantart.com/_napi/da-browse/shared_api"
+ "/deviation/extended_fetch")
+ headers = {"Referer": "https://www.deviantart.com/"}
+ params = {
+ "deviationid" : deviation_id,
+ "username" : user,
+ "type" : kind,
+ "include_session": "false",
+ }
+ return self.extractor.request(
+ url, headers=headers, params=params, fatal=None).json()
+
def deviation_metadata(self, deviations):
""" Fetch deviation metadata for a set of deviations"""
if not deviations:
diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py
index 1dcb3c8..2ebbec8 100644
--- a/gallery_dl/extractor/gfycat.py
+++ b/gallery_dl/extractor/gfycat.py
@@ -14,7 +14,7 @@ from .common import Extractor, Message
class GfycatExtractor(Extractor):
"""Base class for gfycat extractors"""
category = "gfycat"
- filename_fmt = "{category}_{gfyName}.{extension}"
+ filename_fmt = "{category}_{gfyName}{title:?_//}.{extension}"
archive_fmt = "{gfyName}"
root = "https://gfycat.com"
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index e4f18b3..ef08d69 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -20,12 +20,9 @@ class HitomiGalleryExtractor(GalleryExtractor):
pattern = r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)"
test = (
("https://hitomi.la/galleries/867789.html", {
- "url": "cb759868d090fe0e2655c3e29ebf146054322b6d",
+ "pattern": r"https://aa.hitomi.la/galleries/867789/\d+.jpg",
"keyword": "d097a8db8e810045131b4510c41714004f9eff3a",
- }),
- ("https://hitomi.la/galleries/1036181.html", {
- # "aa" subdomain for gallery-id ending in 1 (#142)
- "pattern": r"https://aa\.hitomi\.la/",
+ "count": 16,
}),
("https://hitomi.la/galleries/1401410.html", {
# download test
@@ -37,18 +34,39 @@ class HitomiGalleryExtractor(GalleryExtractor):
"url": "c2a84185f467450b8b9b72fbe40c0649029ce007",
"count": 210,
}),
+ ("https://hitomi.la/galleries/1045954.html", {
+ # fallback for galleries only available through /reader/ URLs
+ "url": "055c898a36389719799d6bce76889cc4ea4421fc",
+ "count": 1413,
+ }),
("https://hitomi.la/reader/867789.html"),
)
def __init__(self, match):
- self.gallery_id = text.parse_int(match.group(1))
+ self.gallery_id = match.group(1)
+ self.fallback = False
url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url)
+ def request(self, url, **kwargs):
+ response = GalleryExtractor.request(self, url, fatal=False, **kwargs)
+ if response.status_code == 404:
+ self.fallback = True
+ url = url.replace("/galleries/", "/reader/")
+ response = GalleryExtractor.request(self, url, **kwargs)
+ return response
+
def metadata(self, page):
+ if self.fallback:
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title": text.unescape(text.extract(
+ page, "<title>", "<")[0].rpartition(" | ")[0]),
+ }
+
extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
data = {
- "gallery_id": self.gallery_id,
+ "gallery_id": text.parse_int(self.gallery_id),
"title" : text.unescape(extr('.html">', '<').strip()),
"artist" : self._prep(extr('<h2>', '</h2>')),
"group" : self._prep(extr('<td>Group</td><td>', '</td>')),
@@ -66,7 +84,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
def images(self, page):
# see https://ltn.hitomi.la/common.js
- offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0
+ offset = text.parse_int(self.gallery_id[-1]) % 3
subdomain = chr(97 + offset) + "a"
base = "https://" + subdomain + ".hitomi.la/galleries/"
@@ -79,6 +97,8 @@ class HitomiGalleryExtractor(GalleryExtractor):
url = "{}/reader/{}.html".format(self.root, self.gallery_id)
page = self.request(url).text
begin, end = ">//g.hitomi.la/galleries/", "</div>"
+ elif self.fallback:
+ begin, end = ">//g.hitomi.la/galleries/", "</div>"
else:
begin, end = "'//tn.hitomi.la/smalltn/", ".jpg',"
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index 8541e4f..6e7f139 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -73,7 +73,7 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
return [
(text.unescape(url), None)
for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
- if "/Banner-" not in url
+ if "/Banner-" not in url and "/WM-Sampingan." not in url
]
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 23750db..e125184 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -119,8 +119,8 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
"views" : text.parse_int(imgs["views"]),
"likes" : text.parse_int(imgs["rating"]["likes"]),
"dislikes" : text.parse_int(imgs["rating"]["dislikes"]),
- "title" : imgs["title"],
- "description": imgs["description"],
+ "title" : text.unescape(imgs["title"]),
+ "description": text.unescape(imgs["description"]),
"thumbnail" : imgs["thumbURL"],
},
"count": text.parse_int(imgs["quantity"]),
diff --git a/gallery_dl/extractor/yaplog.py b/gallery_dl/extractor/yaplog.py
index b3c5501..b07ba4b 100644
--- a/gallery_dl/extractor/yaplog.py
+++ b/gallery_dl/extractor/yaplog.py
@@ -12,6 +12,9 @@ from .common import Extractor, Message, AsynchronousMixin
from .. import text, util
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?yaplog\.jp/([\w-]+)"
+
+
class YaplogExtractor(AsynchronousMixin, Extractor):
"""Base class for yaplog extractors"""
category = "yaplog"
@@ -31,11 +34,15 @@ class YaplogExtractor(AsynchronousMixin, Extractor):
for num, url in enumerate(urls, 1):
page = self.request(url).text if num > 1 else url
iurl = text.extract(page, '<img src="', '"')[0]
- iid, _, ext = iurl.rpartition("/")[2].rpartition(".")
+ if iurl[0] == "/":
+ iurl = text.urljoin(self.root, iurl)
+ name, _, ext = iurl.rpartition("/")[2].rpartition(".")
+ iid = name.rpartition("_")[0] or name
image = {
"url" : iurl,
"num" : num,
- "id" : text.parse_int(iid.partition("_")[0]),
+ "id" : text.parse_int(iid, iid),
+ "filename" : name,
"extension": ext,
"post" : post,
}
@@ -52,7 +59,8 @@ class YaplogExtractor(AsynchronousMixin, Extractor):
prev , pos = text.extract(page, 'class="last"><a href="', '"', pos)
urls = list(text.extract_iter(page, '<li><a href="', '"', pos))
- urls[0] = page # cache HTML of first page
+ if urls:
+ urls[0] = page # cache HTML of first page
if len(urls) == 24 and text.extract(page, '(1/', ')')[0] != '24':
# there are a maximum of 24 image entries in an /image/ page
@@ -69,14 +77,14 @@ class YaplogExtractor(AsynchronousMixin, Extractor):
"id" : text.parse_int(pid),
"title": text.unescape(title[:-3]),
"user" : self.user,
- "date" : date,
+ "date" : text.parse_datetime(date, "%B %d [%a], %Y, %H:%M"),
}
class YaplogBlogExtractor(YaplogExtractor):
"""Extractor for a user's blog on yaplog.jp"""
subcategory = "blog"
- pattern = r"(?:https?://)?(?:www\.)?yaplog\.jp/(\w+)/?(?:$|[?&#])"
+ pattern = BASE_PATTERN + r"/?(?:$|[?&#])"
test = ("https://yaplog.jp/omitakashi3", {
"pattern": r"https://img.yaplog.jp/img/18/pc/o/m/i/omitakashi3/0/",
"count": ">= 2",
@@ -92,12 +100,23 @@ class YaplogBlogExtractor(YaplogExtractor):
class YaplogPostExtractor(YaplogExtractor):
"""Extractor for images from a blog post on yaplog.jp"""
subcategory = "post"
- pattern = (r"(?:https?://)?(?:www\.)?yaplog\.jp"
- r"/(\w+)/(?:archive|image)/(\d+)")
- test = ("https://yaplog.jp/imamiami0726/image/1299", {
- "url": "896cae20fa718735a57e723c48544e830ff31345",
- "keyword": "f8d8781e61c4c38238a7622d6df6c905f864e5d3",
- })
+ pattern = BASE_PATTERN + r"/(?:archive|image)/(\d+)"
+ test = (
+ ("https://yaplog.jp/imamiami0726/image/1299", {
+ "url": "896cae20fa718735a57e723c48544e830ff31345",
+ "keyword": "22df8ad6cb534514c6bb2ff000381d156769a620",
+ }),
+ # complete image URLs (#443)
+ ("https://yaplog.jp/msjane/archive/246", {
+ "pattern": r"https://yaplog.jp/cv/msjane/img/246/img\d+_t.jpg"
+ }),
+ # empty post (#443)
+ ("https://yaplog.jp/f_l_a_s_c_o/image/872", {
+ "count": 0,
+ }),
+ # blog names with '-' (#443)
+ ("https://yaplog.jp/a-pierrot-o/image/3946/22779"),
+ )
def __init__(self, match):
YaplogExtractor.__init__(self, match)