aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py1
-rw-r--r--gallery_dl/extractor/arcalive.py14
-rw-r--r--gallery_dl/extractor/bbc.py33
-rw-r--r--gallery_dl/extractor/bunkr.py3
-rw-r--r--gallery_dl/extractor/common.py47
-rw-r--r--gallery_dl/extractor/danbooru.py101
-rw-r--r--gallery_dl/extractor/deviantart.py84
-rw-r--r--gallery_dl/extractor/hentaifox.py119
-rw-r--r--gallery_dl/extractor/hitomi.py69
-rw-r--r--gallery_dl/extractor/imhentai.py50
-rw-r--r--gallery_dl/extractor/instagram.py11
-rw-r--r--gallery_dl/extractor/kemonoparty.py68
-rw-r--r--gallery_dl/extractor/mangapark.py280
-rw-r--r--gallery_dl/extractor/mastodon.py3
-rw-r--r--gallery_dl/extractor/nozomi.py11
-rw-r--r--gallery_dl/extractor/patreon.py9
-rw-r--r--gallery_dl/extractor/pinterest.py3
-rw-r--r--gallery_dl/extractor/sexcom.py121
-rw-r--r--gallery_dl/extractor/skeb.py7
-rw-r--r--gallery_dl/extractor/subscribestar.py6
-rw-r--r--gallery_dl/extractor/tiktok.py110
-rw-r--r--gallery_dl/extractor/zerochan.py22
22 files changed, 615 insertions, 557 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 8198619..87c3798 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -68,7 +68,6 @@ modules = [
"hentai2read",
"hentaicosplays",
"hentaifoundry",
- "hentaifox",
"hentaihand",
"hentaihere",
"hentainexus",
diff --git a/gallery_dl/extractor/arcalive.py b/gallery_dl/extractor/arcalive.py
index 8e832fe..8c44256 100644
--- a/gallery_dl/extractor/arcalive.py
+++ b/gallery_dl/extractor/arcalive.py
@@ -41,7 +41,9 @@ class ArcalivePostExtractor(ArcaliveExtractor):
def items(self):
self.emoticons = self.config("emoticons", False)
- self.gifs = self.config("gifs", True)
+ self.gifs = gifs = self.config("gifs", True)
+ if gifs:
+ self.gifs_fallback = (gifs != "check")
post = self.api.post(self.groups[0])
files = self._extract_files(post)
@@ -90,11 +92,15 @@ class ArcalivePostExtractor(ArcaliveExtractor):
url = path + "." + orig
elif video and self.gifs:
url_gif = url.rpartition(".")[0] + ".gif"
- response = self.request(
- url_gif + "?type=orig", method="HEAD", fatal=False)
- if response.status_code < 400:
+ if self.gifs_fallback:
fallback = (url + "?type=orig",)
url = url_gif
+ else:
+ response = self.request(
+ url_gif + "?type=orig", method="HEAD", fatal=False)
+ if response.status_code < 400:
+ fallback = (url + "?type=orig",)
+ url = url_gif
files.append({
"url" : url + "?type=orig",
diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py
index 113a669..b398152 100644
--- a/gallery_dl/extractor/bbc.py
+++ b/gallery_dl/extractor/bbc.py
@@ -27,7 +27,12 @@ class BbcGalleryExtractor(GalleryExtractor):
def metadata(self, page):
data = self._extract_jsonld(page)
+
return {
+ "title": text.unescape(text.extr(
+ page, "<h1>", "</h1>").rpartition("</span>")[2]),
+ "description": text.unescape(text.extr(
+ page, 'property="og:description" content="', '"')),
"programme": self.gallery_url.split("/")[4],
"path": list(util.unique_sequence(
element["name"]
@@ -40,11 +45,20 @@ class BbcGalleryExtractor(GalleryExtractor):
width = width - width % 16 if width else 1920
dimensions = "/{}xn/".format(width)
- return [
- (src.replace("/320x180_b/", dimensions),
- {"_fallback": self._fallback_urls(src, width)})
- for src in text.extract_iter(page, 'data-image-src="', '"')
- ]
+ results = []
+ for img in text.extract_iter(page, 'class="gallery__thumbnail', ">"):
+ src = text.extr(img, 'data-image-src="', '"')
+ results.append((
+ src.replace("/320x180_b/", dimensions),
+ {
+ "title_image": text.unescape(text.extr(
+ img, 'data-gallery-title="', '"')),
+ "synopsis": text.unescape(text.extr(
+ img, 'data-gallery-synopsis="', '"')),
+ "_fallback": self._fallback_urls(src, width),
+ },
+ ))
+ return results
@staticmethod
def _fallback_urls(src, max_width):
@@ -62,14 +76,11 @@ class BbcProgrammeExtractor(Extractor):
pattern = BASE_PATTERN + r"[^/?#]+/galleries)(?:/?\?page=(\d+))?"
example = "https://www.bbc.co.uk/programmes/ID/galleries"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.path, self.page = match.groups()
-
def items(self):
+ path, pnum = self.groups
data = {"_extractor": BbcGalleryExtractor}
- params = {"page": text.parse_int(self.page, 1)}
- galleries_url = self.root + self.path
+ params = {"page": text.parse_int(pnum, 1)}
+ galleries_url = self.root + path
while True:
page = self.request(galleries_url, params=params).text
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index d74f59c..481e962 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -189,8 +189,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
else:
file_url = data["url"]
- file_name = (text.extr(page, 'property="og:title" content="', '"') or
- text.extr(page, "<title>", " | Bunkr<"))
+ file_name = text.extr(page, "<h1", "<").rpartition(">")[2]
fallback = text.extr(page, 'property="og:url" content="', '"')
return {
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index a85eedd..995505f 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -539,7 +539,7 @@ class Extractor():
for name, value in cookiedict.items():
set_cookie(name, value, domain=domain)
- def cookies_check(self, cookies_names, domain=None):
+ def cookies_check(self, cookies_names, domain=None, subdomains=False):
"""Check if all 'cookies_names' are in the session's cookiejar"""
if not self.cookies:
return False
@@ -550,26 +550,31 @@ class Extractor():
now = time.time()
for cookie in self.cookies:
- if cookie.name in names and (
- not domain or cookie.domain == domain):
-
- if cookie.expires:
- diff = int(cookie.expires - now)
-
- if diff <= 0:
- self.log.warning(
- "Cookie '%s' has expired", cookie.name)
- continue
-
- elif diff <= 86400:
- hours = diff // 3600
- self.log.warning(
- "Cookie '%s' will expire in less than %s hour%s",
- cookie.name, hours + 1, "s" if hours else "")
-
- names.discard(cookie.name)
- if not names:
- return True
+ if cookie.name not in names:
+ continue
+
+ if not domain or cookie.domain == domain:
+ pass
+ elif not subdomains or not cookie.domain.endswith(domain):
+ continue
+
+ if cookie.expires:
+ diff = int(cookie.expires - now)
+
+ if diff <= 0:
+ self.log.warning(
+ "Cookie '%s' has expired", cookie.name)
+ continue
+
+ elif diff <= 86400:
+ hours = diff // 3600
+ self.log.warning(
+ "Cookie '%s' will expire in less than %s hour%s",
+ cookie.name, hours + 1, "s" if hours else "")
+
+ names.discard(cookie.name)
+ if not names:
+ return True
return False
def _extract_jsonld(self, page):
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 8d00728..741800c 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -175,6 +175,51 @@ class DanbooruExtractor(BaseExtractor):
return [{"file": fmt(index), "delay": delay}
for index, delay in enumerate(delays)]
+ def _collection_posts(self, cid, ctype):
+ reverse = prefix = None
+
+ order = self.config("order-posts")
+ if not order or order in {"asc", "pool", "pool_asc", "asc_pool"}:
+ params = {"tags": "ord{}:{}".format(ctype, cid)}
+ elif order in {"id", "desc_id", "id_desc"}:
+ params = {"tags": "{}:{}".format(ctype, cid)}
+ prefix = "b"
+ elif order in {"desc", "desc_pool", "pool_desc"}:
+ params = {"tags": "ord{}:{}".format(ctype, cid)}
+ reverse = True
+ elif order in {"asc_id", "id_asc"}:
+ params = {"tags": "{}:{}".format(ctype, cid)}
+ reverse = True
+
+ posts = self._pagination("/posts.json", params, prefix)
+ if reverse:
+ self.log.info("Collecting posts of %s %s", ctype, cid)
+ return self._collection_enumerate_reverse(posts)
+ else:
+ return self._collection_enumerate(posts)
+
+ def _collection_metadata(self, cid, ctype, cname=None):
+ url = "{}/{}s/{}.json".format(self.root, cname or ctype, cid)
+ collection = self.request(url).json()
+ collection["name"] = collection["name"].replace("_", " ")
+ self.post_ids = collection.pop("post_ids", ())
+ return {ctype: collection}
+
+ def _collection_enumerate(self, posts):
+ pid_to_num = {pid: num for num, pid in enumerate(self.post_ids, 1)}
+ for post in posts:
+ post["num"] = pid_to_num[post["id"]]
+ yield post
+
+ def _collection_enumerate_reverse(self, posts):
+ posts = list(posts)
+ posts.reverse()
+
+ pid_to_num = {pid: num for num, pid in enumerate(self.post_ids, 1)}
+ for post in posts:
+ post["num"] = pid_to_num[post["id"]]
+ return posts
+
BASE_PATTERN = DanbooruExtractor.update({
"danbooru": {
@@ -228,7 +273,7 @@ class DanbooruTagExtractor(DanbooruExtractor):
class DanbooruPoolExtractor(DanbooruExtractor):
- """Extractor for posts from danbooru pools"""
+ """Extractor for Danbooru pools"""
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name]}")
filename_fmt = "{num:>04}_{id}_{filename}.{extension}"
@@ -237,50 +282,28 @@ class DanbooruPoolExtractor(DanbooruExtractor):
example = "https://danbooru.donmai.us/pools/12345"
def metadata(self):
- self.pool_id = self.groups[-1]
- url = "{}/pools/{}.json".format(self.root, self.pool_id)
- pool = self.request(url).json()
- pool["name"] = pool["name"].replace("_", " ")
- self.post_ids = pool.pop("post_ids", ())
- return {"pool": pool}
+ return self._collection_metadata(self.groups[-1], "pool")
def posts(self):
- reverse = prefix = None
+ return self._collection_posts(self.groups[-1], "pool")
- order = self.config("order-posts")
- if not order or order in ("asc", "pool", "pool_asc", "asc_pool"):
- params = {"tags": "ordpool:" + self.pool_id}
- elif order in ("id", "desc_id", "id_desc"):
- params = {"tags": "pool:" + self.pool_id}
- prefix = "b"
- elif order in ("desc", "desc_pool", "pool_desc"):
- params = {"tags": "ordpool:" + self.pool_id}
- reverse = True
- elif order in ("asc_id", "id_asc"):
- params = {"tags": "pool:" + self.pool_id}
- reverse = True
- posts = self._pagination("/posts.json", params, prefix)
- if reverse:
- return self._enumerate_posts_reverse(posts)
- else:
- return self._enumerate_posts(posts)
-
- def _enumerate_posts(self, posts):
- pid_to_num = {pid: num+1 for num, pid in enumerate(self.post_ids)}
- for post in posts:
- post["num"] = pid_to_num[post["id"]]
- yield post
+class DanbooruFavgroupExtractor(DanbooruExtractor):
+ """Extractor for Danbooru favorite groups"""
+ subcategory = "favgroup"
+ directory_fmt = ("{category}", "Favorite Groups",
+ "{favgroup[id]} {favgroup[name]}")
+ filename_fmt = "{num:>04}_{id}_{filename}.{extension}"
+ archive_fmt = "fg_{favgroup[id]}_{id}"
+ pattern = BASE_PATTERN + r"/favorite_group(?:s|/show)/(\d+)"
+ example = "https://danbooru.donmai.us/favorite_groups/12345"
- def _enumerate_posts_reverse(self, posts):
- self.log.info("Collecting posts of pool %s", self.pool_id)
- posts = list(posts)
- posts.reverse()
+ def metadata(self):
+ return self._collection_metadata(
+ self.groups[-1], "favgroup", "favorite_group")
- pid_to_num = {pid: num+1 for num, pid in enumerate(self.post_ids)}
- for post in posts:
- post["num"] = pid_to_num[post["id"]]
- return posts
+ def posts(self):
+ return self._collection_posts(self.groups[-1], "favgroup")
class DanbooruPostExtractor(DanbooruExtractor):
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 59b2d6d..3a862c1 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -687,10 +687,18 @@ x2="45.4107524%" y2="71.4898596%" id="app-root-3">\
for folder in folders:
if match(folder["name"]):
return folder
+ elif folder["has_subfolders"]:
+ for subfolder in folder["subfolders"]:
+ if match(subfolder["name"]):
+ return subfolder
else:
for folder in folders:
if folder["folderid"] == uuid:
return folder
+ elif folder["has_subfolders"]:
+ for subfolder in folder["subfolders"]:
+ if subfolder["folderid"] == uuid:
+ return subfolder
raise exception.NotFoundError("folder")
def _folder_urls(self, folders, category, extractor):
@@ -891,7 +899,8 @@ class DeviantartGalleryExtractor(DeviantartExtractor):
"""Extractor for all deviations from an artist's gallery"""
subcategory = "gallery"
archive_fmt = "g_{_username}_{index}.{extension}"
- pattern = BASE_PATTERN + r"/gallery(?:/all|/?\?catpath=)?/?$"
+ pattern = (BASE_PATTERN + r"/gallery"
+ r"(?:/all|/recommended-for-you|/?\?catpath=)?/?$")
example = "https://www.deviantart.com/USER/gallery/"
def deviations(self):
@@ -987,13 +996,36 @@ class DeviantartFolderExtractor(DeviantartExtractor):
def deviations(self):
folders = self.api.gallery_folders(self.user)
folder = self._find_folder(folders, self.folder_name, self.folder_id)
+
+ # Leaving this here for backwards compatibility
self.folder = {
"title": folder["name"],
"uuid" : folder["folderid"],
"index": self.folder_id,
"owner": self.user,
+ "parent_uuid": folder["parent"],
}
- return self.api.gallery(self.user, folder["folderid"], self.offset)
+
+ if folder.get("subfolder"):
+ self.folder["parent_folder"] = folder["parent_folder"]
+ self.archive_fmt = "F_{folder[parent_uuid]}_{index}.{extension}"
+
+ if self.flat:
+ self.directory_fmt = ("{category}", "{username}",
+ "{folder[parent_folder]}")
+ else:
+ self.directory_fmt = ("{category}", "{username}",
+ "{folder[parent_folder]}",
+ "{folder[title]}")
+
+ if folder.get("has_subfolders") and self.config("subfolders", True):
+ for subfolder in folder["subfolders"]:
+ subfolder["parent_folder"] = folder["name"]
+ subfolder["subfolder"] = True
+ yield from self._folder_urls(
+ folder["subfolders"], "gallery", DeviantartFolderExtractor)
+
+ yield from self.api.gallery(self.user, folder["folderid"], self.offset)
def prepare(self, deviation):
DeviantartExtractor.prepare(self, deviation)
@@ -1004,7 +1036,7 @@ class DeviantartStashExtractor(DeviantartExtractor):
"""Extractor for sta.sh-ed deviations"""
subcategory = "stash"
archive_fmt = "{index}.{extension}"
- pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)"
+ pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.s(h))"
r"/([a-z0-9]+)")
example = "https://www.deviantart.com/stash/abcde"
@@ -1016,9 +1048,18 @@ class DeviantartStashExtractor(DeviantartExtractor):
def deviations(self, stash_id=None):
if stash_id is None:
- stash_id = self.groups[0]
- url = "https://www.deviantart.com/stash/" + stash_id
- page = self._limited_request(url).text
+ legacy_url, stash_id = self.groups
+ else:
+ legacy_url = False
+
+ if legacy_url and stash_id[0] == "2":
+ url = "https://sta.sh/" + stash_id
+ response = self._limited_request(url)
+ stash_id = response.url.rpartition("/")[2]
+ page = response.text
+ else:
+ url = "https://www.deviantart.com/stash/" + stash_id
+ page = self._limited_request(url).text
if stash_id[0] == "0":
uuid = text.extr(page, '//deviation/', '"')
@@ -1235,7 +1276,34 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
deviation = self.api.deviation(uuid)
deviation["_page"] = page
- return (deviation,)
+
+ _dev_info = text.extr(
+ page, '\\"deviationExtended\\":', ',\\"deviation\\":', None)
+ # Clean up escaped quotes
+ _json_str = re.sub(
+ r'(?<!\\)\\{1}"', '"', _dev_info).replace("\\'", "'")
+ _extended_info = util.json_loads(_json_str)[self.deviation_id]
+ additional_media = _extended_info.get("additionalMedia") or ()
+
+ if additional_media:
+ self.filename_fmt = ("{category}_{index}_{index_file}_{title}_"
+ "{num:>02}.{extension}")
+ self.archive_fmt = ("g_{_username}_{index}{index_file:?_//}."
+ "{extension}")
+
+ deviation["index_file"] = 0
+ deviation["count"] = 1 + len(additional_media)
+ deviation["num"] = 1
+ yield deviation
+
+ for index, post in enumerate(additional_media):
+ uri = post["media"]["baseUri"].encode().decode("unicode-escape")
+ deviation["content"]["src"] = uri
+ deviation["num"] += 1
+ deviation["index_file"] = post["fileId"]
+ # Download only works on purchased materials - no way to check
+ deviation["is_downloadable"] = False
+ yield deviation
class DeviantartScrapsExtractor(DeviantartExtractor):
@@ -1366,7 +1434,7 @@ class DeviantartOAuthAPI():
def __init__(self, extractor):
self.extractor = extractor
self.log = extractor.log
- self.headers = {"dA-minor-version": "20200519"}
+ self.headers = {"dA-minor-version": "20210526"}
self._warn_429 = True
self.delay = extractor.config("wait-min", 0)
diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py
deleted file mode 100644
index 31a302d..0000000
--- a/gallery_dl/extractor/hentaifox.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2019-2023 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://hentaifox.com/"""
-
-from .common import GalleryExtractor, Extractor, Message
-from .. import text, util
-
-
-class HentaifoxBase():
- """Base class for hentaifox extractors"""
- category = "hentaifox"
- root = "https://hentaifox.com"
-
-
-class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
- """Extractor for image galleries on hentaifox.com"""
- pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))"
- example = "https://hentaifox.com/gallery/12345/"
-
- def __init__(self, match):
- GalleryExtractor.__init__(self, match)
- self.gallery_id = match.group(2)
-
- @staticmethod
- def _split(txt):
- return [
- text.remove_html(tag.partition(">")[2], "", "")
- for tag in text.extract_iter(
- txt, "class='tag_btn", "<span class='t_badge")
- ]
-
- def metadata(self, page):
- extr = text.extract_from(page)
- split = self._split
-
- return {
- "gallery_id": text.parse_int(self.gallery_id),
- "parody" : split(extr(">Parodies:" , "</ul>")),
- "characters": split(extr(">Characters:", "</ul>")),
- "tags" : split(extr(">Tags:" , "</ul>")),
- "artist" : split(extr(">Artists:" , "</ul>")),
- "group" : split(extr(">Groups:" , "</ul>")),
- "type" : text.remove_html(extr(">Category:", "<span")),
- "title" : text.unescape(extr(
- 'id="gallery_title" value="', '"')),
- "language" : "English",
- "lang" : "en",
- }
-
- def images(self, page):
- cover, pos = text.extract(page, '<img src="', '"')
- data , pos = text.extract(page, "$.parseJSON('", "');", pos)
- path = "/".join(cover.split("/")[3:-1])
-
- result = []
- append = result.append
- extmap = {"j": "jpg", "p": "png", "g": "gif"}
- urlfmt = ("/" + path + "/{}.{}").format
-
- server1 = "https://i.hentaifox.com"
- server2 = "https://i2.hentaifox.com"
-
- for num, image in util.json_loads(data).items():
- ext, width, height = image.split(",")
- path = urlfmt(num, extmap[ext])
- append((server1 + path, {
- "width" : width,
- "height" : height,
- "_fallback": (server2 + path,),
- }))
-
- return result
-
-
-class HentaifoxSearchExtractor(HentaifoxBase, Extractor):
- """Extractor for search results and listings on hentaifox.com"""
- subcategory = "search"
- pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com"
- r"(/(?:parody|tag|artist|character|search|group)/[^/?%#]+)")
- example = "https://hentaifox.com/tag/TAG/"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.path = match.group(1)
-
- def items(self):
- for gallery in self.galleries():
- yield Message.Queue, gallery["url"], gallery
-
- def galleries(self):
- num = 1
-
- while True:
- url = "{}{}/pag/{}/".format(self.root, self.path, num)
- page = self.request(url).text
-
- for info in text.extract_iter(
- page, 'class="g_title"><a href="', '</a>'):
- url, _, title = info.partition('">')
-
- yield {
- "url" : text.urljoin(self.root, url),
- "gallery_id": text.parse_int(
- url.strip("/").rpartition("/")[2]),
- "title" : text.unescape(title),
- "_extractor": HentaifoxGalleryExtractor,
- }
-
- pos = page.find(">Next<")
- url = text.rextract(page, "href=", ">", pos)[0]
- if pos == -1 or "/pag" not in url:
- return
- num += 1
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index e15e13c..086b77c 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -16,19 +16,25 @@ import string
import re
-class HitomiGalleryExtractor(GalleryExtractor):
- """Extractor for image galleries from hitomi.la"""
+class HitomiExtractor(Extractor):
+ """Base class for hitomi extractors"""
category = "hitomi"
root = "https://hitomi.la"
+ domain = "gold-usergeneratedcontent.net"
+
+
+class HitomiGalleryExtractor(HitomiExtractor, GalleryExtractor):
+ """Extractor for hitomi.la galleries"""
pattern = (r"(?:https?://)?hitomi\.la"
r"/(?:manga|doujinshi|cg|gamecg|imageset|galleries|reader)"
r"/(?:[^/?#]+-)?(\d+)")
example = "https://hitomi.la/manga/TITLE-867789.html"
def __init__(self, match):
- self.gid = match.group(1)
- url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gid)
- GalleryExtractor.__init__(self, match, url)
+ GalleryExtractor.__init__(self, match, False)
+ self.gid = gid = self.groups[0]
+ self.gallery_url = "https://ltn.{}/galleries/{}.js".format(
+ self.domain, gid)
def _init(self):
self.session.headers["Referer"] = "{}/reader/{}.html".format(
@@ -71,43 +77,34 @@ class HitomiGalleryExtractor(GalleryExtractor):
}
def images(self, _):
- # see https://ltn.hitomi.la/gg.js
+ # https://ltn.gold-usergeneratedcontent.net/gg.js
gg_m, gg_b, gg_default = _parse_gg(self)
- fmt = self.config("format") or "webp"
- if fmt == "original":
- subdomain, path, ext, check = "b", "images", None, False
- else:
- subdomain, path, ext, check = "a", fmt, fmt, (fmt != "webp")
+ fmt = ext = self.config("format") or "webp"
+ check = (fmt != "webp")
result = []
for image in self.info["files"]:
if check:
- if image.get("has" + fmt):
- path = ext = fmt
- else:
- path = ext = "webp"
+ ext = fmt if image.get("has" + fmt) else "webp"
ihash = image["hash"]
idata = text.nameext_from_url(image["name"])
idata["extension_original"] = idata["extension"]
- if ext:
- idata["extension"] = ext
+ idata["extension"] = ext
- # see https://ltn.hitomi.la/common.js
+ # https://ltn.gold-usergeneratedcontent.net/common.js
inum = int(ihash[-1] + ihash[-3:-1], 16)
- url = "https://{}{}.hitomi.la/{}/{}/{}/{}.{}".format(
- chr(97 + gg_m.get(inum, gg_default)),
- subdomain, path, gg_b, inum, ihash, idata["extension"],
+ url = "https://{}{}.{}/{}/{}/{}.{}".format(
+ ext[0], gg_m.get(inum, gg_default) + 1, self.domain,
+ gg_b, inum, ihash, ext,
)
result.append((url, idata))
return result
-class HitomiTagExtractor(Extractor):
+class HitomiTagExtractor(HitomiExtractor):
"""Extractor for galleries from tag searches on hitomi.la"""
- category = "hitomi"
subcategory = "tag"
- root = "https://hitomi.la"
pattern = (r"(?:https?://)?hitomi\.la"
r"/(tag|artist|group|series|type|character)"
r"/([^/?#]+)\.html")
@@ -126,8 +123,8 @@ class HitomiTagExtractor(Extractor):
"_extractor": HitomiGalleryExtractor,
"search_tags": text.unquote(self.tag.rpartition("-")[0]),
}
- nozomi_url = "https://ltn.hitomi.la/{}/{}.nozomi".format(
- self.type, self.tag)
+ nozomi_url = "https://ltn.{}/{}/{}.nozomi".format(
+ self.domain, self.type, self.tag)
headers = {
"Origin": self.root,
"Cache-Control": "max-age=0",
@@ -166,8 +163,8 @@ class HitomiIndexExtractor(HitomiTagExtractor):
def items(self):
data = {"_extractor": HitomiGalleryExtractor}
- nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format(
- self.tag, self.language)
+ nozomi_url = "https://ltn.{}/{}-{}.nozomi".format(
+ self.domain, self.tag, self.language)
headers = {
"Origin": self.root,
"Cache-Control": "max-age=0",
@@ -194,11 +191,9 @@ class HitomiIndexExtractor(HitomiTagExtractor):
return
-class HitomiSearchExtractor(Extractor):
+class HitomiSearchExtractor(HitomiExtractor):
"""Extractor for galleries from multiple tag searches on hitomi.la"""
- category = "hitomi"
subcategory = "search"
- root = "https://hitomi.la"
pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)"
example = "https://hitomi.la/search.html?QUERY"
@@ -224,11 +219,11 @@ class HitomiSearchExtractor(Extractor):
area, tag, language = self.get_nozomi_args(full_tag)
if area:
- nozomi_url = "https://ltn.hitomi.la/n/{}/{}-{}.nozomi".format(
- area, tag, language)
+ nozomi_url = "https://ltn.{}/n/{}/{}-{}.nozomi".format(
+ self.domain, area, tag, language)
else:
- nozomi_url = "https://ltn.hitomi.la/n/{}-{}.nozomi".format(
- tag, language)
+ nozomi_url = "https://ltn.{}/n/{}-{}.nozomi".format(
+ self.domain, tag, language)
headers = {
"Origin": self.root,
@@ -257,7 +252,7 @@ class HitomiSearchExtractor(Extractor):
@memcache(maxage=1800)
def _parse_gg(extr):
- page = extr.request("https://ltn.hitomi.la/gg.js").text
+ page = extr.request("https://ltn.gold-usergeneratedcontent.net/gg.js").text
m = {}
@@ -280,4 +275,4 @@ def _parse_gg(extr):
d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page)
b = re.search(r"b:\s*[\"'](.+)[\"']", page)
- return m, b.group(1).strip("/"), int(d.group(1)) if d else 1
+ return m, b.group(1).strip("/"), int(d.group(1)) if d else 0
diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py
index 0439f5b..1b0fba3 100644
--- a/gallery_dl/extractor/imhentai.py
+++ b/gallery_dl/extractor/imhentai.py
@@ -22,10 +22,15 @@ class ImhentaiExtractor(BaseExtractor):
while True:
page = self.request(url).text
+
+ pos = page.find('class="ranking_list"')
+ if pos >= 0:
+ page = page[:pos]
+
extr = text.extract_from(page)
while True:
- gallery_id = extr('<a href="/gallery/', '"')
+ gallery_id = extr('href="/gallery/', '"')
if gallery_id == prev:
continue
if not gallery_id:
@@ -57,6 +62,18 @@ BASE_PATTERN = ImhentaiExtractor.update({
"root": "https://hentairox.com",
"pattern": r"(?:www\.)?hentairox\.com",
},
+ "hentaifox": {
+ "root": "https://hentaifox.com",
+ "pattern": r"(?:www\.)?hentaifox\.com",
+ },
+ "hentaienvy": {
+ "root": "https://hentaienvy.com",
+ "pattern": r"(?:www\.)?hentaienvy\.com",
+ },
+ "hentaizap": {
+ "root": "https://hentaizap.com",
+ "pattern": r"(?:www\.)?hentaizap\.com",
+ },
})
@@ -72,17 +89,20 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
def metadata(self, page):
extr = text.extract_from(page)
+ title = extr("<h1>", "<")
+ title_alt = extr('class="subtitle">', "<")
+ end = "</li>" if extr('<ul class="galleries_info', ">") else "</ul>"
data = {
"gallery_id": text.parse_int(self.gallery_id),
- "title" : text.unescape(extr("<h1>", "<")),
- "title_alt" : text.unescape(extr('class="subtitle">', "<")),
- "parody" : self._split(extr(">Parodies", "</li>")),
- "character" : self._split(extr(">Characters", "</li>")),
- "tags" : self._split(extr(">Tags", "</li>")),
- "artist" : self._split(extr(">Artists", "</li>")),
- "group" : self._split(extr(">Groups", "</li>")),
- "language" : self._split(extr(">Languages", "</li>")),
+ "title" : text.unescape(title),
+ "title_alt" : text.unescape(title_alt),
+ "parody" : self._split(extr(">Parodies", end)),
+ "character" : self._split(extr(">Characters", end)),
+ "tags" : self._split(extr(">Tags", end)),
+ "artist" : self._split(extr(">Artists", end)),
+ "group" : self._split(extr(">Groups", end)),
+ "language" : self._split(extr(">Languages", end)),
"type" : extr("href='/category/", "/"),
}
@@ -94,10 +114,12 @@ class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
def _split(self, html):
results = []
for tag in text.extract_iter(html, ">", "</a>"):
- tag = tag.partition(" <span class='badge'>")[0]
- if "<" in tag:
- tag = text.remove_html(tag)
+ badge = ("badge'>" in tag or "class='badge" in tag)
+ tag = text.remove_html(tag)
+ if badge:
+ tag = tag.rpartition(" ")[0]
results.append(tag)
+ results.sort()
return results
def images(self, page):
@@ -132,9 +154,9 @@ class ImhentaiTagExtractor(ImhentaiExtractor):
class ImhentaiSearchExtractor(ImhentaiExtractor):
"""Extractor for imhentai search results"""
subcategory = "search"
- pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
+ pattern = BASE_PATTERN + r"/search(/?\?[^#]+|/[^/?#]+/?)"
example = "https://imhentai.xxx/search/?key=QUERY"
def items(self):
- url = self.root + "/search/?" + self.groups[-1]
+ url = self.root + "/search" + self.groups[-1]
return self._pagination(url)
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index e344b2f..aa26408 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -56,9 +56,11 @@ class InstagramExtractor(Extractor):
data = self.metadata()
videos = self.config("videos", True)
+ if videos:
+ videos_dash = (videos != "merged")
+ videos_headers = {"User-Agent": "Mozilla/5.0"}
previews = self.config("previews", False)
max_posts = self.config("max-posts")
- video_headers = {"User-Agent": "Mozilla/5.0"}
order = self.config("order-files")
reverse = order[0] in ("r", "d") if order else False
@@ -92,8 +94,12 @@ class InstagramExtractor(Extractor):
url = file.get("video_url")
if url:
if videos:
- file["_http_headers"] = video_headers
+ file["_http_headers"] = videos_headers
text.nameext_from_url(url, file)
+ if videos_dash:
+ file["_fallback"] = (url,)
+ file["_ytdl_manifest"] = "dash"
+ url = "ytdl:dash"
yield Message.Url, url, file
if previews:
file["media_id"] += "p"
@@ -246,6 +252,7 @@ class InstagramExtractor(Extractor):
"video_url" : video["url"] if video else None,
"width" : media["width"],
"height" : media["height"],
+ "_ytdl_manifest_data": item.get("video_dash_manifest"),
}
if "expiring_at" in item:
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 788b5d9..860e771 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -57,11 +57,13 @@ class KemonopartyExtractor(Extractor):
find_hash = re.compile(HASH_PATTERN).match
generators = self._build_file_generators(self.config("files"))
announcements = True if self.config("announcements") else None
+ archives = True if self.config("archives") else False
comments = True if self.config("comments") else False
duplicates = True if self.config("duplicates") else False
dms = True if self.config("dms") else None
max_posts = self.config("max-posts")
- creator_info = {} if self.config("metadata") else None
+ creator_info = {} if self.config("metadata", True) else None
+ exts_archive = {"zip", "rar", "7z"}
# prevent files from being sent with gzip compression
headers = {"Accept-Encoding": "identity"}
@@ -115,6 +117,7 @@ class KemonopartyExtractor(Extractor):
files = []
hashes = set()
+ post_archives = post["archives"] = []
for file in itertools.chain.from_iterable(
g(post) for g in generators):
@@ -129,31 +132,45 @@ class KemonopartyExtractor(Extractor):
continue
hashes.add(hash)
else:
- file["hash"] = ""
+ file["hash"] = hash = ""
+
+ if url[0] == "/":
+ url = self.root + "/data" + url
+ elif url.startswith(self.root):
+ url = self.root + "/data" + url[20:]
+ file["url"] = url
+
+ text.nameext_from_url(file.get("name", url), file)
+ ext = text.ext_from_url(url)
+ if not file["extension"]:
+ file["extension"] = ext
+ elif ext == "txt" and file["extension"] != "txt":
+ file["_http_validate"] = _validate
+ elif ext in exts_archive:
+ file["type"] = "archive"
+ if archives:
+ try:
+ data = self.api.posts_archives(file["hash"])
+ data.update(file)
+ post_archives.append(data)
+ except Exception as exc:
+ self.log.warning(
+ "%s: Failed to retrieve archive metadata of "
+ "'%s' (%s: %s)", post["id"], file.get("name"),
+ exc.__class__.__name__, exc)
+ post_archives.append(file.copy())
+ else:
+ post_archives.append(file.copy())
files.append(file)
post["count"] = len(files)
yield Message.Directory, post
-
for post["num"], file in enumerate(files, 1):
- post["_http_validate"] = None
- post["hash"] = file["hash"]
- post["type"] = file["type"]
- url = file["path"]
-
- text.nameext_from_url(file.get("name", url), post)
- ext = text.ext_from_url(url)
- if not post["extension"]:
- post["extension"] = ext
- elif ext == "txt" and post["extension"] != "txt":
- post["_http_validate"] = _validate
-
- if url[0] == "/":
- url = self.root + "/data" + url
- elif url.startswith(self.root):
- url = self.root + "/data" + url[20:]
- yield Message.Url, url, post
+ if "id" in file:
+ del file["id"]
+ post.update(file)
+ yield Message.Url, file["url"], post
def login(self):
username, password = self._get_auth_info()
@@ -368,17 +385,18 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
key = "id"
else:
key = "name"
+ else:
+ key = "id"
+ channel = channel_id
+ if not channel_name or not channel_id:
for ch in self.api.discord_server(server_id):
if ch[key] == channel:
break
else:
raise exception.NotFoundError("channel")
-
channel_id = ch["id"]
channel_name = ch["name"]
- elif channel_name is None:
- channel_name = ""
find_inline = re.compile(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
@@ -503,6 +521,10 @@ class KemonoAPI():
params = {"q": query, "o": offset, "tag": tags}
return self._pagination(endpoint, params, 50, "posts")
+ def posts_archives(self, file_hash):
+ endpoint = "/posts/archives/" + file_hash
+ return self._call(endpoint)["archive"]
+
def creator_posts(self, service, creator_id, offset=0, query=None):
endpoint = "/{}/user/{}".format(service, creator_id)
params = {"q": query, "o": offset}
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
index 6f7a238..b11f81d 100644
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@@ -10,9 +10,13 @@
from .common import ChapterExtractor, Extractor, Message
from .. import text, util, exception
+from ..cache import memcache
import re
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangapark\.(?:net|com|org|io|me)"
+BASE_PATTERN = (r"(?:https?://)?(?:www\.)?(?:"
+ r"(?:manga|comic|read)park\.(?:com|net|org|me|io|to)|"
+ r"parkmanga\.(?:com|net|org)|"
+ r"mpark\.to)")
class MangaparkBase():
@@ -31,57 +35,87 @@ class MangaparkBase():
match = self._match_title(title)
return match.groups() if match else (0, 0, "", "")
+ @memcache(keyarg=1)
+ def _extract_manga(self, manga_id):
+ variables = {
+ "getComicNodeId": manga_id,
+ }
+ return self._request_graphql("Get_comicNode", variables)["data"]
+
+ def _extract_chapter(self, chapter_id):
+ variables = {
+ "getChapterNodeId": chapter_id,
+ }
+ return self._request_graphql("Get_chapterNode", variables)["data"]
+
+ def _extract_chapters_all(self, manga_id):
+ variables = {
+ "comicId": manga_id,
+ }
+ return self._request_graphql("Get_comicChapterList", variables)
+
+ def _extract_chapters_source(self, source_id):
+ variables = {
+ "sourceId": source_id,
+ }
+ return self._request_graphql(
+ "get_content_source_chapterList", variables)
+
+ def _request_graphql(self, opname, variables):
+ url = self.root + "/apo/"
+ data = {
+ "query" : QUERIES[opname],
+ "variables" : variables,
+ "operationName": opname,
+ }
+ return self.request(
+ url, method="POST", json=data).json()["data"].popitem()[1]
+
class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
"""Extractor for manga-chapters from mangapark.net"""
- pattern = BASE_PATTERN + r"/title/[^/?#]+/(\d+)"
+ pattern = (BASE_PATTERN +
+ r"/(?:title/[^/?#]+/|comic/\d+/[^/?#]+/[^/?#]+-i)(\d+)")
example = "https://mangapark.net/title/MANGA/12345-en-ch.01"
def __init__(self, match):
self.root = text.root_from_url(match.group(0))
- url = "{}/title/_/{}".format(self.root, match.group(1))
- ChapterExtractor.__init__(self, match, url)
-
- def metadata(self, page):
- data = self._extract_nextdata(page)
- chapter = (data["props"]["pageProps"]["dehydratedState"]
- ["queries"][0]["state"]["data"]["data"])
- manga = chapter["comicNode"]["data"]
- source = chapter["sourceNode"]["data"]
-
- self._urls = chapter["imageSet"]["httpLis"]
- self._params = chapter["imageSet"]["wordLis"]
+ ChapterExtractor.__init__(self, match, False)
+
+ def metadata(self, _):
+ chapter = self._extract_chapter(self.groups[0])
+ manga = self._extract_manga(chapter["comicNode"]["id"])
+
+ self._urls = chapter["imageFile"]["urlList"]
vol, ch, minor, title = self._parse_chapter_title(chapter["dname"])
+ lang = chapter.get("lang") or "en"
return {
"manga" : manga["name"],
- "manga_id" : manga["id"],
- "artist" : source["artists"],
- "author" : source["authors"],
- "genre" : source["genres"],
+ "manga_id" : text.parse_int(manga["id"]),
+ "artist" : manga["artists"],
+ "author" : manga["authors"],
+ "genre" : manga["genres"],
"volume" : text.parse_int(vol),
"chapter" : text.parse_int(ch),
"chapter_minor": minor,
- "chapter_id": chapter["id"],
- "title" : chapter["title"] or title or "",
- "lang" : chapter["lang"],
- "language" : util.code_to_language(chapter["lang"]),
- "source" : source["srcTitle"],
- "source_id" : source["id"],
+ "chapter_id": text.parse_int(chapter["id"]),
+ "title" : title or "",
+ "lang" : lang,
+ "language" : util.code_to_language(lang),
+ "source" : chapter["srcTitle"],
+ "source_id" : chapter["sourceId"],
"date" : text.parse_timestamp(chapter["dateCreate"] // 1000),
}
- def images(self, page):
- return [
- (url + "?" + params, None)
- for url, params in zip(self._urls, self._params)
- ]
+ def images(self, _):
+ return [(url, None) for url in self._urls]
class MangaparkMangaExtractor(MangaparkBase, Extractor):
"""Extractor for manga from mangapark.net"""
subcategory = "manga"
- pattern = BASE_PATTERN + r"/title/(\d+)(?:-[^/?#]*)?/?$"
+ pattern = BASE_PATTERN + r"/(?:title|comic)/(\d+)(?:[/-][^/?#]*)?/?$"
example = "https://mangapark.net/title/12345-MANGA"
def __init__(self, match):
@@ -95,6 +129,8 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
url = self.root + chapter["urlPath"]
vol, ch, minor, title = self._parse_chapter_title(chapter["dname"])
+ lang = chapter.get("lang") or "en"
+
data = {
"manga_id" : self.manga_id,
"volume" : text.parse_int(vol),
@@ -102,8 +138,8 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
"chapter_minor": minor,
"chapter_id": chapter["id"],
"title" : chapter["title"] or title or "",
- "lang" : chapter["lang"],
- "language" : util.code_to_language(chapter["lang"]),
+ "lang" : lang,
+ "language" : util.code_to_language(lang),
"source" : chapter["srcTitle"],
"source_id" : chapter["sourceId"],
"date" : text.parse_timestamp(
@@ -114,45 +150,12 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
def chapters(self):
source = self.config("source")
- if not source:
- return self.chapters_all()
-
- source_id = self._select_source(source)
- self.log.debug("Requesting chapters for source_id %s", source_id)
- return self.chapters_source(source_id)
-
- def chapters_all(self):
- pnum = 0
- variables = {
- "select": {
- "comicId": self.manga_id,
- "range" : None,
- "isAsc" : not self.config("chapter-reverse"),
- }
- }
-
- while True:
- data = self._request_graphql(
- "get_content_comicChapterRangeList", variables)
-
- for item in data["items"]:
- yield from item["chapterNodes"]
-
- if not pnum:
- pager = data["pager"]
- pnum += 1
-
- try:
- variables["select"]["range"] = pager[pnum]
- except IndexError:
- return
-
- def chapters_source(self, source_id):
- variables = {
- "sourceId": source_id,
- }
- chapters = self._request_graphql(
- "get_content_source_chapterList", variables)
+ if source:
+ source_id = self._select_source(source)
+ self.log.debug("Requesting chapters for source_id %s", source_id)
+ chapters = self._extract_chapters_source(source_id)
+ else:
+ chapters = self._extract_chapters_all(self.groups[0])
if self.config("chapter-reverse"):
chapters.reverse()
@@ -180,101 +183,58 @@ class MangaparkMangaExtractor(MangaparkBase, Extractor):
raise exception.StopExtraction(
"'%s' does not match any available source", source)
- def _request_graphql(self, opname, variables):
- url = self.root + "/apo/"
- data = {
- "query" : QUERIES[opname],
- "variables" : util.json_dumps(variables),
- "operationName": opname,
- }
- return self.request(
- url, method="POST", json=data).json()["data"][opname]
-
QUERIES = {
- "get_content_comicChapterRangeList": """
- query get_content_comicChapterRangeList($select: Content_ComicChapterRangeList_Select) {
- get_content_comicChapterRangeList(
- select: $select
- ) {
- reqRange{x y}
- missing
- pager {x y}
- items{
- serial
- chapterNodes {
-
- id
- data {
-
-
- id
- sourceId
-
- dbStatus
- isNormal
- isHidden
- isDeleted
- isFinal
-
- dateCreate
- datePublic
- dateModify
- lang
- volume
- serial
- dname
- title
- urlPath
-
- srcTitle srcColor
-
- count_images
-
- stat_count_post_child
- stat_count_post_reply
- stat_count_views_login
- stat_count_views_guest
-
- userId
- userNode {
-
- id
- data {
-
-id
-name
-uniq
-avatarUrl
-urlPath
-
-verified
-deleted
-banned
-
-dateCreate
-dateOnline
-
-stat_count_chapters_normal
-stat_count_chapters_others
-
-is_adm is_mod is_vip is_upr
-
- }
-
- }
-
- disqusId
-
-
- }
+ "Get_comicChapterList": """
+query Get_comicChapterList($comicId: ID!) {
+ get_comicChapterList(comicId: $comicId) {
+ data {
+ id
+ dname
+ title
+ lang
+ urlPath
+ srcTitle
+ sourceId
+ dateCreate
+ }
+ }
+}
+""",
- sser_read
+ "Get_chapterNode": """
+query Get_chapterNode($getChapterNodeId: ID!) {
+ get_chapterNode(id: $getChapterNodeId) {
+ data {
+ id
+ dname
+ lang
+ sourceId
+ srcTitle
+ dateCreate
+ comicNode{
+ id
+ }
+ imageFile {
+ urlList
+ }
}
- }
+ }
+}
+""",
+ "Get_comicNode": """
+query Get_comicNode($getComicNodeId: ID!) {
+ get_comicNode(id: $getComicNodeId) {
+ data {
+ id
+ name
+ artists
+ authors
+ genres
+ }
}
- }
+}
""",
"get_content_source_chapterList": """
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 5b354ac..5e78ad4 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -196,7 +196,8 @@ class MastodonFollowingExtractor(MastodonExtractor):
class MastodonStatusExtractor(MastodonExtractor):
"""Extractor for images from a status"""
subcategory = "status"
- pattern = BASE_PATTERN + r"/@[^/?#]+/(?!following)([^/?#]+)"
+ pattern = (BASE_PATTERN + r"/(?:@[^/?#]+|(?:users/[^/?#]+/)?statuses)"
+ r"/(?!following)([^/?#]+)")
example = "https://mastodon.social/@USER/12345"
def statuses(self):
diff --git a/gallery_dl/extractor/nozomi.py b/gallery_dl/extractor/nozomi.py
index 851f663..3d1722a 100644
--- a/gallery_dl/extractor/nozomi.py
+++ b/gallery_dl/extractor/nozomi.py
@@ -21,6 +21,7 @@ class NozomiExtractor(Extractor):
"""Base class for nozomi extractors"""
category = "nozomi"
root = "https://nozomi.la"
+ domain = "gold-usergeneratedcontent.net"
filename_fmt = "{postid} {dataid}.{extension}"
archive_fmt = "{dataid}"
@@ -31,8 +32,8 @@ class NozomiExtractor(Extractor):
data = self.metadata()
for post_id in map(str, self.posts()):
- url = "https://j.nozomi.la/post/{}/{}/{}.json".format(
- post_id[-1], post_id[-3:-1], post_id)
+ url = "https://j.{}/post/{}/{}/{}.json".format(
+ self.domain, post_id[-1], post_id[-3:-1], post_id)
response = self.request(url, fatal=False)
if response.status_code >= 400:
@@ -76,8 +77,8 @@ class NozomiExtractor(Extractor):
ext = "webp"
post["extension"] = ext
- post["url"] = url = "https://{}.nozomi.la/{}/{}/{}.{}".format(
- subdomain, did[-1], did[-3:-1], did, ext)
+ post["url"] = url = "https://{}.{}/{}/{}/{}.{}".format(
+ subdomain, self.domain, did[-1], did[-3:-1], did, ext)
yield Message.Url, url, post
def posts(self):
@@ -168,7 +169,7 @@ class NozomiSearchExtractor(NozomiExtractor):
negative = []
def nozomi(path):
- url = "https://j.nozomi.la/" + path + ".nozomi"
+ url = "https://j.{}/{}.nozomi".format(self.domain, path)
return decode_nozomi(self.request(url).content)
for tag in self.tags:
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index f5a33d5..b8c6acb 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -27,7 +27,7 @@ class PatreonExtractor(Extractor):
_warning = True
def _init(self):
- if not self.cookies_check(("session_id",)):
+ if not self.cookies_check(("session_id",), subdomains=True):
if self._warning:
PatreonExtractor._warning = False
self.log.warning("no 'session_id' cookie set")
@@ -329,10 +329,11 @@ class PatreonCreatorExtractor(PatreonExtractor):
"""Extractor for a creator's works"""
subcategory = "creator"
pattern = (r"(?:https?://)?(?:www\.)?patreon\.com"
- r"/(?!(?:home|join|posts|login|signup)(?:$|[/?#]))"
+ r"/(?!(?:home|create|login|signup|search|posts|messages)"
+ r"(?:$|[/?#]))"
r"(?:profile/creators|(?:c/)?([^/?#]+)(?:/posts)?)"
r"/?(?:\?([^#]+))?")
- example = "https://www.patreon.com/USER"
+ example = "https://www.patreon.com/c/USER"
def posts(self):
creator, query = self.groups
@@ -370,7 +371,7 @@ class PatreonCreatorExtractor(PatreonExtractor):
data = None
data = self._extract_bootstrap(page)
return data["campaign"]["data"]["id"]
- except (KeyError, ValueError) as exc:
+ except Exception as exc:
if data:
self.log.debug(data)
raise exception.StopExtraction(
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index 121c7bf..1a299c1 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -132,6 +132,9 @@ class PinterestExtractor(Extractor):
"extension": "txt",
"media_id": block.get("id")}
+ elif type == "story_pin_static_sticker_block":
+ continue
+
else:
self.log.warning("%s: Unsupported story block '%s'",
pin.get("id"), type)
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 7708b5c..9e7d75d 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -10,6 +10,9 @@
from .common import Extractor, Message
from .. import text
+from datetime import datetime
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?sex\.com"
class SexcomExtractor(Extractor):
@@ -23,8 +26,20 @@ class SexcomExtractor(Extractor):
def items(self):
yield Message.Directory, self.metadata()
for pin in map(self._parse_pin, self.pins()):
- if pin:
- yield Message.Url, pin["url"], pin
+ if not pin:
+ continue
+
+ url = pin["url"]
+ parts = url.rsplit("/", 4)
+ try:
+ pin["date_url"] = dt = datetime(
+ int(parts[1]), int(parts[2]), int(parts[3]))
+ if "date" not in pin:
+ pin["date"] = dt
+ except Exception:
+ pass
+
+ yield Message.Url, url, pin
def metadata(self):
return {}
@@ -53,10 +68,18 @@ class SexcomExtractor(Extractor):
self.log.warning('Unable to fetch %s ("%s %s")',
url, response.status_code, response.reason)
return None
+
+ if "/pin/" in response.url:
+ return self._parse_pin_legacy(response)
+ if "/videos/" in response.url:
+ return self._parse_pin_video(response)
+ return self._parse_pin_gifs(response)
+
+ def _parse_pin_legacy(self, response):
extr = text.extract_from(response.text)
data = {}
- data["_http_headers"] = {"Referer": url}
+ data["_http_headers"] = {"Referer": response.url}
data["thumbnail"] = extr('itemprop="thumbnail" content="', '"')
data["type"] = extr('<h1>' , '<').rstrip(" -").strip().lower()
data["title"] = text.unescape(extr('itemprop="name">' , '<'))
@@ -82,7 +105,8 @@ class SexcomExtractor(Extractor):
src = (text.extr(iframe, ' src="', '"') or
text.extr(iframe, " src='", "'"))
if not src:
- self.log.warning("Unable to fetch media from %s", url)
+ self.log.warning(
+ "Unable to fetch media from %s", response.url)
return None
data["extension"] = None
data["url"] = "ytdl:" + src
@@ -100,27 +124,60 @@ class SexcomExtractor(Extractor):
return data
+ def _parse_pin_gifs(self, response):
+ extr = text.extract_from(response.text)
+
+ data = {
+ "_http_headers": {"Referer": response.url},
+ "type": "gif",
+ "url": extr(' href="', '"'),
+ "title": text.unescape(extr("<title>", " Gif | Sex.com<")),
+ "pin_id": text.parse_int(extr(
+ 'rel="canonical" href="', '"').rpartition("/")[2]),
+ "tags": text.split_html(extr("</h1>", "</section>")),
+ }
+
+ return text.nameext_from_url(data["url"], data)
+
+ def _parse_pin_video(self, response):
+ extr = text.extract_from(response.text)
+
+ if not self.cookies.get("CloudFront-Key-Pair-Id", domain=".sex.com"):
+ self.log.warning("CloudFront cookies required for video downloads")
+
+ data = {
+ "_ytdl_manifest": "hls",
+ "extension": "mp4",
+ "type": "video",
+ "title": text.unescape(extr("<title>", " | Sex.com<")),
+ "pin_id": text.parse_int(extr(
+ 'rel="canonical" href="', '"').rpartition("/")[2]),
+ "tags": text.split_html(extr(
+ 'event_name="video_tags_click"', "<div data-testid=")
+ .partition(">")[2]),
+ "url": "ytdl:" + extr('<source src="', '"'),
+ }
+
+ return data
+
class SexcomPinExtractor(SexcomExtractor):
"""Extractor for a pinned image or video on www.sex.com"""
subcategory = "pin"
directory_fmt = ("{category}",)
- pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)(?!.*#related$)"
+ pattern = (BASE_PATTERN +
+ r"(/(?:pin|\w\w/(?:gif|video)s)/\d+/?)(?!.*#related$)")
example = "https://www.sex.com/pin/12345-TITLE/"
- def __init__(self, match):
- SexcomExtractor.__init__(self, match)
- self.pin_id = match.group(1)
-
def pins(self):
- return ("{}/pin/{}/".format(self.root, self.pin_id),)
+ return (self.root + self.groups[0],)
class SexcomRelatedPinExtractor(SexcomPinExtractor):
"""Extractor for related pins on www.sex.com"""
subcategory = "related-pin"
directory_fmt = ("{category}", "related {original_pin[pin_id]}")
- pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+).*#related$"
+ pattern = BASE_PATTERN + r"(/pin/(\d+)/?).*#related$"
example = "https://www.sex.com/pin/12345#related"
def metadata(self):
@@ -129,7 +186,7 @@ class SexcomRelatedPinExtractor(SexcomPinExtractor):
def pins(self):
url = "{}/pin/related?pinId={}&limit=24&offset=0".format(
- self.root, self.pin_id)
+ self.root, self.groups[1])
return self._pagination(url)
@@ -137,18 +194,14 @@ class SexcomPinsExtractor(SexcomExtractor):
"""Extractor for a user's pins on www.sex.com"""
subcategory = "pins"
directory_fmt = ("{category}", "{user}")
- pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/pins/"
+ pattern = BASE_PATTERN + r"/user/([^/?#]+)/pins/"
example = "https://www.sex.com/user/USER/pins/"
- def __init__(self, match):
- SexcomExtractor.__init__(self, match)
- self.user = match.group(1)
-
def metadata(self):
- return {"user": text.unquote(self.user)}
+ return {"user": text.unquote(self.groups[0])}
def pins(self):
- url = "{}/user/{}/pins/".format(self.root, self.user)
+ url = "{}/user/{}/pins/".format(self.root, self.groups[0])
return self._pagination(url)
@@ -156,18 +209,14 @@ class SexcomLikesExtractor(SexcomExtractor):
"""Extractor for a user's liked pins on www.sex.com"""
subcategory = "likes"
directory_fmt = ("{category}", "{user}", "Likes")
- pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/likes/"
+ pattern = BASE_PATTERN + r"/user/([^/?#]+)/likes/"
example = "https://www.sex.com/user/USER/likes/"
- def __init__(self, match):
- SexcomExtractor.__init__(self, match)
- self.user = match.group(1)
-
def metadata(self):
- return {"user": text.unquote(self.user)}
+ return {"user": text.unquote(self.groups[0])}
def pins(self):
- url = "{}/user/{}/likes/".format(self.root, self.user)
+ url = "{}/user/{}/likes/".format(self.root, self.groups[0])
return self._pagination(url)
@@ -175,15 +224,12 @@ class SexcomBoardExtractor(SexcomExtractor):
"""Extractor for pins from a board on www.sex.com"""
subcategory = "board"
directory_fmt = ("{category}", "{user}", "{board}")
- pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user"
+ pattern = (BASE_PATTERN + r"/user"
r"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)")
example = "https://www.sex.com/user/USER/BOARD/"
- def __init__(self, match):
- SexcomExtractor.__init__(self, match)
- self.user, self.board = match.groups()
-
def metadata(self):
+ self.user, self.board = self.groups
return {
"user" : text.unquote(self.user),
"board": text.unquote(self.board),
@@ -198,19 +244,18 @@ class SexcomSearchExtractor(SexcomExtractor):
"""Extractor for search results on www.sex.com"""
subcategory = "search"
directory_fmt = ("{category}", "search", "{search[query]}")
- pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:"
+ pattern = (BASE_PATTERN + r"/((?:"
r"(pic|gif|video)s/([^/?#]*)|search/(pic|gif|video)s"
r")/?(?:\?([^#]+))?)")
example = "https://www.sex.com/search/pics?query=QUERY"
- def __init__(self, match):
- SexcomExtractor.__init__(self, match)
- self.path = match.group(1)
+ def _init(self):
+ self.path, t1, query_alt, t2, query = self.groups
- self.search = text.parse_query(match.group(5))
- self.search["type"] = match.group(2) or match.group(4)
+ self.search = text.parse_query(query)
+ self.search["type"] = t1 or t2
if "query" not in self.search:
- self.search["query"] = match.group(3) or ""
+ self.search["query"] = query_alt or ""
def metadata(self):
return {"search": self.search}
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index 07c9b21..cdccd4c 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -48,7 +48,12 @@ class SkebExtractor(Extractor):
def items(self):
metadata = self.metadata()
for user_name, post_num in self.posts():
- response, post = self._get_post_data(user_name, post_num)
+ try:
+ response, post = self._get_post_data(user_name, post_num)
+ except Exception as exc:
+ self.log.error("@%s/%s: %s: %s", user_name, post_num,
+ exc.__class__.__name__, exc)
+ continue
if metadata:
post.update(metadata)
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index 6c43941..5d0ec46 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -39,6 +39,8 @@ class SubscribestarExtractor(Extractor):
for post_html in self.posts():
media = self._media_from_post(post_html)
data = self._data_from_post(post_html)
+ data["title"] = text.unescape(text.extr(
+ data["content"], "<h1>", "</h1>"))
yield Message.Directory, data
for num, item in enumerate(media, 1):
item.update(data)
@@ -55,7 +57,9 @@ class SubscribestarExtractor(Extractor):
while True:
response = Extractor.request(self, url, **kwargs)
- if response.history and "/verify_subscriber" in response.url:
+ if response.history and (
+ "/verify_subscriber" in response.url or
+ "/age_confirmation_warning" in response.url):
raise exception.StopExtraction(
"HTTP redirect to %s", response.url)
diff --git a/gallery_dl/extractor/tiktok.py b/gallery_dl/extractor/tiktok.py
index 30f310d..4c1da7a 100644
--- a/gallery_dl/extractor/tiktok.py
+++ b/gallery_dl/extractor/tiktok.py
@@ -25,14 +25,8 @@ class TiktokExtractor(Extractor):
def _init(self):
self.audio = self.config("audio", True)
self.video = self.config("videos", True)
- if not self.config("avatar", True):
- self.avatar = util.false
def items(self):
- # We assume that all of the URLs served by urls() come from the same
- # author.
- downloaded_avatar = not self.avatar()
-
for tiktok_url in self.urls():
tiktok_url = self._sanitize_url(tiktok_url)
data = self._extract_rehydration_data(tiktok_url)
@@ -49,18 +43,10 @@ class TiktokExtractor(Extractor):
post = video_detail["itemInfo"]["itemStruct"]
author = post["author"]
- post["user"] = user = author["uniqueId"]
+ post["user"] = author["uniqueId"]
post["date"] = text.parse_timestamp(post["createTime"])
original_title = title = post["desc"]
- if not downloaded_avatar:
- avatar_url = author["avatarLarger"]
- avatar = self._generate_avatar(
- avatar_url, post, user, author["id"])
- yield Message.Directory, avatar
- yield Message.Url, avatar_url, avatar
- downloaded_avatar = True
-
yield Message.Directory, post
ytdl_media = False
@@ -111,44 +97,29 @@ class TiktokExtractor(Extractor):
})
yield Message.Url, "ytdl:" + tiktok_url, post
- # If we couldn't download the avatar because the given user has no
- # posts, we'll need to make a separate request for the user's page
- # and download the avatar that way.
- if not downloaded_avatar:
- user_name = self.avatar()
- profile_url = "https://www.tiktok.com/@{}".format(user_name)
- data = self._extract_rehydration_data(profile_url)
- data = data["webapp.user-detail"]["userInfo"]["user"]
- data["user"] = user_name
- avatar_url = data["avatarLarger"]
- avatar = self._generate_avatar(
- avatar_url, data, user_name, data["id"])
- yield Message.Directory, avatar
- yield Message.Url, avatar_url, avatar
-
- def avatar(self):
- return False
-
- def _generate_avatar(self, avatar_url, data, user_name, user_id):
- avatar = text.nameext_from_url(avatar_url, data.copy())
- avatar.update({
- "type" : "avatar",
- "title" : "@" + user_name,
- "id" : user_id,
- "img_id": avatar["filename"].partition("~")[0],
- "num" : 0,
- })
- return avatar
-
def _sanitize_url(self, url):
return text.ensure_http_scheme(url.replace("/photo/", "/video/", 1))
def _extract_rehydration_data(self, url):
- html = self.request(url).text
- data = text.extr(
- html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
- 'type="application/json">', '</script>')
- return util.json_loads(data)["__DEFAULT_SCOPE__"]
+ tries = 0
+ while True:
+ try:
+ html = self.request(url).text
+ data = text.extr(
+ html, '<script id="__UNIVERSAL_DATA_FOR_REHYDRATION__" '
+ 'type="application/json">', '</script>')
+ return util.json_loads(data)["__DEFAULT_SCOPE__"]
+ except ValueError:
+ # We failed to retrieve rehydration data. This happens
+ # relatively frequently when making many requests, so
+ # retry.
+ if tries >= self._retries:
+ raise
+ tries += 1
+ self.log.warning("%s: Failed to retrieve rehydration data "
+ "(%s/%s)", url.rpartition("/")[2], tries,
+ self._retries)
+ self.sleep(self._timeout, "retry")
def _extract_audio(self, post):
audio = post["music"]
@@ -179,7 +150,7 @@ class TiktokExtractor(Extractor):
elif status == 10204:
self.log.error("%s: Requested post not available", url)
elif status == 10231:
- self.log.error("%s: Region locked - Try downloading with a"
+ self.log.error("%s: Region locked - Try downloading with a "
"VPN/proxy connection", url)
else:
self.log.error(
@@ -230,7 +201,10 @@ class TiktokUserExtractor(TiktokExtractor):
pattern = BASE_PATTERN + r"/@([\w_.-]+)/?(?:$|\?|#)"
example = "https://www.tiktok.com/@USER"
- def urls(self):
+ def _init(self):
+ self.avatar = self.config("avatar", True)
+
+ def items(self):
"""Attempt to use yt-dlp/youtube-dl to extract links from a
user's page"""
@@ -263,19 +237,39 @@ class TiktokUserExtractor(TiktokExtractor):
ytdl_instance = ytdl.construct_YoutubeDL(
module, self, user_opts, extr_opts)
- # transfer cookies to ytdl
+ # Transfer cookies to ytdl.
if self.cookies:
set_cookie = ytdl_instance.cookiejar.set_cookie
for cookie in self.cookies:
set_cookie(cookie)
+ user_name = self.groups[0]
+ profile_url = "{}/@{}".format(self.root, user_name)
+ if self.avatar:
+ avatar_url, avatar = self._generate_avatar(user_name, profile_url)
+ yield Message.Directory, avatar
+ yield Message.Url, avatar_url, avatar
+
with ytdl_instance as ydl:
info_dict = ydl._YoutubeDL__extract_info(
- "{}/@{}".format(self.root, self.groups[0]),
- ydl.get_info_extractor("TikTokUser"),
+ profile_url, ydl.get_info_extractor("TikTokUser"),
False, {}, True)
# This should include video and photo posts in /video/ URL form.
- return [video["url"] for video in info_dict["entries"]]
-
- def avatar(self):
- return self.groups[0]
+ for video in info_dict["entries"]:
+ data = {"_extractor": TiktokPostExtractor}
+ yield Message.Queue, video["url"].partition("?")[0], data
+
+ def _generate_avatar(self, user_name, profile_url):
+ data = self._extract_rehydration_data(profile_url)
+ data = data["webapp.user-detail"]["userInfo"]["user"]
+ data["user"] = user_name
+ avatar_url = data["avatarLarger"]
+ avatar = text.nameext_from_url(avatar_url, data.copy())
+ avatar.update({
+ "type" : "avatar",
+ "title" : "@" + user_name,
+ "id" : data["id"],
+ "img_id": avatar["filename"].partition("~")[0],
+ "num" : 0,
+ })
+ return (avatar_url, avatar)
diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py
index bc135ad..ac1400e 100644
--- a/gallery_dl/extractor/zerochan.py
+++ b/gallery_dl/extractor/zerochan.py
@@ -64,16 +64,22 @@ class ZerochanExtractor(BooruExtractor):
def _parse_entry_html(self, entry_id):
url = "{}/{}".format(self.root, entry_id)
- extr = text.extract_from(self.request(url).text)
+ page = self.request(url).text
+ try:
+ jsonld = self._extract_jsonld(page)
+ except Exception:
+ return {"id": entry_id}
+
+ extr = text.extract_from(page)
data = {
"id" : text.parse_int(entry_id),
- "author" : text.parse_unicode_escapes(extr(' "name": "', '"')),
- "file_url": extr('"contentUrl": "', '"'),
- "date" : text.parse_datetime(extr('"datePublished": "', '"')),
- "width" : text.parse_int(extr('"width": "', ' ')),
- "height" : text.parse_int(extr('"height": "', ' ')),
- "size" : text.parse_bytes(extr('"contentSize": "', 'B')),
+ "author" : jsonld["author"]["name"],
+ "file_url": jsonld["contentUrl"],
+ "date" : text.parse_datetime(jsonld["datePublished"]),
+ "width" : text.parse_int(jsonld["width"][:-3]),
+ "height" : text.parse_int(jsonld["height"][:-3]),
+ "size" : text.parse_bytes(jsonld["contentSize"][:-1]),
"path" : text.split_html(extr(
'class="breadcrumbs', '</nav>'))[2:],
"uploader": extr('href="/user/', '"'),
@@ -86,7 +92,7 @@ class ZerochanExtractor(BooruExtractor):
tags = data["tags"] = []
for tag in html.split("<li class=")[1:]:
category = text.extr(tag, '"', '"')
- name = text.extr(tag, 'data-tag="', '"')
+ name = text.unescape(text.extr(tag, 'data-tag="', '"'))
tags.append(category.partition(" ")[0].capitalize() + ":" + name)
return data