summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/8chan.py7
-rw-r--r--gallery_dl/extractor/__init__.py8
-rw-r--r--gallery_dl/extractor/bilibili.py116
-rw-r--r--gallery_dl/extractor/blogger.py2
-rw-r--r--gallery_dl/extractor/bluesky.py49
-rw-r--r--gallery_dl/extractor/boosty.py13
-rw-r--r--gallery_dl/extractor/bunkr.py11
-rw-r--r--gallery_dl/extractor/civitai.py22
-rw-r--r--gallery_dl/extractor/common.py31
-rw-r--r--gallery_dl/extractor/danbooru.py65
-rw-r--r--gallery_dl/extractor/deviantart.py7
-rw-r--r--gallery_dl/extractor/e621.py3
-rw-r--r--gallery_dl/extractor/everia.py99
-rw-r--r--gallery_dl/extractor/exhentai.py8
-rw-r--r--gallery_dl/extractor/facebook.py447
-rw-r--r--gallery_dl/extractor/flickr.py14
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py6
-rw-r--r--gallery_dl/extractor/hentaifoundry.py16
-rw-r--r--gallery_dl/extractor/hiperdex.py14
-rw-r--r--gallery_dl/extractor/hitomi.py109
-rw-r--r--gallery_dl/extractor/imagechest.py66
-rw-r--r--gallery_dl/extractor/instagram.py12
-rw-r--r--gallery_dl/extractor/kemonoparty.py352
-rw-r--r--gallery_dl/extractor/koharu.py49
-rw-r--r--gallery_dl/extractor/lolisafe.py7
-rw-r--r--gallery_dl/extractor/mangadex.py35
-rw-r--r--gallery_dl/extractor/mastodon.py1
-rw-r--r--gallery_dl/extractor/motherless.py167
-rw-r--r--gallery_dl/extractor/newgrounds.py19
-rw-r--r--gallery_dl/extractor/nhentai.py2
-rw-r--r--gallery_dl/extractor/noop.py27
-rw-r--r--gallery_dl/extractor/patreon.py13
-rw-r--r--gallery_dl/extractor/philomena.py2
-rw-r--r--gallery_dl/extractor/piczel.py8
-rw-r--r--gallery_dl/extractor/pillowfort.py1
-rw-r--r--gallery_dl/extractor/pinterest.py40
-rw-r--r--gallery_dl/extractor/pixiv.py123
-rw-r--r--gallery_dl/extractor/poipiku.py2
-rw-r--r--gallery_dl/extractor/reddit.py14
-rw-r--r--gallery_dl/extractor/rule34vault.py119
-rw-r--r--gallery_dl/extractor/rule34xyz.py143
-rw-r--r--gallery_dl/extractor/saint.py101
-rw-r--r--gallery_dl/extractor/sankaku.py11
-rw-r--r--gallery_dl/extractor/scrolller.py7
-rw-r--r--gallery_dl/extractor/smugmug.py1
-rw-r--r--gallery_dl/extractor/steamgriddb.py17
-rw-r--r--gallery_dl/extractor/tumblr.py77
-rw-r--r--gallery_dl/extractor/tumblrgallery.py1
-rw-r--r--gallery_dl/extractor/twitter.py9
-rw-r--r--gallery_dl/extractor/urlgalleries.py4
-rw-r--r--gallery_dl/extractor/webtoons.py7
-rw-r--r--gallery_dl/extractor/weibo.py11
52 files changed, 2057 insertions, 438 deletions
diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py
index ce1c52a..3e30ddc 100644
--- a/gallery_dl/extractor/8chan.py
+++ b/gallery_dl/extractor/8chan.py
@@ -26,10 +26,6 @@ class _8chanExtractor(Extractor):
self.root = "https://8chan." + match.group(1)
Extractor.__init__(self, match)
- def _init(self):
- tos = self.cookies_tos_name()
- self.cookies.set(tos, "1", domain=self.root[8:])
-
@memcache()
def cookies_tos_name(self):
url = self.root + "/.static/pages/confirmed.html"
@@ -79,6 +75,7 @@ class _8chanThreadExtractor(_8chanExtractor):
def items(self):
_, board, thread = self.groups
+ self.cookies.set(self.cookies_tos_name(), "1", domain=self.root[8:])
# fetch thread data
url = "{}/{}/res/{}.".format(self.root, board, thread)
@@ -116,6 +113,8 @@ class _8chanBoardExtractor(_8chanExtractor):
def items(self):
_, board, pnum = self.groups
+ self.cookies.set(self.cookies_tos_name(), "1", domain=self.root[8:])
+
pnum = text.parse_int(pnum, 1)
url = "{}/{}/{}.json".format(self.root, board, pnum)
data = self.request(url).json()
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 4e9fa50..594ce41 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -30,6 +30,7 @@ modules = [
"batoto",
"bbc",
"behance",
+ "bilibili",
"blogger",
"bluesky",
"boosty",
@@ -47,7 +48,9 @@ modules = [
"dynastyscans",
"e621",
"erome",
+ "everia",
"exhentai",
+ "facebook",
"fanbox",
"fanleaks",
"fantia",
@@ -107,6 +110,7 @@ modules = [
"mangasee",
"mangoxo",
"misskey",
+ "motherless",
"myhentaigallery",
"myportfolio",
"naver",
@@ -139,6 +143,9 @@ modules = [
"reddit",
"redgifs",
"rule34us",
+ "rule34vault",
+ "rule34xyz",
+ "saint",
"sankaku",
"sankakucomplex",
"scrolller",
@@ -200,6 +207,7 @@ modules = [
"directlink",
"recursive",
"oauth",
+ "noop",
"ytdl",
"generic",
]
diff --git a/gallery_dl/extractor/bilibili.py b/gallery_dl/extractor/bilibili.py
new file mode 100644
index 0000000..d5c419e
--- /dev/null
+++ b/gallery_dl/extractor/bilibili.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.bilibili.com/"""
+
+from .common import Extractor, Message
+from .. import text, util, exception
+
+
+class BilibiliExtractor(Extractor):
+ """Base class for bilibili extractors"""
+ category = "bilibili"
+ root = "https://www.bilibili.com"
+ request_interval = (3.0, 6.0)
+
+ def _init(self):
+ self.api = BilibiliAPI(self)
+
+
+class BilibiliUserArticlesExtractor(BilibiliExtractor):
+ """Extractor for a bilibili user's articles"""
+ subcategory = "user-articles"
+ pattern = r"(?:https?://)?space\.bilibili\.com/(\d+)/article"
+ example = "https://space.bilibili.com/12345/article"
+
+ def items(self):
+ for article in self.api.user_articles(self.groups[0]):
+ article["_extractor"] = BilibiliArticleExtractor
+ url = "{}/opus/{}".format(self.root, article["opus_id"])
+ yield Message.Queue, url, article
+
+
+class BilibiliArticleExtractor(BilibiliExtractor):
+ """Extractor for a bilibili article"""
+ subcategory = "article"
+ pattern = (r"(?:https?://)?"
+ r"(?:t\.bilibili\.com|(?:www\.)?bilibili.com/opus)/(\d+)")
+ example = "https://www.bilibili.com/opus/12345"
+ directory_fmt = ("{category}", "{username}")
+ filename_fmt = "{id}_{num}.{extension}"
+ archive_fmt = "{id}_{num}"
+
+ def items(self):
+ article = self.api.article(self.groups[0])
+
+ # Flatten modules list
+ modules = {}
+ for module in article["detail"]["modules"]:
+ del module['module_type']
+ modules.update(module)
+ article["detail"]["modules"] = modules
+
+ article["username"] = modules["module_author"]["name"]
+
+ pics = []
+ for paragraph in modules['module_content']['paragraphs']:
+ if "pic" not in paragraph:
+ continue
+
+ try:
+ pics.extend(paragraph["pic"]["pics"])
+ except Exception:
+ pass
+
+ article["count"] = len(pics)
+ yield Message.Directory, article
+ for article["num"], pic in enumerate(pics, 1):
+ url = pic["url"]
+ article.update(pic)
+ yield Message.Url, url, text.nameext_from_url(url, article)
+
+
+class BilibiliAPI():
+ def __init__(self, extractor):
+ self.extractor = extractor
+
+ def _call(self, endpoint, params):
+ url = "https://api.bilibili.com/x/polymer/web-dynamic/v1" + endpoint
+ data = self.extractor.request(url, params=params).json()
+
+ if data["code"] != 0:
+ self.extractor.log.debug("Server response: %s", data)
+ raise exception.StopExtraction("API request failed")
+
+ return data
+
+ def user_articles(self, user_id):
+ endpoint = "/opus/feed/space"
+ params = {"host_mid": user_id}
+
+ while True:
+ data = self._call(endpoint, params)
+
+ for item in data["data"]["items"]:
+ params["offset"] = item["opus_id"]
+ yield item
+
+ if not data["data"]["has_more"]:
+ break
+
+ def article(self, article_id):
+ url = "https://www.bilibili.com/opus/" + article_id
+
+ while True:
+ page = self.extractor.request(url).text
+ try:
+ return util.json_loads(text.extr(
+ page, "window.__INITIAL_STATE__=", "};") + "}")
+ except Exception:
+ if "window._riskdata_" not in page:
+ raise exception.StopExtraction(
+ "%s: Unable to extract INITIAL_STATE data", article_id)
+ self.extractor.wait(seconds=300)
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 37075ea..ef117da 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -159,7 +159,7 @@ class BloggerAPI():
def __init__(self, extractor):
self.extractor = extractor
- self.api_key = extractor.config("api-key", self.API_KEY)
+ self.api_key = extractor.config("api-key") or self.API_KEY
def blog_by_url(self, url):
return self._call("blogs/byurl", {"url": url}, "blog")
diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py
index a1a488e..bbff17c 100644
--- a/gallery_dl/extractor/bluesky.py
+++ b/gallery_dl/extractor/bluesky.py
@@ -12,7 +12,8 @@ from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache, memcache
-BASE_PATTERN = r"(?:https?://)?bsky\.app"
+BASE_PATTERN = (r"(?:https?://)?"
+ r"(?:(?:www\.)?(?:c|[fv]x)?bs[ky]y[ex]?\.app|main\.bsky\.dev)")
USER_PATTERN = BASE_PATTERN + r"/profile/([^/?#]+)"
@@ -60,8 +61,10 @@ class BlueskyExtractor(Extractor):
yield Message.Directory, post
if files:
- base = ("https://bsky.social/xrpc/com.atproto.sync.getBlob"
- "?did={}&cid=".format(post["author"]["did"]))
+ did = post["author"]["did"]
+ base = (
+ "{}/xrpc/com.atproto.sync.getBlob?did={}&cid=".format(
+ self.api.service_endpoint(did), did))
for post["num"], file in enumerate(files, 1):
post.update(file)
yield Message.Url, base + file["filename"], post
@@ -84,7 +87,14 @@ class BlueskyExtractor(Extractor):
def _pid(self, post):
return post["uri"].rpartition("/")[2]
+ @memcache(keyarg=1)
+ def _instance(self, handle):
+ return ".".join(handle.rsplit(".", 2)[-2:])
+
def _prepare(self, post):
+ author = post["author"]
+ author["instance"] = self._instance(author["handle"])
+
if self._metadata_facets:
if "facets" in post:
post["hashtags"] = tags = []
@@ -102,7 +112,7 @@ class BlueskyExtractor(Extractor):
post["hashtags"] = post["mentions"] = post["uris"] = ()
if self._metadata_user:
- post["user"] = self._user or post["author"]
+ post["user"] = self._user or author
post["instance"] = self.instance
post["post_id"] = self._pid(post)
@@ -317,6 +327,15 @@ class BlueskySearchExtractor(BlueskyExtractor):
return self.api.search_posts(self.user)
+class BlueskyHashtagExtractor(BlueskyExtractor):
+ subcategory = "hashtag"
+ pattern = BASE_PATTERN + r"/hashtag/([^/?#]+)(?:/(top|latest))?"
+ example = "https://bsky.app/hashtag/NAME"
+
+ def posts(self):
+ return self.api.search_posts("#"+self.user, self.groups[1])
+
+
class BlueskyAPI():
"""Interface for the Bluesky API
@@ -412,11 +431,28 @@ class BlueskyAPI():
params = {"handle": handle}
return self._call(endpoint, params)["did"]
- def search_posts(self, query):
+ @memcache(keyarg=1)
+ def service_endpoint(self, did):
+ if did.startswith('did:web:'):
+ url = "https://" + did[8:] + "/.well-known/did.json"
+ else:
+ url = "https://plc.directory/" + did
+
+ try:
+ data = self.extractor.request(url).json()
+ for service in data["service"]:
+ if service["type"] == "AtprotoPersonalDataServer":
+ return service["serviceEndpoint"]
+ except Exception:
+ pass
+ return "https://bsky.social"
+
+ def search_posts(self, query, sort=None):
endpoint = "app.bsky.feed.searchPosts"
params = {
"q" : query,
"limit": "100",
+ "sort" : sort,
}
return self._pagination(endpoint, params, "posts")
@@ -430,7 +466,8 @@ class BlueskyAPI():
if user_did and not extr.config("reposts", False):
extr._user_did = did
if extr._metadata_user:
- extr._user = self.get_profile(did)
+ extr._user = user = self.get_profile(did)
+ user["instance"] = extr._instance(user["handle"])
return did
diff --git a/gallery_dl/extractor/boosty.py b/gallery_dl/extractor/boosty.py
index 997de4a..33823be 100644
--- a/gallery_dl/extractor/boosty.py
+++ b/gallery_dl/extractor/boosty.py
@@ -35,8 +35,16 @@ class BoostyExtractor(Extractor):
if isinstance(videos, str):
videos = videos.split(",")
elif not isinstance(videos, (list, tuple)):
- videos = ("quad_hd", "ultra_hd", "full_hd",
- "high", "medium", "low")
+ # ultra_hd: 2160p
+ # quad_hd: 1440p
+ # full_hd: 1080p
+ # high: 720p
+ # medium: 480p
+ # low: 360p
+ # lowest: 240p
+ # tiny: 144p
+ videos = ("ultra_hd", "quad_hd", "full_hd",
+ "high", "medium", "low", "lowest", "tiny")
self.videos = videos
def items(self):
@@ -325,6 +333,7 @@ class BoostyAPI():
def _pagination(self, endpoint, params, transform=None, key=None):
if "is_only_allowed" not in params and self.extractor.only_allowed:
+ params["only_allowed"] = "true"
params["is_only_allowed"] = "true"
while True:
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 6c79d0a..3e12452 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -22,13 +22,14 @@ else:
BASE_PATTERN = (
r"(?:bunkr:(?:https?://)?([^/?#]+)|"
r"(?:https?://)?(?:app\.)?(bunkr+"
- r"\.(?:s[kiu]|[cf]i|p[hks]|ru|la|is|to|a[cx]"
+ r"\.(?:s[kiu]|c[ir]|fi|p[hks]|ru|la|is|to|a[cx]"
r"|black|cat|media|red|site|ws|org)))"
)
DOMAINS = [
"bunkr.ac",
"bunkr.ci",
+ "bunkr.cr",
"bunkr.fi",
"bunkr.ph",
"bunkr.pk",
@@ -110,13 +111,17 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
def fetch_album(self, album_id):
# album metadata
- page = self.request(self.root + "/a/" + self.album_id).text
+ page = self.request(self.root + "/a/" + album_id).text
title, size = text.split_html(text.extr(
page, "<h1", "</span>").partition(">")[2])
+ if "&" in title:
+ title = title.replace(
+ "&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
+ # files
items = list(text.extract_iter(page, "<!-- item -->", "<!-- -->"))
return self._extract_files(items), {
- "album_id" : self.album_id,
+ "album_id" : album_id,
"album_name" : title,
"album_size" : text.extr(size, "(", ")"),
"count" : len(items),
diff --git a/gallery_dl/extractor/civitai.py b/gallery_dl/extractor/civitai.py
index 0b1e44a..1e8cb42 100644
--- a/gallery_dl/extractor/civitai.py
+++ b/gallery_dl/extractor/civitai.py
@@ -44,6 +44,16 @@ class CivitaiExtractor(Extractor):
self._image_quality = "original=true"
self._image_ext = "png"
+ metadata = self.config("metadata")
+ if metadata:
+ if isinstance(metadata, str):
+ metadata = metadata.split(",")
+ elif not isinstance(metadata, (list, tuple)):
+ metadata = ("generation",)
+ self._meta_generation = ("generation" in metadata)
+ else:
+ self._meta_generation = False
+
def items(self):
models = self.models()
if models:
@@ -81,6 +91,9 @@ class CivitaiExtractor(Extractor):
if images:
for image in images:
url = self._url(image)
+ if self._meta_generation:
+ image["generation"] = self.api.image_generationdata(
+ image["id"])
image["date"] = text.parse_datetime(
image["createdAt"], "%Y-%m-%dT%H:%M:%S.%fZ")
text.nameext_from_url(url, image)
@@ -127,6 +140,8 @@ class CivitaiExtractor(Extractor):
data["extension"] = self._image_ext
if "id" not in file and data["filename"].isdecimal():
file["id"] = text.parse_int(data["filename"])
+ if self._meta_generation:
+ file["generation"] = self.api.image_generationdata(file["id"])
yield data
@@ -469,7 +484,7 @@ class CivitaiTrpcAPI():
self.root = extractor.root + "/api/trpc/"
self.headers = {
"content-type" : "application/json",
- "x-client-version": "5.0.185",
+ "x-client-version": "5.0.211",
"x-client-date" : "",
"x-client" : "web",
"x-fingerprint" : "undefined",
@@ -491,6 +506,11 @@ class CivitaiTrpcAPI():
params = {"id": int(image_id)}
return (self._call(endpoint, params),)
+ def image_generationdata(self, image_id):
+ endpoint = "image.getGenerationData"
+ params = {"id": int(image_id)}
+ return self._call(endpoint, params)
+
def images(self, params, defaults=True):
endpoint = "image.getInfinite"
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 2146fa6..f364124 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -11,7 +11,6 @@
import os
import re
import ssl
-import sys
import time
import netrc
import queue
@@ -23,7 +22,7 @@ import requests
import threading
from requests.adapters import HTTPAdapter
from .message import Message
-from .. import config, text, util, cache, exception
+from .. import config, output, text, util, cache, exception
urllib3 = requests.packages.urllib3
@@ -43,6 +42,8 @@ class Extractor():
ciphers = None
tls12 = True
browser = None
+ useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
+ "rv:128.0) Gecko/20100101 Firefox/128.0")
request_interval = 0.0
request_interval_min = 0.0
request_interval_429 = 60.0
@@ -289,13 +290,8 @@ class Extractor():
def _check_input_allowed(self, prompt=""):
input = self.config("input")
-
if input is None:
- try:
- input = sys.stdin.isatty()
- except Exception:
- input = False
-
+ input = output.TTY_STDIN
if not input:
raise exception.StopExtraction(
"User input required (%s)", prompt.strip(" :"))
@@ -351,6 +347,9 @@ class Extractor():
headers.clear()
ssl_options = ssl_ciphers = 0
+ # .netrc Authorization headers are alwsays disabled
+ session.trust_env = True if self.config("proxy-env", False) else False
+
browser = self.config("browser")
if browser is None:
browser = self.browser
@@ -384,11 +383,13 @@ class Extractor():
ssl_ciphers = SSL_CIPHERS[browser]
else:
useragent = self.config("user-agent")
- if useragent is None:
- useragent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
- "rv:128.0) Gecko/20100101 Firefox/128.0")
+ if useragent is None or useragent == "auto":
+ useragent = self.useragent
elif useragent == "browser":
useragent = _browser_useragent()
+ elif useragent is config.get(("extractor",), "user-agent") and \
+ useragent == Extractor.useragent:
+ useragent = self.useragent
headers["User-Agent"] = useragent
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
@@ -660,6 +661,8 @@ class Extractor():
headers=(self._write_pages in ("all", "ALL")),
hide_auth=(self._write_pages != "ALL")
)
+ self.log.info("Writing '%s' response to '%s'",
+ response.url, path + ".txt")
except Exception as e:
self.log.warning("Failed to dump HTTP request (%s: %s)",
e.__class__.__name__, e)
@@ -1008,6 +1011,12 @@ SSL_CIPHERS = {
}
+# disable Basic Authorization header injection from .netrc data
+try:
+ requests.sessions.get_netrc_auth = lambda _: None
+except Exception:
+ pass
+
# detect brotli support
try:
BROTLI = urllib3.response.brotli is not None
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 1746647..c3dfd91 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -20,12 +20,22 @@ class DanbooruExtractor(BaseExtractor):
page_limit = 1000
page_start = None
per_page = 200
+ useragent = util.USERAGENT
request_interval = (0.5, 1.5)
def _init(self):
self.ugoira = self.config("ugoira", False)
self.external = self.config("external", False)
- self.includes = False
+
+ includes = self.config("metadata")
+ if includes:
+ if isinstance(includes, (list, tuple)):
+ includes = ",".join(includes)
+ elif not isinstance(includes, str):
+ includes = "artist_commentary,children,notes,parent,uploader"
+ self.includes = includes + ",id"
+ else:
+ self.includes = False
threshold = self.config("threshold")
if isinstance(threshold, int):
@@ -46,16 +56,6 @@ class DanbooruExtractor(BaseExtractor):
return pages * self.per_page
def items(self):
- self.session.headers["User-Agent"] = util.USERAGENT
-
- includes = self.config("metadata")
- if includes:
- if isinstance(includes, (list, tuple)):
- includes = ",".join(includes)
- elif not isinstance(includes, str):
- includes = "artist_commentary,children,notes,parent,uploader"
- self.includes = includes + ",id"
-
data = self.metadata()
for post in self.posts():
@@ -108,6 +108,13 @@ class DanbooruExtractor(BaseExtractor):
yield Message.Directory, post
yield Message.Url, url, post
+ def items_artists(self):
+ for artist in self.artists():
+ artist["_extractor"] = DanbooruTagExtractor
+ url = "{}/posts?tags={}".format(
+ self.root, text.quote(artist["name"]))
+ yield Message.Queue, url, artist
+
def metadata(self):
return ()
@@ -294,3 +301,39 @@ class DanbooruPopularExtractor(DanbooruExtractor):
def posts(self):
return self._pagination("/explore/posts/popular.json", self.params)
+
+
+class DanbooruArtistExtractor(DanbooruExtractor):
+ """Extractor for danbooru artists"""
+ subcategory = "artist"
+ pattern = BASE_PATTERN + r"/artists/(\d+)"
+ example = "https://danbooru.donmai.us/artists/12345"
+
+ items = DanbooruExtractor.items_artists
+
+ def artists(self):
+ url = "{}/artists/{}.json".format(self.root, self.groups[-1])
+ return (self.request(url).json(),)
+
+
+class DanbooruArtistSearchExtractor(DanbooruExtractor):
+ """Extractor for danbooru artist searches"""
+ subcategory = "artist-search"
+ pattern = BASE_PATTERN + r"/artists/?\?([^#]+)"
+ example = "https://danbooru.donmai.us/artists?QUERY"
+
+ items = DanbooruExtractor.items_artists
+
+ def artists(self):
+ url = self.root + "/artists.json"
+ params = text.parse_query(self.groups[-1])
+ params["page"] = text.parse_int(params.get("page"), 1)
+
+ while True:
+ artists = self.request(url, params=params).json()
+
+ yield from artists
+
+ if len(artists) < 20:
+ return
+ params["page"] += 1
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 693def9..ea3f13d 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -31,7 +31,7 @@ class DeviantartExtractor(Extractor):
root = "https://www.deviantart.com"
directory_fmt = ("{category}", "{username}")
filename_fmt = "{category}_{index}_{title}.{extension}"
- cookies_domain = None
+ cookies_domain = ".deviantart.com"
cookies_names = ("auth", "auth_secure", "userinfo")
_last_request = 0
@@ -399,7 +399,7 @@ class DeviantartExtractor(Extractor):
def _textcontent_to_html(self, deviation, content):
html = content["html"]
- markup = html["markup"]
+ markup = html.get("markup")
if not markup or markup[0] != "{":
return markup
@@ -1144,7 +1144,6 @@ class DeviantartScrapsExtractor(DeviantartExtractor):
subcategory = "scraps"
directory_fmt = ("{category}", "{username}", "Scraps")
archive_fmt = "s_{_username}_{index}.{extension}"
- cookies_domain = ".deviantart.com"
pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b"
example = "https://www.deviantart.com/USER/gallery/scraps"
@@ -1161,7 +1160,6 @@ class DeviantartSearchExtractor(DeviantartExtractor):
subcategory = "search"
directory_fmt = ("{category}", "Search", "{search_tags}")
archive_fmt = "Q_{search_tags}_{index}.{extension}"
- cookies_domain = ".deviantart.com"
pattern = (r"(?:https?://)?www\.deviantart\.com"
r"/search(?:/deviations)?/?\?([^#]+)")
example = "https://www.deviantart.com/search?q=QUERY"
@@ -1213,7 +1211,6 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor):
"""Extractor for deviantart gallery searches"""
subcategory = "gallery-search"
archive_fmt = "g_{_username}_{index}.{extension}"
- cookies_domain = ".deviantart.com"
pattern = BASE_PATTERN + r"/gallery/?\?(q=[^#]+)"
example = "https://www.deviantart.com/USER/gallery?q=QUERY"
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index 553ec22..4a6624d 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -20,11 +20,10 @@ class E621Extractor(danbooru.DanbooruExtractor):
page_limit = 750
page_start = None
per_page = 320
+ useragent = util.USERAGENT + " (by mikf)"
request_interval_min = 1.0
def items(self):
- self.session.headers["User-Agent"] = util.USERAGENT + " (by mikf)"
-
includes = self.config("metadata") or ()
if includes:
if isinstance(includes, str):
diff --git a/gallery_dl/extractor/everia.py b/gallery_dl/extractor/everia.py
new file mode 100644
index 0000000..94444ff
--- /dev/null
+++ b/gallery_dl/extractor/everia.py
@@ -0,0 +1,99 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://everia.club"""
+
+from .common import Extractor, Message
+from .. import text
+import re
+
+BASE_PATTERN = r"(?:https?://)?everia\.club"
+
+
+class EveriaExtractor(Extractor):
+ category = "everia"
+ root = "https://everia.club"
+
+ def items(self):
+ data = {"_extractor": EveriaPostExtractor}
+ for url in self.posts():
+ yield Message.Queue, url, data
+
+ def posts(self):
+ return self._pagination(self.groups[0])
+
+ def _pagination(self, path, params=None, pnum=1):
+ find_posts = re.compile(r'thumbnail">\s*<a href="([^"]+)').findall
+
+ while True:
+ if pnum == 1:
+ url = "{}{}/".format(self.root, path)
+ else:
+ url = "{}{}/page/{}/".format(self.root, path, pnum)
+ response = self.request(url, params=params, allow_redirects=False)
+
+ if response.status_code >= 300:
+ return
+
+ yield from find_posts(response.text)
+ pnum += 1
+
+
+class EveriaPostExtractor(EveriaExtractor):
+ subcategory = "post"
+ directory_fmt = ("{category}", "{title}")
+ archive_fmt = "{post_url}_{num}"
+ pattern = BASE_PATTERN + r"(/\d{4}/\d{2}/\d{2}/[^/?#]+)"
+ example = "https://everia.club/0000/00/00/TITLE"
+
+ def items(self):
+ url = self.root + self.groups[0]
+ page = self.request(url).text
+ content = text.extr(page, 'itemprop="text">', "</div>")
+ urls = re.findall(r'img.*?src="([^"]+)', content)
+
+ data = {
+ "title": text.unescape(
+ text.extr(page, 'itemprop="headline">', "</h1>")),
+ "tags": list(text.extract_iter(page, 'rel="tag">', "</a>")),
+ "post_url": url,
+ "post_category": text.extr(
+ page, "post-in-category-", " ").capitalize(),
+ "count": len(urls),
+ }
+
+ yield Message.Directory, data
+ for data["num"], url in enumerate(urls, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class EveriaTagExtractor(EveriaExtractor):
+ subcategory = "tag"
+ pattern = BASE_PATTERN + r"(/tag/[^/?#]+)"
+ example = "https://everia.club/tag/TAG"
+
+
+class EveriaCategoryExtractor(EveriaExtractor):
+ subcategory = "category"
+ pattern = BASE_PATTERN + r"(/category/[^/?#]+)"
+ example = "https://everia.club/category/CATEGORY"
+
+
+class EveriaDateExtractor(EveriaExtractor):
+ subcategory = "date"
+ pattern = (BASE_PATTERN +
+ r"(/\d{4}(?:/\d{2})?(?:/\d{2})?)(?:/page/\d+)?/?$")
+ example = "https://everia.club/0000/00/00"
+
+
+class EveriaSearchExtractor(EveriaExtractor):
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/(?:page/\d+/)?\?s=([^&#]+)"
+ example = "https://everia.club/?s=SEARCH"
+
+ def posts(self):
+ params = {"s": self.groups[0]}
+ return self._pagination("", params)
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 3e6d537..e7ba78e 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -11,6 +11,7 @@
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
+import collections
import itertools
import math
@@ -227,6 +228,13 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
if self.config("metadata", False):
data.update(self.metadata_from_api())
data["date"] = text.parse_timestamp(data["posted"])
+ if self.config("tags", False):
+ tags = collections.defaultdict(list)
+ for tag in data["tags"]:
+ type, _, value = tag.partition(":")
+ tags[type].append(value)
+ for type, values in tags.items():
+ data["tags_" + type] = values
return data
def metadata_from_page(self, page):
diff --git a/gallery_dl/extractor/facebook.py b/gallery_dl/extractor/facebook.py
new file mode 100644
index 0000000..04acfc5
--- /dev/null
+++ b/gallery_dl/extractor/facebook.py
@@ -0,0 +1,447 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.facebook.com/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?facebook\.com"
+
+
+class FacebookExtractor(Extractor):
+ """Base class for Facebook extractors"""
+ category = "facebook"
+ root = "https://www.facebook.com"
+ directory_fmt = ("{category}", "{username}", "{title} ({set_id})")
+ filename_fmt = "{id}.{extension}"
+ archive_fmt = "{id}.{extension}"
+
+ set_url_fmt = root + "/media/set/?set={set_id}"
+ photo_url_fmt = root + "/photo/?fbid={photo_id}&set={set_id}"
+
+ def _init(self):
+ headers = self.session.headers
+ headers["Accept"] = (
+ "text/html,application/xhtml+xml,application/xml;q=0.9,"
+ "image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8"
+ )
+ headers["Sec-Fetch-Dest"] = "empty"
+ headers["Sec-Fetch-Mode"] = "navigate"
+ headers["Sec-Fetch-Site"] = "same-origin"
+
+ self.fallback_retries = self.config("fallback-retries", 2)
+ self.videos = self.config("videos", True)
+ self.author_followups = self.config("author-followups", False)
+
+ @staticmethod
+ def decode_all(txt):
+ return text.unescape(
+ txt.encode("utf-8").decode("unicode_escape")
+ ).replace("\\/", "/")
+
+ @staticmethod
+ def parse_set_page(set_page):
+ directory = {
+ "set_id": text.extr(
+ set_page, '"mediaSetToken":"', '"'
+ ) or text.extr(
+ set_page, '"mediasetToken":"', '"'
+ ),
+ "username": FacebookExtractor.decode_all(
+ text.extr(
+ set_page, '"user":{"__isProfile":"User","name":"', '","'
+ ) or text.extr(
+ set_page, '"actors":[{"__typename":"User","name":"', '","'
+ )
+ ),
+ "user_id": text.extr(
+ set_page, '"owner":{"__typename":"User","id":"', '"'
+ ),
+ "title": FacebookExtractor.decode_all(text.extr(
+ set_page, '"title":{"text":"', '"'
+ )),
+ "first_photo_id": text.extr(
+ set_page,
+ '{"__typename":"Photo","__isMedia":"Photo","',
+ '","creation_story"'
+ ).rsplit('"id":"', 1)[-1] or
+ text.extr(
+ set_page, '{"__typename":"Photo","id":"', '"'
+ )
+ }
+
+ return directory
+
+ @staticmethod
+ def parse_photo_page(photo_page):
+ photo = {
+ "id": text.extr(
+ photo_page, '"__isNode":"Photo","id":"', '"'
+ ),
+ "set_id": text.extr(
+ photo_page,
+ '"url":"https:\\/\\/www.facebook.com\\/photo\\/?fbid=',
+ '"'
+ ).rsplit("&set=", 1)[-1],
+ "username": FacebookExtractor.decode_all(text.extr(
+ photo_page, '"owner":{"__typename":"User","name":"', '"'
+ )),
+ "user_id": text.extr(
+ photo_page, '"owner":{"__typename":"User","id":"', '"'
+ ),
+ "caption": FacebookExtractor.decode_all(text.extr(
+ photo_page,
+ '"message":{"delight_ranges"',
+ '"},"message_preferred_body"'
+ ).rsplit('],"text":"', 1)[-1]),
+ "date": text.parse_timestamp(text.extr(
+ photo_page, '\\"publish_time\\":', ','
+ )),
+ "url": FacebookExtractor.decode_all(text.extr(
+ photo_page, ',"image":{"uri":"', '","'
+ )),
+ "next_photo_id": text.extr(
+ photo_page,
+ '"nextMediaAfterNodeId":{"__typename":"Photo","id":"',
+ '"'
+ ) or text.extr(
+ photo_page,
+ '"nextMedia":{"edges":[{"node":{"__typename":"Photo","id":"',
+ '"'
+ )
+ }
+
+ text.nameext_from_url(photo["url"], photo)
+
+ photo["followups_ids"] = []
+ for comment_raw in text.extract_iter(
+ photo_page, '{"node":{"id"', '"cursor":null}'
+ ):
+ if ('"is_author_original_poster":true' in comment_raw and
+ '{"__typename":"Photo","id":"' in comment_raw):
+ photo["followups_ids"].append(text.extr(
+ comment_raw,
+ '{"__typename":"Photo","id":"',
+ '"'
+ ))
+
+ return photo
+
+ @staticmethod
+ def parse_post_page(post_page):
+ first_photo_url = text.extr(
+ text.extr(
+ post_page, '"__isMedia":"Photo"', '"target_group"'
+ ), '"url":"', ','
+ )
+
+ post = {
+ "set_id": text.extr(post_page, '{"mediaset_token":"', '"') or
+ text.extr(first_photo_url, 'set=', '"').rsplit("&", 1)[0]
+ }
+
+ return post
+
+ @staticmethod
+ def parse_video_page(video_page):
+ video = {
+ "id": text.extr(
+ video_page, '\\"video_id\\":\\"', '\\"'
+ ),
+ "username": FacebookExtractor.decode_all(text.extr(
+ video_page, '"actors":[{"__typename":"User","name":"', '","'
+ )),
+ "user_id": text.extr(
+ video_page, '"owner":{"__typename":"User","id":"', '"'
+ ),
+ "date": text.parse_timestamp(text.extr(
+ video_page, '\\"publish_time\\":', ','
+ )),
+ "type": "video"
+ }
+
+ if not video["username"]:
+ video["username"] = FacebookExtractor.decode_all(text.extr(
+ video_page,
+ '"__typename":"User","id":"' + video["user_id"] + '","name":"',
+ '","'
+ ))
+
+ first_video_raw = text.extr(
+ video_page, '"permalink_url"', '\\/Period>\\u003C\\/MPD>'
+ )
+
+ audio = {
+ **video,
+ "url": FacebookExtractor.decode_all(text.extr(
+ text.extr(
+ first_video_raw,
+ "AudioChannelConfiguration",
+ "BaseURL>\\u003C"
+ ),
+ "BaseURL>", "\\u003C\\/"
+ )),
+ "type": "audio"
+ }
+
+ video["urls"] = {}
+
+ for raw_url in text.extract_iter(
+ first_video_raw, 'FBQualityLabel=\\"', '\\u003C\\/BaseURL>'
+ ):
+ resolution = raw_url.split('\\"', 1)[0]
+ video["urls"][resolution] = FacebookExtractor.decode_all(
+ raw_url.split('BaseURL>', 1)[1]
+ )
+
+ if not video["urls"]:
+ return video, audio
+
+ video["url"] = max(
+ video["urls"].items(),
+ key=lambda x: text.parse_int(x[0][:-1])
+ )[1]
+
+ text.nameext_from_url(video["url"], video)
+ audio["filename"] = video["filename"]
+ audio["extension"] = "m4a"
+
+ return video, audio
+
+ def photo_page_request_wrapper(self, url, **kwargs):
+ LEFT_OFF_TXT = "" if url.endswith("&set=") else (
+ "\nYou can use this URL to continue from "
+ "where you left off (added \"&setextract\"): "
+ "\n" + url + "&setextract"
+ )
+
+ res = self.request(url, **kwargs)
+
+ if res.url.startswith(self.root + "/login"):
+ raise exception.AuthenticationError(
+ "You must be logged in to continue viewing images." +
+ LEFT_OFF_TXT
+ )
+
+ if b'{"__dr":"CometErrorRoot.react"}' in res.content:
+ raise exception.StopExtraction(
+ "You've been temporarily blocked from viewing images. "
+ "\nPlease try using a different account, "
+ "using a VPN or waiting before you retry." +
+ LEFT_OFF_TXT
+ )
+
+ return res
+
+ def extract_set(self, first_photo_id, set_id):
+ all_photo_ids = [first_photo_id]
+
+ retries = 0
+ i = 0
+
+ while i < len(all_photo_ids):
+ photo_id = all_photo_ids[i]
+ photo_url = self.photo_url_fmt.format(
+ photo_id=photo_id, set_id=set_id
+ )
+ photo_page = self.photo_page_request_wrapper(photo_url).text
+
+ photo = self.parse_photo_page(photo_page)
+ photo["set_id"] = set_id
+ photo["num"] = i + 1
+
+ if self.author_followups:
+ for followup_id in photo["followups_ids"]:
+ if followup_id not in all_photo_ids:
+ self.log.debug(
+ "Found a followup in comments: %s", followup_id
+ )
+ all_photo_ids.append(followup_id)
+
+ if not photo["url"]:
+ if retries < self.fallback_retries and self._interval_429:
+ seconds = self._interval_429()
+ self.log.warning(
+ "Failed to find photo download URL for %s. "
+ "Retrying in %s seconds.", photo_url, seconds,
+ )
+ self.wait(seconds=seconds, reason="429 Too Many Requests")
+ retries += 1
+ continue
+ else:
+ self.log.error(
+ "Failed to find photo download URL for " + photo_url +
+ ". Skipping."
+ )
+ retries = 0
+ else:
+ retries = 0
+ yield Message.Url, photo["url"], photo
+
+ if photo["next_photo_id"] == "":
+ self.log.debug(
+ "Can't find next image in the set. "
+ "Extraction is over."
+ )
+ elif photo["next_photo_id"] in all_photo_ids:
+ if photo["next_photo_id"] != photo["id"]:
+ self.log.debug(
+ "Detected a loop in the set, it's likely finished. "
+ "Extraction is over."
+ )
+ else:
+ all_photo_ids.append(photo["next_photo_id"])
+
+ i += 1
+
+
+class FacebookSetExtractor(FacebookExtractor):
+ """Base class for Facebook Set extractors"""
+ subcategory = "set"
+ pattern = (
+ BASE_PATTERN +
+ r"/(?:(?:media/set|photo)/?\?(?:[^&#]+&)*set=([^&#]+)"
+ r"[^/?#]*(?<!&setextract)$"
+ r"|([^/?#]+/posts/[^/?#]+)"
+ r"|photo/\?(?:[^&#]+&)*fbid=([^/?&#]+)&set=([^/?&#]+)&setextract)"
+ )
+ example = "https://www.facebook.com/media/set/?set=SET_ID"
+
+ def items(self):
+ set_id = self.groups[0] or self.groups[3]
+ path = self.groups[1]
+ if path:
+ post_url = self.root + "/" + path
+ post_page = self.request(post_url).text
+ set_id = self.parse_post_page(post_page)["set_id"]
+
+ set_url = self.set_url_fmt.format(set_id=set_id)
+ set_page = self.request(set_url).text
+
+ directory = self.parse_set_page(set_page)
+
+ yield Message.Directory, directory
+
+ yield from self.extract_set(
+ self.groups[2] or directory["first_photo_id"],
+ directory["set_id"]
+ )
+
+
+class FacebookPhotoExtractor(FacebookExtractor):
+ """Base class for Facebook Photo extractors"""
+ subcategory = "photo"
+ pattern = (BASE_PATTERN +
+ r"/(?:[^/?#]+/photos/[^/?#]+/|photo(?:.php)?/?\?"
+ r"(?:[^&#]+&)*fbid=)([^/?&#]+)[^/?#]*(?<!&setextract)$")
+ example = "https://www.facebook.com/photo/?fbid=PHOTO_ID"
+
+ def items(self):
+ photo_id = self.groups[0]
+ photo_url = self.photo_url_fmt.format(photo_id=photo_id, set_id="")
+ photo_page = self.photo_page_request_wrapper(photo_url).text
+
+ i = 1
+ photo = self.parse_photo_page(photo_page)
+ photo["num"] = i
+
+ set_page = self.request(
+ self.set_url_fmt.format(set_id=photo["set_id"])
+ ).text
+
+ directory = self.parse_set_page(set_page)
+
+ yield Message.Directory, directory
+ yield Message.Url, photo["url"], photo
+
+ if self.author_followups:
+ for comment_photo_id in photo["followups_ids"]:
+ comment_photo = self.parse_photo_page(
+ self.photo_page_request_wrapper(
+ self.photo_url_fmt.format(
+ photo_id=comment_photo_id, set_id=""
+ )
+ ).text
+ )
+ i += 1
+ comment_photo["num"] = i
+ yield Message.Url, comment_photo["url"], comment_photo
+
+
+class FacebookVideoExtractor(FacebookExtractor):
+ """Base class for Facebook Video extractors"""
+ subcategory = "video"
+ directory_fmt = ("{category}", "{username}", "{subcategory}")
+ pattern = BASE_PATTERN + r"/(?:[^/?#]+/videos/|watch/?\?v=)([^/?&#]+)"
+ example = "https://www.facebook.com/watch/?v=VIDEO_ID"
+
+ def items(self):
+ video_id = self.groups[0]
+ video_url = self.root + "/watch/?v=" + video_id
+ video_page = self.request(video_url).text
+
+ video, audio = self.parse_video_page(video_page)
+
+ if "url" not in video:
+ return
+
+ yield Message.Directory, video
+
+ if self.videos == "ytdl":
+ yield Message.Url, "ytdl:" + video_url, video
+ elif self.videos:
+ yield Message.Url, video["url"], video
+ if audio["url"]:
+ yield Message.Url, audio["url"], audio
+
+
+class FacebookProfileExtractor(FacebookExtractor):
+ """Base class for Facebook Profile Photos Set extractors"""
+ subcategory = "profile"
+ pattern = (
+ BASE_PATTERN +
+ r"/(?!media/|photo/|photo.php|watch/)"
+ r"(?:profile\.php\?id=|people/[^/?#]+/)?"
+ r"([^/?&#]+)(?:/photos(?:_by)?|/videos|/posts)?/?(?:$|\?|#)"
+ )
+ example = "https://www.facebook.com/USERNAME"
+
+ @staticmethod
+ def get_profile_photos_set_id(profile_photos_page):
+ set_ids_raw = text.extr(
+ profile_photos_page, '"pageItems"', '"page_info"'
+ )
+
+ set_id = text.extr(
+ set_ids_raw, 'set=', '"'
+ ).rsplit("&", 1)[0] or text.extr(
+ set_ids_raw, '\\/photos\\/', '\\/'
+ )
+
+ return set_id
+
+ def items(self):
+ profile_photos_url = (
+ self.root + "/" + self.groups[0] + "/photos_by"
+ )
+ profile_photos_page = self.request(profile_photos_url).text
+
+ set_id = self.get_profile_photos_set_id(profile_photos_page)
+
+ if set_id:
+ set_url = self.set_url_fmt.format(set_id=set_id)
+ set_page = self.request(set_url).text
+
+ directory = self.parse_set_page(set_page)
+
+ yield Message.Directory, directory
+
+ yield from self.extract_set(
+ directory["first_photo_id"], directory["set_id"]
+ )
+ else:
+ self.log.debug("Profile photos set ID not found.")
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index df252ee..e85a375 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -20,7 +20,6 @@ class FlickrExtractor(Extractor):
filename_fmt = "{category}_{id}.{extension}"
directory_fmt = ("{category}", "{user[username]}")
archive_fmt = "{id}"
- cookies_domain = None
request_interval = (1.0, 2.0)
request_interval_min = 0.5
@@ -45,7 +44,7 @@ class FlickrExtractor(Extractor):
self.log.debug("", exc_info=exc)
else:
photo.update(data)
- url = photo["url"]
+ url = self._file_url(photo)
yield Message.Directory, photo
yield Message.Url, url, text.nameext_from_url(url, photo)
@@ -57,6 +56,15 @@ class FlickrExtractor(Extractor):
def photos(self):
"""Return an iterable with all relevant photo objects"""
+ def _file_url(self, photo):
+ url = photo["url"]
+
+ if "/video/" in url:
+ return url
+
+ path, _, ext = url.rpartition(".")
+ return path + "_d." + ext
+
class FlickrImageExtractor(FlickrExtractor):
"""Extractor for individual images from flickr.com"""
@@ -98,7 +106,7 @@ class FlickrImageExtractor(FlickrExtractor):
if isinstance(value, dict):
location[key] = value["_content"]
- url = photo["url"]
+ url = self._file_url(photo)
yield Message.Directory, photo
yield Message.Url, url, text.nameext_from_url(url, photo)
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 0baad2f..aad5752 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -22,14 +22,14 @@ class GelbooruV02Extractor(booru.BooruExtractor):
def _init(self):
self.api_key = self.config("api-key")
self.user_id = self.config("user-id")
- self.api_root = self.config_instance("api_root") or self.root
+ self.root_api = self.config_instance("root-api") or self.root
if self.category == "realbooru":
self.items = self._items_realbooru
self._tags = self._tags_realbooru
def _api_request(self, params):
- url = self.api_root + "/index.php?page=dapi&s=post&q=index"
+ url = self.root_api + "/index.php?page=dapi&s=post&q=index"
return ElementTree.fromstring(self.request(url, params=params).text)
def _pagination(self, params):
@@ -191,8 +191,8 @@ BASE_PATTERN = GelbooruV02Extractor.update({
},
"rule34": {
"root": "https://rule34.xxx",
+ "root-api": "https://api.rule34.xxx",
"pattern": r"(?:www\.)?rule34\.xxx",
- "api_root": "https://api.rule34.xxx",
},
"safebooru": {
"root": "https://safebooru.org",
diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py
index c75c90d..7e128a4 100644
--- a/gallery_dl/extractor/hentaifoundry.py
+++ b/gallery_dl/extractor/hentaifoundry.py
@@ -254,6 +254,22 @@ class HentaifoundryFavoriteExtractor(HentaifoundryExtractor):
self.root, self.user)
+class HentaifoundryTagExtractor(HentaifoundryExtractor):
+ """Extractor for tag searches on hentaifoundry.com"""
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{index}"
+ pattern = BASE_PATTERN + r"/pictures/tagged/([^/?#]+)"
+ example = "https://www.hentai-foundry.com/pictures/tagged/TAG"
+
+ def __init__(self, match):
+ HentaifoundryExtractor.__init__(self, match)
+ self.page_url = "{}/pictures/tagged/{}".format(self.root, self.user)
+
+ def metadata(self):
+ return {"search_tags": self.user}
+
+
class HentaifoundryRecentExtractor(HentaifoundryExtractor):
"""Extractor for 'Recent Pictures' on hentaifoundry.com"""
subcategory = "recent"
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
index 4a9759f..c939a3c 100644
--- a/gallery_dl/extractor/hiperdex.py
+++ b/gallery_dl/extractor/hiperdex.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://hiperdex.top/"""
+"""Extractors for https://hipertoon.com/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
@@ -14,13 +14,13 @@ from ..cache import memcache
import re
BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
- r"(?:1st)?hiperdex\d?\.(?:com|net|info|top))")
+ r"(?:1st)?hiper(?:dex|toon)\d?\.(?:com|net|info|top))")
class HiperdexBase():
"""Base class for hiperdex extractors"""
category = "hiperdex"
- root = "https://hiperdex.top"
+ root = "https://hipertoon.com"
@memcache(keyarg=1)
def manga_data(self, manga, page=None):
@@ -49,7 +49,7 @@ class HiperdexBase():
"status" : extr(
'class="summary-content">', '<').strip(),
"description": text.remove_html(text.unescape(extr(
- 'class="description-summary">', '</div>'))),
+ "Summary </h5>", "</div>"))),
"language": "English",
"lang" : "en",
}
@@ -69,7 +69,7 @@ class HiperdexBase():
class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
"""Extractor for hiperdex manga chapters"""
pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))"
- example = "https://hiperdex.top/manga/MANGA/CHAPTER/"
+ example = "https://hipertoon.com/manga/MANGA/CHAPTER/"
def __init__(self, match):
root, path, self.manga, self.chapter = match.groups()
@@ -91,7 +91,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
"""Extractor for hiperdex manga"""
chapterclass = HiperdexChapterExtractor
pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$"
- example = "https://hiperdex.top/manga/MANGA/"
+ example = "https://hipertoon.com/manga/MANGA/"
def __init__(self, match):
root, path, self.manga = match.groups()
@@ -127,7 +127,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
chapterclass = HiperdexMangaExtractor
reverse = False
pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))"
- example = "https://hiperdex.top/manga-artist/NAME/"
+ example = "https://hipertoon.com/manga-artist/NAME/"
def __init__(self, match):
self.root = text.ensure_http_scheme(match.group(1))
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 18df9df..308b42c 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -108,9 +108,9 @@ class HitomiTagExtractor(Extractor):
category = "hitomi"
subcategory = "tag"
root = "https://hitomi.la"
- pattern = (r"(?:https?://)?hitomi\.la/"
- r"(tag|artist|group|series|type|character)/"
- r"([^/?#]+)\.html")
+ pattern = (r"(?:https?://)?hitomi\.la"
+ r"/(tag|artist|group|series|type|character)"
+ r"/([^/?#]+)\.html")
example = "https://hitomi.la/tag/TAG-LANG.html"
def __init__(self, match):
@@ -151,6 +151,109 @@ class HitomiTagExtractor(Extractor):
return
+class HitomiIndexExtractor(HitomiTagExtractor):
+ """Extractor for galleries from index searches on hitomi.la"""
+ subcategory = "index"
+ pattern = r"(?:https?://)?hitomi\.la/(\w+)-(\w+)\.html"
+ example = "https://hitomi.la/index-LANG.html"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.tag, self.language = match.groups()
+
+ def items(self):
+ data = {"_extractor": HitomiGalleryExtractor}
+ nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format(
+ self.tag, self.language)
+ headers = {
+ "Origin": self.root,
+ "Cache-Control": "max-age=0",
+ }
+
+ offset = 0
+ total = None
+ while True:
+ headers["Referer"] = "{}/{}-{}.html?page={}".format(
+ self.root, self.tag, self.language, offset // 100 + 1)
+ headers["Range"] = "bytes={}-{}".format(offset, offset+99)
+ response = self.request(nozomi_url, headers=headers)
+
+ for gallery_id in decode_nozomi(response.content):
+ gallery_url = "{}/galleries/{}.html".format(
+ self.root, gallery_id)
+ yield Message.Queue, gallery_url, data
+
+ offset += 100
+ if total is None:
+ total = text.parse_int(
+ response.headers["content-range"].rpartition("/")[2])
+ if offset >= total:
+ return
+
+
+class HitomiSearchExtractor(Extractor):
+ """Extractor for galleries from multiple tag searches on hitomi.la"""
+ category = "hitomi"
+ subcategory = "search"
+ root = "https://hitomi.la"
+ pattern = r"(?:https?://)?hitomi\.la/search\.html\?([^/?#]+)"
+ example = "https://hitomi.la/search.html?QUERY"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.query = match.group(1)
+ self.tags = text.unquote(self.query).split(" ")
+
+ def items(self):
+ data = {"_extractor": HitomiGalleryExtractor}
+
+ results = [self.get_nozomi_items(tag) for tag in self.tags]
+ intersects = set.intersection(*results)
+
+ for gallery_id in sorted(intersects, reverse=True):
+ gallery_url = "{}/galleries/{}.html".format(
+ self.root, gallery_id)
+ yield Message.Queue, gallery_url, data
+
+ def get_nozomi_items(self, full_tag):
+ area, tag, language = self.get_nozomi_args(full_tag)
+
+ if area:
+ referer_base = "{}/n/{}/{}-{}.html".format(
+ self.root, area, tag, language)
+ nozomi_url = "https://ltn.hitomi.la/{}/{}-{}.nozomi".format(
+ area, tag, language)
+ else:
+ referer_base = "{}/n/{}-{}.html".format(
+ self.root, tag, language)
+ nozomi_url = "https://ltn.hitomi.la/{}-{}.nozomi".format(
+ tag, language)
+
+ headers = {
+ "Origin": self.root,
+ "Cache-Control": "max-age=0",
+ "Referer": "{}/search.html?{}".format(referer_base, self.query),
+ }
+
+ response = self.request(nozomi_url, headers=headers)
+ return set(decode_nozomi(response.content))
+
+ def get_nozomi_args(self, query):
+ ns, _, tag = query.strip().partition(":")
+ area = ns
+ language = "all"
+
+ if ns == "female" or ns == "male":
+ area = "tag"
+ tag = query
+ elif ns == "language":
+ area = None
+ language = tag
+ tag = "index"
+
+ return area, tag, language
+
+
@memcache(maxage=1800)
def _parse_gg(extr):
page = extr.request("https://ltn.hitomi.la/gg.js").text
diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py
index 115fff3..159feba 100644
--- a/gallery_dl/extractor/imagechest.py
+++ b/gallery_dl/extractor/imagechest.py
@@ -10,7 +10,7 @@
"""Extractors for https://imgchest.com/"""
from .common import GalleryExtractor, Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.)?imgchest\.com"
@@ -33,35 +33,23 @@ class ImagechestGalleryExtractor(GalleryExtractor):
self.api = ImagechestAPI(self, access_token)
self.gallery_url = None
self.metadata = self._metadata_api
- self.images = self._images_api
def metadata(self, page):
- if "Sorry, but the page you requested could not be found." in page:
- raise exception.NotFoundError("gallery")
-
- return {
- "gallery_id": self.gallery_id,
- "title": text.unescape(text.extr(
- page, 'property="og:title" content="', '"').strip())
- }
+ try:
+ data = util.json_loads(text.unescape(text.extr(
+ page, 'data-page="', '"')))
+ post = data["props"]["post"]
+ except Exception:
+ if "<title>Not Found</title>" in page:
+ raise exception.NotFoundError("gallery")
+ self.files = ()
+ return {}
+
+ self.files = post.pop("files", ())
+ post["gallery_id"] = self.gallery_id
+ post["tags"] = [tag["name"] for tag in post["tags"]]
- def images(self, page):
- if ' load-all">' in page:
- url = "{}/p/{}/loadAll".format(self.root, self.gallery_id)
- headers = {
- "X-Requested-With": "XMLHttpRequest",
- "Origin" : self.root,
- "Referer" : self.gallery_url,
- }
- csrf_token = text.extr(page, 'name="csrf-token" content="', '"')
- data = {"_token": csrf_token}
- page += self.request(
- url, method="POST", headers=headers, data=data).text
-
- return [
- (url, None)
- for url in text.extract_iter(page, 'data-url="', '"')
- ]
+ return post
def _metadata_api(self, page):
post = self.api.post(self.gallery_id)
@@ -74,15 +62,18 @@ class ImagechestGalleryExtractor(GalleryExtractor):
post["gallery_id"] = self.gallery_id
post.pop("image_count", None)
- self._image_list = post.pop("images")
+ self.files = post.pop("images")
return post
- def _images_api(self, page):
- return [
- (img["link"], img)
- for img in self._image_list
- ]
+ def images(self, page):
+ try:
+ return [
+ (file["link"], file)
+ for file in self.files
+ ]
+ except Exception:
+ return ()
class ImagechestUserExtractor(Extractor):
@@ -93,10 +84,6 @@ class ImagechestUserExtractor(Extractor):
pattern = BASE_PATTERN + r"/u/([^/?#]+)"
example = "https://imgchest.com/u/USER"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.user = match.group(1)
-
def items(self):
url = self.root + "/api/posts"
params = {
@@ -104,7 +91,7 @@ class ImagechestUserExtractor(Extractor):
"sort" : "new",
"tag" : "",
"q" : "",
- "username": text.unquote(self.user),
+ "username": text.unquote(self.groups[0]),
"nsfw" : "true",
}
@@ -114,6 +101,9 @@ class ImagechestUserExtractor(Extractor):
except (TypeError, KeyError):
return
+ if not data:
+ return
+
for gallery in data:
gallery["_extractor"] = ImagechestGalleryExtractor
yield Message.Queue, gallery["link"], gallery
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index dd1272f..a866f45 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -101,7 +101,10 @@ class InstagramExtractor(Extractor):
continue
url = file["display_url"]
- yield Message.Url, url, text.nameext_from_url(url, file)
+ text.nameext_from_url(url, file)
+ if file["extension"] == "webp" and "stp=dst-jpg" in url:
+ file["extension"] = "jpg"
+ yield Message.Url, url, file
def metadata(self):
return ()
@@ -390,10 +393,11 @@ class InstagramExtractor(Extractor):
def _init_cursor(self):
cursor = self.config("cursor", True)
- if not cursor:
+ if cursor is True:
+ return None
+ elif not cursor:
self._update_cursor = util.identity
- elif isinstance(cursor, str):
- return cursor
+ return cursor
def _update_cursor(self, cursor):
self.log.debug("Cursor: %s", cursor)
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 6f2d5f3..3d04f75 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -10,7 +10,7 @@
from .common import Extractor, Message
from .. import text, util, exception
-from ..cache import cache, memcache
+from ..cache import cache
import itertools
import json
import re
@@ -38,6 +38,7 @@ class KemonopartyExtractor(Extractor):
Extractor.__init__(self, match)
def _init(self):
+ self.api = KemonoAPI(self)
self.revisions = self.config("revisions")
if self.revisions:
self.revisions_unique = (self.revisions == "unique")
@@ -53,48 +54,60 @@ class KemonopartyExtractor(Extractor):
sort_keys=True, separators=(",", ":")).encode
def items(self):
+ service = self.groups[2]
+ creator_id = self.groups[3]
+
find_hash = re.compile(HASH_PATTERN).match
generators = self._build_file_generators(self.config("files"))
- duplicates = self.config("duplicates")
- comments = self.config("comments")
- username = dms = announcements = None
+ announcements = True if self.config("announcements") else None
+ comments = True if self.config("comments") else False
+ duplicates = True if self.config("duplicates") else False
+ dms = True if self.config("dms") else None
+ profile = username = None
# prevent files from being sent with gzip compression
headers = {"Accept-Encoding": "identity"}
if self.config("metadata"):
- username = text.unescape(text.extract(
- self.request(self.user_url).text,
- '<meta name="artist_name" content="', '"')[0])
- if self.config("dms"):
- dms = True
- if self.config("announcements"):
- announcements = True
+ profile = self.api.creator_profile(service, creator_id)
+ username = profile["name"]
posts = self.posts()
max_posts = self.config("max-posts")
if max_posts:
posts = itertools.islice(posts, max_posts)
+ if self.revisions:
+ posts = self._revisions(posts)
for post in posts:
-
headers["Referer"] = "{}/{}/user/{}/post/{}".format(
self.root, post["service"], post["user"], post["id"])
post["_http_headers"] = headers
post["date"] = self._parse_datetime(
post.get("published") or post.get("added") or "")
- if username:
+ if profile is not None:
post["username"] = username
+ post["user_profile"] = profile
if comments:
- post["comments"] = self._extract_comments(post)
+ try:
+ post["comments"] = self.api.creator_post_comments(
+ service, creator_id, post["id"])
+ except exception.HttpError:
+ post["comments"] = ()
if dms is not None:
if dms is True:
- dms = self._extract_cards(post, "dms")
+ dms = self.api.creator_dms(
+ post["service"], post["user"])
+ try:
+ dms = dms["props"]["dms"]
+ except Exception:
+ dms = ()
post["dms"] = dms
if announcements is not None:
if announcements is True:
- announcements = self._extract_cards(post, "announcements")
+ announcements = self.api.creator_announcements(
+ post["service"], post["user"])
post["announcements"] = announcements
files = []
@@ -145,20 +158,23 @@ class KemonopartyExtractor(Extractor):
self.cookies_update(self._login_impl(
(username, self.cookies_domain), password))
- @cache(maxage=28*86400, keyarg=1)
+ @cache(maxage=3650*86400, keyarg=1)
def _login_impl(self, username, password):
username = username[0]
self.log.info("Logging in as %s", username)
- url = self.root + "/account/login"
+ url = self.root + "/api/v1/authentication/login"
data = {"username": username, "password": password}
- response = self.request(url, method="POST", data=data)
- if response.url.endswith("/account/login") and \
- "Username or password is incorrect" in response.text:
- raise exception.AuthenticationError()
+ response = self.request(url, method="POST", json=data, fatal=False)
+ if response.status_code >= 400:
+ try:
+ msg = '"' + response.json()["error"] + '"'
+ except Exception:
+ msg = '"0/1 Username or password is incorrect"'
+ raise exception.AuthenticationError(msg)
- return {c.name: c.value for c in response.history[0].cookies}
+ return {c.name: c.value for c in response.cookies}
def _file(self, post):
file = post["file"]
@@ -188,56 +204,21 @@ class KemonopartyExtractor(Extractor):
filetypes = filetypes.split(",")
return [genmap[ft] for ft in filetypes]
- def _extract_comments(self, post):
- url = "{}/{}/user/{}/post/{}".format(
- self.root, post["service"], post["user"], post["id"])
- page = self.request(url).text
-
- comments = []
- for comment in text.extract_iter(page, "<article", "</article>"):
- extr = text.extract_from(comment)
- cid = extr('id="', '"')
- comments.append({
- "id" : cid,
- "user": extr('href="#' + cid + '"', '</').strip(" \n\r>"),
- "body": extr(
- '<section class="comment__body">', '</section>').strip(),
- "date": extr('datetime="', '"'),
- })
- return comments
-
- def _extract_cards(self, post, type):
- url = "{}/{}/user/{}/{}".format(
- self.root, post["service"], post["user"], type)
- page = self.request(url).text
-
- cards = []
- for card in text.extract_iter(page, "<article", "</article>"):
- footer = text.extr(card, "<footer", "</footer>")
- cards.append({
- "body": text.unescape(text.extr(
- card, "<pre>", "</pre></",
- ).strip()),
- "date": text.extr(footer, ': ', '\n'),
- })
- return cards
-
def _parse_datetime(self, date_string):
if len(date_string) > 19:
date_string = date_string[:19]
return text.parse_datetime(date_string, "%Y-%m-%dT%H:%M:%S")
- @memcache(keyarg=1)
- def _discord_channels(self, server):
- url = "{}/api/v1/discord/channel/lookup/{}".format(
- self.root, server)
- return self.request(url).json()
+ def _revisions(self, posts):
+ return itertools.chain.from_iterable(
+ self._revisions_post(post) for post in posts)
- def _revisions_post(self, post, url):
+ def _revisions_post(self, post):
post["revision_id"] = 0
try:
- revs = self.request(url + "/revisions").json()
+ revs = self.api.creator_post_revisions(
+ post["service"], post["user"], post["id"])
except exception.HttpError:
post["revision_hash"] = self._revision_hash(post)
post["revision_index"] = 1
@@ -268,8 +249,8 @@ class KemonopartyExtractor(Extractor):
return revs
- def _revisions_all(self, url):
- revs = self.request(url + "/revisions").json()
+ def _revisions_all(self, service, creator_id, post_id):
+ revs = self.api.creator_post_revisions(service, creator_id, post_id)
cnt = idx = len(revs)
for rev in revs:
@@ -305,50 +286,30 @@ def _validate(response):
class KemonopartyUserExtractor(KemonopartyExtractor):
"""Extractor for all posts from a kemono.su user listing"""
subcategory = "user"
- pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|[?#])"
+ pattern = USER_PATTERN + r"/?(?:\?([^#]+))?(?:$|\?|#)"
example = "https://kemono.su/SERVICE/user/12345"
def __init__(self, match):
- _, _, service, user_id, self.query = match.groups()
- self.subcategory = service
+ self.subcategory = match.group(3)
KemonopartyExtractor.__init__(self, match)
- self.api_url = "{}/api/v1/{}/user/{}".format(
- self.root, service, user_id)
- self.user_url = "{}/{}/user/{}".format(self.root, service, user_id)
def posts(self):
- url = self.api_url
- params = text.parse_query(self.query)
- params["o"] = text.parse_int(params.get("o"))
-
- while True:
- posts = self.request(url, params=params).json()
-
- if self.revisions:
- for post in posts:
- post_url = "{}/api/v1/{}/user/{}/post/{}".format(
- self.root, post["service"], post["user"], post["id"])
- yield from self._revisions_post(post, post_url)
- else:
- yield from posts
-
- if len(posts) < 50:
- break
- params["o"] += 50
+ _, _, service, creator_id, query = self.groups
+ params = text.parse_query(query)
+ return self.api.creator_posts(
+ service, creator_id, params.get("o"), params.get("q"))
class KemonopartyPostsExtractor(KemonopartyExtractor):
"""Extractor for kemono.su post listings"""
subcategory = "posts"
- pattern = BASE_PATTERN + r"/posts(?:/?\?([^#]+))?"
+ pattern = BASE_PATTERN + r"/posts()()(?:/?\?([^#]+))?"
example = "https://kemono.su/posts"
- def __init__(self, match):
- KemonopartyExtractor.__init__(self, match)
- self.query = match.group(3)
- self.api_url = self.root + "/api/v1/posts"
-
- posts = KemonopartyUserExtractor.posts
+ def posts(self):
+ params = text.parse_query(self.groups[4])
+ return self.api.posts(
+ params.get("o"), params.get("q"), params.get("tag"))
class KemonopartyPostExtractor(KemonopartyExtractor):
@@ -358,27 +319,23 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
example = "https://kemono.su/SERVICE/user/12345/post/12345"
def __init__(self, match):
- _, _, service, user_id, post_id, self.revision, self.revision_id = \
- match.groups()
- self.subcategory = service
+ self.subcategory = match.group(3)
KemonopartyExtractor.__init__(self, match)
- self.api_url = "{}/api/v1/{}/user/{}/post/{}".format(
- self.root, service, user_id, post_id)
- self.user_url = "{}/{}/user/{}".format(self.root, service, user_id)
def posts(self):
- if not self.revision:
- post = self.request(self.api_url).json()
- if self.revisions:
- return self._revisions_post(post, self.api_url)
- return (post,)
+ _, _, service, creator_id, post_id, revision, revision_id = self.groups
+ post = self.api.creator_post(service, creator_id, post_id)
+ if not revision:
+ return (post["post"],)
- revs = self._revisions_all(self.api_url)
- if not self.revision_id:
+ self.revisions = False
+
+ revs = self._revisions_all(service, creator_id, post_id)
+ if not revision_id:
return revs
for rev in revs:
- if str(rev["revision_id"]) == self.revision_id:
+ if str(rev["revision_id"]) == revision_id:
return (rev,)
raise exception.NotFoundError("revision")
@@ -391,40 +348,37 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
"{channel_name|channel}")
filename_fmt = "{id}_{num:>02}_{filename}.{extension}"
archive_fmt = "discord_{server}_{id}_{num}"
- pattern = BASE_PATTERN + r"/discord/server/(\d+)(?:/channel/(\d+))?#(.*)"
- example = "https://kemono.su/discord/server/12345#CHANNEL"
-
- def __init__(self, match):
- KemonopartyExtractor.__init__(self, match)
- _, _, self.server, self.channel_id, self.channel = match.groups()
- self.channel_name = ""
+ pattern = (BASE_PATTERN + r"/discord/server/(\d+)"
+ r"(?:/(?:channel/)?(\d+)(?:#(.+))?|#(.+))")
+ example = "https://kemono.su/discord/server/12345/12345"
def items(self):
self._prepare_ddosguard_cookies()
+ _, _, server_id, channel_id, channel_name, channel = self.groups
- if self.channel_id:
- self.channel_name = self.channel
- else:
- if self.channel.isdecimal() and len(self.channel) >= 16:
+ if channel_id is None:
+ if channel.isdecimal() and len(channel) >= 16:
key = "id"
else:
key = "name"
- for channel in self._discord_channels(self.server):
- if channel[key] == self.channel:
+ for ch in self.api.discord_server(server_id):
+ if ch[key] == channel:
break
else:
raise exception.NotFoundError("channel")
- self.channel_id = channel["id"]
- self.channel_name = channel["name"]
+ channel_id = ch["id"]
+ channel_name = ch["name"]
+ elif channel_name is None:
+ channel_name = ""
find_inline = re.compile(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
find_hash = re.compile(HASH_PATTERN).match
- posts = self.posts()
+ posts = self.api.discord_channel(channel_id)
max_posts = self.config("max-posts")
if max_posts:
posts = itertools.islice(posts, max_posts)
@@ -441,7 +395,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
append({"path": "https://cdn.discordapp.com" + path,
"name": path, "type": "inline", "hash": ""})
- post["channel_name"] = self.channel_name
+ post["channel_name"] = channel_name
post["date"] = self._parse_datetime(post["published"])
post["count"] = len(files)
yield Message.Directory, post
@@ -461,33 +415,17 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
url = self.root + "/data" + url[20:]
yield Message.Url, url, post
- def posts(self):
- url = "{}/api/v1/discord/channel/{}".format(
- self.root, self.channel_id)
- params = {"o": 0}
-
- while True:
- posts = self.request(url, params=params).json()
- yield from posts
-
- if len(posts) < 150:
- break
- params["o"] += 150
-
class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
subcategory = "discord-server"
pattern = BASE_PATTERN + r"/discord/server/(\d+)$"
example = "https://kemono.su/discord/server/12345"
- def __init__(self, match):
- KemonopartyExtractor.__init__(self, match)
- self.server = match.group(3)
-
def items(self):
- for channel in self._discord_channels(self.server):
- url = "{}/discord/server/{}/channel/{}#{}".format(
- self.root, self.server, channel["id"], channel["name"])
+ server_id = self.groups[2]
+ for channel in self.api.discord_server(server_id):
+ url = "{}/discord/server/{}/{}#{}".format(
+ self.root, server_id, channel["id"], channel["name"])
channel["_extractor"] = KemonopartyDiscordExtractor
yield Message.Queue, url, channel
@@ -495,26 +433,21 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
class KemonopartyFavoriteExtractor(KemonopartyExtractor):
"""Extractor for kemono.su favorites"""
subcategory = "favorite"
- pattern = BASE_PATTERN + r"/favorites(?:/?\?([^#]+))?"
+ pattern = BASE_PATTERN + r"/favorites()()(?:/?\?([^#]+))?"
example = "https://kemono.su/favorites"
- def __init__(self, match):
- KemonopartyExtractor.__init__(self, match)
- self.params = text.parse_query(match.group(3))
- self.favorites = (self.params.get("type") or
- self.config("favorites") or
- "artist")
-
def items(self):
self._prepare_ddosguard_cookies()
self.login()
- sort = self.params.get("sort")
- order = self.params.get("order") or "desc"
+ params = text.parse_query(self.groups[4])
+ type = params.get("type") or self.config("favorites") or "artist"
- if self.favorites == "artist":
- users = self.request(
- self.root + "/api/v1/account/favorites?type=artist").json()
+ sort = params.get("sort")
+ order = params.get("order") or "desc"
+
+ if type == "artist":
+ users = self.api.account_favorites("artist")
if not sort:
sort = "updated"
@@ -527,9 +460,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
self.root, user["service"], user["id"])
yield Message.Queue, url, user
- elif self.favorites == "post":
- posts = self.request(
- self.root + "/api/v1/account/favorites?type=post").json()
+ elif type == "post":
+ posts = self.api.account_favorites("post")
if not sort:
sort = "faved_seq"
@@ -541,3 +473,95 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
url = "{}/{}/user/{}/post/{}".format(
self.root, post["service"], post["user"], post["id"])
yield Message.Queue, url, post
+
+
+class KemonoAPI():
+ """Interface for the Kemono API v1.1.0
+
+ https://kemono.su/documentation/api
+ """
+
+ def __init__(self, extractor):
+ self.extractor = extractor
+ self.root = extractor.root + "/api/v1"
+
+ def posts(self, offset=0, query=None, tags=None):
+ endpoint = "/posts"
+ params = {"q": query, "o": offset, "tags": tags}
+ return self._pagination(endpoint, params, 50, "posts")
+
+ def creator_posts(self, service, creator_id, offset=0, query=None):
+ endpoint = "/{}/user/{}".format(service, creator_id)
+ params = {"q": query, "o": offset}
+ return self._pagination(endpoint, params, 50)
+
+ def creator_announcements(self, service, creator_id):
+ endpoint = "/{}/user/{}/announcements".format(service, creator_id)
+ return self._call(endpoint)
+
+ def creator_dms(self, service, creator_id):
+ endpoint = "/{}/user/{}/dms".format(service, creator_id)
+ return self._call(endpoint)
+
+ def creator_fancards(self, service, creator_id):
+ endpoint = "/{}/user/{}/fancards".format(service, creator_id)
+ return self._call(endpoint)
+
+ def creator_post(self, service, creator_id, post_id):
+ endpoint = "/{}/user/{}/post/{}".format(service, creator_id, post_id)
+ return self._call(endpoint)
+
+ def creator_post_comments(self, service, creator_id, post_id):
+ endpoint = "/{}/user/{}/post/{}/comments".format(
+ service, creator_id, post_id)
+ return self._call(endpoint)
+
+ def creator_post_revisions(self, service, creator_id, post_id):
+ endpoint = "/{}/user/{}/post/{}/revisions".format(
+ service, creator_id, post_id)
+ return self._call(endpoint)
+
+ def creator_profile(self, service, creator_id):
+ endpoint = "/{}/user/{}/profile".format(service, creator_id)
+ return self._call(endpoint)
+
+ def creator_links(self, service, creator_id):
+ endpoint = "/{}/user/{}/links".format(service, creator_id)
+ return self._call(endpoint)
+
+ def creator_tags(self, service, creator_id):
+ endpoint = "/{}/user/{}/tags".format(service, creator_id)
+ return self._call(endpoint)
+
+ def discord_channel(self, channel_id):
+ endpoint = "/discord/channel/{}".format(channel_id)
+ return self._pagination(endpoint, {}, 150)
+
+ def discord_server(self, server_id):
+ endpoint = "/discord/channel/lookup/{}".format(server_id)
+ return self._call(endpoint)
+
+ def account_favorites(self, type):
+ endpoint = "/account/favorites"
+ params = {"type": type}
+ return self._call(endpoint, params)
+
+ def _call(self, endpoint, params=None):
+ url = self.root + endpoint
+ response = self.extractor.request(url, params=params)
+ return response.json()
+
+ def _pagination(self, endpoint, params, batch=50, key=False):
+ params["o"] = text.parse_int(params.get("o")) % 50
+
+ while True:
+ data = self._call(endpoint, params)
+
+ if key:
+ yield from data[key]
+ else:
+ yield from data
+
+ if len(data) < batch:
+ return
+ params["o"] += batch
diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py
index cacf504..b60157e 100644
--- a/gallery_dl/extractor/koharu.py
+++ b/gallery_dl/extractor/koharu.py
@@ -6,20 +6,27 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://koharu.to/"""
+"""Extractors for https://niyaniya.moe/"""
from .common import GalleryExtractor, Extractor, Message
from .. import text, exception
from ..cache import cache
+import collections
-BASE_PATTERN = r"(?i)(?:https?://)?(?:koharu|anchira)\.to"
+BASE_PATTERN = (
+ r"(?i)(?:https?://)?("
+ r"(?:niyaniya|shupogaki)\.moe|"
+ r"(?:koharu|anchira|seia)\.to|"
+ r"(?:hoshino)\.one"
+ r")"
+)
class KoharuExtractor(Extractor):
"""Base class for koharu extractors"""
category = "koharu"
- root = "https://koharu.to"
- root_api = "https://api.koharu.to"
+ root = "https://niyaniya.moe"
+ root_api = "https://api.schale.network"
request_interval = (0.5, 1.5)
def _init(self):
@@ -62,7 +69,7 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
archive_fmt = "{id}_{num}"
request_interval = 0.0
pattern = BASE_PATTERN + r"/(?:g|reader)/(\d+)/(\w+)"
- example = "https://koharu.to/g/12345/67890abcde/"
+ example = "https://niyaniya.moe/g/12345/67890abcde/"
TAG_TYPES = {
0 : "general",
@@ -100,16 +107,26 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
def metadata(self, _):
url = "{}/books/detail/{}/{}".format(
- self.root_api, self.groups[0], self.groups[1])
+ self.root_api, self.groups[1], self.groups[2])
self.data = data = self.request(url, headers=self.headers).json()
+ data["date"] = text.parse_timestamp(data["created_at"] // 1000)
tags = []
- for tag in data["tags"]:
+ types = self.TAG_TYPES
+ tags_data = data["tags"]
+
+ for tag in tags_data:
name = tag["name"]
namespace = tag.get("namespace", 0)
- tags.append(self.TAG_TYPES[namespace] + ":" + name)
+ tags.append(types[namespace] + ":" + name)
data["tags"] = tags
- data["date"] = text.parse_timestamp(data["created_at"] // 1000)
+
+ if self.config("tags", False):
+ tags = collections.defaultdict(list)
+ for tag in tags_data :
+ tags[tag.get("namespace", 0)].append(tag["name"])
+ for type, values in tags.items():
+ data["tags_" + types[type]] = values
try:
if self.cbz:
@@ -179,11 +196,11 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
break
except KeyError:
self.log.debug("%s: Format %s is not available",
- self.groups[0], fmtid)
+ self.groups[1], fmtid)
else:
raise exception.NotFoundError("format")
- self.log.debug("%s: Selected format %s", self.groups[0], fmtid)
+ self.log.debug("%s: Selected format %s", self.groups[1], fmtid)
fmt["w"] = fmtid
return fmt
@@ -192,10 +209,10 @@ class KoharuSearchExtractor(KoharuExtractor):
"""Extractor for koharu search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/\?([^#]*)"
- example = "https://koharu.to/?s=QUERY"
+ example = "https://niyaniya.moe/?s=QUERY"
def items(self):
- params = text.parse_query(self.groups[0])
+ params = text.parse_query(self.groups[1])
params["page"] = text.parse_int(params.get("page"), 1)
return self._pagination("/books", params)
@@ -204,12 +221,12 @@ class KoharuFavoriteExtractor(KoharuExtractor):
"""Extractor for koharu favorites"""
subcategory = "favorite"
pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?"
- example = "https://koharu.to/favorites"
+ example = "https://niyaniya.moe/favorites"
def items(self):
self.login()
- params = text.parse_query(self.groups[0])
+ params = text.parse_query(self.groups[1])
params["page"] = text.parse_int(params.get("page"), 1)
return self._pagination("/favorites", params)
@@ -226,7 +243,7 @@ class KoharuFavoriteExtractor(KoharuExtractor):
def _login_impl(self, username, password):
self.log.info("Logging in as %s", username)
- url = "https://auth.koharu.to/login"
+ url = "https://auth.schale.network/login"
data = {"uname": username, "passwd": password}
response = self.request(
url, method="POST", headers=self.headers, data=data)
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 044f4f5..295b9c4 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -46,12 +46,17 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
for data["num"], file in enumerate(files, 1):
url = file["file"]
file.update(data)
- text.nameext_from_url(url, file)
+
+ if "extension" not in file:
+ text.nameext_from_url(url, file)
if "name" in file:
name = file["name"]
file["name"] = name.rpartition(".")[0] or name
file["id"] = file["filename"].rpartition("-")[2]
+ elif "id" in file:
+ file["name"] = file["filename"]
+ file["filename"] = "{}-{}".format(file["name"], file["id"])
else:
file["name"], sep, file["id"] = \
file["filename"].rpartition("-")
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index 1f24593..7f87cff 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -26,6 +26,7 @@ class MangadexExtractor(Extractor):
"{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}")
archive_fmt = "{chapter_id}_{page}"
root = "https://mangadex.org"
+ useragent = util.USERAGENT
_cache = {}
def __init__(self, match):
@@ -33,7 +34,6 @@ class MangadexExtractor(Extractor):
self.uuid = match.group(1)
def _init(self):
- self.session.headers["User-Agent"] = util.USERAGENT
self.api = MangadexAPI(self)
def items(self):
@@ -221,7 +221,7 @@ class MangadexAPI():
return self._call("/list/" + uuid)["data"]
def list_feed(self, uuid):
- return self._pagination("/list/" + uuid + "/feed")
+ return self._pagination_chapters("/list/" + uuid + "/feed")
@memcache(keyarg=1)
def manga(self, uuid):
@@ -230,7 +230,7 @@ class MangadexAPI():
def manga_author(self, uuid_author):
params = {"authorOrArtist": uuid_author}
- return self._pagination("/manga", params)
+ return self._pagination_manga("/manga", params)
def manga_feed(self, uuid):
order = "desc" if self.extractor.config("chapter-reverse") else "asc"
@@ -238,11 +238,11 @@ class MangadexAPI():
"order[volume]" : order,
"order[chapter]": order,
}
- return self._pagination("/manga/" + uuid + "/feed", params)
+ return self._pagination_chapters("/manga/" + uuid + "/feed", params)
def user_follows_manga_feed(self):
params = {"order[publishAt]": "desc"}
- return self._pagination("/user/follows/manga/feed", params)
+ return self._pagination_chapters("/user/follows/manga/feed", params)
def authenticate(self):
self.headers["Authorization"] = \
@@ -289,22 +289,31 @@ class MangadexAPI():
raise exception.StopExtraction(
"%s %s (%s)", response.status_code, response.reason, msg)
- def _pagination(self, endpoint, params=None):
+ def _pagination_chapters(self, endpoint, params=None):
if params is None:
params = {}
+ lang = self.extractor.config("lang")
+ if isinstance(lang, str) and "," in lang:
+ lang = lang.split(",")
+ params["translatedLanguage[]"] = lang
+ params["includes[]"] = ("scanlation_group",)
+
+ return self._pagination(endpoint, params)
+
+ def _pagination_manga(self, endpoint, params=None):
+ if params is None:
+ params = {}
+
+ return self._pagination(endpoint, params)
+
+ def _pagination(self, endpoint, params):
config = self.extractor.config
+
ratings = config("ratings")
if ratings is None:
ratings = ("safe", "suggestive", "erotica", "pornographic")
-
- lang = config("lang")
- if isinstance(lang, str) and "," in lang:
- lang = lang.split(",")
-
params["contentRating[]"] = ratings
- params["translatedLanguage[]"] = lang
- params["includes[]"] = ("scanlation_group",)
params["offset"] = 0
api_params = config("api-parameters")
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index cb7f701..5b354ac 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -19,7 +19,6 @@ class MastodonExtractor(BaseExtractor):
directory_fmt = ("mastodon", "{instance}", "{account[username]}")
filename_fmt = "{category}_{id}_{media[id]}.{extension}"
archive_fmt = "{media[id]}"
- cookies_domain = None
def __init__(self, match):
BaseExtractor.__init__(self, match)
diff --git a/gallery_dl/extractor/motherless.py b/gallery_dl/extractor/motherless.py
new file mode 100644
index 0000000..c5b9322
--- /dev/null
+++ b/gallery_dl/extractor/motherless.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://motherless.com/"""
+
+from .common import Extractor, Message
+from .. import text, util
+from ..cache import memcache
+from datetime import timedelta
+
+BASE_PATTERN = r"(?:https?://)?motherless\.com"
+
+
+class MotherlessExtractor(Extractor):
+ """Base class for motherless extractors"""
+ category = "motherless"
+ root = "https://motherless.com"
+ filename_fmt = "{id} {title}.{extension}"
+ archive_fmt = "{id}"
+
+
+class MotherlessMediaExtractor(MotherlessExtractor):
+ """Extractor for a single image/video from motherless.com"""
+ subcategory = "media"
+ pattern = (BASE_PATTERN +
+ r"/((?:g/[^/?#]+/|G[IV]?[A-Z0-9]+/)?"
+ r"(?!G)[A-Z0-9]+)")
+ example = "https://motherless.com/ABC123"
+
+ def items(self):
+ file = self._extract_media(self.groups[0])
+ url = file["url"]
+ yield Message.Directory, file
+ yield Message.Url, url, text.nameext_from_url(url, file)
+
+ def _extract_media(self, path):
+ url = self.root + "/" + path
+ page = self.request(url).text
+ extr = text.extract_from(page)
+
+ path, _, media_id = path.rpartition("/")
+ data = {
+ "id" : media_id,
+ "type" : extr("__mediatype = '", "'"),
+ "group": extr("__group = '", "'"),
+ "url" : extr("__fileurl = '", "'"),
+ "tags" : [
+ text.unescape(tag)
+ for tag in text.extract_iter(
+ extr('class="media-meta-tags">', "</div>"), ">#", "<")
+ ],
+ "title": text.unescape(extr("<h1>", "<")),
+ "views": text.parse_int(extr(
+ 'class="count">', " ").replace(",", "")),
+ "favorites": text.parse_int(extr(
+ 'class="count">', " ").replace(",", "")),
+ "date" : self._parse_datetime(extr('class="count">', "<")),
+ "uploader": text.unescape(extr('class="username">', "<").strip()),
+ }
+
+ if path and path[0] == "G":
+ data["gallery_id"] = path[1:]
+ data["gallery_title"] = self._extract_gallery_title(
+ page, data["gallery_id"])
+
+ return data
+
+ def _parse_datetime(self, dt):
+ if " ago" not in dt:
+ return text.parse_datetime(dt, "%d %b %Y")
+
+ value = text.parse_int(dt[:-5])
+ delta = timedelta(0, value*3600) if dt[-5] == "h" else timedelta(value)
+ return (util.datetime_utcnow() - delta).replace(
+ hour=0, minute=0, second=0)
+
+ @memcache(keyarg=2)
+ def _extract_gallery_title(self, page, gallery_id):
+ title = text.extr(
+ text.extr(page, '<h1 class="content-title">', "</h1>"),
+ "From the gallery:", "<")
+ if title:
+ return text.unescape(title.strip())
+
+ pos = page.find(' href="/G' + gallery_id + '"')
+ if pos >= 0:
+ return text.unescape(text.extract(
+ page, ' title="', '"', pos)[0])
+
+ return ""
+
+
+class MotherlessGalleryExtractor(MotherlessExtractor):
+ """Extractor for a motherless.com gallery"""
+ subcategory = "gallery"
+ directory_fmt = ("{category}", "{uploader}",
+ "{gallery_id} {gallery_title}")
+ archive_fmt = "{gallery_id}_{id}"
+ pattern = BASE_PATTERN + "/G([IVG])?([A-Z0-9]+)/?$"
+ example = "https://motherless.com/GABC123"
+
+ def items(self):
+ type, gid = self.groups
+
+ if not type:
+ data = {"_extractor": MotherlessGalleryExtractor}
+ yield Message.Queue, self.root + "/GI" + gid, data
+ yield Message.Queue, self.root + "/GV" + gid, data
+ return
+
+ url = "{}/G{}{}".format(self.root, type, gid)
+ page = self.request(url).text
+ data = self._extract_gallery_data(page)
+
+ for num, thumb in enumerate(self._pagination(page), 1):
+ file = self._parse_thumb_data(thumb)
+ file.update(data)
+ file["num"] = num
+ url = file["url"]
+ yield Message.Directory, file
+ yield Message.Url, url, text.nameext_from_url(url, file)
+
+ def _pagination(self, page):
+ while True:
+ for thumb in text.extract_iter(
+ page, 'class="thumb-container', "</div>"):
+ yield thumb
+
+ url = text.extr(page, '<link rel="next" href="', '"')
+ if not url:
+ return
+ page = self.request(text.unescape(url)).text
+
+ def _extract_gallery_data(self, page):
+ extr = text.extract_from(page)
+ return {
+ "gallery_id": self.groups[-1],
+ "gallery_title": text.unescape(extr(
+ "<title>", "<").rpartition(" | ")[0]),
+ "uploader": text.remove_html(extr(
+ 'class="gallery-member-username">', "</")),
+ "count": text.parse_int(
+ extr('<span class="active">', ")")
+ .rpartition("(")[2].replace(",", "")),
+ }
+
+ def _parse_thumb_data(self, thumb):
+ extr = text.extract_from(thumb)
+ data = {
+ "id" : extr('data-codename="', '"'),
+ "type" : extr('data-mediatype="', '"'),
+ "thumbnail": extr('class="static" src="', '"'),
+ "title" : extr(' alt="', '"'),
+ }
+
+ type = data["type"]
+ url = data["thumbnail"].replace("thumb", type)
+ if type == "video":
+ url = "{}/{}.mp4".format(url.rpartition("/")[0], data["id"])
+ data["url"] = url
+
+ return data
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 61ffdee..8ffa14b 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -193,7 +193,8 @@ class NewgroundsExtractor(Extractor):
data["_comment"] = extr(
'id="author_comments"', '</div>').partition(">")[2]
data["comment"] = text.unescape(text.remove_html(
- data["_comment"], "", ""))
+ data["_comment"]
+ .replace("<p><br></p>", "\n\n").replace("<br>", "\n"), "", ""))
data["favorites"] = text.parse_int(extr(
'id="faves_load">', '<').replace(",", ""))
data["score"] = text.parse_float(extr('id="score_number">', '<'))
@@ -214,7 +215,7 @@ class NewgroundsExtractor(Extractor):
data = {
"title" : text.unescape(extr('"og:title" content="', '"')),
"description": text.unescape(extr(':description" content="', '"')),
- "type" : extr('og:type" content="', '"'),
+ "type" : "art",
"_type" : "i",
"date" : text.parse_datetime(extr(
'itemprop="datePublished" content="', '"')),
@@ -231,7 +232,7 @@ class NewgroundsExtractor(Extractor):
if image_data:
data["_multi"] = self._extract_images_multi(image_data)
else:
- art_images = extr('<div class="art-images', '\n</div>')
+ art_images = extr('<div class="art-images', '\n\t\t</div>')
if art_images:
data["_multi"] = self._extract_images_art(art_images, data)
@@ -263,7 +264,7 @@ class NewgroundsExtractor(Extractor):
return {
"title" : text.unescape(extr('"og:title" content="', '"')),
"description": text.unescape(extr(':description" content="', '"')),
- "type" : extr('og:type" content="', '"'),
+ "type" : "audio",
"_type" : "a",
"date" : text.parse_datetime(extr(
'itemprop="datePublished" content="', '"')),
@@ -283,8 +284,13 @@ class NewgroundsExtractor(Extractor):
if src:
src = src.replace("\\/", "/")
formats = ()
+ type = extr(',"description":"', '"')
date = text.parse_datetime(extr(
'itemprop="datePublished" content="', '"'))
+ if type:
+ type = type.rpartition(" ")[2].lower()
+ else:
+ type = "flash" if text.ext_from_url(url) == "swf" else "game"
else:
url = self.root + "/portal/video/" + index
headers = {
@@ -295,6 +301,7 @@ class NewgroundsExtractor(Extractor):
formats = self._video_formats(sources)
src = next(formats, "")
date = text.parse_timestamp(src.rpartition("?")[2])
+ type = "movie"
return {
"title" : text.unescape(title),
@@ -513,7 +520,9 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
"""Extractor for a newgrounds user's favorited users"""
subcategory = "following"
- pattern = USER_PATTERN + r"/favorites/(following)"
+ pattern = (USER_PATTERN + r"/favorites/(following)"
+ r"(?:(?:/page/|/?\?page=)(\d+))?")
+
example = "https://USER.newgrounds.com/favorites/following"
def items(self):
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
index 09b2b16..90c5420 100644
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -61,7 +61,7 @@ class NhentaiGalleryExtractor(GalleryExtractor):
def images(self, _):
ufmt = ("https://i.nhentai.net/galleries/" +
self.data["media_id"] + "/{}.{}")
- extdict = {"j": "jpg", "p": "png", "g": "gif"}
+ extdict = {"j": "jpg", "p": "png", "g": "gif", "w": "webp"}
return [
(ufmt.format(num, extdict.get(img["t"], "jpg")), {
diff --git a/gallery_dl/extractor/noop.py b/gallery_dl/extractor/noop.py
new file mode 100644
index 0000000..df2316c
--- /dev/null
+++ b/gallery_dl/extractor/noop.py
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""noop extractor"""
+
+from .common import Extractor, Message
+
+
+class NoopExtractor(Extractor):
+ category = "noop"
+ pattern = r"(?i)noo?p$"
+ example = "noop"
+
+ def items(self):
+ # yield *something* to prevent a 'No results' message
+ yield Message.Version, 1
+
+ # Save cookies manually, since it happens automatically only after
+ # extended extractor initialization, i.e. Message.Directory, which
+ # itself might cause some unintended effects.
+ if self.cookies:
+ self.cookies_store()
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 0b64ea3..3eacf1a 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -26,12 +26,15 @@ class PatreonExtractor(Extractor):
_warning = True
def _init(self):
- self.session.headers["User-Agent"] = \
- "Patreon/72.2.28 (Android; Android 14; Scale/2.10)"
- if self._warning:
- if not self.cookies_check(("session_id",)):
+ if self.cookies_check(("session_id",)):
+ self.session.headers["User-Agent"] = \
+ "Patreon/72.2.28 (Android; Android 14; Scale/2.10)"
+ else:
+ if self._warning:
+ PatreonExtractor._warning = False
self.log.warning("no 'session_id' cookie set")
- PatreonExtractor._warning = False
+ self.session.headers["User-Agent"] = \
+ "Patreon/7.6.28 (Android; Android 11; Scale/2.10)"
def items(self):
generators = self._build_file_generators(self.config("files"))
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 150efed..1b67272 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -46,7 +46,7 @@ BASE_PATTERN = PhilomenaExtractor.update({
"ponybooru": {
"root": "https://ponybooru.org",
"pattern": r"(?:www\.)?ponybooru\.org",
- "filter_id": "2",
+ "filter_id": "3",
},
"furbooru": {
"root": "https://furbooru.org",
diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py
index 422325f..fe26704 100644
--- a/gallery_dl/extractor/piczel.py
+++ b/gallery_dl/extractor/piczel.py
@@ -19,7 +19,7 @@ class PiczelExtractor(Extractor):
filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}"
archive_fmt = "{id}_{num}"
root = "https://piczel.tv"
- api_root = root
+ root_api = root
def items(self):
for post in self.posts():
@@ -75,7 +75,7 @@ class PiczelUserExtractor(PiczelExtractor):
self.user = match.group(1)
def posts(self):
- url = "{}/api/users/{}/gallery".format(self.api_root, self.user)
+ url = "{}/api/users/{}/gallery".format(self.root_api, self.user)
return self._pagination(url)
@@ -93,7 +93,7 @@ class PiczelFolderExtractor(PiczelExtractor):
self.user, self.folder_id = match.groups()
def posts(self):
- url = "{}/api/users/{}/gallery".format(self.api_root, self.user)
+ url = "{}/api/users/{}/gallery".format(self.root_api, self.user)
return self._pagination(url, int(self.folder_id))
@@ -108,5 +108,5 @@ class PiczelImageExtractor(PiczelExtractor):
self.image_id = match.group(1)
def posts(self):
- url = "{}/api/gallery/{}".format(self.api_root, self.image_id)
+ url = "{}/api/gallery/{}".format(self.root_api, self.image_id)
return (self.request(url).json(),)
diff --git a/gallery_dl/extractor/pillowfort.py b/gallery_dl/extractor/pillowfort.py
index 5362f13..5749240 100644
--- a/gallery_dl/extractor/pillowfort.py
+++ b/gallery_dl/extractor/pillowfort.py
@@ -52,6 +52,7 @@ class PillowfortExtractor(Extractor):
post["date"] = text.parse_datetime(
post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z")
post["post_id"] = post.pop("id")
+ post["count"] = len(files)
yield Message.Directory, post
post["num"] = 0
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index 499c579..121c7bf 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -117,11 +117,16 @@ class PinterestExtractor(Extractor):
else:
media = self._extract_image(page, block)
- elif type == "story_pin_video_block":
+ elif type == "story_pin_video_block" or "video" in block:
video = block["video"]
media = self._extract_video(video)
media["media_id"] = video.get("id") or ""
+ elif type == "story_pin_music_block" or "audio" in block:
+ media = block["audio"]
+ media["url"] = media["audio_url"]
+ media["media_id"] = media.get("id") or ""
+
elif type == "story_pin_paragraph_block":
media = {"url": "text:" + block["text"],
"extension": "txt",
@@ -130,7 +135,10 @@ class PinterestExtractor(Extractor):
else:
self.log.warning("%s: Unsupported story block '%s'",
pin.get("id"), type)
- continue
+ try:
+ media = self._extract_image(page, block)
+ except Exception:
+ continue
media["story_id"] = story_id
media["page_id"] = page_id
@@ -397,14 +405,19 @@ class PinterestAPI():
self.root = extractor.root
self.cookies = {"csrftoken": csrf_token}
self.headers = {
- "Accept" : "application/json, text/javascript, "
- "*/*, q=0.01",
- "Accept-Language" : "en-US,en;q=0.5",
- "X-Requested-With" : "XMLHttpRequest",
- "X-APP-VERSION" : "0c4af40",
- "X-CSRFToken" : csrf_token,
- "X-Pinterest-AppState": "active",
- "Origin" : self.root,
+ "Accept" : "application/json, text/javascript, "
+ "*/*, q=0.01",
+ "X-Requested-With" : "XMLHttpRequest",
+ "X-APP-VERSION" : "a89153f",
+ "X-Pinterest-AppState" : "active",
+ "X-Pinterest-Source-Url" : None,
+ "X-Pinterest-PWS-Handler": "www/[username].js",
+ "Alt-Used" : "www.pinterest.com",
+ "Connection" : "keep-alive",
+ "Cookie" : None,
+ "Sec-Fetch-Dest" : "empty",
+ "Sec-Fetch-Mode" : "cors",
+ "Sec-Fetch-Site" : "same-origin",
}
def pin(self, pin_id):
@@ -437,7 +450,12 @@ class PinterestAPI():
def board_pins(self, board_id):
"""Yield all pins of a specific board"""
- options = {"board_id": board_id}
+ options = {
+ "board_id": board_id,
+ "field_set_key": "react_grid_pin",
+ "prepend": False,
+ "bookmarks": None,
+ }
return self._pagination("BoardFeed", options)
def board_section(self, section_id):
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 8c6e6d8..8ad061d 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -26,13 +26,14 @@ class PixivExtractor(Extractor):
directory_fmt = ("{category}", "{user[id]} {user[account]}")
filename_fmt = "{id}_p{num}.{extension}"
archive_fmt = "{id}{suffix}.{extension}"
- cookies_domain = None
+ cookies_domain = ".pixiv.net"
sanity_url = "https://s.pximg.net/common/images/limit_sanity_level_360.png"
mypixiv_url = "https://s.pximg.net/common/images/limit_mypixiv_360.png"
def _init(self):
self.api = PixivAppAPI(self)
self.load_ugoira = self.config("ugoira", True)
+ self.load_ugoira_original = (self.load_ugoira == "original")
self.max_posts = self.config("max-posts", 0)
self.sanity_workaround = self.config("sanity", True)
self.meta_user = self.config("metadata")
@@ -105,34 +106,7 @@ class PixivExtractor(Extractor):
del work["image_urls"]
del work["meta_pages"]
- if work["type"] == "ugoira":
- if self.load_ugoira:
- try:
- return self._extract_ugoira(work)
- except Exception as exc:
- self.log.warning(
- "%s: Unable to retrieve Ugoira metatdata (%s - %s)",
- work["id"], exc.__class__.__name__, exc)
-
- elif work["page_count"] == 1:
- url = meta_single_page["original_image_url"]
- if url == self.sanity_url:
- if self.sanity_workaround:
- self.log.warning("%s: 'sanity_level' warning", work["id"])
- body = self._request_ajax("/illust/" + str(work["id"]))
- return self._extract_ajax(work, body)
- else:
- self.log.warning(
- "%s: Unable to download work ('sanity_level' warning)",
- work["id"])
- elif url == self.mypixiv_url:
- work["_mypixiv"] = True
- self.log.warning("%s: 'My pixiv' locked", work["id"])
- return ()
- else:
- return ({"url": url},)
-
- else:
+ if meta_pages:
return [
{
"url" : img["image_urls"]["original"],
@@ -141,30 +115,58 @@ class PixivExtractor(Extractor):
for num, img in enumerate(meta_pages)
]
+ url = meta_single_page["original_image_url"]
+ if url == self.sanity_url:
+ work["_ajax"] = True
+ self.log.warning("%s: 'limit_sanity_level' warning", work["id"])
+ if self.sanity_workaround:
+ body = self._request_ajax("/illust/" + str(work["id"]))
+ return self._extract_ajax(work, body)
+
+ elif url == self.mypixiv_url:
+ work["_mypixiv"] = True
+ self.log.warning("%s: 'My pixiv' locked", work["id"])
+
+ elif work["type"] != "ugoira":
+ return ({"url": url},)
+
+ elif self.load_ugoira:
+ try:
+ return self._extract_ugoira(work, url)
+ except Exception as exc:
+ self.log.warning(
+ "%s: Unable to retrieve Ugoira metatdata (%s - %s)",
+ work["id"], exc.__class__.__name__, exc)
+
return ()
- def _extract_ugoira(self, work):
+ def _extract_ugoira(self, work, img_url):
ugoira = self.api.ugoira_metadata(work["id"])
- url = ugoira["zip_urls"]["medium"]
work["_ugoira_frame_data"] = work["frames"] = frames = ugoira["frames"]
- work["date_url"] = self._date_from_url(url)
+ work["_ugoira_original"] = self.load_ugoira_original
work["_http_adjust_extension"] = False
- if self.load_ugoira == "original":
- work["_ugoira_original"] = True
- base, sep, _ = url.rpartition("_ugoira")
- base = base.replace("/img-zip-ugoira/", "/img-original/", 1) + sep
+ if self.load_ugoira_original:
+ work["date_url"] = self._date_from_url(img_url)
- for ext in ("jpg", "png", "gif"):
- try:
- url = "{}0.{}".format(base, ext)
- self.request(url, method="HEAD")
- break
- except exception.HttpError:
- pass
+ base, sep, ext = img_url.rpartition("_ugoira0.")
+ if sep:
+ base += "_ugoira"
else:
- self.log.warning(
- "Unable to find Ugoira frame URLs (%s)", work["id"])
+ base, sep, _ = img_url.rpartition("_ugoira")
+ base = base.replace(
+ "/img-zip-ugoira/", "/img-original/", 1) + sep
+
+ for ext in ("jpg", "png", "gif"):
+ try:
+ url = "{}0.{}".format(base, ext)
+ self.request(url, method="HEAD")
+ break
+ except exception.HttpError:
+ pass
+ else:
+ self.log.warning(
+ "Unable to find Ugoira frame URLs (%s)", work["id"])
return [
{
@@ -174,9 +176,11 @@ class PixivExtractor(Extractor):
}
for num in range(len(frames))
]
+
else:
- work["_ugoira_original"] = False
- url = url.replace("_ugoira600x600", "_ugoira1920x1080", 1)
+ zip_url = ugoira["zip_urls"]["medium"]
+ work["date_url"] = self._date_from_url(zip_url)
+ url = zip_url.replace("_ugoira600x600", "_ugoira1920x1080", 1)
return ({"url": url},)
def _request_ajax(self, endpoint):
@@ -333,12 +337,12 @@ class PixivUserExtractor(PixivExtractor):
class PixivArtworksExtractor(PixivExtractor):
"""Extractor for artworks of a pixiv user"""
subcategory = "artworks"
- _warning = True
pattern = (BASE_PATTERN + r"/(?:"
r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)"
r"(?:/([^/?#]+))?/?(?:$|[?#])"
r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)")
example = "https://www.pixiv.net/en/users/12345/artworks"
+ _warn_phpsessid = True
def _init(self):
PixivExtractor._init(self)
@@ -352,12 +356,13 @@ class PixivArtworksExtractor(PixivExtractor):
self.tag = t1 or t2
if self.sanity_workaround:
- self.cookies_domain = d = ".pixiv.net"
+ self.cookies_domain = domain = ".pixiv.net"
self._init_cookies()
- if self._warning and not self.cookies.get("PHPSESSID", domain=d):
- PixivArtworksExtractor._warning = False
- self.log.warning("No 'PHPSESSID' cookie set. Can detect only "
- "non R-18 'sanity_level' works.")
+ if self._warn_phpsessid:
+ PixivArtworksExtractor._warn_phpsessid = False
+ if not self.cookies.get("PHPSESSID", domain=domain):
+ self.log.warning("No 'PHPSESSID' cookie set. Can detect on"
+ "ly non R-18 'limit_sanity_level' works.")
def metadata(self):
if self.config("metadata"):
@@ -601,7 +606,10 @@ class PixivRankingExtractor(PixivExtractor):
self.mode = self.date = None
def works(self):
- return self.api.illust_ranking(self.mode, self.date)
+ ranking = self.ranking
+ for ranking["rank"], work in enumerate(
+ self.api.illust_ranking(self.mode, self.date), 1):
+ yield work
def metadata(self):
query = text.parse_query(self.query)
@@ -640,10 +648,12 @@ class PixivRankingExtractor(PixivExtractor):
date = (now - timedelta(days=1)).strftime("%Y-%m-%d")
self.date = date
- return {"ranking": {
+ self.ranking = ranking = {
"mode": mode,
"date": self.date,
- }}
+ "rank": 0,
+ }
+ return {"ranking": ranking}
class PixivSearchExtractor(PixivExtractor):
@@ -734,7 +744,6 @@ class PixivPixivisionExtractor(PixivExtractor):
directory_fmt = ("{category}", "pixivision",
"{pixivision_id} {pixivision_title}")
archive_fmt = "V{pixivision_id}_{id}{suffix}.{extension}"
- cookies_domain = ".pixiv.net"
pattern = r"(?:https?://)?(?:www\.)?pixivision\.net/(?:en/)?a/(\d+)"
example = "https://www.pixivision.net/en/a/12345"
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
index bd22283..e09a7aa 100644
--- a/gallery_dl/extractor/poipiku.py
+++ b/gallery_dl/extractor/poipiku.py
@@ -41,7 +41,7 @@ class PoipikuExtractor(Extractor):
post = {
"post_category": extr("<title>[", "]"),
- "count" : extr("(", " "),
+ "count" : text.parse_int(extr("(", " ")),
"post_id" : parts[-1].partition(".")[0],
"user_id" : parts[-2],
"user_name" : text.unescape(extr(
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 8577e74..89eafc8 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -340,18 +340,16 @@ class RedditRedirectExtractor(Extractor):
category = "reddit"
subcategory = "redirect"
pattern = (r"(?:https?://)?(?:"
- r"(?:\w+\.)?reddit\.com/(?:(?:r)/([^/?#]+)))"
+ r"(?:\w+\.)?reddit\.com/(?:(r|u|user)/([^/?#]+)))"
r"/s/([a-zA-Z0-9]{10})")
example = "https://www.reddit.com/r/SUBREDDIT/s/abc456GHIJ"
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.subreddit = match.group(1)
- self.share_url = match.group(2)
-
def items(self):
- url = "https://www.reddit.com/r/" + self.subreddit + "/s/" + \
- self.share_url
+ sub_type, subreddit, share_url = self.groups
+ if sub_type == "u":
+ sub_type = "user"
+ url = "https://www.reddit.com/{}/{}/s/{}".format(
+ sub_type, subreddit, share_url)
data = {"_extractor": RedditSubmissionExtractor}
response = self.request(url, method="HEAD", allow_redirects=False,
notfound="submission")
diff --git a/gallery_dl/extractor/rule34vault.py b/gallery_dl/extractor/rule34vault.py
new file mode 100644
index 0000000..8c8abfa
--- /dev/null
+++ b/gallery_dl/extractor/rule34vault.py
@@ -0,0 +1,119 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://rule34vault.com/"""
+
+from .booru import BooruExtractor
+from .. import text
+import collections
+
+BASE_PATTERN = r"(?:https?://)?rule34vault\.com"
+
+
+class Rule34vaultExtractor(BooruExtractor):
+ category = "rule34vault"
+ root = "https://rule34vault.com"
+ root_cdn = "https://r34xyz.b-cdn.net"
+ filename_fmt = "{category}_{id}.{extension}"
+ per_page = 100
+
+ TAG_TYPES = {
+ 1: "general",
+ 2: "copyright",
+ 4: "character",
+ 8: "artist",
+ }
+
+ def _file_url(self, post):
+ post_id = post["id"]
+ extension = "jpg" if post["type"] == 0 else "mp4"
+ post["file_url"] = url = "{}/posts/{}/{}/{}.{}".format(
+ self.root_cdn, post_id // 1000, post_id, post_id, extension)
+ return url
+
+ def _prepare(self, post):
+ post.pop("files", None)
+ post["date"] = text.parse_datetime(
+ post["created"], "%Y-%m-%dT%H:%M:%S.%fZ")
+ if "tags" in post:
+ post["tags"] = [t["value"] for t in post["tags"]]
+
+ def _tags(self, post, _):
+ if "tags" not in post:
+ post.update(self._fetch_post(post["id"]))
+
+ tags = collections.defaultdict(list)
+ for tag in post["tags"]:
+ tags[tag["type"]].append(tag["value"])
+ types = self.TAG_TYPES
+ for type, values in tags.items():
+ post["tags_" + types[type]] = values
+
+ def _fetch_post(self, post_id):
+ url = "{}/api/v2/post/{}".format(self.root, post_id)
+ return self.request(url).json()
+
+ def _pagination(self, endpoint, params=None):
+ url = "{}/api{}".format(self.root, endpoint)
+
+ if params is None:
+ params = {}
+ params["CountTotal"] = False
+ params["Skip"] = self.page_start * self.per_page
+ params["take"] = self.per_page
+ threshold = self.per_page
+
+ while True:
+ data = self.request(url, method="POST", json=params).json()
+
+ yield from data["items"]
+
+ if len(data["items"]) < threshold:
+ return
+ params["cursor"] = data.get("cursor")
+ params["Skip"] += params["take"]
+
+
+class Rule34vaultPostExtractor(Rule34vaultExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/post/(\d+)"
+ example = "https://rule34vault.com/post/12345"
+
+ def posts(self):
+ return (self._fetch_post(self.groups[0]),)
+
+
+class Rule34vaultPlaylistExtractor(Rule34vaultExtractor):
+ subcategory = "playlist"
+ directory_fmt = ("{category}", "{playlist_id}")
+ archive_fmt = "p_{playlist_id}_{id}"
+ pattern = BASE_PATTERN + r"/playlists/view/(\d+)"
+ example = "https://rule34vault.com/playlists/view/12345"
+
+ def metadata(self):
+ return {"playlist_id": self.groups[0]}
+
+ def posts(self):
+ endpoint = "/v2/post/search/playlist/" + self.groups[0]
+ return self._pagination(endpoint)
+
+
+class Rule34vaultTagExtractor(Rule34vaultExtractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/(?!p(?:ost|laylists)/)([^/?#]+)"
+ example = "https://rule34vault.com/TAG"
+
+ def metadata(self):
+ self.tags = text.unquote(self.groups[0]).split("%7C")
+ return {"search_tags": " ".join(self.tags)}
+
+ def posts(self):
+ endpoint = "/v2/post/search/root"
+ params = {"includeTags": [t.replace("_", " ") for t in self.tags]}
+ return self._pagination(endpoint, params)
diff --git a/gallery_dl/extractor/rule34xyz.py b/gallery_dl/extractor/rule34xyz.py
new file mode 100644
index 0000000..f1e7518
--- /dev/null
+++ b/gallery_dl/extractor/rule34xyz.py
@@ -0,0 +1,143 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://rule34.xyz/"""
+
+from .booru import BooruExtractor
+from .. import text
+import collections
+
+BASE_PATTERN = r"(?:https?://)?rule34\.xyz"
+
+
+class Rule34xyzExtractor(BooruExtractor):
+ category = "rule34xyz"
+ root = "https://rule34.xyz"
+ root_cdn = "https://rule34xyz.b-cdn.net"
+ filename_fmt = "{category}_{id}.{extension}"
+ per_page = 60
+
+ TAG_TYPES = {
+ 0: "general",
+ 1: "copyright",
+ 2: "character",
+ 3: "artist",
+ }
+
+ def _init(self):
+ formats = self.config("format")
+ if formats:
+ if isinstance(formats, str):
+ formats = formats.split(",")
+ self.formats = formats
+ else:
+ self.formats = ("10", "40", "41", "2")
+
+ def _file_url(self, post):
+ post["files"] = files = {
+ str(link["type"]): link["url"]
+ for link in post.pop("imageLinks")
+ }
+
+ for fmt in self.formats:
+ if fmt in files:
+ break
+ else:
+ fmt = "2"
+ self.log.warning("%s: Requested format not available", post["id"])
+
+ post["file_url"] = url = files[fmt]
+ post["format_id"] = fmt
+ post["format"] = url.rsplit(".", 2)[1]
+ return url
+
+ def _prepare(self, post):
+ post.pop("filesPreview", None)
+ post.pop("tagsWithType", None)
+ post["date"] = text.parse_datetime(
+ post["created"], "%Y-%m-%dT%H:%M:%S.%f")
+
+ def _tags(self, post, _):
+ if post.get("tagsWithType") is None:
+ post.update(self._fetch_post(post["id"]))
+
+ tags = collections.defaultdict(list)
+ for tag in post["tagsWithType"]:
+ tags[tag["type"]].append(tag["value"])
+ types = self.TAG_TYPES
+ for type, values in tags.items():
+ post["tags_" + types[type]] = values
+
+ def _fetch_post(self, post_id):
+ url = "{}/api/post/{}".format(self.root, post_id)
+ return self.request(url).json()
+
+ def _pagination(self, endpoint, params=None):
+ url = "{}/api{}".format(self.root, endpoint)
+
+ if params is None:
+ params = {}
+ params["IncludeLinks"] = "true"
+ params["IncludeTags"] = "true"
+ params["OrderBy"] = "0"
+ params["Skip"] = self.page_start * self.per_page
+ params["Take"] = self.per_page
+ params["DisableTotal"] = "true"
+ threshold = self.per_page
+
+ while True:
+ data = self.request(url, params=params).json()
+
+ yield from data["items"]
+
+ if len(data["items"]) < threshold:
+ return
+ params["Skip"] += params["Take"]
+
+
+class Rule34xyzPostExtractor(Rule34xyzExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = BASE_PATTERN + r"/post/(\d+)"
+ example = "https://rule34.xyz/post/12345"
+
+ def posts(self):
+ return (self._fetch_post(self.groups[0]),)
+
+
+class Rule34xyzPlaylistExtractor(Rule34xyzExtractor):
+ subcategory = "playlist"
+ directory_fmt = ("{category}", "{playlist_id}")
+ archive_fmt = "p_{playlist_id}_{id}"
+ pattern = BASE_PATTERN + r"/playlists/view/(\d+)"
+ example = "https://rule34.xyz/playlists/view/12345"
+
+ def metadata(self):
+ return {"playlist_id": self.groups[0]}
+
+ def posts(self):
+ endpoint = "/playlist-item"
+ params = {"PlaylistId": self.groups[0]}
+ return self._pagination(endpoint, params)
+
+
+class Rule34xyzTagExtractor(Rule34xyzExtractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = BASE_PATTERN + r"/([^/?#]+)$"
+ example = "https://rule34.xyz/TAG"
+
+ def metadata(self):
+ self.tags = text.unquote(self.groups[0]).replace("_", " ")
+ return {"search_tags": self.tags}
+
+ def posts(self):
+ endpoint = "/post/search"
+ params = {"Tag": self.tags}
+ return self._pagination(endpoint, params)
diff --git a/gallery_dl/extractor/saint.py b/gallery_dl/extractor/saint.py
new file mode 100644
index 0000000..784cdc0
--- /dev/null
+++ b/gallery_dl/extractor/saint.py
@@ -0,0 +1,101 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://saint2.su/"""
+
+from .lolisafe import LolisafeAlbumExtractor
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?saint\d*\.(?:su|pk|to)"
+
+
+class SaintAlbumExtractor(LolisafeAlbumExtractor):
+ """Extractor for saint albums"""
+ category = "saint"
+ root = "https://saint2.su"
+ pattern = BASE_PATTERN + r"/a/([^/?#]+)"
+ example = "https://saint2.su/a/ID"
+
+ def fetch_album(self, album_id):
+ # album metadata
+ response = self.request(self.root + "/a/" + album_id)
+ extr = text.extract_from(response.text)
+
+ title = extr("<title>", "<")
+ descr = extr('name="description" content="', '"')
+ files = []
+
+ while True:
+ id2 = extr("/thumbs/", "-")
+ if not id2:
+ break
+ files.append({
+ "id2" : id2,
+ "date" : text.parse_timestamp(extr("", ".")),
+ "id" : extr("/embed/", '"'),
+ "size" : text.parse_int(extr('data="', '"')),
+ "file" : text.unescape(extr(
+ "onclick=\"play(", ")").strip("\"'")),
+ "id_dl": extr("/d/", ")").rstrip("\"'"),
+ })
+
+ return files, {
+ "album_id" : album_id,
+ "album_name" : text.unescape(title.rpartition(" - ")[0]),
+ "album_size" : sum(file["size"] for file in files),
+ "description" : text.unescape(descr),
+ "count" : len(files),
+ "_http_headers": {"Referer": response.url}
+ }
+
+
+class SaintMediaExtractor(SaintAlbumExtractor):
+ """Extractor for saint media links"""
+ subcategory = "media"
+ directory_fmt = ("{category}",)
+ pattern = BASE_PATTERN + r"(/(embe)?d/([^/?#]+))"
+ example = "https://saint2.su/embed/ID"
+
+ def fetch_album(self, album_id):
+ try:
+ path, embed, _ = self.groups
+
+ url = self.root + path
+ response = self.request(url)
+ extr = text.extract_from(response.text)
+
+ if embed:
+ file = {
+ "id" : album_id,
+ "id2" : extr("/thumbs/", "-"),
+ "date" : text.parse_timestamp(extr("", ".")),
+ "file" : text.unescape(extr('<source src="', '"')),
+ "id_dl": extr("/d/", "'"),
+ }
+
+ else: # /d/
+ file = {
+ "file" : text.unescape(extr('<a href="', '"')),
+ "id_dl" : album_id,
+ "name" : album_id,
+ "filename" : album_id,
+ "extension": "mp4",
+ }
+
+ file["_http_headers"] = {"Referer": response.url}
+ except Exception as exc:
+ self.log.error("%s: %s", exc.__class__.__name__, exc)
+ return (), {}
+
+ return (file,), {
+ "album_id" : "",
+ "album_name" : "",
+ "album_size" : -1,
+ "description": "",
+ "count" : 1,
+ }
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 7db8172..d5309dc 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -76,14 +76,15 @@ class SankakuExtractor(BooruExtractor):
def _tags(self, post, page):
tags = collections.defaultdict(list)
- types = self.TAG_TYPES
for tag in post["tags"]:
name = tag["name"]
if name:
- tags[types[tag["type"]]].append(name.lower().replace(" ", "_"))
- for key, value in tags.items():
- post["tags_" + key] = value
- post["tag_string_" + key] = " ".join(value)
+ tags[tag["type"]].append(name.lower().replace(" ", "_"))
+ types = self.TAG_TYPES
+ for type, values in tags.items():
+ name = types[type]
+ post["tags_" + name] = values
+ post["tag_string_" + name] = " ".join(values)
def _notes(self, post, page):
if post.get("has_notes"):
diff --git a/gallery_dl/extractor/scrolller.py b/gallery_dl/extractor/scrolller.py
index 9f9f0c4..c818c98 100644
--- a/gallery_dl/extractor/scrolller.py
+++ b/gallery_dl/extractor/scrolller.py
@@ -32,7 +32,12 @@ class ScrolllerExtractor(Extractor):
for post in self.posts():
- src = max(post["mediaSources"], key=self._sort_key)
+ media_sources = post.get("mediaSources")
+ if not media_sources:
+ self.log.warning("%s: No media files", post.get("id"))
+ continue
+
+ src = max(media_sources, key=self._sort_key)
post.update(src)
url = src["url"]
text.nameext_from_url(url, post)
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index 3639c0b..48bd918 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -21,7 +21,6 @@ class SmugmugExtractor(Extractor):
category = "smugmug"
filename_fmt = ("{category}_{User[NickName]:?/_/}"
"{Image[UploadKey]}_{Image[ImageKey]}.{extension}")
- cookies_domain = None
empty_user = {
"Uri": "",
"ResponseLevel": "Public",
diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py
index 8582824..c120ee5 100644
--- a/gallery_dl/extractor/steamgriddb.py
+++ b/gallery_dl/extractor/steamgriddb.py
@@ -56,14 +56,19 @@ class SteamgriddbExtractor(Extractor):
download_fake_png = self.config("download-fake-png", True)
for asset in self.assets():
- if download_fake_png and asset.get("fake_png"):
- urls = (asset["url"], asset["fake_png"])
- else:
- urls = (asset["url"],)
+ fake_png = download_fake_png and asset.get("fake_png")
- asset["count"] = len(urls)
+ asset["count"] = 2 if fake_png else 1
yield Message.Directory, asset
- for asset["num"], url in enumerate(urls, 1):
+
+ asset["num"] = 1
+ url = asset["url"]
+ yield Message.Url, url, text.nameext_from_url(url, asset)
+
+ if fake_png:
+ asset["num"] = 2
+ asset["_http_adjust_extension"] = False
+ url = fake_png
yield Message.Url, url, text.nameext_from_url(url, asset)
def _call(self, endpoint, **kwargs):
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 73455d2..8d1fcde 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -21,8 +21,8 @@ BASE_PATTERN = (
r"([\w-]+\.tumblr\.com)))"
)
-POST_TYPES = frozenset((
- "text", "quote", "link", "answer", "video", "audio", "photo", "chat"))
+POST_TYPES = frozenset(("text", "quote", "link", "answer", "video",
+ "audio", "photo", "chat", "search"))
class TumblrExtractor(Extractor):
@@ -31,7 +31,6 @@ class TumblrExtractor(Extractor):
directory_fmt = ("{category}", "{blog_name}")
filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}"
archive_fmt = "{id}_{num}"
- cookies_domain = None
def __init__(self, match):
Extractor.__init__(self, match)
@@ -83,14 +82,21 @@ class TumblrExtractor(Extractor):
return
if post["type"] not in self.types:
continue
- if not blog:
- blog = self.api.info(self.blog)
- blog["uuid"] = self.blog
- if self.avatar:
- url = self.api.avatar(self.blog)
- yield Message.Directory, {"blog": blog}
- yield self._prepare_avatar(url, post.copy(), blog)
+ if "blog" in post:
+ blog = post["blog"]
+ self.blog = blog["name"] + ".tumblr.com"
+ else:
+ if not blog:
+ blog = self.api.info(self.blog)
+ blog["uuid"] = self.blog
+
+ if self.avatar:
+ url = self.api.avatar(self.blog)
+ yield Message.Directory, {"blog": blog}
+ yield self._prepare_avatar(url, post.copy(), blog)
+
+ post["blog"] = blog
reblog = "reblogged_from_id" in post
if reblog and self._skip_reblog(post):
@@ -99,7 +105,6 @@ class TumblrExtractor(Extractor):
if "trail" in post:
del post["trail"]
- post["blog"] = blog
post["date"] = text.parse_timestamp(post["timestamp"])
posts = []
@@ -349,6 +354,19 @@ class TumblrLikesExtractor(TumblrExtractor):
return self.api.likes(self.blog)
+class TumblrSearchExtractor(TumblrExtractor):
+ """Extractor for a Tumblr search"""
+ subcategory = "search"
+ pattern = (BASE_PATTERN + r"/search/([^/?#]+)"
+ r"(?:/([^/?#]+)(?:/([^/?#]+))?)?(?:/?\?([^#]+))?")
+ example = "https://www.tumblr.com/search/QUERY"
+
+ def posts(self):
+ _, _, _, search, mode, post_type, query = self.groups
+ params = text.parse_query(query)
+ return self.api.search(text.unquote(search), params, mode, post_type)
+
+
class TumblrAPI(oauth.OAuth1API):
"""Interface for the Tumblr API v2
@@ -394,7 +412,8 @@ class TumblrAPI(oauth.OAuth1API):
if self.before and params["offset"]:
self.log.warning("'offset' and 'date-max' cannot be used together")
- return self._pagination(blog, "/posts", params, cache=True)
+ endpoint = "/v2/blog/{}/posts".format(blog)
+ return self._pagination(endpoint, params, blog=blog, cache=True)
def likes(self, blog):
"""Retrieve liked posts"""
@@ -410,6 +429,20 @@ class TumblrAPI(oauth.OAuth1API):
yield from posts
params["before"] = posts[-1]["liked_timestamp"]
+ def search(self, query, params, mode="top", post_type=None):
+ """Retrieve search results"""
+ endpoint = "/v2/timeline/search"
+
+ params["limit"] = "50"
+ params["days"] = params.pop("t", None)
+ params["query"] = query
+ params["mode"] = mode
+ params["reblog_info"] = "true" if self.extractor.reblogs else "false"
+ if post_type:
+ params["post_type_filter"] = post_type
+
+ return self._pagination(endpoint, params)
+
def _call(self, endpoint, params, **kwargs):
url = self.ROOT + endpoint
kwargs["params"] = params
@@ -478,20 +511,28 @@ class TumblrAPI(oauth.OAuth1API):
raise exception.StopExtraction(data)
- def _pagination(self, blog, endpoint, params, key="posts", cache=False):
- endpoint = "/v2/blog/{}{}".format(blog, endpoint)
+ def _pagination(self, endpoint, params,
+ blog=None, key="posts", cache=False):
if self.api_key:
params["api_key"] = self.api_key
strategy = self.extractor.config("pagination")
+ if not strategy and "offset" not in params:
+ strategy = "api"
+
while True:
data = self._call(endpoint, params)
- if cache:
- self.BLOG_CACHE[blog] = data["blog"]
- cache = False
+ if "timeline" in data:
+ data = data["timeline"]
+ posts = data["elements"]
+
+ else:
+ if cache:
+ self.BLOG_CACHE[blog] = data["blog"]
+ cache = False
+ posts = data[key]
- posts = data[key]
yield from posts
if strategy == "api":
diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py
index 27cc9d0..448625e 100644
--- a/gallery_dl/extractor/tumblrgallery.py
+++ b/gallery_dl/extractor/tumblrgallery.py
@@ -18,6 +18,7 @@ class TumblrgalleryExtractor(GalleryExtractor):
filename_fmt = "{category}_{gallery_id}_{num:>03}_{id}.{extension}"
directory_fmt = ("{category}", "{gallery_id} {title}")
root = "https://tumblrgallery.xyz"
+ referer = False
@staticmethod
def _urls_from_page(page):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 9c9d505..090b11a 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -305,6 +305,7 @@ class TwitterExtractor(Extractor):
legacy["created_at"], "%a %b %d %H:%M:%S %z %Y")
except Exception:
date = util.NONE
+ source = tweet.get("source")
tdata = {
"tweet_id" : tweet_id,
@@ -320,7 +321,7 @@ class TwitterExtractor(Extractor):
"author" : author,
"user" : self._user or author,
"lang" : legacy["lang"],
- "source" : text.extr(tweet["source"], ">", "<"),
+ "source" : text.extr(source, ">", "<") if source else "",
"sensitive" : tget("possibly_sensitive"),
"favorite_count": tget("favorite_count"),
"quote_count" : tget("quote_count"),
@@ -538,12 +539,6 @@ class TwitterExtractor(Extractor):
if username:
return self.cookies_update(_login_impl(self, username, password))
- for cookie in self.cookies:
- if cookie.domain == ".twitter.com":
- self.cookies.set(
- cookie.name, cookie.value, domain=self.cookies_domain,
- expires=cookie.expires, secure=cookie.secure)
-
class TwitterUserExtractor(TwitterExtractor):
"""Extractor for a Twitter user"""
diff --git a/gallery_dl/extractor/urlgalleries.py b/gallery_dl/extractor/urlgalleries.py
index f7ce44b..bb80055 100644
--- a/gallery_dl/extractor/urlgalleries.py
+++ b/gallery_dl/extractor/urlgalleries.py
@@ -13,8 +13,8 @@ from .. import text, exception
class UrlgalleriesGalleryExtractor(GalleryExtractor):
"""Base class for Urlgalleries extractors"""
category = "urlgalleries"
- root = "urlgalleries.net"
- request_interval = (0.5, 1.0)
+ root = "https://urlgalleries.net"
+ request_interval = (0.5, 1.5)
pattern = r"(?:https?://)(?:(\w+)\.)?urlgalleries\.net/(?:[\w-]+-)?(\d+)"
example = "https://BLOG.urlgalleries.net/gallery-12345/TITLE"
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 949c7cb..70ab259 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -155,7 +155,10 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor):
def items(self):
page = None
- data = {"_extractor": WebtoonsEpisodeExtractor}
+ data = {
+ "_extractor": WebtoonsEpisodeExtractor,
+ "title_no" : text.parse_int(self.title_no),
+ }
while True:
path = "/{}/list?title_no={}&page={}".format(
@@ -173,6 +176,8 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor):
data["page"] = self.page_no
for url in self.get_episode_urls(page):
+ params = text.parse_query(url.rpartition("?")[2])
+ data["episode_no"] = text.parse_int(params.get("episode_no"))
yield Message.Queue, url, data
self.page_no += 1
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 83b1642..9885d79 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -126,11 +126,7 @@ class WeiboExtractor(Extractor):
elif pic_type == "livephoto" and self.livephoto:
append(pic["largest"].copy())
-
- file = {"url": pic["video"]}
- file["filename"], _, file["extension"] = \
- pic["video"].rpartition("%2F")[2].rpartition(".")
- append(file)
+ append({"url": pic["video"]})
else:
append(pic["largest"].copy())
@@ -251,6 +247,11 @@ class WeiboUserExtractor(WeiboExtractor):
pattern = USER_PATTERN + r"(?:$|#)"
example = "https://weibo.com/USER"
+ # do NOT override 'initialize()'
+ # it is needed for 'self._user_id()'
+ # def initialize(self):
+ # pass
+
def items(self):
base = "{}/u/{}?tabtype=".format(self.root, self._user_id())
return self._dispatch_extractors((