summaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/__init__.py2
-rw-r--r--gallery_dl/extractor/2ch.py91
-rw-r--r--gallery_dl/extractor/__init__.py5
-rw-r--r--gallery_dl/extractor/batoto.py15
-rw-r--r--gallery_dl/extractor/blogger.py2
-rw-r--r--gallery_dl/extractor/bunkr.py14
-rw-r--r--gallery_dl/extractor/chevereto.py4
-rw-r--r--gallery_dl/extractor/common.py8
-rw-r--r--gallery_dl/extractor/deviantart.py63
-rw-r--r--gallery_dl/extractor/erome.py20
-rw-r--r--gallery_dl/extractor/fuskator.py15
-rw-r--r--gallery_dl/extractor/gelbooru.py13
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py40
-rw-r--r--gallery_dl/extractor/hatenablog.py167
-rw-r--r--gallery_dl/extractor/hbrowse.py92
-rw-r--r--gallery_dl/extractor/issuu.py3
-rw-r--r--gallery_dl/extractor/kemonoparty.py26
-rw-r--r--gallery_dl/extractor/mangadex.py32
-rw-r--r--gallery_dl/extractor/mastodon.py12
-rw-r--r--gallery_dl/extractor/nijie.py3
-rw-r--r--gallery_dl/extractor/nitter.py4
-rw-r--r--gallery_dl/extractor/oauth.py4
-rw-r--r--gallery_dl/extractor/paheal.py2
-rw-r--r--gallery_dl/extractor/patreon.py22
-rw-r--r--gallery_dl/extractor/philomena.py11
-rw-r--r--gallery_dl/extractor/pixiv.py6
-rw-r--r--gallery_dl/extractor/sankaku.py2
-rw-r--r--gallery_dl/extractor/shimmie2.py39
-rw-r--r--gallery_dl/extractor/steamgriddb.py211
-rw-r--r--gallery_dl/extractor/twitter.py2
-rw-r--r--gallery_dl/extractor/urlshortener.py11
-rw-r--r--gallery_dl/extractor/vk.py8
-rw-r--r--gallery_dl/extractor/webtoons.py48
-rw-r--r--gallery_dl/extractor/wikimedia.py181
-rw-r--r--gallery_dl/version.py2
35 files changed, 905 insertions, 275 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index fff53eb..19ea77b 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -45,7 +45,7 @@ def main():
elif filename.startswith("\\f"):
filename = "\f" + filename[2:]
config.set((), "filename", filename)
- if args.directory:
+ if args.directory is not None:
config.set((), "base-directory", args.directory)
config.set((), "directory", ())
if args.postprocessors:
diff --git a/gallery_dl/extractor/2ch.py b/gallery_dl/extractor/2ch.py
new file mode 100644
index 0000000..dbbf21b
--- /dev/null
+++ b/gallery_dl/extractor/2ch.py
@@ -0,0 +1,91 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://2ch.hk/"""
+
+from .common import Extractor, Message
+from .. import text, util
+
+
+class _2chThreadExtractor(Extractor):
+ """Extractor for 2ch threads"""
+ category = "2ch"
+ subcategory = "thread"
+ root = "https://2ch.hk"
+ directory_fmt = ("{category}", "{board}", "{thread} {title}")
+ filename_fmt = "{tim}{filename:? //}.{extension}"
+ archive_fmt = "{board}_{thread}_{tim}"
+ pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/res/(\d+)"
+ example = "https://2ch.hk/a/res/12345.html"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board, self.thread = match.groups()
+
+ def items(self):
+ url = "{}/{}/res/{}.json".format(self.root, self.board, self.thread)
+ posts = self.request(url).json()["threads"][0]["posts"]
+
+ op = posts[0]
+ title = op.get("subject") or text.remove_html(op["comment"])
+
+ thread = {
+ "board" : self.board,
+ "thread": self.thread,
+ "title" : text.unescape(title)[:50],
+ }
+
+ yield Message.Directory, thread
+ for post in posts:
+ files = post.get("files")
+ if files:
+ post["post_name"] = post["name"]
+ post["date"] = text.parse_timestamp(post["timestamp"])
+ del post["files"]
+ del post["name"]
+
+ for file in files:
+ file.update(thread)
+ file.update(post)
+
+ file["filename"] = file["fullname"].rpartition(".")[0]
+ file["tim"], _, file["extension"] = \
+ file["name"].rpartition(".")
+
+ yield Message.Url, self.root + file["path"], file
+
+
+class _2chBoardExtractor(Extractor):
+ """Extractor for 2ch boards"""
+ category = "2ch"
+ subcategory = "board"
+ root = "https://2ch.hk"
+ pattern = r"(?:https?://)?2ch\.hk/([^/?#]+)/?$"
+ example = "https://2ch.hk/a/"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.board = match.group(1)
+
+ def items(self):
+ # index page
+ url = "{}/{}/index.json".format(self.root, self.board)
+ index = self.request(url).json()
+ index["_extractor"] = _2chThreadExtractor
+ for thread in index["threads"]:
+ url = "{}/{}/res/{}.html".format(
+ self.root, self.board, thread["thread_num"])
+ yield Message.Queue, url, index
+
+ # pages 1..n
+ for n in util.advance(index["pages"], 1):
+ url = "{}/{}/{}.json".format(self.root, self.board, n)
+ page = self.request(url).json()
+ page["_extractor"] = _2chThreadExtractor
+ for thread in page["threads"]:
+ url = "{}/{}/res/{}.html".format(
+ self.root, self.board, thread["thread_num"])
+ yield Message.Queue, url, page
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 9e33f2c..d624736 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -10,6 +10,7 @@ import sys
import re
modules = [
+ "2ch",
"2chan",
"2chen",
"35photo",
@@ -53,7 +54,7 @@ modules = [
"gelbooru_v01",
"gelbooru_v02",
"gofile",
- "hbrowse",
+ "hatenablog",
"hentai2read",
"hentaicosplays",
"hentaifoundry",
@@ -145,6 +146,7 @@ modules = [
"smugmug",
"soundgasm",
"speakerdeck",
+ "steamgriddb",
"subscribestar",
"szurubooru",
"tapas",
@@ -175,6 +177,7 @@ modules = [
"weibo",
"wikiart",
"wikifeet",
+ "wikimedia",
"xhamster",
"xvideos",
"zerochan",
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py
index cd6302e..e82cd09 100644
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@@ -10,8 +10,11 @@ from .common import Extractor, ChapterExtractor, MangaExtractor
from .. import text, exception
import re
-BASE_PATTERN = (r"(?:https?://)?"
- r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)")
+BASE_PATTERN = (r"(?:https?://)?(?:"
+ r"(?:ba|d|h|m|w)to\.to|"
+ r"(?:(?:manga|read)toto|batocomic|[xz]bato)\.(?:com|net|org)|"
+ r"comiko\.(?:net|org)|"
+ r"bat(?:otoo|o?two)\.com)")
class BatotoBase():
@@ -38,7 +41,8 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
def metadata(self, page):
extr = text.extract_from(page)
manga, info, _ = extr("<title>", "<").rsplit(" - ", 3)
- manga_id = extr("/title/", "/")
+ manga_id = text.extr(
+ extr('rel="canonical" href="', '"'), "/title/", "/")
match = re.match(
r"(?:Volume\s+(\d+) )?"
@@ -76,12 +80,13 @@ class BatotoMangaExtractor(BatotoBase, MangaExtractor):
"""Extractor for bato.to manga"""
reverse = False
chapterclass = BatotoChapterExtractor
- pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$"
+ pattern = (BASE_PATTERN +
+ r"/(?:title/(\d+)[^/?#]*|series/(\d+)(?:/[^/?#]*)?)/?$")
example = "https://bato.to/title/12345-MANGA/"
def __init__(self, match):
self.root = text.root_from_url(match.group(0))
- self.manga_id = match.group(1)
+ self.manga_id = match.group(1) or match.group(2)
url = "{}/title/{}".format(self.root, self.manga_id)
MangaExtractor.__init__(self, match, url)
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 58ae59d..402408e 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -37,7 +37,7 @@ class BloggerExtractor(BaseExtractor):
findall_image = re.compile(
r'src="(https?://(?:'
r'blogger\.googleusercontent\.com/img|'
- r'lh\d+\.googleusercontent\.com/|'
+ r'lh\d+(?:-\w+)?\.googleusercontent\.com|'
r'\d+\.bp\.blogspot\.com)/[^"]+)').findall
findall_video = re.compile(
r'src="(https?://www\.blogger\.com/video\.g\?token=[^"]+)').findall
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 26123b8..e7fc14b 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -6,13 +6,13 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://bunkrr.su/"""
+"""Extractors for https://bunkrr.ru/"""
from .lolisafe import LolisafeAlbumExtractor
from .. import text
from urllib.parse import urlsplit, urlunsplit
-BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:la|[sr]u|is|to)"
+BASE_PATTERN = r"(?:https?://)?(?:app\.)?bunkr+\.(?:[rs]u|la|is|to)"
MEDIA_DOMAIN_OVERRIDES = {
"cdn9.bunkr.ru" : "c9.bunkr.ru",
@@ -27,11 +27,11 @@ CDN_HOSTED_EXTENSIONS = (
class BunkrAlbumExtractor(LolisafeAlbumExtractor):
- """Extractor for bunkrr.su albums"""
+ """Extractor for bunkrr.ru albums"""
category = "bunkr"
- root = "https://bunkrr.su"
+ root = "https://bunkrr.ru"
pattern = BASE_PATTERN + r"/a/([^/?#]+)"
- example = "https://bunkrr.su/a/ID"
+ example = "https://bunkrr.ru/a/ID"
def fetch_album(self, album_id):
# album metadata
@@ -84,11 +84,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
class BunkrMediaExtractor(BunkrAlbumExtractor):
- """Extractor for bunkrr.su media links"""
+ """Extractor for bunkrr.ru media links"""
subcategory = "media"
directory_fmt = ("{category}",)
pattern = BASE_PATTERN + r"/[vid]/([^/?#]+)"
- example = "https://bunkrr.su/v/FILENAME"
+ example = "https://bunkrr.ru/v/FILENAME"
def fetch_album(self, album_id):
try:
diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
index 2bf200b..ef5a44c 100644
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@@ -38,10 +38,6 @@ BASE_PATTERN = CheveretoExtractor.update({
"root": "https://jpg4.su",
"pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
},
- "pixl": {
- "root": "https://pixl.li",
- "pattern": r"pixl\.(?:li|is)",
- },
"imgkiwi": {
"root": "https://img.kiwi",
"pattern": r"img\.kiwi",
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 0dd05ef..cf0f8c9 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -102,6 +102,9 @@ class Extractor():
def config_accumulate(self, key):
return config.accumulate(self._cfgpath, key)
+ def config_instance(self, key, default=None):
+ return default
+
def _config_shared(self, key, default=None):
return config.interpolate_common(
("extractor",), self._cfgpath, key, default)
@@ -735,9 +738,10 @@ class BaseExtractor(Extractor):
for index, group in enumerate(match.groups()):
if group is not None:
if index:
- self.category, self.root = self.instances[index-1]
+ self.category, self.root, info = self.instances[index-1]
if not self.root:
self.root = text.root_from_url(match.group(0))
+ self.config_instance = info.get
else:
self.root = group
self.category = group.partition("://")[2]
@@ -757,7 +761,7 @@ class BaseExtractor(Extractor):
root = info["root"]
if root:
root = root.rstrip("/")
- instance_list.append((category, root))
+ instance_list.append((category, root, info))
pattern = info.get("pattern")
if not pattern:
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 4b5f1d7..bcfbe73 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -38,7 +38,7 @@ class DeviantartExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = (match.group(1) or match.group(2)).lower()
+ self.user = (match.group(1) or match.group(2) or "").lower()
self.offset = 0
def _init(self):
@@ -452,9 +452,11 @@ class DeviantartExtractor(Extractor):
return None
dev = self.api.deviation(deviation["deviationid"], False)
- folder = dev["premium_folder_data"]
+ folder = deviation["premium_folder_data"]
username = dev["author"]["username"]
- has_access = folder["has_access"]
+
+ # premium_folder_data is no longer present when user has access (#5063)
+ has_access = ("premium_folder_data" not in dev) or folder["has_access"]
if not has_access and folder["type"] == "watchers" and \
self.config("auto-watch"):
@@ -547,22 +549,45 @@ class DeviantartAvatarExtractor(DeviantartExtractor):
example = "https://www.deviantart.com/USER/avatar/"
def deviations(self):
- profile = self.api.user_profile(self.user.lower())
- if profile:
- url = profile["user"]["usericon"]
- return ({
- "author" : profile["user"],
- "category" : "avatar",
- "index" : text.parse_int(url.rpartition("?")[2]),
- "is_deleted" : False,
- "is_downloadable": False,
- "published_time" : 0,
- "title" : "avatar",
- "content" : {
- "src": url.replace("/avatars/", "/avatars-big/", 1),
- },
- },)
- return ()
+ name = self.user.lower()
+ profile = self.api.user_profile(name)
+ if not profile:
+ return ()
+
+ user = profile["user"]
+ icon = user["usericon"]
+ index = icon.rpartition("?")[2]
+
+ formats = self.config("formats")
+ if not formats:
+ url = icon.replace("/avatars/", "/avatars-big/", 1)
+ return (self._make_deviation(url, user, index, ""),)
+
+ if isinstance(formats, str):
+ formats = formats.replace(" ", "").split(",")
+
+ results = []
+ for fmt in formats:
+ fmt, _, ext = fmt.rpartition(".")
+ if fmt:
+ fmt = "-" + fmt
+ url = "https://a.deviantart.net/avatars{}/{}/{}/{}.{}?{}".format(
+ fmt, name[0], name[1], name, ext, index)
+ results.append(self._make_deviation(url, user, index, fmt))
+ return results
+
+ def _make_deviation(self, url, user, index, fmt):
+ return {
+ "author" : user,
+ "category" : "avatar",
+ "index" : text.parse_int(index),
+ "is_deleted" : False,
+ "is_downloadable": False,
+ "published_time" : 0,
+ "title" : "avatar" + fmt,
+ "stats" : {"comments": 0},
+ "content" : {"src": url},
+ }
class DeviantartBackgroundExtractor(DeviantartExtractor):
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index 6a0e069..8c9da2f 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -44,24 +44,26 @@ class EromeExtractor(Extractor):
pos = page.index('<div class="user-profile', pos)
user, pos = text.extract(
page, 'href="https://www.erome.com/', '"', pos)
- count, pos = text.extract(
- page, 'fa-camera"></i>', '</span>', pos)
+
+ urls = []
+ groups = page.split('<div class="media-group"')
+ for group in util.advance(groups, 1):
+ url = (text.extr(group, '<source src="', '"') or
+ text.extr(group, 'data-src="', '"'))
+ if url:
+ urls.append(url)
data = {
"album_id" : album_id,
"title" : text.unescape(title),
"user" : text.unquote(user),
+ "count" : len(urls),
"_http_headers": {"Referer": url},
- "count" : text.parse_int(count),
}
yield Message.Directory, data
- groups = page.split('<div class="media-group"')
- for data["num"], group in enumerate(util.advance(groups, 1), 1):
- url = (text.extr(group, '<source src="', '"') or
- text.extr(group, 'data-src="', '"'))
- if url:
- yield Message.Url, url, text.nameext_from_url(url, data)
+ for data["num"], url in enumerate(urls, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
def albums(self):
return ()
diff --git a/gallery_dl/extractor/fuskator.py b/gallery_dl/extractor/fuskator.py
index 20afb5a..beecbff 100644
--- a/gallery_dl/extractor/fuskator.py
+++ b/gallery_dl/extractor/fuskator.py
@@ -22,7 +22,7 @@ class FuskatorGalleryExtractor(GalleryExtractor):
def __init__(self, match):
self.gallery_hash = match.group(1)
- url = "{}/thumbs/{}/".format(self.root, self.gallery_hash)
+ url = "{}/thumbs/{}/index.html".format(self.root, self.gallery_hash)
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
@@ -50,15 +50,16 @@ class FuskatorGalleryExtractor(GalleryExtractor):
"gallery_id" : text.parse_int(gallery_id),
"gallery_hash": self.gallery_hash,
"title" : text.unescape(title[:-15]),
- "views" : data["hits"],
- "score" : data["rating"],
- "tags" : data["tags"].split(","),
- "count" : len(data["images"]),
+ "views" : data.get("hits"),
+ "score" : data.get("rating"),
+ "tags" : (data.get("tags") or "").split(","),
}
def images(self, page):
- for image in self.data["images"]:
- yield "https:" + image["imageUrl"], image
+ return [
+ ("https:" + image["imageUrl"], image)
+ for image in self.data["images"]
+ ]
class FuskatorSearchExtractor(Extractor):
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index eba1539..83f1392 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -23,7 +23,7 @@ class GelbooruBase():
root = "https://gelbooru.com"
offset = 0
- def _api_request(self, params, key="post"):
+ def _api_request(self, params, key="post", log=False):
if "s" not in params:
params["s"] = "post"
params["api_key"] = self.api_key
@@ -35,8 +35,9 @@ class GelbooruBase():
try:
posts = data[key]
except KeyError:
- self.log.error("Incomplete API response (missing '%s')", key)
- self.log.debug("%s", data)
+ if log:
+ self.log.error("Incomplete API response (missing '%s')", key)
+ self.log.debug("%s", data)
return []
if not isinstance(posts, list):
@@ -117,7 +118,7 @@ class GelbooruBase():
class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
- pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]+)"
+ pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]*)"
example = "https://gelbooru.com/index.php?page=post&s=list&tags=TAG"
@@ -169,7 +170,7 @@ class GelbooruFavoriteExtractor(GelbooruBase,
"limit": "1",
}
- count = self._api_request(params, "@attributes")[0]["count"]
+ count = self._api_request(params, "@attributes", True)[0]["count"]
if count <= self.offset:
return
@@ -186,7 +187,7 @@ class GelbooruFavoriteExtractor(GelbooruBase,
params["limit"] = self.per_page
while True:
- favs = self._api_request(params, "favorite")
+ favs = self._api_request(params, "favorite", True)
favs.reverse()
if skip:
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 0c8af3d..7ab6d02 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -22,14 +22,10 @@ class GelbooruV02Extractor(booru.BooruExtractor):
def _init(self):
self.api_key = self.config("api-key")
self.user_id = self.config("user-id")
-
- try:
- self.api_root = INSTANCES[self.category]["api_root"]
- except KeyError:
- self.api_root = self.root
+ self.api_root = self.config_instance("api_root") or self.root
if self.category == "realbooru":
- self.items = self._items_realbooru
+ self._file_url = self._file_url_realbooru
self._tags = self._tags_realbooru
def _api_request(self, params):
@@ -128,28 +124,6 @@ class GelbooruV02Extractor(booru.BooruExtractor):
self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
return url
- def _items_realbooru(self):
- from .common import Message
- data = self.metadata()
-
- for post in self.posts():
- try:
- html = self._html(post)
- url = post["file_url"] = text.rextract(
- html, 'href="', '"', html.index(">Original<"))[0]
- except Exception:
- self.log.debug("Unable to fetch download URL for post %s "
- "(md5: %s)", post.get("id"), post.get("md5"))
- continue
-
- text.nameext_from_url(url, post)
- post.update(data)
- self._prepare(post)
- self._tags(post, html)
-
- yield Message.Directory, post
- yield Message.Url, url, post
-
def _tags_realbooru(self, post, page):
tag_container = text.extr(page, 'id="tagLink"', '</div>')
tags = collections.defaultdict(list)
@@ -161,7 +135,7 @@ class GelbooruV02Extractor(booru.BooruExtractor):
post["tags_" + key] = " ".join(value)
-INSTANCES = {
+BASE_PATTERN = GelbooruV02Extractor.update({
"realbooru": {
"root": "https://realbooru.com",
"pattern": r"realbooru\.com",
@@ -187,16 +161,14 @@ INSTANCES = {
"root": "https://xbooru.com",
"pattern": r"xbooru\.com",
},
-}
-
-BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES)
+})
class GelbooruV02TagExtractor(GelbooruV02Extractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]+)"
+ pattern = BASE_PATTERN + r"/index\.php\?page=post&s=list&tags=([^&#]*)"
example = "https://safebooru.org/index.php?page=post&s=list&tags=TAG"
def __init__(self, match):
@@ -208,6 +180,8 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
return {"search_tags": self.tags}
def posts(self):
+ if self.tags == "all":
+ self.tags = ""
return self._pagination({"tags": self.tags})
diff --git a/gallery_dl/extractor/hatenablog.py b/gallery_dl/extractor/hatenablog.py
new file mode 100644
index 0000000..792f666
--- /dev/null
+++ b/gallery_dl/extractor/hatenablog.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://hatenablog.com"""
+
+import re
+from .common import Extractor, Message
+from .. import text
+
+
+BASE_PATTERN = (
+ r"(?:hatenablog:https?://([^/?#]+)|(?:https?://)?"
+ r"([\w-]+\.(?:hatenablog\.(?:com|jp)"
+ r"|hatenadiary\.com|hateblo\.jp)))"
+)
+QUERY_RE = r"(?:\?([^#]*))?(?:#.*)?$"
+
+
+class HatenablogExtractor(Extractor):
+ """Base class for HatenaBlog extractors"""
+ category = "hatenablog"
+ directory_fmt = ("{category}", "{domain}")
+ filename_fmt = "{category}_{domain}_{entry}_{num:>02}.{extension}"
+ archive_fmt = "{filename}"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.domain = match.group(1) or match.group(2)
+
+ def _init(self):
+ self._find_img = re.compile(r'<img +([^>]+)').finditer
+
+ def _handle_article(self, article: str):
+ extr = text.extract_from(article)
+ date = text.parse_datetime(extr('<time datetime="', '"'))
+ entry_link = text.unescape(extr('<a href="', '"'))
+ entry = entry_link.partition("/entry/")[2]
+ title = text.unescape(extr('>', '<'))
+ content = extr(
+ '<div class="entry-content hatenablog-entry">', '</div>')
+
+ images = []
+ for i in self._find_img(content):
+ attributes = i.group(1)
+ if 'class="hatena-fotolife"' not in attributes:
+ continue
+ image = text.unescape(text.extr(attributes, 'src="', '"'))
+ images.append(image)
+
+ data = {
+ "domain": self.domain,
+ "date": date,
+ "entry": entry,
+ "title": title,
+ "count": len(images),
+ }
+ yield Message.Directory, data
+ for data["num"], url in enumerate(images, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+
+class HatenablogEntriesExtractor(HatenablogExtractor):
+ """Base class for a list of entries"""
+ allowed_parameters = ()
+
+ def __init__(self, match):
+ HatenablogExtractor.__init__(self, match)
+ self.path = match.group(3)
+ self.query = {key: value for key, value in text.parse_query(
+ match.group(4)).items() if self._acceptable_query(key)}
+
+ def _init(self):
+ HatenablogExtractor._init(self)
+ self._find_pager_url = re.compile(
+ r' class="pager-next">\s*<a href="([^"]+)').search
+
+ def items(self):
+ url = "https://" + self.domain + self.path
+ query = self.query
+
+ while url:
+ page = self.request(url, params=query).text
+
+ extr = text.extract_from(page)
+ attributes = extr('<body ', '>')
+ if "page-archive" in attributes:
+ yield from self._handle_partial_articles(extr)
+ else:
+ yield from self._handle_full_articles(extr)
+
+ match = self._find_pager_url(page)
+ url = text.unescape(match.group(1)) if match else None
+ query = None
+
+ def _handle_partial_articles(self, extr):
+ while True:
+ section = extr('<section class="archive-entry', '</section>')
+ if not section:
+ break
+
+ url = "hatenablog:" + text.unescape(text.extr(
+ section, '<a class="entry-title-link" href="', '"'))
+ data = {"_extractor": HatenablogEntryExtractor}
+ yield Message.Queue, url, data
+
+ def _handle_full_articles(self, extr):
+ while True:
+ attributes = extr('<article ', '>')
+ if not attributes:
+ break
+ if "no-entry" in attributes:
+ continue
+
+ article = extr('', '</article>')
+ yield from self._handle_article(article)
+
+ def _acceptable_query(self, key):
+ return key == "page" or key in self.allowed_parameters
+
+
+class HatenablogEntryExtractor(HatenablogExtractor):
+ """Extractor for a single entry URL"""
+ subcategory = "entry"
+ pattern = BASE_PATTERN + r"/entry/([^?#]+)" + QUERY_RE
+ example = "https://BLOG.hatenablog.com/entry/PATH"
+
+ def __init__(self, match):
+ HatenablogExtractor.__init__(self, match)
+ self.path = match.group(3)
+
+ def items(self):
+ url = "https://" + self.domain + "/entry/" + self.path
+ page = self.request(url).text
+
+ extr = text.extract_from(page)
+ while True:
+ attributes = extr('<article ', '>')
+ if "no-entry" in attributes:
+ continue
+ article = extr('', '</article>')
+ return self._handle_article(article)
+
+
+class HatenablogHomeExtractor(HatenablogEntriesExtractor):
+ """Extractor for a blog's home page"""
+ subcategory = "home"
+ pattern = BASE_PATTERN + r"(/?)" + QUERY_RE
+ example = "https://BLOG.hatenablog.com"
+
+
+class HatenablogArchiveExtractor(HatenablogEntriesExtractor):
+ """Extractor for a blog's archive page"""
+ subcategory = "archive"
+ pattern = (BASE_PATTERN + r"(/archive(?:/\d+(?:/\d+(?:/\d+)?)?"
+ r"|/category/[^?#]+)?)" + QUERY_RE)
+ example = "https://BLOG.hatenablog.com/archive/2024"
+
+
+class HatenablogSearchExtractor(HatenablogEntriesExtractor):
+ """Extractor for a blog's search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"(/search)" + QUERY_RE
+ example = "https://BLOG.hatenablog.com/search?q=QUERY"
+ allowed_parameters = ("q",)
diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py
deleted file mode 100644
index a522140..0000000
--- a/gallery_dl/extractor/hbrowse.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2015-2023 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://www.hbrowse.com/"""
-
-from .common import ChapterExtractor, MangaExtractor
-from .. import text, util, exception
-
-
-class HbrowseBase():
- """Base class for hbrowse extractors"""
- category = "hbrowse"
- root = "https://www.hbrowse.com"
-
- def parse_page(self, page, data):
- """Parse metadata on 'page' and add it to 'data'"""
- data, pos = text.extract_all(page, (
- ('manga' , '<td class="listLong">', '</td>'),
- ('artist', '<td class="listLong">', '</td>'),
- ('total' , '<td class="listLong">', ' '),
- ('origin', '<td class="listLong">', '</td>'),
- ), values=data)
-
- if not data["manga"] and "<b>Warning</b>" in page:
- msg = page.rpartition(">")[2].strip()
- raise exception.StopExtraction("Site is not accessible: '%s'", msg)
-
- tags = text.extract(page, 'class="listTable"', '</table>', pos)[0]
-
- data["manga"] = text.unescape(data["manga"])
- data["total"] = text.parse_int(data["total"])
- data["artist"] = text.remove_html(data["artist"])
- data["origin"] = text.remove_html(data["origin"])
- data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"'))
- return data
-
-
-class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor):
- """Extractor for manga-chapters from hbrowse.com"""
- directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}")
- filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
- "{page:>03}.{extension}")
- archive_fmt = "{manga_id}_{chapter}_{page}"
- pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))"
- example = "https://www.hbrowse.com/12345/c00000"
-
- def __init__(self, match):
- self.path, self.gid, self.chapter = match.groups()
- self.path += "/"
- ChapterExtractor.__init__(self, match)
-
- def metadata(self, page):
- return self.parse_page(page, {
- "manga_id": text.parse_int(self.gid),
- "chapter": text.parse_int(self.chapter)
- })
-
- def images(self, page):
- base = self.root + "/data" + self.path
- json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
- return [(base + name, None) for name in util.json_loads(json_data)]
-
-
-class HbrowseMangaExtractor(HbrowseBase, MangaExtractor):
- """Extractor for manga from hbrowse.com"""
- chapterclass = HbrowseChapterExtractor
- reverse = False
- pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$"
- example = "https://www.hbrowse.com/12345"
-
- def chapters(self, page):
- results = []
- data = self.parse_page(page, {
- "manga_id": text.parse_int(
- self.manga_url.rstrip("/").rpartition("/")[2])
- })
-
- pos = 0
- needle = '<td class="listMiddle">\n<a class="listLink" href="'
- while True:
- url, pos = text.extract(page, needle, '"', pos)
- if not url:
- return results
- title, pos = text.extract(page, '>View ', '<', pos)
- data["chapter"] = text.parse_int(url.rpartition("/")[2][1:])
- data["title"] = title
- results.append((text.urljoin(self.root, url), data.copy()))
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index f6170c2..54c6539 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -29,8 +29,9 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
example = "https://issuu.com/issuu/docs/TITLE/"
def metadata(self, page):
+ pos = page.rindex('id="initial-data"')
data = util.json_loads(text.rextract(
- page, '<script data-json="', '"')[0].replace("&quot;", '"'))
+ page, '<script data-json="', '"', pos)[0].replace("&quot;", '"'))
doc = data["initialDocumentData"]["document"]
doc["date"] = text.parse_datetime(
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index c24e57d..10228b5 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -9,9 +9,10 @@
"""Extractors for https://kemono.party/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
from ..cache import cache, memcache
import itertools
+import json
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)"
@@ -37,10 +38,14 @@ class KemonopartyExtractor(Extractor):
Extractor.__init__(self, match)
def _init(self):
+ self.revisions = self.config("revisions")
self._prepare_ddosguard_cookies()
self._find_inline = re.compile(
r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
+ self._json_dumps = json.JSONEncoder(
+ ensure_ascii=False, check_circular=False,
+ sort_keys=True, separators=(",", ":")).encode
def items(self):
find_hash = re.compile(HASH_PATTERN).match
@@ -223,11 +228,23 @@ class KemonopartyExtractor(Extractor):
idx = len(revs)
for rev in revs:
+ rev["revision_hash"] = self._revision_hash(rev)
rev["revision_index"] = idx
idx -= 1
return revs
+ def _revision_hash(self, revision):
+ rev = revision.copy()
+ rev.pop("revision_id", None)
+ rev.pop("added", None)
+ rev.pop("next", None)
+ rev.pop("prev", None)
+ rev["file"].pop("name", None)
+ for a in rev["attachments"]:
+ a.pop("name", None)
+ return util.sha1(self._json_dumps(rev))
+
def _validate(response):
return (response.headers["content-length"] != "9" or
@@ -252,13 +269,13 @@ class KemonopartyUserExtractor(KemonopartyExtractor):
url = self.api_url
params = text.parse_query(self.query)
params["o"] = text.parse_int(params.get("o"))
- revisions = self.config("revisions")
while True:
posts = self.request(url, params=params).json()
- if revisions:
+ if self.revisions:
for post in posts:
+ post["revision_hash"] = self._revision_hash(post)
post["revision_id"] = 0
post_url = "{}/post/{}".format(self.api_url, post["id"])
try:
@@ -296,7 +313,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
def posts(self):
if not self.revision:
post = self.request(self.api_url).json()
- if self.config("revisions"):
+ if self.revisions:
+ post["revision_hash"] = self._revision_hash(post)
post["revision_id"] = 0
try:
revs = self._post_revisions(self.api_url)
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index 94bea57..bca7e4d 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -148,6 +148,32 @@ class MangadexFeedExtractor(MangadexExtractor):
return self.api.user_follows_manga_feed()
+class MangadexListExtractor(MangadexExtractor):
+ """Extractor for mangadex lists"""
+ subcategory = "list"
+ pattern = (BASE_PATTERN +
+ r"/list/([0-9a-f-]+)(?:/[^/?#]*)?(?:\?tab=(\w+))?")
+ example = ("https://mangadex.org/list"
+ "/01234567-89ab-cdef-0123-456789abcdef/NAME")
+
+ def __init__(self, match):
+ MangadexExtractor.__init__(self, match)
+ if match.group(2) == "feed":
+ self.subcategory = "list-feed"
+ else:
+ self.items = self._items_titles
+
+ def chapters(self):
+ return self.api.list_feed(self.uuid)
+
+ def _items_titles(self):
+ data = {"_extractor": MangadexMangaExtractor}
+ for item in self.api.list(self.uuid)["relationships"]:
+ if item["type"] == "manga":
+ url = "{}/title/{}".format(self.root, item["id"])
+ yield Message.Queue, url, data
+
+
class MangadexAPI():
"""Interface for the MangaDex API v5
@@ -173,6 +199,12 @@ class MangadexAPI():
params = {"includes[]": ("scanlation_group",)}
return self._call("/chapter/" + uuid, params)["data"]
+ def list(self, uuid):
+ return self._call("/list/" + uuid)["data"]
+
+ def list_feed(self, uuid):
+ return self._pagination("/list/" + uuid + "/feed")
+
@memcache(keyarg=1)
def manga(self, uuid):
params = {"includes[]": ("artist", "author")}
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 0b63d6c..68b4196 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -75,7 +75,7 @@ class MastodonExtractor(BaseExtractor):
account["acct"], account["moved"]["acct"])
-INSTANCES = {
+BASE_PATTERN = MastodonExtractor.update({
"mastodon.social": {
"root" : "https://mastodon.social",
"pattern" : r"mastodon\.social",
@@ -100,9 +100,7 @@ INSTANCES = {
"client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
"client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
}
-}
-
-BASE_PATTERN = MastodonExtractor.update(INSTANCES) + "(?:/web)?"
+}) + "(?:/web)?"
class MastodonUserExtractor(MastodonExtractor):
@@ -174,10 +172,8 @@ class MastodonAPI():
if access_token is None or access_token == "cache":
access_token = _access_token_cache(extractor.instance)
if not access_token:
- try:
- access_token = INSTANCES[extractor.category]["access-token"]
- except (KeyError, TypeError):
- pass
+ access_token = extractor.config_instance("access-token")
+
if access_token:
self.headers = {"Authorization": "Bearer " + access_token}
else:
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index b991705..9614513 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -116,7 +116,8 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
yield from text.extract_iter(
page, 'href="javascript:void(0);"><img src="', '"')
else:
- yield text.extr(page, 'itemprop="image" src="', '"')
+ pos = page.find('id="view-center"') + 1
+ yield text.extract(page, 'itemprop="image" src="', '"', pos)[0]
@staticmethod
def _extract_user_name(page):
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py
index bc7b308..d36f509 100644
--- a/gallery_dl/extractor/nitter.py
+++ b/gallery_dl/extractor/nitter.py
@@ -235,10 +235,6 @@ BASE_PATTERN = NitterExtractor.update({
"root": "https://nitter.net",
"pattern": r"nitter\.net",
},
- "nitter.lacontrevoie.fr": {
- "root": "https://nitter.lacontrevoie.fr",
- "pattern": r"nitter\.lacontrevoie\.fr",
- },
"nitter.1d4.us": {
"root": "https://nitter.1d4.us",
"pattern": r"nitter\.1d4\.us",
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 1690160..8c8a5a9 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -358,8 +358,8 @@ class OAuthMastodon(OAuthBase):
yield Message.Version, 1
from . import mastodon
- for application in mastodon.INSTANCES.values():
- if self.instance == application["root"].partition("://")[2]:
+ for _, root, application in mastodon.MastodonExtractor.instances:
+ if self.instance == root.partition("://")[2]:
break
else:
application = self._register(self.instance)
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 89c0d2f..5226724 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -56,7 +56,7 @@ class PahealExtractor(Extractor):
"date" : text.parse_datetime(
extr("datetime='", "'"), "%Y-%m-%dT%H:%M:%S%z"),
"source" : text.unescape(text.extr(
- extr(">Source&nbsp;Link<", "</td>"), "href='", "'")),
+ extr(">Source Link<", "</td>"), "href='", "'")),
}
dimensions, size, ext = extr("Info</th><td>", "<").split(" // ")
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 6c2f39d..62d11f2 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -52,19 +52,29 @@ class PatreonExtractor(Extractor):
post["hash"] = fhash
post["type"] = kind
post["num"] += 1
- yield Message.Url, url, text.nameext_from_url(name, post)
+ text.nameext_from_url(name, post)
+ if text.ext_from_url(url) == "m3u8":
+ url = "ytdl:" + url
+ post["extension"] = "mp4"
+ yield Message.Url, url, post
else:
self.log.debug("skipping %s (%s %s)", url, fhash, kind)
- @staticmethod
- def _postfile(post):
+ def _postfile(self, post):
postfile = post.get("post_file")
if postfile:
- return (("postfile", postfile["url"], postfile["name"]),)
+ url = postfile["url"]
+ name = postfile.get("name")
+ if not name:
+ if url.startswith("https://stream.mux.com/"):
+ name = url
+ else:
+ name = self._filename(url) or url
+ return (("postfile", url, name),)
return ()
def _images(self, post):
- for image in post["images"]:
+ for image in post.get("images") or ():
url = image.get("download_url")
if url:
name = image.get("file_name") or self._filename(url) or url
@@ -80,7 +90,7 @@ class PatreonExtractor(Extractor):
return ()
def _attachments(self, post):
- for attachment in post["attachments"]:
+ for attachment in post.get("attachments") or ():
url = self.request(
attachment["url"], method="HEAD",
allow_redirects=False, fatal=False,
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index ac6a391..339646f 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -32,7 +32,7 @@ class PhilomenaExtractor(BooruExtractor):
post["date"] = text.parse_datetime(post["created_at"])
-INSTANCES = {
+BASE_PATTERN = PhilomenaExtractor.update({
"derpibooru": {
"root": "https://derpibooru.org",
"pattern": r"(?:www\.)?derpibooru\.org",
@@ -48,9 +48,7 @@ INSTANCES = {
"pattern": r"furbooru\.org",
"filter_id": "2",
},
-}
-
-BASE_PATTERN = PhilomenaExtractor.update(INSTANCES)
+})
class PhilomenaPostExtractor(PhilomenaExtractor):
@@ -176,10 +174,7 @@ class PhilomenaAPI():
if filter_id:
params["filter_id"] = filter_id
elif not api_key:
- try:
- params["filter_id"] = INSTANCES[extr.category]["filter_id"]
- except (KeyError, TypeError):
- params["filter_id"] = "2"
+ params["filter_id"] = extr.config_instance("filter_id") or "2"
params["page"] = extr.page_start
params["per_page"] = extr.per_page
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 4414c71..b9821f2 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -826,9 +826,9 @@ class PixivAppAPI():
extractor.session.headers.update({
"App-OS" : "ios",
- "App-OS-Version": "13.1.2",
- "App-Version" : "7.7.6",
- "User-Agent" : "PixivIOSApp/7.7.6 (iOS 13.1.2; iPhone11,8)",
+ "App-OS-Version": "16.7.2",
+ "App-Version" : "7.19.1",
+ "User-Agent" : "PixivIOSApp/7.19.1 (iOS 16.7.2; iPhone12,8)",
"Referer" : "https://app-api.pixiv.net/",
})
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 602895c..b3b7a9c 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor):
"""Extractor for single posts from sankaku.app"""
subcategory = "post"
archive_fmt = "{id}"
- pattern = BASE_PATTERN + r"/posts?(?:/show)?/([0-9a-f]+)"
+ pattern = BASE_PATTERN + r"/posts?(?:/show)?/(\w+)"
example = "https://sankaku.app/post/show/12345"
def __init__(self, match):
diff --git a/gallery_dl/extractor/shimmie2.py b/gallery_dl/extractor/shimmie2.py
index 8a08fab..67f38c4 100644
--- a/gallery_dl/extractor/shimmie2.py
+++ b/gallery_dl/extractor/shimmie2.py
@@ -19,17 +19,12 @@ class Shimmie2Extractor(BaseExtractor):
archive_fmt = "{id}"
def _init(self):
- try:
- instance = INSTANCES[self.category]
- except KeyError:
- return
-
- cookies = instance.get("cookies")
+ cookies = self.config_instance("cookies")
if cookies:
domain = self.root.rpartition("/")[2]
self.cookies_update_dict(cookies, domain=domain)
- file_url = instance.get("file_url")
+ file_url = self.config_instance("file_url")
if file_url:
self.file_url_fmt = file_url
@@ -73,15 +68,15 @@ class Shimmie2Extractor(BaseExtractor):
return "'"
-INSTANCES = {
+BASE_PATTERN = Shimmie2Extractor.update({
"loudbooru": {
"root": "https://loudbooru.com",
"pattern": r"loudbooru\.com",
"cookies": {"ui-tnc-agreed": "true"},
},
"giantessbooru": {
- "root": "https://giantessbooru.com",
- "pattern": r"giantessbooru\.com",
+ "root": "https://sizechangebooru.com",
+ "pattern": r"(?:sizechange|giantess)booru\.com",
"cookies": {"agreed": "true"},
},
"tentaclerape": {
@@ -97,9 +92,7 @@ INSTANCES = {
"root": "https://rule34hentai.net",
"pattern": r"rule34hentai\.net",
},
-}
-
-BASE_PATTERN = Shimmie2Extractor.update(INSTANCES) + r"/(?:index\.php\?q=/?)?"
+}) + r"/(?:index\.php\?q=/?)?"
class Shimmie2TagExtractor(Shimmie2Extractor):
@@ -183,25 +176,25 @@ class Shimmie2TagExtractor(Shimmie2Extractor):
extr = text.extract_from(self.request(url).text)
while True:
- pid = extr('href="./index.php?q=/post/view/', '&')
+ pid = extr("href='./index.php?q=/post/view/", "&")
if not pid:
break
- tags, dimensions, size = extr('title="', '"').split(" // ")
+ tags, dimensions, size = extr("title='", "'").split(" // ")
width, _, height = dimensions.partition("x")
yield {
"file_url": file_url_fmt(pid),
- "id": pid,
- "md5": "",
- "tags": tags,
- "width": width,
- "height": height,
- "size": text.parse_bytes(size[:-1]),
+ "id" : pid,
+ "md5" : "",
+ "tags" : tags,
+ "width" : width,
+ "height" : height,
+ "size" : text.parse_bytes(size[:-1]),
}
pnum += 1
- if not extr('/{}">{}<'.format(pnum, pnum), ">"):
+ if not extr("/{0}'>{0}<".format(pnum), ">"):
return
@@ -248,7 +241,7 @@ class Shimmie2PostExtractor(Shimmie2Extractor):
"id" : self.post_id,
"tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"),
"md5" : "",
- "file_url": self.root + extr('id="main_image" src=".', '"'),
+ "file_url": self.root + extr("id='main_image' src='.", "'"),
"width" : extr("orig_width =", ";"),
"height" : 0,
"size" : 0,
diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py
new file mode 100644
index 0000000..9d46fd6
--- /dev/null
+++ b/gallery_dl/extractor/steamgriddb.py
@@ -0,0 +1,211 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.steamgriddb.com"""
+
+from .common import Extractor, Message
+from .. import text, exception
+
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?steamgriddb\.com"
+LANGUAGE_CODES = (
+ "aa", "ab", "ae", "af", "ak", "am", "an", "ar", "as", "av", "ay", "az",
+ "ba", "be", "bg", "bh", "bi", "bm", "bn", "bo", "br", "bs", "ca", "ce",
+ "ch", "co", "cr", "cs", "cu", "cv", "cy", "da", "de", "dv", "dz", "ee",
+ "el", "en", "eo", "es", "et", "eu", "fa", "ff", "fi", "fj", "fo", "fr",
+ "fy", "ga", "gd", "gl", "gn", "gu", "gv", "ha", "he", "hi", "ho", "hr",
+ "ht", "hu", "hy", "hz", "ia", "id", "ie", "ig", "ii", "ik", "io", "is",
+ "it", "iu", "ja", "jv", "ka", "kg", "ki", "kj", "kk", "kl", "km", "kn",
+ "ko", "kr", "ks", "ku", "kv", "kw", "ky", "la", "lb", "lg", "li", "ln",
+ "lo", "lt", "lu", "lv", "mg", "mh", "mi", "mk", "ml", "mn", "mr", "ms",
+ "mt", "my", "na", "nb", "nd", "ne", "ng", "nl", "nn", "no", "nr", "nv",
+ "ny", "oc", "oj", "om", "or", "os", "pa", "pi", "pl", "ps", "pt", "qu",
+ "rm", "rn", "ro", "ru", "rw", "sa", "sc", "sd", "se", "sg", "si", "sk",
+ "sl", "sm", "sn", "so", "sq", "sr", "ss", "st", "su", "sv", "sw", "ta",
+ "te", "tg", "th", "ti", "tk", "tl", "tn", "to", "tr", "ts", "tt", "tw",
+ "ty", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi",
+ "yo", "za", "zh", "zu",
+)
+FILE_EXT_TO_MIME = {
+ "png": "image/png",
+ "jpeg": "image/jpeg",
+ "jpg": "image/jpeg",
+ "webp": "image/webp",
+ "ico": "image/vnd.microsoft.icon",
+ "all": "all",
+}
+
+
+class SteamgriddbExtractor(Extractor):
+ """Base class for SteamGridDB"""
+ category = "steamgriddb"
+ directory_fmt = ("{category}", "{subcategory}", "{game[id]}")
+ filename_fmt = "{game[id]}_{id}_{num:>02}.{extension}"
+ archive_fmt = "{filename}"
+ root = "https://www.steamgriddb.com"
+
+ def _init(self):
+ self.cookies_update({
+ "userprefs": "%7B%22adult%22%3Afalse%7D",
+ })
+
+ def items(self):
+ download_fake_png = self.config("download-fake-png", True)
+
+ for asset in self.assets():
+ if download_fake_png and asset.get("fake_png"):
+ urls = (asset["url"], asset["fake_png"])
+ else:
+ urls = (asset["url"],)
+
+ asset["count"] = len(urls)
+ yield Message.Directory, asset
+ for asset["num"], url in enumerate(urls, 1):
+ yield Message.Url, url, text.nameext_from_url(url, asset)
+
+ def _call(self, endpoint, **kwargs):
+ data = self.request(self.root + endpoint, **kwargs).json()
+ if not data["success"]:
+ raise exception.StopExtraction(data["error"])
+ return data["data"]
+
+
+class SteamgriddbAssetsExtractor(SteamgriddbExtractor):
+ """Base class for extracting a list of assets"""
+
+ def __init__(self, match):
+ SteamgriddbExtractor.__init__(self, match)
+ list_type = match.group(1)
+ id = int(match.group(2))
+ self.game_id = id if list_type == "game" else None
+ self.collection_id = id if list_type == "collection" else None
+ self.page = int(match.group(3) or 1)
+
+ def assets(self):
+ limit = 48
+ page = min(self.page - 1, 0)
+
+ sort = self.config("sort", "score_desc")
+ if sort not in ("score_desc", "score_asc", "score_old_desc",
+ "score_old_asc", "age_desc", "age_asc"):
+ raise exception.StopExtractor("Invalid sort '%s'", sort)
+
+ json = {
+ "static" : self.config("static", True),
+ "animated": self.config("animated", True),
+ "humor" : self.config("humor", True),
+ "nsfw" : self.config("nsfw", True),
+ "epilepsy": self.config("epilepsy", True),
+ "untagged": self.config("untagged", True),
+
+ "asset_type": self.asset_type,
+ "limit": limit,
+ "order": sort,
+ }
+ if self.valid_dimensions:
+ json["dimensions"] = self.config_list(
+ "dimensions", "dimension", self.valid_dimensions)
+ json["styles"] = self.config_list("styles", "style", self.valid_styles)
+ json["languages"] = self.config_list(
+ "languages", "language", LANGUAGE_CODES)
+ file_types = self.config_list(
+ "file-types", "file type", self.valid_file_types)
+ json["mime"] = [FILE_EXT_TO_MIME[i] for i in file_types]
+
+ if self.game_id:
+ json["game_id"] = [self.game_id]
+ else:
+ json["collection_id"] = self.collection_id
+
+ while True:
+ json["page"] = page
+
+ data = self._call(
+ "/api/public/search/assets", method="POST", json=json)
+ for asset in data["assets"]:
+ if not asset.get("game"):
+ asset["game"] = data["game"]
+ yield asset
+
+ if data["total"] <= limit * page:
+ break
+ page += 1
+
+ def config_list(self, key, type_name, valid_values):
+ value = self.config(key)
+ if isinstance(value, str):
+ value = value.split(",")
+
+ if value is None or "all" in value:
+ return ["all"]
+
+ for i in value:
+ if i not in valid_values:
+ raise exception.StopExtraction("Invalid %s '%s'", type_name, i)
+
+ return value
+
+
+class SteamgriddbAssetExtractor(SteamgriddbExtractor):
+ """Extractor for a single asset"""
+ subcategory = "asset"
+ pattern = BASE_PATTERN + r"/(grid|hero|logo|icon)/(\d+)"
+ example = "https://www.steamgriddb.com/grid/1234"
+
+ def __init__(self, match):
+ SteamgriddbExtractor.__init__(self, match)
+ self.asset_type = match.group(1)
+ self.asset_id = match.group(2)
+
+ def assets(self):
+ endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id
+ asset = self._call(endpoint)["asset"]
+ return (asset,)
+
+
+class SteamgriddbGridsExtractor(SteamgriddbAssetsExtractor):
+ subcategory = "grids"
+ asset_type = "grid"
+ pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/grids(?:/(\d+))?"
+ example = "https://www.steamgriddb.com/game/1234/grids"
+ valid_dimensions = ("460x215", "920x430", "600x900", "342x482", "660x930",
+ "512x512", "1024x1024")
+ valid_styles = ("alternate", "blurred", "no_logo", "material",
+ "white_logo")
+ valid_file_types = ("png", "jpeg", "jpg", "webp")
+
+
+class SteamgriddbHeroesExtractor(SteamgriddbAssetsExtractor):
+ subcategory = "heroes"
+ asset_type = "hero"
+ pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/heroes(?:/(\d+))?"
+ example = "https://www.steamgriddb.com/game/1234/heroes"
+ valid_dimensions = ("1920x620", "3840x1240", "1600x650")
+ valid_styles = ("alternate", "blurred", "material")
+ valid_file_types = ("png", "jpeg", "jpg", "webp")
+
+
+class SteamgriddbLogosExtractor(SteamgriddbAssetsExtractor):
+ subcategory = "logos"
+ asset_type = "logo"
+ pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/logos(?:/(\d+))?"
+ example = "https://www.steamgriddb.com/game/1234/logos"
+ valid_dimensions = None
+ valid_styles = ("official", "white", "black", "custom")
+ valid_file_types = ("png", "webp")
+
+
+class SteamgriddbIconsExtractor(SteamgriddbAssetsExtractor):
+ subcategory = "icons"
+ asset_type = "icon"
+ pattern = BASE_PATTERN + r"/(game|collection)/(\d+)/icons(?:/(\d+))?"
+ example = "https://www.steamgriddb.com/game/1234/icons"
+ valid_dimensions = ["{0}x{0}".format(i) for i in (8, 10, 14, 16, 20, 24,
+ 28, 32, 35, 40, 48, 54, 56, 57, 60, 64, 72, 76, 80, 90,
+ 96, 100, 114, 120, 128, 144, 150, 152, 160, 180, 192,
+ 194, 256, 310, 512, 768, 1024)]
+ valid_styles = ("official", "custom")
+ valid_file_types = ("png", "ico")
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index aa9ab9f..cf759e0 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -546,7 +546,7 @@ class TwitterTimelineExtractor(TwitterExtractor):
def _select_tweet_source(self):
strategy = self.config("strategy")
if strategy is None or strategy == "auto":
- if self.retweets or self.replies or self.textonly:
+ if self.retweets or self.textonly:
return self.api.user_tweets
else:
return self.api.user_media
diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py
index f2e6521..49a3deb 100644
--- a/gallery_dl/extractor/urlshortener.py
+++ b/gallery_dl/extractor/urlshortener.py
@@ -15,7 +15,7 @@ class UrlshortenerExtractor(BaseExtractor):
basecategory = "urlshortener"
-INSTANCES = {
+BASE_PATTERN = UrlshortenerExtractor.update({
"bitly": {
"root": "https://bit.ly",
"pattern": r"bit\.ly",
@@ -26,9 +26,7 @@ INSTANCES = {
"root": "https://t.co",
"pattern": r"t\.co",
},
-}
-
-BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES)
+})
class UrlshortenerLinkExtractor(UrlshortenerExtractor):
@@ -42,10 +40,7 @@ class UrlshortenerLinkExtractor(UrlshortenerExtractor):
self.id = match.group(match.lastindex)
def _init(self):
- try:
- self.headers = INSTANCES[self.category]["headers"]
- except Exception:
- self.headers = None
+ self.headers = self.config_instance("headers")
def items(self):
response = self.request(
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py
index c22e67e..95eeafe 100644
--- a/gallery_dl/extractor/vk.py
+++ b/gallery_dl/extractor/vk.py
@@ -10,6 +10,7 @@
from .common import Extractor, Message
from .. import text, exception
+import re
BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com"
@@ -24,6 +25,7 @@ class VkExtractor(Extractor):
request_interval = (0.5, 1.5)
def items(self):
+ sub = re.compile(r"/imp[fg]/").sub
sizes = "wzyxrqpo"
data = self.metadata()
@@ -40,11 +42,15 @@ class VkExtractor(Extractor):
continue
try:
- photo["url"] = photo[size + "src"]
+ url = photo[size + "src"]
except KeyError:
self.log.warning("no photo URL found (%s)", photo.get("id"))
continue
+ photo["url"] = sub("/", url.partition("?")[0])
+ # photo["url"] = url
+ photo["_fallback"] = (url,)
+
try:
_, photo["width"], photo["height"] = photo[size]
except ValueError:
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index 3f2f410..949c7cb 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -87,23 +87,41 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
self.episode_no = params.get("episode_no")
def metadata(self, page):
- keywords, pos = text.extract(
- page, '<meta name="keywords" content="', '"')
- title, pos = text.extract(
- page, '<meta property="og:title" content="', '"', pos)
- descr, pos = text.extract(
- page, '<meta property="og:description" content="', '"', pos)
+ extr = text.extract_from(page)
+ title = extr('<meta property="og:title" content="', '"')
+ descr = extr('<meta property="og:description" content="', '"')
+
+ if extr('<div class="subj_info"', '\n'):
+ comic_name = extr('>', '<')
+ episode_name = extr('<h1 class="subj_episode" title="', '"')
+ else:
+ comic_name = episode_name = ""
+
+ if extr('<span class="tx _btnOpenEpisodeList ', '"'):
+ episode = extr('>#', '<')
+ else:
+ episode = ""
+
+ if extr('<div class="author_area"', '\n'):
+ username = extr('/creator/', '"')
+ author_name = extr('<span>', '</span>')
+ else:
+ username = author_name = ""
return {
- "genre" : self.genre,
- "comic" : self.comic,
- "title_no" : self.title_no,
- "episode_no" : self.episode_no,
- "title" : text.unescape(title),
- "episode" : keywords.split(", ")[1],
- "description": text.unescape(descr),
- "lang" : self.lang,
- "language" : util.code_to_language(self.lang),
+ "genre" : self.genre,
+ "comic" : self.comic,
+ "title_no" : self.title_no,
+ "episode_no" : self.episode_no,
+ "title" : text.unescape(title),
+ "episode" : episode,
+ "comic_name" : text.unescape(comic_name),
+ "episode_name": text.unescape(episode_name),
+ "username" : username,
+ "author_name" : text.unescape(author_name),
+ "description" : text.unescape(descr),
+ "lang" : self.lang,
+ "language" : util.code_to_language(self.lang),
}
@staticmethod
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
new file mode 100644
index 0000000..1eafc29
--- /dev/null
+++ b/gallery_dl/extractor/wikimedia.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2022 Ailothaen
+# Copyright 2024 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Wikimedia sites"""
+
+from .common import BaseExtractor, Message
+from .. import text
+
+
+class WikimediaExtractor(BaseExtractor):
+ """Base class for wikimedia extractors"""
+ basecategory = "wikimedia"
+ filename_fmt = "{filename} ({sha1[:8]}).{extension}"
+ directory_fmt = ("{category}", "{page}")
+ archive_fmt = "{sha1}"
+ request_interval = (1.0, 2.0)
+
+ def __init__(self, match):
+ BaseExtractor.__init__(self, match)
+ path = match.group(match.lastindex)
+
+ if self.category == "fandom":
+ self.category = \
+ "fandom-" + self.root.partition(".")[0].rpartition("/")[2]
+
+ if path.startswith("wiki/"):
+ path = path[5:]
+ self.api_path = "/w/api.php"
+ else:
+ self.api_path = "/api.php"
+
+ pre, sep, _ = path.partition(":")
+ prefix = pre.lower() if sep else None
+
+ self.title = path = text.unquote(path)
+ if prefix:
+ self.subcategory = prefix
+
+ if prefix == "category":
+ self.params = {
+ "generator": "categorymembers",
+ "gcmtitle" : path,
+ "gcmtype" : "file",
+ }
+ elif prefix == "file":
+ self.params = {
+ "titles" : path,
+ }
+ else:
+ self.params = {
+ "generator": "images",
+ "titles" : path,
+ }
+
+ def _init(self):
+ api_path = self.config_instance("api-path")
+ if api_path:
+ if api_path[0] == "/":
+ self.api_url = self.root + api_path
+ else:
+ self.api_url = api_path
+ else:
+ self.api_url = self.root + self.api_path
+
+ def items(self):
+ for info in self._pagination(self.params):
+ image = info["imageinfo"][0]
+
+ image["metadata"] = {
+ m["name"]: m["value"]
+ for m in image["metadata"]}
+ image["commonmetadata"] = {
+ m["name"]: m["value"]
+ for m in image["commonmetadata"]}
+
+ filename = image["canonicaltitle"]
+ image["filename"], _, image["extension"] = \
+ filename.partition(":")[2].rpartition(".")
+ image["date"] = text.parse_datetime(
+ image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
+ image["page"] = self.title
+
+ yield Message.Directory, image
+ yield Message.Url, image["url"], image
+
+ def _pagination(self, params):
+ """
+ https://www.mediawiki.org/wiki/API:Query
+ https://opendata.stackexchange.com/questions/13381
+ """
+
+ url = self.api_url
+ params["action"] = "query"
+ params["format"] = "json"
+ params["prop"] = "imageinfo"
+ params["iiprop"] = (
+ "timestamp|user|userid|comment|canonicaltitle|url|size|"
+ "sha1|mime|metadata|commonmetadata|extmetadata|bitdepth"
+ )
+
+ while True:
+ data = self.request(url, params=params).json()
+
+ try:
+ pages = data["query"]["pages"]
+ except KeyError:
+ pass
+ else:
+ yield from pages.values()
+
+ try:
+ continuation = data["continue"]
+ except KeyError:
+ break
+ params.update(continuation)
+
+
+BASE_PATTERN = WikimediaExtractor.update({
+ "wikipedia": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikipedia\.org",
+ },
+ "wiktionary": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wiktionary\.org",
+ },
+ "wikiquote": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikiquote\.org",
+ },
+ "wikibooks": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikibooks\.org",
+ },
+ "wikisource": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikisource\.org",
+ },
+ "wikinews": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikinews\.org",
+ },
+ "wikiversity": {
+ "root": None,
+ "pattern": r"[a-z]{2,}\.wikiversity\.org",
+ },
+ "wikispecies": {
+ "root": "https://species.wikimedia.org",
+ "pattern": r"species\.wikimedia\.org",
+ },
+ "wikimediacommons": {
+ "root": "https://commons.wikimedia.org",
+ "pattern": r"commons\.wikimedia\.org",
+ },
+ "mediawiki": {
+ "root": "https://www.mediawiki.org",
+ "pattern": r"(?:www\.)?mediawiki\.org",
+ },
+ "fandom": {
+ "root": None,
+ "pattern": r"[\w-]+\.fandom\.com",
+ "api-path": "/api.php",
+ },
+ "mariowiki": {
+ "root": "https://www.mariowiki.com",
+ "pattern": r"(?:www\.)?mariowiki\.com",
+ },
+})
+
+
+class WikimediaArticleExtractor(WikimediaExtractor):
+ """Extractor for wikimedia articles"""
+ subcategory = "article"
+ pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
+ example = "https://en.wikipedia.org/wiki/TITLE"
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index 15905d6..f99beaa 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.26.6"
+__version__ = "1.26.7"