aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2023-03-13 02:07:49 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2023-03-13 02:07:49 -0400
commit10987f08f8b6c510ba64f4b42d95ba67eec6e5b0 (patch)
tree1af82cad9ac859a70cafc976a980280b939cfcc7 /gallery_dl/extractor
parent919f8ba16a7b82ba1099bd25b2c61c7881a05aa2 (diff)
New upstream version 1.25.0.upstream/1.25.0
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/500px.py6
-rw-r--r--gallery_dl/extractor/8muses.py7
-rw-r--r--gallery_dl/extractor/__init__.py40
-rw-r--r--gallery_dl/extractor/bbc.py5
-rw-r--r--gallery_dl/extractor/bcy.py9
-rw-r--r--gallery_dl/extractor/behance.py5
-rw-r--r--gallery_dl/extractor/blogger.py7
-rw-r--r--gallery_dl/extractor/bunkr.py92
-rw-r--r--gallery_dl/extractor/catbox.py27
-rw-r--r--gallery_dl/extractor/common.py15
-rw-r--r--gallery_dl/extractor/danbooru.py178
-rw-r--r--gallery_dl/extractor/deviantart.py259
-rw-r--r--gallery_dl/extractor/directlink.py5
-rw-r--r--gallery_dl/extractor/dynastyscans.py7
-rw-r--r--gallery_dl/extractor/e621.py254
-rw-r--r--gallery_dl/extractor/erome.py3
-rw-r--r--gallery_dl/extractor/fallenangels.py5
-rw-r--r--gallery_dl/extractor/fanbox.py2
-rw-r--r--gallery_dl/extractor/fantia.py5
-rw-r--r--gallery_dl/extractor/foolslide.py5
-rw-r--r--gallery_dl/extractor/gelbooru.py51
-rw-r--r--gallery_dl/extractor/generic.py24
-rw-r--r--gallery_dl/extractor/hbrowse.py9
-rw-r--r--gallery_dl/extractor/hentai2read.py5
-rw-r--r--gallery_dl/extractor/hentaifox.py7
-rw-r--r--gallery_dl/extractor/hentaihand.py5
-rw-r--r--gallery_dl/extractor/hentaihere.py7
-rw-r--r--gallery_dl/extractor/hitomi.py5
-rw-r--r--gallery_dl/extractor/imagefap.py37
-rw-r--r--gallery_dl/extractor/imagehosts.py19
-rw-r--r--gallery_dl/extractor/imgbb.py5
-rw-r--r--gallery_dl/extractor/instagram.py20
-rw-r--r--gallery_dl/extractor/issuu.py7
-rw-r--r--gallery_dl/extractor/lightroom.py7
-rw-r--r--gallery_dl/extractor/mangadex.py8
-rw-r--r--gallery_dl/extractor/manganelo.py124
-rw-r--r--gallery_dl/extractor/mangapark.py7
-rw-r--r--gallery_dl/extractor/mangasee.py53
-rw-r--r--gallery_dl/extractor/misskey.py202
-rw-r--r--gallery_dl/extractor/nana.py5
-rw-r--r--gallery_dl/extractor/newgrounds.py13
-rw-r--r--gallery_dl/extractor/nhentai.py5
-rw-r--r--gallery_dl/extractor/nitter.py33
-rw-r--r--gallery_dl/extractor/oauth.py19
-rw-r--r--gallery_dl/extractor/patreon.py7
-rw-r--r--gallery_dl/extractor/photobucket.py4
-rw-r--r--gallery_dl/extractor/pinterest.py13
-rw-r--r--gallery_dl/extractor/plurk.py10
-rw-r--r--gallery_dl/extractor/poipiku.py2
-rw-r--r--gallery_dl/extractor/pornpics.py173
-rw-r--r--gallery_dl/extractor/pururin.py5
-rw-r--r--gallery_dl/extractor/reactor.py9
-rw-r--r--gallery_dl/extractor/reddit.py58
-rw-r--r--gallery_dl/extractor/redgifs.py102
-rw-r--r--gallery_dl/extractor/shopify.py7
-rw-r--r--gallery_dl/extractor/slideshare.py7
-rw-r--r--gallery_dl/extractor/soundgasm.py93
-rw-r--r--gallery_dl/extractor/subscribestar.py7
-rw-r--r--gallery_dl/extractor/szurubooru.py144
-rw-r--r--gallery_dl/extractor/telegraph.py20
-rw-r--r--gallery_dl/extractor/tumblr.py20
-rw-r--r--gallery_dl/extractor/twitter.py8
-rw-r--r--gallery_dl/extractor/vsco.py7
-rw-r--r--gallery_dl/extractor/weibo.py7
-rw-r--r--gallery_dl/extractor/wikifeet.py5
-rw-r--r--gallery_dl/extractor/xhamster.py6
-rw-r--r--gallery_dl/extractor/xvideos.py5
67 files changed, 1732 insertions, 600 deletions
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
index b2ae963..1213194 100644
--- a/gallery_dl/extractor/500px.py
+++ b/gallery_dl/extractor/500px.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,7 +9,7 @@
"""Extractors for https://500px.com/"""
from .common import Extractor, Message
-import json
+from .. import util
BASE_PATTERN = r"(?:https?://)?(?:web\.)?500px\.com"
@@ -86,7 +86,7 @@ class _500pxExtractor(Extractor):
}
data = {
"operationName": opname,
- "variables" : json.dumps(variables),
+ "variables" : util.json_dumps(variables),
"query" : QUERIES[opname],
}
return self.request(
diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py
index fed4991..26ac8b2 100644
--- a/gallery_dl/extractor/8muses.py
+++ b/gallery_dl/extractor/8muses.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://comics.8muses.com/"""
from .common import Extractor, Message
-from .. import text
-import json
+from .. import text, util
class _8musesAlbumExtractor(Extractor):
@@ -131,7 +130,7 @@ class _8musesAlbumExtractor(Extractor):
@staticmethod
def _unobfuscate(data):
- return json.loads("".join([
+ return util.json_loads("".join([
chr(33 + (ord(c) + 14) % 94) if "!" <= c <= "~" else c
for c in text.unescape(data.strip("\t\n\r !"))
]))
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 6140c2c..3968d72 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -1,11 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
+import sys
import re
modules = [
@@ -34,6 +35,7 @@ modules = [
"desktopography",
"deviantart",
"dynastyscans",
+ "e621",
"erome",
"exhentai",
"fallenangels",
@@ -92,6 +94,7 @@ modules = [
"mangasee",
"mangoxo",
"mememuseum",
+ "misskey",
"myhentaigallery",
"myportfolio",
"nana",
@@ -118,6 +121,7 @@ modules = [
"plurk",
"poipiku",
"pornhub",
+ "pornpics",
"pururin",
"reactor",
"readcomiconline",
@@ -137,6 +141,7 @@ modules = [
"soundgasm",
"speakerdeck",
"subscribestar",
+ "szurubooru",
"tapas",
"tcbscans",
"telegraph",
@@ -217,20 +222,33 @@ def extractors():
# --------------------------------------------------------------------
# internals
-_cache = []
-_module_iter = iter(modules)
-
def _list_classes():
- """Yield all available extractor classes"""
+ """Yield available extractor classes"""
yield from _cache
- globals_ = globals()
- for module_name in _module_iter:
- module = __import__(module_name, globals_, None, (), 1)
+ for module in _module_iter:
yield from add_module(module)
- globals_["_list_classes"] = lambda : _cache
+ globals()["_list_classes"] = lambda : _cache
+
+
+def _modules_internal():
+ globals_ = globals()
+ for module_name in modules:
+ yield __import__(module_name, globals_, None, (), 1)
+
+
+def _modules_path(path, files):
+ sys.path.insert(0, path)
+ try:
+ return [
+ __import__(name[:-3])
+ for name in files
+ if name.endswith(".py")
+ ]
+ finally:
+ del sys.path[0]
def _get_classes(module):
@@ -240,3 +258,7 @@ def _get_classes(module):
hasattr(cls, "pattern") and cls.__module__ == module.__name__
)
]
+
+
+_cache = []
+_module_iter = _modules_internal()
diff --git a/gallery_dl/extractor/bbc.py b/gallery_dl/extractor/bbc.py
index 1b49d6a..638fedc 100644
--- a/gallery_dl/extractor/bbc.py
+++ b/gallery_dl/extractor/bbc.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021 Mike Fährmann
+# Copyright 2021-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,6 @@
from .common import GalleryExtractor, Extractor, Message
from .. import text, util
-import json
BASE_PATTERN = r"(?:https?://)?(?:www\.)?bbc\.co\.uk(/programmes/"
@@ -38,7 +37,7 @@ class BbcGalleryExtractor(GalleryExtractor):
)
def metadata(self, page):
- data = json.loads(text.extr(
+ data = util.json_loads(text.extr(
page, '<script type="application/ld+json">', '</script>'))
return {
"programme": self.gallery_url.split("/")[4],
diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py
index 44d6065..d6adb4e 100644
--- a/gallery_dl/extractor/bcy.py
+++ b/gallery_dl/extractor/bcy.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2022 Mike Fährmann
+# Copyright 2020-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://bcy.net/"""
from .common import Extractor, Message
-from .. import text, exception
-import json
+from .. import text, util, exception
import re
@@ -100,9 +99,9 @@ class BcyExtractor(Extractor):
.replace('\\\\u002F', '/')
.replace('\\"', '"'))
try:
- return json.loads(data)["detail"]
+ return util.json_loads(data)["detail"]
except ValueError:
- return json.loads(data.replace('\\"', '"'))["detail"]
+ return util.json_loads(data.replace('\\"', '"'))["detail"]
class BcyUserExtractor(BcyExtractor):
diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py
index 6da6175..1469aad 100644
--- a/gallery_dl/extractor/behance.py
+++ b/gallery_dl/extractor/behance.py
@@ -9,8 +9,7 @@
"""Extractors for https://www.behance.net/"""
from .common import Extractor, Message
-from .. import text
-import json
+from .. import text, util
class BehanceExtractor(Extractor):
@@ -120,7 +119,7 @@ class BehanceGalleryExtractor(BehanceExtractor):
}
page = self.request(url, cookies=cookies).text
- data = json.loads(text.extr(
+ data = util.json_loads(text.extr(
page, 'id="beconfig-store_state">', '</script>'))
return self._update(data["project"]["project"])
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 8a1a42e..56010c2 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for Blogger blogs"""
from .common import Extractor, Message
-from .. import text
-import json
+from .. import text, util
import re
BASE_PATTERN = (
@@ -61,7 +60,7 @@ class BloggerExtractor(Extractor):
page = self.request(post["url"]).text
for url in findall_video(page):
page = self.request(url).text
- video_config = json.loads(text.extr(
+ video_config = util.json_loads(text.extr(
page, 'var VIDEO_CONFIG =', '\n'))
files.append(max(
video_config["streams"],
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 1c339a9..17d066d 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -6,20 +6,19 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://bunkr.ru/"""
+"""Extractors for https://bunkr.su/"""
from .lolisafe import LolisafeAlbumExtractor
from .. import text
-import json
class BunkrAlbumExtractor(LolisafeAlbumExtractor):
- """Extractor for bunkr.ru albums"""
+ """Extractor for bunkr.su albums"""
category = "bunkr"
- root = "https://bunkr.ru"
- pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:ru|is|to)/a/([^/?#]+)"
+ root = "https://bunkr.su"
+ pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:[sr]u|is|to)/a/([^/?#]+)"
test = (
- ("https://bunkr.ru/a/Lktg9Keq", {
+ ("https://bunkr.su/a/Lktg9Keq", {
"pattern": r"https://cdn\.bunkr\.ru/test-テスト-\"&>-QjgneIQv\.png",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
"keyword": {
@@ -33,7 +32,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
},
}),
# mp4 (#2239)
- ("https://app.bunkr.is/a/ptRHaCn2", {
+ ("https://app.bunkr.ru/a/ptRHaCn2", {
"pattern": r"https://media-files\.bunkr\.ru/_-RnHoW69L\.mp4",
"content": "80e61d1dbc5896ae7ef9a28734c747b28b320471",
}),
@@ -41,44 +40,57 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
("https://bunkr.is/a/iXTTc1o2", {
"pattern": r"https://(cdn|media-files)4\.bunkr\.ru/",
"content": "da29aae371b7adc8c5ef8e6991b66b69823791e8",
+ "keyword": {
+ "album_id": "iXTTc1o2",
+ "album_name": "test2",
+ "album_size": "691.1 KB",
+ "count": 2,
+ "description": "072022",
+ "filename": "re:video-wFO9FtxG|image-sZrQUeOx",
+ "id": "re:wFO9FtxG|sZrQUeOx",
+ "name": "re:video|image",
+ "num": int,
+ },
}),
("https://bunkr.to/a/Lktg9Keq"),
)
def fetch_album(self, album_id):
- root = self.root
+ # album metadata
+ page = self.request(self.root + "/a/" + self.album_id).text
+ info = text.split_html(text.extr(
+ page, "<h1", "</div>").partition(">")[2])
+ count, _, size = info[1].split(None, 2)
+
+ # files
+ cdn = None
+ files = []
+ append = files.append
+ headers = {"Referer": self.root.replace("://", "://stream.", 1) + "/"}
- try:
- data = json.loads(text.extr(
- self.request(root + "/a/" + self.album_id).text,
- 'id="__NEXT_DATA__" type="application/json">', '<'))
- album = data["props"]["pageProps"]["album"]
- files = album["files"]
- except Exception as exc:
- self.log.debug("%s: %s", exc.__class__.__name__, exc)
- self.log.debug("Falling back to lolisafe API")
- self.root = root.replace("://", "://app.", 1)
- files, data = LolisafeAlbumExtractor.fetch_album(self, album_id)
- # fix file URLs (bunkr..ru -> bunkr.ru) (#3481)
- for file in files:
- file["file"] = file["file"].replace("bunkr..", "bunkr.", 1)
- else:
- for file in files:
- file["file"] = file["cdn"] + "/" + file["name"]
- data = {
- "album_id" : self.album_id,
- "album_name" : text.unescape(album["name"]),
- "description": text.unescape(album["description"]),
- "count" : len(files),
- }
+ pos = page.index('class="grid-images')
+ for url in text.extract_iter(page, '<a href="', '"', pos):
+ if url.startswith("/"):
+ if not cdn:
+ # fetch cdn root from download page
+ durl = "{}/d/{}".format(self.root, url[3:])
+ cdn = text.extr(self.request(
+ durl).text, 'link.href = "', '"')
+ cdn = cdn[:cdn.index("/", 8)]
+ url = cdn + url[2:]
- headers = {"Referer": root.replace("://", "://stream.", 1) + "/"}
- for file in files:
- if file["file"].endswith(
- (".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts",
- ".zip", ".rar", ".7z")):
- file["_http_headers"] = headers
- file["file"] = file["file"].replace(
- "://cdn", "://media-files", 1)
+ url = text.unescape(url)
+ if url.endswith((".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts",
+ ".zip", ".rar", ".7z")):
+ append({"file": url.replace("://cdn", "://media-files", 1),
+ "_http_headers": headers})
+ else:
+ append({"file": url})
- return files, data
+ return files, {
+ "album_id" : self.album_id,
+ "album_name" : text.unescape(info[0]),
+ "album_size" : size[1:-1],
+ "description": text.unescape(info[2]) if len(info) > 2 else "",
+ "count" : len(files),
+ }
diff --git a/gallery_dl/extractor/catbox.py b/gallery_dl/extractor/catbox.py
index 509108f..7a21d2a 100644
--- a/gallery_dl/extractor/catbox.py
+++ b/gallery_dl/extractor/catbox.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022 Mike Fährmann
+# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -8,7 +8,7 @@
"""Extractors for https://catbox.moe/"""
-from .common import GalleryExtractor
+from .common import GalleryExtractor, Extractor, Message
from .. import text
@@ -54,3 +54,26 @@ class CatboxAlbumExtractor(GalleryExtractor):
for path in text.extract_iter(
page, ">https://files.catbox.moe/", "<")
]
+
+
+class CatboxFileExtractor(Extractor):
+ """Extractor for catbox files"""
+ category = "catbox"
+ subcategory = "file"
+ archive_fmt = "{filename}"
+ pattern = r"(?:https?://)?(?:files|litter|de)\.catbox\.moe/([^/?#]+)"
+ test = (
+ ("https://files.catbox.moe/8ih3y7.png", {
+ "pattern": r"^https://files\.catbox\.moe/8ih3y7\.png$",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ "count": 1,
+ }),
+ ("https://litter.catbox.moe/t8v3n9.png"),
+ ("https://de.catbox.moe/bjdmz1.jpg"),
+ )
+
+ def items(self):
+ url = text.ensure_http_scheme(self.url)
+ file = text.nameext_from_url(url, {"url": url})
+ yield Message.Directory, file
+ yield Message.Url, url, file
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 4cefa1c..8024be9 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -791,15 +791,21 @@ HTTP_HEADERS = {
("TE", "trailers"),
),
"chrome": (
+ ("Connection", "keep-alive"),
("Upgrade-Insecure-Requests", "1"),
("User-Agent", "Mozilla/5.0 ({}) AppleWebKit/537.36 (KHTML, "
- "like Gecko) Chrome/92.0.4515.131 Safari/537.36"),
+ "like Gecko) Chrome/111.0.0.0 Safari/537.36"),
("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,"
- "image/webp,image/apng,*/*;q=0.8"),
+ "image/avif,image/webp,image/apng,*/*;q=0.8,"
+ "application/signed-exchange;v=b3;q=0.7"),
("Referer", None),
+ ("Sec-Fetch-Site", "same-origin"),
+ ("Sec-Fetch-Mode", "no-cors"),
+ ("Sec-Fetch-Dest", "empty"),
("Accept-Encoding", None),
("Accept-Language", "en-US,en;q=0.9"),
- ("Cookie", None),
+ ("cookie", None),
+ ("content-length", None),
),
}
@@ -838,8 +844,7 @@ SSL_CIPHERS = {
"AES128-GCM-SHA256:"
"AES256-GCM-SHA384:"
"AES128-SHA:"
- "AES256-SHA:"
- "DES-CBC3-SHA"
+ "AES256-SHA"
),
}
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 7b0e572..f104556 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -9,8 +9,7 @@
"""Extractors for https://danbooru.donmai.us/ and other Danbooru instances"""
from .common import BaseExtractor, Message
-from ..version import __version__
-from .. import text
+from .. import text, util
import datetime
@@ -21,36 +20,13 @@ class DanbooruExtractor(BaseExtractor):
page_limit = 1000
page_start = None
per_page = 200
+ request_interval = 1.0
def __init__(self, match):
- self._init_category(match)
-
- instance = INSTANCES.get(self.category) or {}
- iget = instance.get
-
- self.headers = iget("headers")
- self.page_limit = iget("page-limit", 1000)
- self.page_start = iget("page-start")
- self.per_page = iget("per-page", 200)
- self.request_interval_min = iget("request-interval-min", 0.0)
- self._pools = iget("pools")
- self._popular_endpoint = iget("popular", "/explore/posts/popular.json")
-
BaseExtractor.__init__(self, match)
-
self.ugoira = self.config("ugoira", False)
self.external = self.config("external", False)
- metadata = self.config("metadata", False)
- if metadata:
- if isinstance(metadata, (list, tuple)):
- metadata = ",".join(metadata)
- elif not isinstance(metadata, str):
- metadata = "artist_commentary,children,notes,parent,uploader"
- self.metadata_includes = metadata
- else:
- self.metadata_includes = None
-
threshold = self.config("threshold")
if isinstance(threshold, int):
self.threshold = 1 if threshold < 1 else threshold
@@ -62,10 +38,6 @@ class DanbooruExtractor(BaseExtractor):
self.log.debug("Using HTTP Basic Auth for user '%s'", username)
self.session.auth = (username, api_key)
- def request(self, url, **kwargs):
- kwargs["headers"] = self.headers
- return BaseExtractor.request(self, url, **kwargs)
-
def skip(self, num):
pages = num // self.per_page
if pages >= self.page_limit:
@@ -74,32 +46,28 @@ class DanbooruExtractor(BaseExtractor):
return pages * self.per_page
def items(self):
+ self.session.headers["User-Agent"] = util.USERAGENT
+
+ includes = self.config("metadata")
+ if includes:
+ if isinstance(includes, (list, tuple)):
+ includes = ",".join(includes)
+ elif not isinstance(includes, str):
+ includes = "artist_commentary,children,notes,parent,uploader"
+
data = self.metadata()
for post in self.posts():
- file = post.get("file")
- if file:
- url = file["url"]
- if not url:
- md5 = file["md5"]
- url = file["url"] = (
- "https://static1.{}/data/{}/{}/{}.{}".format(
- self.root[8:], md5[0:2], md5[2:4], md5, file["ext"]
- ))
- post["filename"] = file["md5"]
- post["extension"] = file["ext"]
+ try:
+ url = post["file_url"]
+ except KeyError:
+ if self.external and post["source"]:
+ post.update(data)
+ yield Message.Directory, post
+ yield Message.Queue, post["source"], post
+ continue
- else:
- try:
- url = post["file_url"]
- except KeyError:
- if self.external and post["source"]:
- post.update(data)
- yield Message.Directory, post
- yield Message.Queue, post["source"], post
- continue
-
- text.nameext_from_url(url, post)
+ text.nameext_from_url(url, post)
if post["extension"] == "zip":
if self.ugoira:
@@ -109,9 +77,9 @@ class DanbooruExtractor(BaseExtractor):
url = post["large_file_url"]
post["extension"] = "webm"
- if self.metadata_includes:
+ if includes:
meta_url = "{}/posts/{}.json?only={}".format(
- self.root, post["id"], self.metadata_includes)
+ self.root, post["id"], includes)
post.update(self.request(meta_url).json())
if url[0] == "/":
@@ -127,7 +95,7 @@ class DanbooruExtractor(BaseExtractor):
def posts(self):
return ()
- def _pagination(self, endpoint, params, pagenum=False):
+ def _pagination(self, endpoint, params, pages=False):
url = self.root + endpoint
params["limit"] = self.per_page
params["page"] = self.page_start
@@ -141,7 +109,7 @@ class DanbooruExtractor(BaseExtractor):
if len(posts) < self.threshold:
return
- if pagenum:
+ if pages:
params["page"] += 1
else:
for post in reversed(posts):
@@ -163,34 +131,20 @@ class DanbooruExtractor(BaseExtractor):
for index, delay in enumerate(delays)]
-INSTANCES = {
+BASE_PATTERN = DanbooruExtractor.update({
"danbooru": {
"root": None,
"pattern": r"(?:danbooru|hijiribe|sonohara|safebooru)\.donmai\.us",
},
- "e621": {
- "root": None,
- "pattern": r"e(?:621|926)\.net",
- "headers": {"User-Agent": "gallery-dl/{} (by mikf)".format(
- __version__)},
- "pools": "sort",
- "popular": "/popular.json",
- "page-limit": 750,
- "per-page": 320,
- "request-interval-min": 1.0,
- },
"atfbooru": {
"root": "https://booru.allthefallen.moe",
"pattern": r"booru\.allthefallen\.moe",
- "page-limit": 5000,
},
"aibooru": {
"root": None,
"pattern": r"(?:safe.)?aibooru\.online",
}
-}
-
-BASE_PATTERN = DanbooruExtractor.update(INSTANCES)
+})
class DanbooruTagExtractor(DanbooruExtractor):
@@ -213,10 +167,6 @@ class DanbooruTagExtractor(DanbooruExtractor):
"pattern": r"https://i\.pximg\.net/img-original/img"
r"/2008/08/28/02/35/48/1476533_p0\.jpg",
}),
- ("https://e621.net/posts?tags=anry", {
- "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba",
- "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58",
- }),
("https://booru.allthefallen.moe/posts?tags=yume_shokunin", {
"count": 12,
}),
@@ -228,7 +178,6 @@ class DanbooruTagExtractor(DanbooruExtractor):
("https://hijiribe.donmai.us/posts?tags=bonocho"),
("https://sonohara.donmai.us/posts?tags=bonocho"),
("https://safebooru.donmai.us/posts?tags=bonocho"),
- ("https://e926.net/posts?tags=anry"),
("https://safe.aibooru.online/posts?tags=center_frills"),
)
@@ -254,23 +203,17 @@ class DanbooruPoolExtractor(DanbooruExtractor):
("https://danbooru.donmai.us/pools/7659", {
"content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99",
}),
- ("https://e621.net/pools/73", {
- "url": "1bd09a72715286a79eea3b7f09f51b3493eb579a",
- "content": "91abe5d5334425d9787811d7f06d34c77974cd22",
- }),
("https://booru.allthefallen.moe/pools/9", {
"url": "902549ffcdb00fe033c3f63e12bc3cb95c5fd8d5",
"count": 6,
}),
("https://aibooru.online/pools/1"),
("https://danbooru.donmai.us/pool/show/7659"),
- ("https://e621.net/pool/show/73"),
)
def __init__(self, match):
DanbooruExtractor.__init__(self, match)
self.pool_id = match.group(match.lastindex)
- self.post_ids = ()
def metadata(self):
url = "{}/pools/{}.json".format(self.root, self.pool_id)
@@ -280,29 +223,8 @@ class DanbooruPoolExtractor(DanbooruExtractor):
return {"pool": pool}
def posts(self):
- if self._pools == "sort":
- self.log.info("Fetching posts of pool %s", self.pool_id)
-
- id_to_post = {
- post["id"]: post
- for post in self._pagination(
- "/posts.json", {"tags": "pool:" + self.pool_id})
- }
-
- posts = []
- append = posts.append
- for num, pid in enumerate(self.post_ids, 1):
- if pid in id_to_post:
- post = id_to_post[pid]
- post["num"] = num
- append(post)
- else:
- self.log.warning("Post %s is unavailable", pid)
- return posts
-
- else:
- params = {"tags": "pool:" + self.pool_id}
- return self._pagination("/posts.json", params)
+ params = {"tags": "pool:" + self.pool_id}
+ return self._pagination("/posts.json", params)
class DanbooruPostExtractor(DanbooruExtractor):
@@ -318,10 +240,6 @@ class DanbooruPostExtractor(DanbooruExtractor):
"pattern": r"https?://.+\.zip$",
"options": (("ugoira", True),)
}),
- ("https://e621.net/posts/535", {
- "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
- "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
- }),
("https://booru.allthefallen.moe/posts/22", {
"content": "21dda68e1d7e0a554078e62923f537d8e895cac8",
}),
@@ -329,7 +247,6 @@ class DanbooruPostExtractor(DanbooruExtractor):
"content": "54d548743cd67799a62c77cbae97cfa0fec1b7e9",
}),
("https://danbooru.donmai.us/post/show/294929"),
- ("https://e621.net/post/show/535"),
)
def __init__(self, match):
@@ -338,8 +255,7 @@ class DanbooruPostExtractor(DanbooruExtractor):
def posts(self):
url = "{}/posts/{}.json".format(self.root, self.post_id)
- post = self.request(url).json()
- return (post["post"] if "post" in post else post,)
+ return (self.request(url).json(),)
class DanbooruPopularExtractor(DanbooruExtractor):
@@ -355,12 +271,6 @@ class DanbooruPopularExtractor(DanbooruExtractor):
"range": "1-120",
"count": 120,
}),
- ("https://e621.net/popular"),
- (("https://e621.net/explore/posts/popular"
- "?date=2019-06-01&scale=month"), {
- "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+",
- "count": ">= 70",
- }),
("https://booru.allthefallen.moe/explore/posts/popular"),
("https://aibooru.online/explore/posts/popular"),
)
@@ -385,31 +295,5 @@ class DanbooruPopularExtractor(DanbooruExtractor):
def posts(self):
if self.page_start is None:
self.page_start = 1
- return self._pagination(self._popular_endpoint, self.params, True)
-
-
-class DanbooruFavoriteExtractor(DanbooruExtractor):
- """Extractor for e621 favorites"""
- subcategory = "favorite"
- directory_fmt = ("{category}", "Favorites", "{user_id}")
- archive_fmt = "f_{user_id}_{id}"
- pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?"
- test = (
- ("https://e621.net/favorites"),
- ("https://e621.net/favorites?page=2&user_id=53275", {
- "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+",
- "count": "> 260",
- }),
- )
-
- def __init__(self, match):
- DanbooruExtractor.__init__(self, match)
- self.query = text.parse_query(match.group(match.lastindex))
-
- def metadata(self):
- return {"user_id": self.query.get("user_id", "")}
-
- def posts(self):
- if self.page_start is None:
- self.page_start = 1
- return self._pagination("/favorites.json", self.query, True)
+ return self._pagination(
+ "/explore/posts/popular.json", self.params, True)
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index a3187fa..37475df 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://www.deviantart.com/"""
+"""Extractors for https://www.deviantart.com/"""
from .common import Extractor, Message
from .. import text, util, exception
@@ -21,29 +21,30 @@ import re
BASE_PATTERN = (
r"(?:https?://)?(?:"
- r"(?:www\.)?deviantart\.com/(?!watch/)([\w-]+)|"
- r"(?!www\.)([\w-]+)\.deviantart\.com)"
+ r"(?:www\.)?(?:fx)?deviantart\.com/(?!watch/)([\w-]+)|"
+ r"(?!www\.)([\w-]+)\.(?:fx)?deviantart\.com)"
)
class DeviantartExtractor(Extractor):
"""Base class for deviantart extractors"""
category = "deviantart"
+ root = "https://www.deviantart.com"
directory_fmt = ("{category}", "{username}")
filename_fmt = "{category}_{index}_{title}.{extension}"
cookiedomain = None
- root = "https://www.deviantart.com"
+ cookienames = ("auth", "auth_secure", "userinfo")
_last_request = 0
def __init__(self, match):
Extractor.__init__(self, match)
- self.offset = 0
self.flat = self.config("flat", True)
self.extra = self.config("extra", False)
self.original = self.config("original", True)
self.comments = self.config("comments", False)
self.user = match.group(1) or match.group(2)
self.group = False
+ self.offset = 0
self.api = None
unwatch = self.config("auto-unwatch")
@@ -69,6 +70,14 @@ class DeviantartExtractor(Extractor):
self.offset += num
return num
+ def login(self):
+ if not self._check_cookies(self.cookienames):
+ username, password = self._get_auth_info()
+ if not username:
+ return False
+ self._update_cookies(_login_impl(self, username, password))
+ return True
+
def items(self):
self.api = DeviantartOAuthAPI(self)
@@ -87,6 +96,13 @@ class DeviantartExtractor(Extractor):
yield Message.Queue, url, data
continue
+ if deviation["is_deleted"]:
+ # prevent crashing in case the deviation really is
+ # deleted
+ self.log.debug(
+ "Skipping %s (deleted)", deviation["deviationid"])
+ continue
+
if "premium_folder_data" in deviation:
data = self._fetch_premium(deviation)
if not data:
@@ -346,9 +362,7 @@ class DeviantartExtractor(Extractor):
kwargs["fatal"] = None
diff = time.time() - DeviantartExtractor._last_request
if diff < 2.0:
- delay = 2.0 - diff
- self.log.debug("Sleeping %.2f seconds", delay)
- time.sleep(delay)
+ self.sleep(2.0 - diff, "request")
while True:
response = self.request(url, **kwargs)
@@ -406,6 +420,16 @@ class DeviantartExtractor(Extractor):
self.log.info("Unwatching %s", username)
self.api.user_friends_unwatch(username)
+ def _eclipse_to_oauth(self, eclipse_api, deviations):
+ for obj in deviations:
+ deviation = obj["deviation"] if "deviation" in obj else obj
+ deviation_uuid = eclipse_api.deviation_extended_fetch(
+ deviation["deviationId"],
+ deviation["author"]["username"],
+ "journal" if deviation["isJournal"] else "art",
+ )["deviation"]["extended"]["deviationUuid"]
+ yield self.api.deviation(deviation_uuid)
+
class DeviantartUserExtractor(DeviantartExtractor):
"""Extractor for an artist's user profile"""
@@ -676,15 +700,9 @@ class DeviantartFavoriteExtractor(DeviantartExtractor):
)
def deviations(self):
- folders = self.api.collections_folders(self.user)
if self.flat:
- deviations = itertools.chain.from_iterable(
- self.api.collections(self.user, folder["folderid"])
- for folder in folders
- )
- if self.offset:
- deviations = util.advance(deviations, self.offset)
- return deviations
+ return self.api.collections_all(self.user, self.offset)
+ folders = self.api.collections_folders(self.user)
return self._folder_urls(
folders, "favourites", DeviantartCollectionExtractor)
@@ -796,6 +814,14 @@ class DeviantartStatusExtractor(DeviantartExtractor):
"url" : "re:^https://sta.sh",
},
}),
+ # "deleted" deviations in 'items'
+ ("https://www.deviantart.com/AndrejSKalin/posts/statuses", {
+ "options": (("journals", "none"), ("original", 0),
+ ("image-filter", "deviationid[:8] == '147C8B03'")),
+ "count": 2,
+ "archive": False,
+ "keyword": {"deviationid": "147C8B03-7D34-AE93-9241-FA3C6DBBC655"}
+ }),
("https://www.deviantart.com/justgalym/posts/statuses", {
"options": (("journals", "text"),),
"url": "c8744f7f733a3029116607b826321233c5ca452d",
@@ -861,8 +887,7 @@ class DeviantartPopularExtractor(DeviantartExtractor):
"{popular[range]}", "{popular[search]}")
archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}"
pattern = (r"(?:https?://)?www\.deviantart\.com/(?:"
- r"search(?:/deviations)?"
- r"|(?:deviations/?)?\?order=(popular-[^/?#]+)"
+ r"(?:deviations/?)?\?order=(popular-[^/?#]+)"
r"|((?:[\w-]+/)*)(popular-[^/?#]+)"
r")/?(?:\?([^#]*))?")
test = (
@@ -876,8 +901,6 @@ class DeviantartPopularExtractor(DeviantartExtractor):
"range": "1-30",
"count": 30,
}),
- ("https://www.deviantart.com/search?q=tree"),
- ("https://www.deviantart.com/search/deviations?order=popular-1-week"),
("https://www.deviantart.com/artisan/popular-all-time/?q=tree"),
)
@@ -974,7 +997,7 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
subcategory = "deviation"
archive_fmt = "g_{_username}_{index}.{extension}"
pattern = (BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)"
- r"|(?:https?://)?(?:www\.)?deviantart\.com/"
+ r"|(?:https?://)?(?:www\.)?(?:fx)?deviantart\.com/"
r"(?:view/|deviation/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)"
r"(\d+)" # bare deviation ID without slug
r"|(?:https?://)?fav\.me/d([0-9a-z]+)") # base36
@@ -1068,6 +1091,9 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
# old /view/ URLs from the Wayback Machine
("https://www.deviantart.com/view.php?id=14864502"),
("http://www.deviantart.com/view-full.php?id=100842"),
+
+ ("https://www.fxdeviantart.com/zzz/art/zzz-1234567890"),
+ ("https://www.fxdeviantart.com/view/1234567890"),
)
skip = Extractor.skip
@@ -1094,6 +1120,7 @@ class DeviantartScrapsExtractor(DeviantartExtractor):
subcategory = "scraps"
directory_fmt = ("{category}", "{username}", "Scraps")
archive_fmt = "s_{_username}_{index}.{extension}"
+ cookiedomain = ".deviantart.com"
pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b"
test = (
("https://www.deviantart.com/shimoda7/gallery/scraps", {
@@ -1102,34 +1129,109 @@ class DeviantartScrapsExtractor(DeviantartExtractor):
("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps"),
("https://shimoda7.deviantart.com/gallery/?catpath=scraps"),
)
+
+ def deviations(self):
+ self.login()
+
+ eclipse_api = DeviantartEclipseAPI(self)
+ return self._eclipse_to_oauth(
+ eclipse_api, eclipse_api.gallery_scraps(self.user, self.offset))
+
+
+class DeviantartSearchExtractor(DeviantartExtractor):
+ """Extractor for deviantart search results"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "Search", "{search_tags}")
+ archive_fmt = "Q_{search_tags}_{index}.{extension}"
cookiedomain = ".deviantart.com"
- cookienames = ("auth", "auth_secure", "userinfo")
- _warning = True
+ pattern = (r"(?:https?://)?www\.deviantart\.com"
+ r"/search(?:/deviations)?/?\?([^#]+)")
+ test = (
+ ("https://www.deviantart.com/search?q=tree"),
+ ("https://www.deviantart.com/search/deviations?order=popular-1-week"),
+ )
+
+ skip = Extractor.skip
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.query = text.parse_query(self.user)
+ self.search = self.query.get("q", "")
+ self.user = ""
+
+ def deviations(self):
+ logged_in = self.login()
+
+ eclipse_api = DeviantartEclipseAPI(self)
+ search = (eclipse_api.search_deviations
+ if logged_in else self._search_html)
+ return self._eclipse_to_oauth(eclipse_api, search(self.query))
+
+ def prepare(self, deviation):
+ DeviantartExtractor.prepare(self, deviation)
+ deviation["search_tags"] = self.search
+
+ def _search_html(self, params):
+ url = self.root + "/search"
+ deviation = {
+ "deviationId": None,
+ "author": {"username": "u"},
+ "isJournal": False,
+ }
+
+ while True:
+ page = self.request(url, params=params).text
+
+ items , pos = text.rextract(page, r'\"items\":[', ']')
+ cursor, pos = text.extract(page, r'\"cursor\":\"', '\\', pos)
+
+ for deviation_id in items.split(","):
+ deviation["deviationId"] = deviation_id
+ yield deviation
+
+ if not cursor:
+ return
+ params["cursor"] = cursor
+
+
+class DeviantartGallerySearchExtractor(DeviantartExtractor):
+ """Extractor for deviantart gallery searches"""
+ subcategory = "gallery-search"
+ archive_fmt = "g_{_username}_{index}.{extension}"
+ cookiedomain = ".deviantart.com"
+ pattern = BASE_PATTERN + r"/gallery/?\?(q=[^#]+)"
+ test = (
+ ("https://www.deviantart.com/shimoda7/gallery?q=memory", {
+ "options": (("original", 0),),
+ "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e",
+ }),
+ ("https://www.deviantart.com/shimoda7/gallery?q=memory&sort=popular"),
+ )
+
+ def __init__(self, match):
+ DeviantartExtractor.__init__(self, match)
+ self.query = match.group(3)
def deviations(self):
self.login()
eclipse_api = DeviantartEclipseAPI(self)
- for obj in eclipse_api.gallery_scraps(self.user, self.offset):
- deviation = obj["deviation"]
- deviation_uuid = eclipse_api.deviation_extended_fetch(
- deviation["deviationId"],
- deviation["author"]["username"],
- "journal" if deviation["isJournal"] else "art",
- )["deviation"]["extended"]["deviationUuid"]
+ info = eclipse_api.user_info(self.user)
- yield self.api.deviation(deviation_uuid)
+ query = text.parse_query(self.query)
+ self.search = query["q"]
- def login(self):
- """Login and obtain session cookies"""
- if not self._check_cookies(self.cookienames):
- username, password = self._get_auth_info()
- if username:
- self._update_cookies(_login_impl(self, username, password))
- elif self._warning:
- self.log.warning(
- "No session cookies set: Unable to fetch mature scraps.")
- DeviantartScrapsExtractor._warning = False
+ return self._eclipse_to_oauth(
+ eclipse_api, eclipse_api.galleries_search(
+ info["user"]["userId"],
+ self.search,
+ self.offset,
+ query.get("sort", "most-recent"),
+ ))
+
+ def prepare(self, deviation):
+ DeviantartExtractor.prepare(self, deviation)
+ deviation["search_tags"] = self.search
class DeviantartFollowingExtractor(DeviantartExtractor):
@@ -1261,6 +1363,13 @@ class DeviantartOAuthAPI():
"mature_content": self.mature}
return self._pagination(endpoint, params)
+ def collections_all(self, username, offset=0):
+ """Yield all deviations in a user's collection"""
+ endpoint = "/collections/all"
+ params = {"username": username, "offset": offset, "limit": 24,
+ "mature_content": self.mature}
+ return self._pagination(endpoint, params)
+
@memcache(keyarg=1)
def collections_folders(self, username, offset=0):
"""Yield all collection folders of a specific user"""
@@ -1411,7 +1520,7 @@ class DeviantartOAuthAPI():
while True:
if self.delay:
- time.sleep(self.delay)
+ self.extractor.sleep(self.delay, "api")
self.authenticate(None if public else self.refresh_token_key)
kwargs["headers"] = self.headers
@@ -1480,6 +1589,15 @@ class DeviantartOAuthAPI():
self._metadata(results)
if self.folders:
self._folders(results)
+ else: # attempt to fix "deleted" deviations
+ for dev in self._shared_content(results):
+ if not dev["is_deleted"]:
+ continue
+ patch = self._call(
+ "/deviation/" + dev["deviationid"], fatal=False)
+ if patch:
+ dev.update(patch)
+
yield from results
if not data["has_more"] and (
@@ -1497,6 +1615,14 @@ class DeviantartOAuthAPI():
return
params["offset"] = int(params["offset"]) + len(results)
+ @staticmethod
+ def _shared_content(results):
+ """Return an iterable of shared deviations in 'results'"""
+ for result in results:
+ for item in result.get("items") or ():
+ if "deviation" in item:
+ yield item["deviation"]
+
def _pagination_list(self, endpoint, params, key="results"):
result = []
result.extend(self._pagination(endpoint, params, False, key=key))
@@ -1585,6 +1711,29 @@ class DeviantartEclipseAPI():
}
return self._pagination(endpoint, params)
+ def galleries_search(self, user_id, query,
+ offset=None, order="most-recent"):
+ endpoint = "/shared_api/galleries/search"
+ params = {
+ "userid": user_id,
+ "order" : order,
+ "q" : query,
+ "offset": offset,
+ "limit" : 24,
+ }
+ return self._pagination(endpoint, params)
+
+ def search_deviations(self, params):
+ endpoint = "/da-browse/api/networkbar/search/deviations"
+ return self._pagination(endpoint, params, key="deviations")
+
+ def user_info(self, user, expand=False):
+ endpoint = "/shared_api/user/info"
+ params = {"username": user}
+ if expand:
+ params["expand"] = "user.stats,user.profile,user.watch"
+ return self._call(endpoint, params)
+
def user_watching(self, user, offset=None):
endpoint = "/da-user-profile/api/module/watching"
params = {
@@ -1611,23 +1760,37 @@ class DeviantartEclipseAPI():
except Exception:
return {"error": response.text}
- def _pagination(self, endpoint, params):
+ def _pagination(self, endpoint, params, key="results"):
+ limit = params.get("limit", 24)
+ warn = True
+
while True:
data = self._call(endpoint, params)
- results = data.get("results")
+ results = data.get(key)
if results is None:
return
+ if len(results) < limit and warn and data.get("hasMore"):
+ warn = False
+ self.log.warning(
+ "Private deviations detected! "
+ "Provide login credentials or session cookies "
+ "to be able to access them.")
yield from results
if not data.get("hasMore"):
return
- next_offset = data.get("nextOffset")
- if next_offset:
- params["offset"] = next_offset
+ if "nextCursor" in data:
+ params["offset"] = None
+ params["cursor"] = data["nextCursor"]
+ elif "nextOffset" in data:
+ params["offset"] = data["nextOffset"]
+ params["cursor"] = None
+ elif params.get("offset") is None:
+ return
else:
- params["offset"] += params["limit"]
+ params["offset"] = int(params["offset"]) + len(results)
def _module_id_watching(self, user):
url = "{}/{}/about".format(self.extractor.root, user)
diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py
index 8b90250..e85eb8d 100644
--- a/gallery_dl/extractor/directlink.py
+++ b/gallery_dl/extractor/directlink.py
@@ -44,6 +44,11 @@ class DirectlinkExtractor(Extractor):
("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw"
".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP"
"mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"),
+ # internationalized domain name
+ ("https://räksmörgås.josefsson.org/raksmorgas.jpg", {
+ "url": "a65667f670b194afbd1e3ea5e7a78938d36747da",
+ "keyword": "fd5037fe86eebd4764e176cbaf318caec0f700be",
+ }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index d78f25b..59e8c90 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://dynasty-scans.com/"""
from .common import ChapterExtractor, MangaExtractor, Extractor, Message
-from .. import text
-import json
+from .. import text, util
import re
BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com"
@@ -86,7 +85,7 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
data = text.extr(page, "var pages = ", ";\n")
return [
(self.root + img["image"], None)
- for img in json.loads(data)
+ for img in util.json_loads(data)
]
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
new file mode 100644
index 0000000..8f2994e
--- /dev/null
+++ b/gallery_dl/extractor/e621.py
@@ -0,0 +1,254 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2014-2023 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://e621.net/ and other e621 instances"""
+
+from .common import Message
+from . import danbooru
+from .. import text, util
+
+
+class E621Extractor(danbooru.DanbooruExtractor):
+ """Base class for e621 extractors"""
+ basecategory = "E621"
+ page_limit = 750
+ page_start = None
+ per_page = 320
+ request_interval_min = 1.0
+
+ def items(self):
+ self.session.headers["User-Agent"] = util.USERAGENT + " (by mikf)"
+
+ includes = self.config("metadata") or ()
+ if includes:
+ if isinstance(includes, str):
+ includes = includes.split(",")
+ elif not isinstance(includes, (list, tuple)):
+ includes = ("notes", "pools")
+
+ notes = ("notes" in includes)
+ pools = ("pools" in includes)
+
+ data = self.metadata()
+ for post in self.posts():
+ file = post["file"]
+
+ if not file["url"]:
+ md5 = file["md5"]
+ file["url"] = "https://static1.{}/data/{}/{}/{}.{}".format(
+ self.root[8:], md5[0:2], md5[2:4], md5, file["ext"])
+
+ if notes and post.get("has_notes"):
+ url = "{}/notes.json?search[post_id]={}".format(
+ self.root, post["id"])
+ post["notes"] = self.request(url).json()
+
+ if pools and post["pools"]:
+ url = "{}/pools.json?search[id]={}".format(
+ self.root, ",".join(map(str, post["pools"])))
+ post["pools"] = _pools = self.request(url).json()
+ for pool in _pools:
+ pool["name"] = pool["name"].replace("_", " ")
+
+ post["filename"] = file["md5"]
+ post["extension"] = file["ext"]
+
+ post.update(data)
+ yield Message.Directory, post
+ yield Message.Url, file["url"], post
+
+
+BASE_PATTERN = E621Extractor.update({
+ "e621": {
+ "root": "https://e621.net",
+ "pattern": r"e621\.net",
+ },
+ "e926": {
+ "root": "https://e926.net",
+ "pattern": r"e926\.net",
+ },
+})
+
+
+class E621TagExtractor(E621Extractor, danbooru.DanbooruTagExtractor):
+ """Extractor for e621 posts from tag searches"""
+ pattern = BASE_PATTERN + r"/posts?(?:\?.*?tags=|/index/\d+/)([^&#]+)"
+ test = (
+ ("https://e621.net/posts?tags=anry", {
+ "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba",
+ "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58",
+ }),
+ ("https://e621.net/post/index/1/anry"),
+ ("https://e621.net/post?tags=anry"),
+
+ ("https://e926.net/posts?tags=anry", {
+ "url": "12198b275c62ffe2de67cca676c8e64de80c425d",
+ "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58",
+ }),
+ ("https://e926.net/post/index/1/anry"),
+ ("https://e926.net/post?tags=anry"),
+ )
+
+
+class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor):
+ """Extractor for e621 pools"""
+ pattern = BASE_PATTERN + r"/pool(?:s|/show)/(\d+)"
+ test = (
+ ("https://e621.net/pools/73", {
+ "url": "1bd09a72715286a79eea3b7f09f51b3493eb579a",
+ "content": "91abe5d5334425d9787811d7f06d34c77974cd22",
+ }),
+ ("https://e621.net/pool/show/73"),
+
+ ("https://e926.net/pools/73", {
+ "url": "6936f1b6a18c5c25bee7cad700088dbc2503481b",
+ "content": "91abe5d5334425d9787811d7f06d34c77974cd22",
+ }),
+ ("https://e926.net/pool/show/73"),
+ )
+
+ def posts(self):
+ self.log.info("Fetching posts of pool %s", self.pool_id)
+
+ id_to_post = {
+ post["id"]: post
+ for post in self._pagination(
+ "/posts.json", {"tags": "pool:" + self.pool_id})
+ }
+
+ posts = []
+ append = posts.append
+ for num, pid in enumerate(self.post_ids, 1):
+ if pid in id_to_post:
+ post = id_to_post[pid]
+ post["num"] = num
+ append(post)
+ else:
+ self.log.warning("Post %s is unavailable", pid)
+ return posts
+
+
+class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor):
+ """Extractor for single e621 posts"""
+ pattern = BASE_PATTERN + r"/post(?:s|/show)/(\d+)"
+ test = (
+ ("https://e621.net/posts/535", {
+ "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
+ "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
+ }),
+ ("https://e621.net/posts/3181052", {
+ "options": (("metadata", "notes,pools"),),
+ "pattern": r"https://static\d\.e621\.net/data/c6/8c"
+ r"/c68cca0643890b615f75fb2719589bff\.png",
+ "keyword": {
+ "notes": [
+ {
+ "body": "Little Legends 2",
+ "created_at": "2022-05-16T13:58:38.877-04:00",
+ "creator_id": 517450,
+ "creator_name": "EeveeCuddler69",
+ "height": 475,
+ "id": 321296,
+ "is_active": True,
+ "post_id": 3181052,
+ "updated_at": "2022-05-16T13:59:02.050-04:00",
+ "version": 3,
+ "width": 809,
+ "x": 83,
+ "y": 117,
+ },
+ ],
+ "pools": [
+ {
+ "category": "series",
+ "created_at": "2022-02-17T00:29:22.669-05:00",
+ "creator_id": 1077440,
+ "creator_name": "Yeetus90",
+ "description": "* \"Little Legends\":/pools/27971\r\n"
+ "* Little Legends 2\r\n"
+ "* \"Little Legends 3\":/pools/27481",
+ "id": 27492,
+ "is_active": False,
+ "name": "Little Legends 2",
+ "post_count": 39,
+ "post_ids": list,
+ "updated_at": "2022-03-27T06:30:03.382-04:00"
+ },
+ ],
+ },
+ }),
+ ("https://e621.net/post/show/535"),
+
+ ("https://e926.net/posts/535", {
+ "url": "17aec8ebd8fab098d321adcb62a2db59dab1f4bf",
+ "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
+ }),
+ ("https://e926.net/post/show/535"),
+ )
+
+ def posts(self):
+ url = "{}/posts/{}.json".format(self.root, self.post_id)
+ return (self.request(url).json()["post"],)
+
+
+class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor):
+ """Extractor for popular images from e621"""
+ pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?([^#]*))?"
+ test = (
+ ("https://e621.net/explore/posts/popular"),
+ (("https://e621.net/explore/posts/popular"
+ "?date=2019-06-01&scale=month"), {
+ "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+",
+ "count": ">= 70",
+ }),
+
+ ("https://e926.net/explore/posts/popular"),
+ (("https://e926.net/explore/posts/popular"
+ "?date=2019-06-01&scale=month"), {
+ "pattern": r"https://static\d.e926.net/data/../../[0-9a-f]+",
+ "count": ">= 70",
+ }),
+ )
+
+ def posts(self):
+ if self.page_start is None:
+ self.page_start = 1
+ return self._pagination("/popular.json", self.params, True)
+
+
+class E621FavoriteExtractor(E621Extractor):
+ """Extractor for e621 favorites"""
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "Favorites", "{user_id}")
+ archive_fmt = "f_{user_id}_{id}"
+ pattern = BASE_PATTERN + r"/favorites(?:\?([^#]*))?"
+ test = (
+ ("https://e621.net/favorites"),
+ ("https://e621.net/favorites?page=2&user_id=53275", {
+ "pattern": r"https://static\d.e621.net/data/../../[0-9a-f]+",
+ "count": "> 260",
+ }),
+
+ ("https://e926.net/favorites"),
+ ("https://e926.net/favorites?page=2&user_id=53275", {
+ "pattern": r"https://static\d.e926.net/data/../../[0-9a-f]+",
+ "count": "> 260",
+ }),
+ )
+
+ def __init__(self, match):
+ E621Extractor.__init__(self, match)
+ self.query = text.parse_query(match.group(match.lastindex))
+
+ def metadata(self):
+ return {"user_id": self.query.get("user_id", "")}
+
+ def posts(self):
+ if self.page_start is None:
+ self.page_start = 1
+ return self._pagination("/favorites.json", self.query, True)
diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py
index ad3f16b..03307f8 100644
--- a/gallery_dl/extractor/erome.py
+++ b/gallery_dl/extractor/erome.py
@@ -12,7 +12,6 @@ from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import itertools
-import time
BASE_PATTERN = r"(?:https?://)?(?:www\.)?erome\.com"
@@ -75,7 +74,7 @@ class EromeExtractor(Extractor):
if response.content.find(
b"<title>Please wait a few moments</title>", 0, 600) < 0:
return response
- time.sleep(5)
+ self.sleep(5.0, "check")
def _pagination(self, url, params):
for params["page"] in itertools.count(1):
diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py
index 57587b6..0503dcf 100644
--- a/gallery_dl/extractor/fallenangels.py
+++ b/gallery_dl/extractor/fallenangels.py
@@ -6,11 +6,10 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract manga-chapters from https://www.fascans.com/"""
+"""Extractors for https://www.fascans.com/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
-import json
class FallenangelsChapterExtractor(ChapterExtractor):
@@ -56,7 +55,7 @@ class FallenangelsChapterExtractor(ChapterExtractor):
def images(page):
return [
(img["page_image"], None)
- for img in json.loads(
+ for img in util.json_loads(
text.extr(page, "var pages = ", ";")
)
]
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index 41431dc..57c4333 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -89,6 +89,7 @@ class FanboxExtractor(Extractor):
content_body["imageMap"] = {
image_id: image_map[image_id]
for image_id in images
+ if image_id in image_map
}
post["content"] = "\n".join(content)
@@ -256,7 +257,6 @@ class FanboxCreatorExtractor(FanboxExtractor):
def posts(self):
url = "https://api.fanbox.cc/post.listCreator?creatorId={}&limit=10"
-
return self._pagination(url.format(self.creator_id))
diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py
index 476fdeb..13dfead 100644
--- a/gallery_dl/extractor/fantia.py
+++ b/gallery_dl/extractor/fantia.py
@@ -7,8 +7,7 @@
"""Extractors for https://fantia.jp/"""
from .common import Extractor, Message
-from .. import text
-import json
+from .. import text, util
class FantiaExtractor(Extractor):
@@ -117,7 +116,7 @@ class FantiaExtractor(Extractor):
yield self.root+"/"+content["download_uri"], post
if content["category"] == "blog" and "comment" in content:
- comment_json = json.loads(content["comment"])
+ comment_json = util.json_loads(content["comment"])
ops = comment_json.get("ops", ())
# collect blogpost text first
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index 2290cc2..4a38fb4 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2022 Mike Fährmann
+# Copyright 2016-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,6 @@
from .common import BaseExtractor, Message
from .. import text, util
-import json
class FoolslideExtractor(BaseExtractor):
@@ -106,7 +105,7 @@ class FoolslideChapterExtractor(FoolslideExtractor):
})
def images(self, page):
- return json.loads(text.extr(page, "var pages = ", ";"))
+ return util.json_loads(text.extr(page, "var pages = ", ";"))
class FoolslideMangaExtractor(FoolslideExtractor):
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index 8d73949..80b0ae1 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -13,6 +13,8 @@ from . import gelbooru_v02
from .. import text, exception
import binascii
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?"
+
class GelbooruBase():
"""Base class for gelbooru extractors"""
@@ -53,6 +55,23 @@ class GelbooruBase():
del params["pid"]
params["tags"] = "{} id:<{}".format(self.tags, post["id"])
+ def _pagination_html(self, params):
+ url = self.root + "/index.php"
+ params["pid"] = self.page_start * self.per_page
+
+ data = {}
+ while True:
+ num_ids = 0
+ page = self.request(url, params=params).text
+
+ for data["id"] in text.extract_iter(page, '" id="p', '"'):
+ num_ids += 1
+ yield from self._api_request(data)
+
+ if num_ids < self.per_page:
+ return
+ params["pid"] += self.per_page
+
@staticmethod
def _file_url(post):
url = post["file_url"]
@@ -88,8 +107,7 @@ class GelbooruBase():
class GelbooruTagExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02TagExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
- pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
- r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
+ pattern = BASE_PATTERN + r"page=post&s=list&tags=([^&#]+)"
test = (
("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
"count": 5,
@@ -108,8 +126,7 @@ class GelbooruPoolExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02PoolExtractor):
"""Extractor for gelbooru pools"""
per_page = 45
- pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
- r"\?page=pool&s=show&id=(?P<pool>\d+)")
+ pattern = BASE_PATTERN + r"page=pool&s=show&id=(\d+)"
test = (
("https://gelbooru.com/index.php?page=pool&s=show&id=761", {
"count": 6,
@@ -124,9 +141,9 @@ class GelbooruPoolExtractor(GelbooruBase,
"id" : self.pool_id,
"pid" : self.page_start,
}
- self._page = self.request(url, params=self._params).text
+ page = self.request(url, params=self._params).text
- name, pos = text.extract(self._page, "<h3>Now Viewing: ", "</h3>")
+ name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
if not name:
raise exception.NotFoundError("pool")
@@ -136,29 +153,19 @@ class GelbooruPoolExtractor(GelbooruBase,
}
def posts(self):
- url = self.root + "/index.php"
- params = self._params
+ return self._pagination_html(self._params)
- page = self._page
- del self._page
- data = {}
-
- while True:
- num_ids = 0
- for data["id"] in text.extract_iter(page, '" id="p', '"'):
- num_ids += 1
- yield from self._api_request(data)
- if num_ids < self.per_page:
- return
- params["pid"] += self.per_page
- page = self.request(url, params=params).text
+class GelbooruFavoriteExtractor(GelbooruBase,
+ gelbooru_v02.GelbooruV02FavoriteExtractor):
+ pattern = BASE_PATTERN + r"page=favorites&s=view&id=(\d+)"
+ test = ("https://gelbooru.com/index.php?page=favorites&s=view&id=12345",)
class GelbooruPostExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02PostExtractor):
"""Extractor for single images from gelbooru.com"""
- pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?\?"
+ pattern = (BASE_PATTERN +
r"(?=(?:[^#]+&)?page=post(?:&|#|$))"
r"(?=(?:[^#]+&)?s=view(?:&|#|$))"
r"(?:[^#]+&)?id=(\d+)")
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 9292da3..9999283 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -32,6 +32,28 @@ class GenericExtractor(Extractor):
(?:\#(?P<fragment>.*))? # optional fragment
"""
+ test = (
+ ("generic:https://www.nongnu.org/lzip/", {
+ "count": 1,
+ "content": "40be5c77773d3e91db6e1c5df720ee30afb62368",
+ "keyword": {
+ "description": "Lossless data compressor",
+ "imageurl": "https://www.nongnu.org/lzip/lzip.png",
+ "keywords": "lzip, clzip, plzip, lzlib, LZMA, bzip2, "
+ "gzip, data compression, GNU, free software",
+ "pageurl": "https://www.nongnu.org/lzip/",
+ },
+ }),
+ # internationalized domain name
+ ("generic:https://räksmörgås.josefsson.org/", {
+ "count": 2,
+ "pattern": "^https://räksmörgås.josefsson.org/",
+ }),
+ ("generic:https://en.wikipedia.org/Main_Page"),
+ ("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"),
+ ("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"),
+ )
+
def __init__(self, match):
"""Init."""
Extractor.__init__(self, match)
@@ -56,7 +78,7 @@ class GenericExtractor(Extractor):
self.root = self.scheme + match.group('domain')
def items(self):
- """Get page, extract metadata & images, yield them in suitable messages.
+ """Get page, extract metadata & images, yield them in suitable messages
Adapted from common.GalleryExtractor.items()
diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py
index 43479c6..5b561ea 100644
--- a/gallery_dl/extractor/hbrowse.py
+++ b/gallery_dl/extractor/hbrowse.py
@@ -1,16 +1,15 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2019 Mike Fährmann
+# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://www.hbrowse.com/"""
+"""Extractors for https://www.hbrowse.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, exception
-import json
+from .. import text, util, exception
class HbrowseBase():
@@ -68,7 +67,7 @@ class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor):
def images(self, page):
base = self.root + "/data" + self.path
json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
- return [(base + name, None) for name in json.loads(json_data)]
+ return [(base + name, None) for name in util.json_loads(json_data)]
class HbrowseMangaExtractor(HbrowseBase, MangaExtractor):
diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py
index dc4e31d..e771a4f 100644
--- a/gallery_dl/extractor/hentai2read.py
+++ b/gallery_dl/extractor/hentai2read.py
@@ -9,8 +9,7 @@
"""Extractors for https://hentai2read.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text
-import json
+from .. import text, util
import re
@@ -78,7 +77,7 @@ class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor):
images = text.extract(page, "'images' : ", ",\n")[0]
return [
("https://hentaicdn.com/hentai" + part, None)
- for part in json.loads(images)
+ for part in util.json_loads(images)
]
diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py
index 0327f56..ed8576f 100644
--- a/gallery_dl/extractor/hentaifox.py
+++ b/gallery_dl/extractor/hentaifox.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2021 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://hentaifox.com/"""
from .common import GalleryExtractor, Extractor, Message
-from .. import text
-import json
+from .. import text, util
class HentaifoxBase():
@@ -90,7 +89,7 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor):
server1 = "https://i.hentaifox.com"
server2 = "https://i2.hentaifox.com"
- for num, image in json.loads(data).items():
+ for num, image in util.json_loads(data).items():
ext, width, height = image.split(",")
path = urlfmt(num, extmap[ext])
append((server1 + path, {
diff --git a/gallery_dl/extractor/hentaihand.py b/gallery_dl/extractor/hentaihand.py
index bf9e464..0617330 100644
--- a/gallery_dl/extractor/hentaihand.py
+++ b/gallery_dl/extractor/hentaihand.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2022 Mike Fährmann
+# Copyright 2020-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,6 @@
from .common import GalleryExtractor, Extractor, Message
from .. import text, util
-import json
class HentaihandGalleryExtractor(GalleryExtractor):
@@ -46,7 +45,7 @@ class HentaihandGalleryExtractor(GalleryExtractor):
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
- info = json.loads(page)
+ info = util.json_loads(page)
data = {
"gallery_id" : text.parse_int(info["id"]),
"title" : info["title"],
diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py
index 38ec77c..2297cc0 100644
--- a/gallery_dl/extractor/hentaihere.py
+++ b/gallery_dl/extractor/hentaihere.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2022 Mike Fährmann
+# Copyright 2016-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://hentaihere.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text
-import json
+from .. import text, util
import re
@@ -80,7 +79,7 @@ class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
images = text.extr(page, "var rff_imageList = ", ";")
return [
("https://hentaicdn.com/hentai" + part, None)
- for part in json.loads(images)
+ for part in util.json_loads(images)
]
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 44459ce..4e8d1ca 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -13,7 +13,6 @@ from .nozomi import decode_nozomi
from ..cache import memcache
from .. import text, util
import string
-import json
import re
@@ -75,7 +74,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
self.root, gid)
def metadata(self, page):
- self.info = info = json.loads(page.partition("=")[2])
+ self.info = info = util.json_loads(page.partition("=")[2])
iget = info.get
language = iget("language")
diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py
index 1efbbf0..497f1ef 100644
--- a/gallery_dl/extractor/imagefap.py
+++ b/gallery_dl/extractor/imagefap.py
@@ -9,8 +9,7 @@
"""Extractors for https://www.imagefap.com/"""
from .common import Extractor, Message
-from .. import text, exception
-import json
+from .. import text, util, exception
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?imagefap\.com"
@@ -47,7 +46,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
pattern = BASE_PATTERN + r"/(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)"
test = (
- ("https://www.imagefap.com/pictures/7102714", {
+ ("https://www.imagefap.com/gallery/7102714", {
"pattern": r"https://cdnh?\.imagefap\.com"
r"/images/full/\d+/\d+/\d+\.jpg",
"keyword": "2ba96e84c2952c4750e9fa94a3f2b1f965cec2f3",
@@ -68,6 +67,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
},
"count": 44,
}),
+ ("https://www.imagefap.com/pictures/7102714"),
("https://www.imagefap.com/gallery.php?gid=7102714"),
("https://beta.imagefap.com/gallery.php?gid=7102714"),
)
@@ -78,7 +78,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
self.image_id = ""
def items(self):
- url = "{}/pictures/{}/".format(self.root, self.gid)
+ url = "{}/gallery/{}".format(self.root, self.gid)
page = self.request(url).text
data = self.get_job_metadata(page)
yield Message.Directory, data
@@ -88,22 +88,21 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
def get_job_metadata(self, page):
"""Collect metadata for extractor-job"""
- descr, pos = text.extract(
- page, '<meta name="description" content="Browse ', '"')
- count, pos = text.extract(page, ' 1 of ', ' pics"', pos)
- self.image_id = text.extract(page, 'id="img_ed_', '"', pos)[0]
-
- title, _, descr = descr.partition(" porn picture gallery by ")
- uploader, _, tags = descr.partition(" to see hottest ")
- self._count = text.parse_int(count)
- return {
+ extr = text.extract_from(page)
+
+ data = {
"gallery_id": text.parse_int(self.gid),
- "title": text.unescape(title),
- "uploader": uploader,
- "tags": tags[:-11].split(", "),
- "count": self._count,
+ "tags": extr('name="keywords" content="', '"').split(", "),
+ "uploader": extr("porn picture gallery by ", " to see hottest"),
+ "title": text.unescape(extr("<title>", "<")),
+ "count": text.parse_int(extr(' 1 of ', ' pics"')),
}
+ self.image_id = extr('id="img_ed_', '"')
+ self._count = data["count"]
+
+ return data
+
def get_images(self):
"""Collect image-urls and -metadata"""
url = "{}/photo/{}/".format(self.root, self.image_id)
@@ -128,7 +127,7 @@ class ImagefapGalleryExtractor(ImagefapExtractor):
data["image_id"] = text.parse_int(data["filename"])
yield image_url, data
- if cnt < 24 and num >= total:
+ if not cnt or cnt < 24 and num >= total:
return
params["idx"] += cnt
@@ -173,7 +172,7 @@ class ImagefapImageExtractor(ImagefapExtractor):
page, 'id="imageid_input" value="', '"', pos)
gallery_id, pos = text.extract(
page, 'id="galleryid_input" value="', '"', pos)
- info = json.loads(info)
+ info = util.json_loads(info)
url = info["contentUrl"]
return url, text.nameext_from_url(url, {
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py
index 207562a..d57ec89 100644
--- a/gallery_dl/extractor/imagehosts.py
+++ b/gallery_dl/extractor/imagehosts.py
@@ -187,12 +187,19 @@ class ImagevenueImageExtractor(ImagehostImageExtractor):
class ImagetwistImageExtractor(ImagehostImageExtractor):
"""Extractor for single images from imagetwist.com"""
category = "imagetwist"
- pattern = r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))"
- test = ("https://imagetwist.com/f1i2s4vhvbrq/test.png", {
- "url": "8d5e168c0bee30211f821c6f3b2116e419d42671",
- "keyword": "d1060a4c2e3b73b83044e20681712c0ffdd6cfef",
- "content": "0c8768055e4e20e7c7259608b67799171b691140",
- })
+ pattern = (r"(?:https?://)?((?:www\.|phun\.)?"
+ r"image(?:twist|haha)\.com/([a-z0-9]{12}))")
+ test = (
+ ("https://imagetwist.com/f1i2s4vhvbrq/test.png", {
+ "url": "8d5e168c0bee30211f821c6f3b2116e419d42671",
+ "keyword": "d1060a4c2e3b73b83044e20681712c0ffdd6cfef",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ }),
+ ("https://www.imagetwist.com/f1i2s4vhvbrq/test.png"),
+ ("https://phun.imagetwist.com/f1i2s4vhvbrq/test.png"),
+ ("https://imagehaha.com/f1i2s4vhvbrq/test.png"),
+ ("https://www.imagehaha.com/f1i2s4vhvbrq/test.png"),
+ )
@property
@memcache(maxage=3*3600)
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index 49082d8..a221075 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -9,9 +9,8 @@
"""Extractors for https://imgbb.com/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
from ..cache import cache
-import json
class ImgbbExtractor(Extractor):
@@ -98,7 +97,7 @@ class ImgbbExtractor(Extractor):
while True:
for img in text.extract_iter(page, "data-object='", "'"):
- yield json.loads(text.unquote(img))
+ yield util.json_loads(text.unquote(img))
if data:
if params["seek"] == data["seekEnd"]:
return
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index deb31a0..4c1be0f 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -40,6 +40,7 @@ class InstagramExtractor(Extractor):
self._logged_in = True
self._find_tags = re.compile(r"#\w+").findall
self._cursor = None
+ self._user = None
def items(self):
self.login()
@@ -60,6 +61,8 @@ class InstagramExtractor(Extractor):
post = self._parse_post_graphql(post)
else:
post = self._parse_post_rest(post)
+ if self._user:
+ post["user"] = self._user
post.update(data)
files = post.pop("_files")
@@ -363,6 +366,22 @@ class InstagramExtractor(Extractor):
self._cursor = cursor
return cursor
+ def _assign_user(self, user):
+ self._user = user
+
+ for key, old in (
+ ("count_media" , "edge_owner_to_timeline_media"),
+ ("count_video" , "edge_felix_video_timeline"),
+ ("count_saved" , "edge_saved_media"),
+ ("count_mutual" , "edge_mutual_followed_by"),
+ ("count_follow" , "edge_follow"),
+ ("count_followed" , "edge_followed_by"),
+ ("count_collection", "edge_media_collections")):
+ try:
+ user[key] = user.pop(old)["count"]
+ except Exception:
+ user[key] = 0
+
class InstagramUserExtractor(InstagramExtractor):
"""Extractor for an Instagram user profile"""
@@ -796,6 +815,7 @@ class InstagramRestAPI():
name = user["username"]
s = "" if name.endswith("s") else "s"
raise exception.StopExtraction("%s'%s posts are private", name, s)
+ self.extractor._assign_user(user)
return user["id"]
def user_clips(self, user_id):
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py
index 8067f63..c0a1de1 100644
--- a/gallery_dl/extractor/issuu.py
+++ b/gallery_dl/extractor/issuu.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://issuu.com/"""
from .common import GalleryExtractor, Extractor, Message
-from .. import text
-import json
+from .. import text, util
class IssuuBase():
@@ -54,7 +53,7 @@ class IssuuPublicationExtractor(IssuuBase, GalleryExtractor):
})
def metadata(self, page):
- data = json.loads(text.extr(
+ data = util.json_loads(text.extr(
page, '<script data-json="', '"').replace("&quot;", '"'))
doc = data["initialDocumentData"]["document"]
diff --git a/gallery_dl/extractor/lightroom.py b/gallery_dl/extractor/lightroom.py
index d202e20..783473d 100644
--- a/gallery_dl/extractor/lightroom.py
+++ b/gallery_dl/extractor/lightroom.py
@@ -7,8 +7,7 @@
"""Extractors for https://lightroom.adobe.com/"""
from .common import Extractor, Message
-from .. import text
-import json
+from .. import text, util
class LightroomGalleryExtractor(Extractor):
@@ -46,7 +45,7 @@ class LightroomGalleryExtractor(Extractor):
# Get config
url = "https://lightroom.adobe.com/shares/" + self.href
response = self.request(url)
- album = json.loads(
+ album = util.json_loads(
text.extr(response.text, "albumAttributes: ", "\n")
)
@@ -75,7 +74,7 @@ class LightroomGalleryExtractor(Extractor):
url = base_url + next_url
page = self.request(url).text
# skip 1st line as it's a JS loop
- data = json.loads(page[page.index("\n") + 1:])
+ data = util.json_loads(page[page.index("\n") + 1:])
base_url = data["base"]
for res in data["resources"]:
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index dae203e..409483b 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2022 Mike Fährmann
+# Copyright 2018-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache, memcache
-from ..version import __version__
from collections import defaultdict
BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangadex\.(?:org|cc)"
@@ -28,10 +27,10 @@ class MangadexExtractor(Extractor):
archive_fmt = "{chapter_id}_{page}"
root = "https://mangadex.org"
_cache = {}
- _headers = {"User-Agent": "gallery-dl/" + __version__}
def __init__(self, match):
Extractor.__init__(self, match)
+ self.session.headers["User-Agent"] = util.USERAGENT
self.api = MangadexAPI(self)
self.uuid = match.group(1)
@@ -127,7 +126,6 @@ class MangadexChapterExtractor(MangadexExtractor):
data["chapter"], data["chapter_minor"], data["_external_url"])
yield Message.Directory, data
- data["_http_headers"] = self._headers
server = self.api.athome_server(self.uuid)
chapter = server["chapter"]
@@ -192,7 +190,7 @@ class MangadexAPI():
def __init__(self, extr):
self.extractor = extr
- self.headers = extr._headers.copy()
+ self.headers = {}
self.username, self.password = self.extractor._get_auth_info()
if not self.username:
diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py
index 14a542b..5ba18a3 100644
--- a/gallery_dl/extractor/manganelo.py
+++ b/gallery_dl/extractor/manganelo.py
@@ -10,51 +10,33 @@ from .common import ChapterExtractor, MangaExtractor
from .. import text
import re
-BASE_PATTERN = \
- r"(?:https?://)?((?:(?:chap|read)?manganato|(?:www\.)?manganelo)\.com)"
+BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)"
-class ManganeloChapterExtractor(ChapterExtractor):
- """Extractor for manga-chapters from manganelo.com"""
+class ManganeloBase():
category = "manganelo"
root = "https://chapmanganato.com"
- pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)"
- test = (
- ("https://chapmanganato.com/manga-gn983696/chapter-23", {
- "pattern": r"https://v\d+\.mkklcdnv6tempv5\.com/img/tab_17/03/23"
- r"/39/gn983696/vol_3_chapter_23_24_yen/\d+-[no]\.jpg",
- "keyword": "2c5cd59342f149375df9bcb50aa416b4d04a43cf",
- "count": 25,
- }),
- ("https://readmanganato.com/manga-gn983696/chapter-23"),
- ("https://manganelo.com/chapter/gamers/chapter_15"),
- ("https://manganelo.com/chapter/gq921227/chapter_23"),
- )
def __init__(self, match):
domain, path = match.groups()
- ChapterExtractor.__init__(self, match, "https://" + domain + path)
+ super().__init__(match, "https://" + domain + path)
self.session.headers['Referer'] = self.root
- def metadata(self, page):
- _ , pos = text.extract(page, '<a class="a-h" ', '/a>')
- manga , pos = text.extract(page, '<a class="a-h" ', '/a>', pos)
- info , pos = text.extract(page, '<a class="a-h" ', '/a>', pos)
- author, pos = text.extract(page, '- Author(s) : ', '</p>', pos)
-
- manga, _ = text.extract(manga, '">', '<')
- info , _ = text.extract(info , '">', '<')
- match = re.match(
- r"(?:[Vv]ol\. *(\d+) )?"
- r"[Cc]hapter *([^:]*)"
- r"(?:: *(.+))?", info)
+ self._match_chapter = re.compile(
+ r"(?:[Vv]ol\.?\s*(\d+)\s?)?"
+ r"[Cc]hapter\s*([^:]+)"
+ r"(?::\s*(.+))?").match
+
+ def _parse_chapter(self, info, manga, author, date=None):
+ match = self._match_chapter(info)
volume, chapter, title = match.groups() if match else ("", "", info)
chapter, sep, minor = chapter.partition(".")
return {
- "manga" : text.unescape(manga),
+ "manga" : manga,
+ "author" : author,
+ "date" : date,
"title" : text.unescape(title) if title else "",
- "author" : text.unescape(author) if author else "",
"volume" : text.parse_int(volume),
"chapter" : text.parse_int(chapter),
"chapter_minor": sep + minor,
@@ -62,19 +44,53 @@ class ManganeloChapterExtractor(ChapterExtractor):
"language" : "English",
}
+
+class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor):
+ """Extractor for manga chapters from manganelo.com"""
+ pattern = BASE_PATTERN + r"(/(?:manga-\w+|chapter/\w+)/chapter[-_][^/?#]+)"
+ test = (
+ ("https://chapmanganato.com/manga-gn983696/chapter-23", {
+ "pattern": r"https://v\d+\.mkklcdnv6tempv5\.com/img/tab_17/03/23"
+ r"/39/gn983696/vol_3_chapter_23_24_yen/\d+-[no]\.jpg",
+ "keyword": "17faaea7f0fb8c2675a327bf3aa0bcd7a6311d68",
+ "count": 25,
+ }),
+ ("https://chapmanganelo.com/manga-ti107776/chapter-4", {
+ "pattern": r"https://v\d+\.mkklcdnv6tempv5\.com/img/tab_17/01/92"
+ r"/08/ti970565/chapter_4_caster/\d+-o\.jpg",
+ "keyword": "06e01fa9b3fc9b5b954c0d4a98f0153b40922ded",
+ "count": 45,
+ }),
+ ("https://readmanganato.com/manga-gn983696/chapter-23"),
+ ("https://manganelo.com/chapter/gamers/chapter_15"),
+ ("https://manganelo.com/chapter/gq921227/chapter_23"),
+ )
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ extr('class="a-h"', ">")
+ manga = extr('title="', '"')
+ info = extr('title="', '"')
+ author = extr("- Author(s) : ", "</p>")
+
+ return self._parse_chapter(
+ info, text.unescape(manga), text.unescape(author))
+
def images(self, page):
page = text.extr(
page, 'class="container-chapter-reader', '\n<div')
return [
(url, None)
for url in text.extract_iter(page, '<img src="', '"')
+ ] or [
+ (url, None)
+ for url in text.extract_iter(
+ page, '<img class="reader-content" src="', '"')
]
-class ManganeloMangaExtractor(MangaExtractor):
+class ManganeloMangaExtractor(ManganeloBase, MangaExtractor):
"""Extractor for manga from manganelo.com"""
- category = "manganelo"
- root = "https://chapmanganato.com"
chapterclass = ManganeloChapterExtractor
pattern = BASE_PATTERN + r"(/(?:manga[-/]|read_)\w+)/?$"
test = (
@@ -82,40 +98,28 @@ class ManganeloMangaExtractor(MangaExtractor):
"pattern": ManganeloChapterExtractor.pattern,
"count": ">= 25",
}),
+ ("https://m.manganelo.com/manga-ti107776", {
+ "pattern": ManganeloChapterExtractor.pattern,
+ "count": ">= 12",
+ }),
("https://readmanganato.com/manga-gn983696"),
("https://manganelo.com/manga/read_otome_no_teikoku"),
("https://manganelo.com/manga/ol921234/"),
)
- def __init__(self, match):
- domain, path = match.groups()
- MangaExtractor.__init__(self, match, "https://" + domain + path)
- self.session.headers['Referer'] = self.root
-
def chapters(self, page):
results = []
- data = self.parse_page(page, {"lang": "en", "language": "English"})
+ append = results.append
+
+ extr = text.extract_from(page)
+ manga = text.unescape(extr("<h1>", "<"))
+ author = text.remove_html(extr("</i>Author(s) :</td>", "</tr>"))
- needle = 'class="chapter-name text-nowrap" href="'
- pos = page.index('<ul class="row-content-chapter">')
+ extr('class="row-content-chapter', '')
while True:
- url, pos = text.extract(page, needle, '"', pos)
+ url = extr('class="chapter-name text-nowrap" href="', '"')
if not url:
return results
- data["title"], pos = text.extract(page, '>', '</a>', pos)
- data["date"] , pos = text.extract(
- page, 'class="chapter-time text-nowrap" title="', '">', pos)
- chapter, sep, minor = url.rpartition("/chapter_")[2].partition(".")
- data["chapter"] = text.parse_int(chapter)
- data["chapter_minor"] = sep + minor
- results.append((url, data.copy()))
-
- @staticmethod
- def parse_page(page, data):
- """Parse metadata on 'page' and add it to 'data'"""
- text.extract_all(page, (
- ("manga" , '<h1>', '</h1>'),
- ('author' , '</i>Author(s) :</td>', '</tr>'),
- ), values=data)
- data["author"] = text.remove_html(data["author"])
- return data
+ info = extr(">", "<")
+ date = extr('class="chapter-time text-nowrap" title="', '"')
+ append((url, self._parse_chapter(info, manga, author, date)))
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py
index dcf1972..168fbe8 100644
--- a/gallery_dl/extractor/mangapark.py
+++ b/gallery_dl/extractor/mangapark.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://mangapark.net/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text, exception
-import json
+from .. import text, util, exception
import re
@@ -104,7 +103,7 @@ class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor):
return data
def images(self, page):
- data = json.loads(text.extr(page, "var _load_pages =", ";"))
+ data = util.json_loads(text.extr(page, "var _load_pages =", ";"))
return [
(text.urljoin(self.root, item["u"]), {
"width": text.parse_int(item["w"]),
diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py
index 5fa5631..b7070f2 100644
--- a/gallery_dl/extractor/mangasee.py
+++ b/gallery_dl/extractor/mangasee.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2022 Mike Fährmann
+# Copyright 2021-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,7 +10,6 @@
from .common import ChapterExtractor, MangaExtractor
from .. import text, util
-import json
class MangaseeBase():
@@ -43,6 +42,7 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor):
"pattern": r"https://[^/]+/manga/Tokyo-Innocent/0004\.5-00\d\.png",
"count": 8,
"keyword": {
+ "author": ["NARUMI Naru"],
"chapter": 4,
"chapter_minor": ".5",
"chapter_string": "100045",
@@ -50,6 +50,8 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor):
"date": "dt:2020-01-20 21:52:53",
"extension": "png",
"filename": r"re:0004\.5-00\d",
+ "genre": ["Comedy", "Fantasy", "Harem", "Romance", "Shounen",
+ "Supernatural"],
"index": "1",
"lang": "en",
"language": "English",
@@ -63,6 +65,7 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor):
"pattern": r"https://[^/]+/manga/One-Piece/1063-0\d\d\.png",
"count": 13,
"keyword": {
+ "author": ["ODA Eiichiro"],
"chapter": 1063,
"chapter_minor": "",
"chapter_string": "110630",
@@ -70,6 +73,8 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor):
"date": "dt:2022-10-16 17:32:54",
"extension": "png",
"filename": r"re:1063-0\d\d",
+ "genre": ["Action", "Adventure", "Comedy", "Drama", "Fantasy",
+ "Shounen"],
"index": "1",
"lang": "en",
"language": "English",
@@ -94,12 +99,16 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor):
def metadata(self, page):
extr = text.extract_from(page)
- self.chapter = data = json.loads(extr("vm.CurChapter =", ";\r\n"))
+ author = util.json_loads(extr('"author":', '],') + "]")
+ genre = util.json_loads(extr('"genre":', '],') + "]")
+ self.chapter = data = util.json_loads(extr("vm.CurChapter =", ";\r\n"))
self.domain = extr('vm.CurPathName = "', '"')
self.slug = extr('vm.IndexName = "', '"')
data = self._transform_chapter(data)
data["manga"] = text.unescape(extr('vm.SeriesName = "', '"'))
+ data["author"] = author
+ data["genre"] = genre
return data
def images(self, page):
@@ -128,10 +137,38 @@ class MangaseeMangaExtractor(MangaseeBase, MangaExtractor):
"/Nakamura-Koedo-To-Daizu-Keisuke-Wa-Umaku-Ikanai"), {
"pattern": MangaseeChapterExtractor.pattern,
"count": ">= 17",
+ "keyword": {
+ "author": ["TAKASE Masaya"],
+ "chapter": int,
+ "chapter_minor": r"re:^|\.5$",
+ "chapter_string": r"re:100\d\d\d",
+ "date": "type:datetime",
+ "genre": ["Comedy", "Romance", "School Life", "Shounen",
+ "Slice of Life"],
+ "index": "1",
+ "lang": "en",
+ "language": "English",
+ "manga": "Nakamura-Koedo-To-Daizu-Keisuke-Wa-Umaku-Ikanai",
+ "title": "",
+ },
}),
("https://manga4life.com/manga/Ano-Musume-Ni-Kiss-To-Shirayuri-O", {
"pattern": MangaseeChapterExtractor.pattern,
"count": ">= 50",
+ "keyword": {
+ "author": ["Canno"],
+ "chapter": int,
+ "chapter_minor": r"re:^|\.5$",
+ "chapter_string": r"re:100\d\d\d",
+ "date": "type:datetime",
+ "genre": ["Comedy", "Romance", "School Life", "Seinen",
+ "Shoujo Ai"],
+ "index": "1",
+ "lang": "en",
+ "language": "English",
+ "manga": "Ano-Musume-Ni-Kiss-To-Shirayuri-O",
+ "title": ""
+ },
}),
)
@@ -142,9 +179,11 @@ class MangaseeMangaExtractor(MangaseeBase, MangaExtractor):
MangaExtractor.__init__(self, match, self.root + match.group(2))
def chapters(self, page):
- slug, pos = text.extract(page, 'vm.IndexName = "', '"')
- chapters = json.loads(text.extract(
- page, "vm.Chapters = ", ";\r\n", pos)[0])
+ extr = text.extract_from(page)
+ author = util.json_loads(extr('"author":', '],') + "]")
+ genre = util.json_loads(extr('"genre":', '],') + "]")
+ slug = extr('vm.IndexName = "', '"')
+ chapters = util.json_loads(extr("vm.Chapters = ", ";\r\n"))
result = []
for data in map(self._transform_chapter, chapters):
@@ -155,5 +194,7 @@ class MangaseeMangaExtractor(MangaseeBase, MangaExtractor):
url += "-page-1.html"
data["manga"] = slug
+ data["author"] = author
+ data["genre"] = genre
result.append((url, data))
return result
diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py
new file mode 100644
index 0000000..03e9104
--- /dev/null
+++ b/gallery_dl/extractor/misskey.py
@@ -0,0 +1,202 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for Misskey instances"""
+
+from .common import BaseExtractor, Message
+from .. import text
+
+
+class MisskeyExtractor(BaseExtractor):
+ """Base class for Misskey extractors"""
+ basecategory = "misskey"
+ directory_fmt = ("misskey", "{instance}", "{user[username]}")
+ filename_fmt = "{category}_{id}_{file[id]}.{extension}"
+ archive_fmt = "{id}_{file[id]}"
+
+ def __init__(self, match):
+ BaseExtractor.__init__(self, match)
+ self.api = MisskeyAPI(self)
+ self.instance = self.root.rpartition("://")[2]
+ self.item = match.group(match.lastindex)
+ self.renotes = self.config("renotes", False)
+ self.replies = self.config("replies", True)
+
+ def items(self):
+ for note in self.notes():
+ files = note.pop("files") or []
+ renote = note.get("renote")
+ if renote:
+ if not self.renotes:
+ self.log.debug("Skipping %s (renote)", note["id"])
+ continue
+ files.extend(renote.get("files") or ())
+
+ reply = note.get("reply")
+ if reply:
+ if not self.replies:
+ self.log.debug("Skipping %s (reply)", note["id"])
+ continue
+ files.extend(reply.get("files") or ())
+
+ note["instance"] = self.instance
+ note["instance_remote"] = note["user"]["host"]
+ note["count"] = len(files)
+ note["date"] = text.parse_datetime(
+ note["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z")
+
+ yield Message.Directory, note
+ for note["num"], file in enumerate(files, 1):
+ file["date"] = text.parse_datetime(
+ file["createdAt"], "%Y-%m-%dT%H:%M:%S.%f%z")
+ note["file"] = file
+ url = file["url"]
+ yield Message.Url, url, text.nameext_from_url(url, note)
+
+ def notes(self):
+ """Return an iterable containing all relevant Note objects"""
+ return ()
+
+
+BASE_PATTERN = MisskeyExtractor.update({
+ "misskey.io": {
+ "root": "https://misskey.io",
+ "pattern": r"misskey\.io",
+ },
+ "lesbian.energy": {
+ "root": "https://lesbian.energy",
+ "pattern": r"lesbian\.energy"
+ },
+ "sushi.ski": {
+ "root": "https://sushi.ski",
+ "pattern": r"sushi\.ski",
+ },
+})
+
+
+class MisskeyUserExtractor(MisskeyExtractor):
+ """Extractor for all images of a Misskey user"""
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"/@([^/?#]+)/?$"
+ test = (
+ ("https://misskey.io/@lithla", {
+ "pattern": r"https://s\d+\.arkjp\.net/misskey/[\w-]+\.\w+",
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://misskey.io/@blooddj@pawoo.net", {
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://lesbian.energy/@rerorero", {
+ "pattern": r"https://lesbian.energy/files/\w+",
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://lesbian.energy/@nano@mk.yopo.work"),
+ ("https://sushi.ski/@ui@misskey.04.si"),
+ )
+
+ def notes(self):
+ return self.api.users_notes(self.api.user_id_by_username(self.item))
+
+
+class MisskeyFollowingExtractor(MisskeyExtractor):
+ """Extractor for followed Misskey users"""
+ subcategory = "following"
+ pattern = BASE_PATTERN + r"/@([^/?#]+)/following"
+ test = (
+ ("https://misskey.io/@blooddj@pawoo.net/following", {
+ "extractor": False,
+ "count": ">= 6",
+ }),
+ ("https://sushi.ski/@hatusimo_sigure/following"),
+ )
+
+ def items(self):
+ user_id = self.api.user_id_by_username(self.item)
+ for user in self.api.users_following(user_id):
+ user = user["followee"]
+ url = self.root + "/@" + user["username"]
+ host = user["host"]
+ if host is not None:
+ url += "@" + host
+ user["_extractor"] = MisskeyUserExtractor
+ yield Message.Queue, url, user
+
+
+class MisskeyNoteExtractor(MisskeyExtractor):
+ """Extractor for images from a Note"""
+ subcategory = "note"
+ pattern = BASE_PATTERN + r"/notes/(\w+)"
+ test = (
+ ("https://misskey.io/notes/9bhqfo835v", {
+ "pattern": r"https://s\d+\.arkjp\.net/misskey/[\w-]+\.\w+",
+ "count": 4,
+ }),
+ ("https://misskey.io/notes/9brq7z1re6"),
+ ("https://sushi.ski/notes/9bm3x4ksqw", {
+ "pattern": r"https://media\.sushi\.ski/files/[\w-]+\.png",
+ "count": 1,
+ }),
+ ("https://lesbian.energy/notes/995ig09wqy", {
+ "count": 1,
+ }),
+ ("https://lesbian.energy/notes/96ynd9w5kc"),
+ )
+
+ def notes(self):
+ return (self.api.notes_show(self.item),)
+
+
+class MisskeyAPI():
+ """Interface for Misskey API
+
+ https://github.com/misskey-dev/misskey
+ https://misskey-hub.net/en/docs/api/
+ https://misskey-hub.net/docs/api/endpoints.html
+ """
+
+ def __init__(self, extractor):
+ self.root = extractor.root
+ self.extractor = extractor
+ self.headers = {"Content-Type": "application/json"}
+
+ def user_id_by_username(self, username):
+ endpoint = "/users/show"
+ data = {"username": username}
+ if "@" in username:
+ data["username"], _, data["host"] = username.partition("@")
+ return self._call(endpoint, data)["id"]
+
+ def users_following(self, user_id):
+ endpoint = "/users/following"
+ data = {"userId": user_id}
+ return self._pagination(endpoint, data)
+
+ def users_notes(self, user_id):
+ endpoint = "/users/notes"
+ data = {"userId": user_id}
+ return self._pagination(endpoint, data)
+
+ def notes_show(self, note_id):
+ endpoint = "/notes/show"
+ data = {"noteId": note_id}
+ return self._call(endpoint, data)
+
+ def _call(self, endpoint, data):
+ url = self.root + "/api" + endpoint
+ return self.extractor.request(
+ url, method="POST", headers=self.headers, json=data).json()
+
+ def _pagination(self, endpoint, data):
+ data["limit"] = 100
+ while True:
+ notes = self._call(endpoint, data)
+ if not notes:
+ return
+ yield from notes
+ data["untilId"] = notes[-1]["id"]
diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py
index 1db83b0..0f79d7f 100644
--- a/gallery_dl/extractor/nana.py
+++ b/gallery_dl/extractor/nana.py
@@ -7,8 +7,7 @@
"""Extractors for https://nana.my.id/"""
from .common import GalleryExtractor, Extractor, Message
-from .. import text, exception
-import json
+from .. import text, util, exception
class NanaGalleryExtractor(GalleryExtractor):
@@ -59,7 +58,7 @@ class NanaGalleryExtractor(GalleryExtractor):
}
def images(self, page):
- data = json.loads(text.extr(page, "Reader.pages = ", ".pages"))
+ data = util.json_loads(text.extr(page, "Reader.pages = ", ".pages"))
return [
("https://nana.my.id" + image, None)
for image in data["pages"]
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 1f96879..2b759ec 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2022 Mike Fährmann
+# Copyright 2018-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,10 +9,9 @@
"""Extractors for https://www.newgrounds.com/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
from ..cache import cache
import itertools
-import json
class NewgroundsExtractor(Extractor):
@@ -20,7 +19,7 @@ class NewgroundsExtractor(Extractor):
category = "newgrounds"
directory_fmt = ("{category}", "{artist[:10]:J, }")
filename_fmt = "{category}_{_index}_{title}.{extension}"
- archive_fmt = "{_index}"
+ archive_fmt = "{_type}{_index}"
root = "https://www.newgrounds.com"
cookiedomain = ".newgrounds.com"
cookienames = ("NG_GG_username", "vmk1du5I8m")
@@ -151,11 +150,13 @@ class NewgroundsExtractor(Extractor):
@staticmethod
def _extract_image_data(extr, url):
- full = text.extract_from(json.loads(extr('"full_image_text":', '});')))
+ full = text.extract_from(util.json_loads(extr(
+ '"full_image_text":', '});')))
data = {
"title" : text.unescape(extr('"og:title" content="', '"')),
"description": text.unescape(extr(':description" content="', '"')),
"type" : extr('og:type" content="', '"'),
+ "_type" : "i",
"date" : text.parse_datetime(extr(
'itemprop="datePublished" content="', '"')),
"rating" : extr('class="rated-', '"'),
@@ -175,6 +176,7 @@ class NewgroundsExtractor(Extractor):
"title" : text.unescape(extr('"og:title" content="', '"')),
"description": text.unescape(extr(':description" content="', '"')),
"type" : extr('og:type" content="', '"'),
+ "_type" : "a",
"date" : text.parse_datetime(extr(
'itemprop="datePublished" content="', '"')),
"url" : extr('{"url":"', '"').replace("\\/", "/"),
@@ -227,6 +229,7 @@ class NewgroundsExtractor(Extractor):
"url" : src,
"date" : date,
"type" : type,
+ "_type" : "",
"description": text.unescape(descr or extr(
'itemprop="description" content="', '"')),
"rating" : extr('class="rated-', '"'),
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
index 9df43e5..4270c84 100644
--- a/gallery_dl/extractor/nhentai.py
+++ b/gallery_dl/extractor/nhentai.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2021 Mike Fährmann
+# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from .common import GalleryExtractor, Extractor, Message
from .. import text, util
import collections
-import json
class NhentaiGalleryExtractor(GalleryExtractor):
@@ -48,7 +47,7 @@ class NhentaiGalleryExtractor(GalleryExtractor):
GalleryExtractor.__init__(self, match, url)
def metadata(self, page):
- self.data = data = json.loads(page)
+ self.data = data = util.json_loads(page)
title_en = data["title"].get("english", "")
title_ja = data["title"].get("japanese", "")
diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py
index f9c6abf..9b69694 100644
--- a/gallery_dl/extractor/nitter.py
+++ b/gallery_dl/extractor/nitter.py
@@ -59,10 +59,7 @@ class NitterExtractor(BaseExtractor):
if url[0] == "/":
url = self.root + url
- file = {
- "url": url,
- "_http_retry_codes": (404,),
- }
+ file = {"url": url, "_http_retry": _retry_on_404}
file["filename"], _, file["extension"] = \
name.rpartition(".")
append(file)
@@ -220,10 +217,6 @@ BASE_PATTERN = NitterExtractor.update({
"root": "https://nitter.lacontrevoie.fr",
"pattern": r"nitter\.lacontrevoie\.fr",
},
- "nitter.pussthecat.org": {
- "root": "https://nitter.pussthecat.org",
- "pattern": r"nitter\.pussthecat\.org",
- },
"nitter.1d4.us": {
"root": "https://nitter.1d4.us",
"pattern": r"nitter\.1d4\.us",
@@ -283,13 +276,12 @@ class NitterTweetsExtractor(NitterExtractor):
},
},
}),
- ("https://nitter.pussthecat.org/i/user/2976459548", {
- "url": "c740a2683db2c8ed2f350afc0494475c4444025b",
- "pattern": r"https://nitter.pussthecat\.org/pic/orig"
+ ("https://nitter.lacontrevoie.fr/supernaturepics", {
+ "url": "54f4b55f2099dcc248f3fb7bfacf1349e08d8e2d",
+ "pattern": r"https://nitter\.lacontrevoie\.fr/pic/orig"
r"/media%2FCGMNYZvW0AIVoom\.jpg",
"range": "1",
}),
- ("https://nitter.lacontrevoie.fr/supernaturepics"),
("https://nitter.1d4.us/supernaturepics"),
("https://nitter.kavin.rocks/id:2976459548"),
("https://nitter.unixfox.eu/supernaturepics"),
@@ -309,7 +301,6 @@ class NitterRepliesExtractor(NitterExtractor):
"range": "1-20",
}),
("https://nitter.lacontrevoie.fr/supernaturepics/with_replies"),
- ("https://nitter.pussthecat.org/supernaturepics/with_replies"),
("https://nitter.1d4.us/supernaturepics/with_replies"),
("https://nitter.kavin.rocks/id:2976459548/with_replies"),
("https://nitter.unixfox.eu/i/user/2976459548/with_replies"),
@@ -334,7 +325,6 @@ class NitterMediaExtractor(NitterExtractor):
"range": "1-20",
}),
("https://nitter.lacontrevoie.fr/supernaturepics/media"),
- ("https://nitter.pussthecat.org/supernaturepics/media"),
("https://nitter.1d4.us/supernaturepics/media"),
("https://nitter.unixfox.eu/i/user/2976459548/media"),
)
@@ -353,7 +343,6 @@ class NitterSearchExtractor(NitterExtractor):
"range": "1-20",
}),
("https://nitter.lacontrevoie.fr/supernaturepics/search"),
- ("https://nitter.pussthecat.org/supernaturepics/search"),
("https://nitter.1d4.us/supernaturepics/search"),
("https://nitter.kavin.rocks/id:2976459548/search"),
("https://nitter.unixfox.eu/i/user/2976459548/search"),
@@ -375,7 +364,7 @@ class NitterTweetExtractor(NitterExtractor):
"url": "3f2b64e175bf284aa672c3bb53ed275e470b919a",
"content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab",
"keyword": {
- "comments": 16,
+ "comments": 19,
"content": "Big Wedeene River, Canada",
"count": 1,
"date": "dt:2015-05-29 17:40:00",
@@ -399,9 +388,9 @@ class NitterTweetExtractor(NitterExtractor):
"url": "9c51b3a4a1114535eb9b168bba97ad95db0d59ff",
}),
# video
- ("https://nitter.pussthecat.org/i/status/1065692031626829824", {
- "pattern": r"ytdl:https://nitter.pussthecat.org/video"
- r"/B875137EDC8FF/https%3A%2F%2Fvideo.twimg.com%2F"
+ ("https://nitter.lacontrevoie.fr/i/status/1065692031626829824", {
+ "pattern": r"ytdl:https://nitter\.lacontrevoie\.fr/video"
+ r"/[0-9A-F]{10,}/https%3A%2F%2Fvideo.twimg.com%2F"
r"ext_tw_video%2F1065691868439007232%2Fpu%2Fpl%2F"
r"nv8hUQC1R0SjhzcZ.m3u8%3Ftag%3D5",
"keyword": {
@@ -446,7 +435,7 @@ class NitterTweetExtractor(NitterExtractor):
"count": 0,
}),
# "Misleading" content
- ("https://nitter.pussthecat.org/i/status/1486373748911575046", {
+ ("https://nitter.lacontrevoie.fr/i/status/1486373748911575046", {
"count": 4,
}),
# age-restricted (#2354)
@@ -468,3 +457,7 @@ class NitterTweetExtractor(NitterExtractor):
quoted["user"] = tweet["user"]
return (tweet, quoted)
return (tweet,)
+
+
+def _retry_on_404(response):
+ return response.status_code == 404
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 9270f33..ec46ca3 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -9,13 +9,12 @@
"""Utility classes to setup OAuth and link accounts to gallery-dl"""
from .common import Extractor, Message
-from . import deviantart, flickr, mastodon, pixiv, reddit, smugmug, tumblr
from .. import text, oauth, util, config, exception
from ..output import stdout_write
from ..cache import cache
import urllib.parse
+import binascii
import hashlib
-import base64
REDIRECT_URI_LOCALHOST = "http://localhost:6414/"
REDIRECT_URI_HTTPS = "https://mikf.github.io/gallery-dl/oauth-redirect.html"
@@ -76,7 +75,8 @@ class OAuthBase(Extractor):
browser = webbrowser.get()
if browser and browser.open(url):
- self.log.info("Opening URL in %s:", browser.name.capitalize())
+ name = getattr(browser, "name", "Browser")
+ self.log.info("Opening URL in %s:", name.capitalize())
else:
self.log.info("Please open this URL in your browser:")
@@ -242,6 +242,7 @@ class OAuthFlickr(OAuthBase):
def items(self):
yield Message.Version, 1
+ from . import flickr
self._oauth1_authorization_flow(
flickr.FlickrAPI.API_KEY,
@@ -258,6 +259,7 @@ class OAuthSmugmug(OAuthBase):
def items(self):
yield Message.Version, 1
+ from . import smugmug
self._oauth1_authorization_flow(
smugmug.SmugmugAPI.API_KEY,
@@ -274,6 +276,7 @@ class OAuthTumblr(OAuthBase):
def items(self):
yield Message.Version, 1
+ from . import tumblr
self._oauth1_authorization_flow(
tumblr.TumblrAPI.API_KEY,
@@ -294,6 +297,7 @@ class OAuthDeviantart(OAuthBase):
def items(self):
yield Message.Version, 1
+ from . import deviantart
self._oauth2_authorization_code_grant(
self.oauth_config("client-id"),
@@ -313,6 +317,7 @@ class OAuthReddit(OAuthBase):
def items(self):
yield Message.Version, 1
+ from . import reddit
self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT
self._oauth2_authorization_code_grant(
@@ -337,6 +342,7 @@ class OAuthMastodon(OAuthBase):
def items(self):
yield Message.Version, 1
+ from . import mastodon
for application in mastodon.INSTANCES.values():
if self.instance == application["root"].partition("://")[2]:
@@ -389,11 +395,12 @@ class OAuthPixiv(OAuthBase):
def items(self):
yield Message.Version, 1
+ from . import pixiv
code_verifier = util.generate_token(32)
- digest = hashlib.sha256(code_verifier.encode("ascii")).digest()
- code_challenge = base64.urlsafe_b64encode(
- digest).rstrip(b"=").decode("ascii")
+ digest = hashlib.sha256(code_verifier.encode()).digest()
+ code_challenge = binascii.b2a_base64(
+ digest)[:-2].decode().replace("+", "-").replace("/", "_")
url = "https://app-api.pixiv.net/web/v1/login"
params = {
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 1f520c3..e4bfa2a 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,11 +9,10 @@
"""Extractors for https://www.patreon.com/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
from ..cache import memcache
import collections
import itertools
-import json
class PatreonExtractor(Extractor):
@@ -251,7 +250,7 @@ class PatreonExtractor(Extractor):
return [genmap[ft] for ft in filetypes]
def _extract_bootstrap(self, page):
- return json.loads(text.extr(
+ return util.json_loads(text.extr(
page, "window.patreon.bootstrap,", "\n});") + "}")
diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py
index 375b5e3..6234e6a 100644
--- a/gallery_dl/extractor/photobucket.py
+++ b/gallery_dl/extractor/photobucket.py
@@ -10,7 +10,7 @@
from .common import Extractor, Message
from .. import text, exception
-import base64
+import binascii
import json
@@ -168,7 +168,7 @@ class PhotobucketImageExtractor(Extractor):
image["titleOrFilename"] = image["title"] or name
image["tags"] = image.pop("clarifaiTagList", [])
- mtype, _, mid = base64.b64decode(image["id"]).partition(b":")
+ mtype, _, mid = binascii.a2b_base64(image["id"]).partition(b":")
image["pictureId"] = mid.decode() if mtype == b"mediaId" else ""
yield Message.Directory, image
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index 63b16ce..31ddbcc 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -12,7 +12,6 @@ from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache
import itertools
-import json
BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.[\w.]+"
@@ -309,7 +308,7 @@ class PinterestSearchExtractor(PinterestExtractor):
def __init__(self, match):
PinterestExtractor.__init__(self, match)
- self.search = match.group(1)
+ self.search = text.unquote(match.group(1))
def metadata(self):
return {"search": self.search}
@@ -504,7 +503,10 @@ class PinterestAPI():
"username_or_email": username,
"password" : password,
}
- data = {"data": json.dumps({"options": options}), "source_url": ""}
+ data = {
+ "data" : util.json_dumps({"options": options}),
+ "source_url": "",
+ }
try:
response = self.extractor.request(
@@ -523,7 +525,10 @@ class PinterestAPI():
def _call(self, resource, options):
url = "{}/resource/{}Resource/get/".format(self.root, resource)
- params = {"data": json.dumps({"options": options}), "source_url": ""}
+ params = {
+ "data" : util.json_dumps({"options": options}),
+ "source_url": "",
+ }
response = self.extractor.request(
url, params=params, headers=self.headers,
diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py
index 535fae9..4135259 100644
--- a/gallery_dl/extractor/plurk.py
+++ b/gallery_dl/extractor/plurk.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,10 +9,8 @@
"""Extractors for https://www.plurk.com/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
import datetime
-import time
-import json
import re
@@ -20,6 +18,7 @@ class PlurkExtractor(Extractor):
"""Base class for plurk extractors"""
category = "plurk"
root = "https://www.plurk.com"
+ request_interval = 1.0
def items(self):
urls = self._urls_ex if self.config("comments", False) else self._urls
@@ -59,14 +58,13 @@ class PlurkExtractor(Extractor):
return
elif info["has_newer"] < 200:
del data["count"]
- time.sleep(1)
data["from_response_id"] = info["responses"][-1]["id"] + 1
@staticmethod
def _load(data):
if not data:
raise exception.NotFoundError("user")
- return json.loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data))
+ return util.json_loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data))
class PlurkTimelineExtractor(PlurkExtractor):
diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py
index c35ee74..49da9ce 100644
--- a/gallery_dl/extractor/poipiku.py
+++ b/gallery_dl/extractor/poipiku.py
@@ -79,7 +79,7 @@ class PoipikuExtractor(Extractor):
page = self.request(
url, method="POST", headers=headers, data=data).json()["html"]
- if page.startswith("You need to"):
+ if page.startswith(("You need to", "Password is incorrect")):
self.log.warning("'%s'", page)
for thumb in text.extract_iter(
diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py
new file mode 100644
index 0000000..783f3da
--- /dev/null
+++ b/gallery_dl/extractor/pornpics.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2023 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.pornpics.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?pornpics\.com(?:/\w\w)?"
+
+
+class PornpicsExtractor(Extractor):
+ """Base class for pornpics extractors"""
+ category = "pornpics"
+ root = "https://www.pornpics.com"
+ request_interval = (0.5, 1.5)
+
+ def __init__(self, match):
+ super().__init__(match)
+ self.item = match.group(1)
+ self.session.headers["Referer"] = self.root
+
+ def items(self):
+ for gallery in self.galleries():
+ gallery["_extractor"] = PornpicsGalleryExtractor
+ yield Message.Queue, gallery["g_url"], gallery
+
+ def _pagination(self, url, params=None):
+ if params is None:
+ # fetch first 20 galleries from HTML
+ # since '"offset": 0' does not return a JSON response
+ page = self.request(url).text
+ for path in text.extract_iter(
+ page, 'class="rel-link" href="', '"'):
+ yield {"g_url": self.root + path}
+ del page
+ params = {"offset": 20}
+
+ limit = params["limit"] = 20
+
+ headers = {
+ "Accept": "application/json, text/javascript, */*; q=0.01",
+ "Referer": url if params["offset"] else self.root + "/",
+ "X-Requested-With": "XMLHttpRequest",
+ }
+
+ while True:
+ galleries = self.request(
+ url, params=params, headers=headers).json()
+ yield from galleries
+
+ if len(galleries) < limit:
+ return
+ params["offset"] += limit
+
+
+class PornpicsGalleryExtractor(PornpicsExtractor, GalleryExtractor):
+ """Extractor for pornpics galleries"""
+ pattern = BASE_PATTERN + r"(/galleries/(?:[^/?#]+-)?(\d+))"
+ test = (
+ (("https://www.pornpics.com/galleries/british-beauty-danielle-flashes-"
+ "hot-breasts-ass-and-snatch-in-the-forest-62610699/"), {
+ "pattern": r"https://cdni\.pornpics\.com/1280/7/160/62610699"
+ r"/62610699_\d+_[0-9a-f]{4}\.jpg",
+ "keyword": {
+ "categories": ["MILF", "Amateur", "Sexy", "Outdoor"],
+ "channel": "FTV MILFs",
+ "count": 17,
+ "gallery_id": 62610699,
+ "models": ["Danielle"],
+ "num": int,
+ "slug": "british-beauty-danielle-flashes-"
+ "hot-breasts-ass-and-snatch-in-the-forest",
+ "tags": ["Amateur MILF", "Sexy MILF"],
+ "title": "British beauty Danielle flashes "
+ "hot breasts, ass and snatch in the forest",
+ "views": int,
+ },
+ }),
+ ("https://pornpics.com/es/galleries/62610699", {
+ "keyword": {
+ "slug": "british-beauty-danielle-flashes-"
+ "hot-breasts-ass-and-snatch-in-the-forest",
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ PornpicsExtractor.__init__(self, match)
+ self.gallery_id = match.group(2)
+
+ items = GalleryExtractor.items
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "slug" : extr("/galleries/", "/").rpartition("-")[0],
+ "title" : text.unescape(extr("<h1>", "<")),
+ "channel" : extr('>Channel:', '</a>').rpartition(">")[2],
+ "models" : text.split_html(extr(
+ ">Models:", '<span class="suggest')),
+ "categories": text.split_html(extr(
+ ">Categories:", '<span class="suggest')),
+ "tags" : text.split_html(extr(
+ ">Tags List:", ' </div>')),
+ "views" : text.parse_int(extr(">Views:", "<").replace(",", "")),
+ }
+
+ def images(self, page):
+ return [
+ (url, None)
+ for url in text.extract_iter(page, "class='rel-link' href='", "'")
+ ]
+
+
+class PornpicsTagExtractor(PornpicsExtractor):
+ """Extractor for galleries from pornpics tag searches"""
+ subcategory = "tag"
+ pattern = BASE_PATTERN + r"/tags/([^/?#]+)"
+ test = (
+ ("https://www.pornpics.com/tags/summer-dress/", {
+ "pattern": PornpicsGalleryExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://pornpics.com/fr/tags/summer-dress"),
+ )
+
+ def galleries(self):
+ url = "{}/tags/{}/".format(self.root, self.item)
+ return self._pagination(url)
+
+
+class PornpicsSearchExtractor(PornpicsExtractor):
+ """Extractor for galleries from pornpics search results"""
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/(?:\?q=|pornstars/|channels/)([^/&#]+)"
+ test = (
+ ("https://www.pornpics.com/?q=nature", {
+ "pattern": PornpicsGalleryExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://www.pornpics.com/channels/femjoy/", {
+ "pattern": PornpicsGalleryExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://www.pornpics.com/pornstars/emma-brown/", {
+ "pattern": PornpicsGalleryExtractor.pattern,
+ "range": "1-50",
+ "count": 50,
+ }),
+ ("https://pornpics.com/jp/?q=nature"),
+ ("https://pornpics.com/it/channels/femjoy"),
+ ("https://pornpics.com/pt/pornstars/emma-brown"),
+ )
+
+ def galleries(self):
+ url = self.root + "/search/srch.php"
+ params = {
+ "q" : self.item.replace("-", " "),
+ "lang" : "en",
+ "offset": 0,
+ }
+ return self._pagination(url, params)
diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py
index 7e266cc..32567f6 100644
--- a/gallery_dl/extractor/pururin.py
+++ b/gallery_dl/extractor/pururin.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2021 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,7 +11,6 @@
from .common import GalleryExtractor
from .. import text, util
import binascii
-import json
class PururinGalleryExtractor(GalleryExtractor):
@@ -73,7 +72,7 @@ class PururinGalleryExtractor(GalleryExtractor):
url = "{}/read/{}/01/x".format(self.root, self.gallery_id)
page = self.request(url).text
- info = json.loads(binascii.a2b_base64(text.extr(
+ info = util.json_loads(binascii.a2b_base64(text.extr(
page, '<gallery-read encoded="', '"')).decode())
self._ext = info["image_extension"]
self._cnt = info["total_pages"]
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index 8b5b6b6..1800b68 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,9 +9,8 @@
"""Generic extractors for *reactor sites"""
from .common import BaseExtractor, Message
-from .. import text
+from .. import text, util
import urllib.parse
-import json
class ReactorExtractor(BaseExtractor):
@@ -84,13 +83,13 @@ class ReactorExtractor(BaseExtractor):
script = script[:script.index("</")].strip()
try:
- data = json.loads(script)
+ data = util.json_loads(script)
except ValueError:
try:
# remove control characters and escape backslashes
mapping = dict.fromkeys(range(32))
script = script.translate(mapping).replace("\\", "\\\\")
- data = json.loads(script)
+ data = util.json_loads(script)
except ValueError as exc:
self.log.warning("Unable to parse JSON data: %s", exc)
return
diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py
index 204562e..305de2a 100644
--- a/gallery_dl/extractor/reddit.py
+++ b/gallery_dl/extractor/reddit.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2022 Mike Fährmann
+# Copyright 2017-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -29,7 +29,14 @@ class RedditExtractor(Extractor):
parentdir = self.config("parent-directory")
max_depth = self.config("recursion", 0)
+
videos = self.config("videos", True)
+ if videos:
+ if videos == "ytdl":
+ self._extract_video = self._extract_video_ytdl
+ elif videos == "dash":
+ self._extract_video = self._extract_video_dash
+ videos = True
submissions = self.submissions()
visited = set()
@@ -62,19 +69,8 @@ class RedditExtractor(Extractor):
elif submission["is_video"]:
if videos:
text.nameext_from_url(url, submission)
- if videos == "ytdl":
- url = "https://www.reddit.com" + \
- submission["permalink"]
- else:
- submission["_ytdl_extra"] = {
- "title": submission["title"],
- }
- try:
- url = (submission["secure_media"]
- ["reddit_video"]["dash_url"])
- except (KeyError, TypeError):
- pass
- yield Message.Url, "ytdl:" + url, submission
+ url = "ytdl:" + self._extract_video(submission)
+ yield Message.Url, url, submission
elif not submission["is_self"]:
urls.append((url, submission))
@@ -145,6 +141,21 @@ class RedditExtractor(Extractor):
submission["id"], item["media_id"])
self.log.debug(src)
+ def _extract_video_ytdl(self, submission):
+ return "https://www.reddit.com" + submission["permalink"]
+
+ def _extract_video_dash(self, submission):
+ submission["_ytdl_extra"] = {"title": submission["title"]}
+ try:
+ return (submission["secure_media"]["reddit_video"]["dash_url"] +
+ "#__youtubedl_smuggle=%7B%22to_generic%22%3A+1%7D")
+ except Exception:
+ return submission["url"]
+
+ def _extract_video(self, submission):
+ submission["_ytdl_extra"] = {"title": submission["title"]}
+ return submission["url"]
+
class RedditSubredditExtractor(RedditExtractor):
"""Extractor for URLs from subreddits on reddit.com"""
@@ -233,6 +244,25 @@ class RedditSubmissionExtractor(RedditExtractor):
"content": "1e7dde4ee7d5f4c4b45749abfd15b2dbfa27df3f",
"count": 3,
}),
+ # video
+ ("https://www.reddit.com/r/aww/comments/90bu6w/", {
+ "pattern": r"ytdl:https://v.redd.it/gyh95hiqc0b11",
+ "count": 1,
+ }),
+ # video (ytdl)
+ ("https://www.reddit.com/r/aww/comments/90bu6w/", {
+ "options": (("videos", "ytdl"),),
+ "pattern": r"ytdl:https://www.reddit.com/r/aww/comments/90bu6w"
+ r"/heat_index_was_110_degrees_so_we_offered_him_a/",
+ "count": 1,
+ }),
+ # video (dash)
+ ("https://www.reddit.com/r/aww/comments/90bu6w/", {
+ "options": (("videos", "dash"),),
+ "pattern": r"ytdl:https://v.redd.it/gyh95hiqc0b11"
+ r"/DASHPlaylist.mpd\?a=",
+ "count": 1,
+ }),
# deleted gallery (#953)
("https://www.reddit.com/gallery/icfgzv", {
"count": 0,
diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py
index ad4282c..eaaef7d 100644
--- a/gallery_dl/extractor/redgifs.py
+++ b/gallery_dl/extractor/redgifs.py
@@ -23,6 +23,7 @@ class RedgifsExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.key = match.group(1)
+ self.api = RedgifsAPI(self)
formats = self.config("format")
if formats is None:
@@ -69,30 +70,89 @@ class RedgifsUserExtractor(RedgifsExtractor):
"""Extractor for redgifs user profiles"""
subcategory = "user"
directory_fmt = ("{category}", "{userName}")
- pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/users/([^/?#]+)"
- test = ("https://www.redgifs.com/users/Natalifiction", {
- "pattern": r"https://\w+\.redgifs\.com/[A-Za-z]+\.mp4",
- "count": ">= 100",
- })
+ pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/users/([^/?#]+)/?$"
+ test = (
+ ("https://www.redgifs.com/users/Natalifiction", {
+ "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4",
+ "count": ">= 100",
+ }),
+ ("https://v3.redgifs.com/users/lamsinka89", {
+ "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.(mp4|jpg)",
+ "count": ">= 100",
+ }),
+ )
def metadata(self):
return {"userName": self.key}
def gifs(self):
- return RedgifsAPI(self).user(self.key)
+ return self.api.user(self.key)
+
+
+class RedgifsCollectionExtractor(RedgifsExtractor):
+ """Extractor for an individual user collection"""
+ subcategory = "collection"
+ directory_fmt = ("{category}", "{userName}", "{folderName}")
+ archive_fmt = "{folderId}_{id}"
+ pattern = (r"(?:https?://)?(?:www\.)?redgifs\.com/users"
+ r"/([^/?#]+)/collections/([^/?#]+)")
+ test = (
+ ("https://www.redgifs.com/users/boombah123/collections/2631326bbd", {
+ "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4",
+ "range": "1-20",
+ "count": 20,
+ }),
+ ("https://www.redgifs.com/users/boombah123/collections/9e6f7dd41f", {
+ "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4",
+ "range": "1-20",
+ "count": 20,
+ }),
+ )
+
+ def __init__(self, match):
+ RedgifsExtractor.__init__(self, match)
+ self.collection_id = match.group(2)
+
+ def metadata(self):
+ data = {"userName": self.key}
+ data.update(self.api.collection_info(self.key, self.collection_id))
+ return data
+
+ def gifs(self):
+ return self.api.collection(self.key, self.collection_id)
+
+
+class RedgifsCollectionsExtractor(RedgifsExtractor):
+ """Extractor for redgifs user collections"""
+ subcategory = "collections"
+ pattern = (r"(?:https?://)?(?:www\.)?redgifs\.com/users"
+ r"/([^/?#]+)/collections/?$")
+ test = ("https://www.redgifs.com/users/boombah123/collections", {
+ "pattern": (r"https://www\.redgifs\.com/users"
+ r"/boombah123/collections/\w+"),
+ "count": ">= 3",
+ })
+
+ def items(self):
+ for collection in self.api.collections(self.key):
+ url = "{}/users/{}/collections/{}".format(
+ self.root, self.key, collection["folderId"])
+ collection["_extractor"] = RedgifsCollectionExtractor
+ yield Message.Queue, url, collection
class RedgifsSearchExtractor(RedgifsExtractor):
"""Extractor for redgifs search results"""
subcategory = "search"
directory_fmt = ("{category}", "Search", "{search}")
- pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/browse/?\?([^#]+)"
+ pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/browse/?\?([^#]+)"
test = (
("https://www.redgifs.com/browse?tags=JAV", {
"pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)",
"range": "1-10",
"count": 10,
}),
+ ("https://v3.redgifs.com/browse?tags=JAV"),
("https://www.redgifs.com/browse?type=i&verified=y&order=top7"),
)
@@ -102,14 +162,14 @@ class RedgifsSearchExtractor(RedgifsExtractor):
return {"search": search}
def gifs(self):
- return RedgifsAPI(self).search(self.params)
+ return self.api.search(self.params)
class RedgifsImageExtractor(RedgifsExtractor):
"""Extractor for individual gifs from redgifs.com"""
subcategory = "image"
pattern = (r"(?:https?://)?(?:"
- r"(?:www\.)?redgifs\.com/(?:watch|ifr)|"
+ r"(?:\w+\.)?redgifs\.com/(?:watch|ifr)|"
r"(?:www\.)?gifdeliverynetwork\.com|"
r"i\.redgifs\.com/i)/([A-Za-z]+)")
test = (
@@ -121,13 +181,16 @@ class RedgifsImageExtractor(RedgifsExtractor):
("https://redgifs.com/ifr/FoolishForkedAbyssiniancat"),
("https://i.redgifs.com/i/FoolishForkedAbyssiniancat"),
("https://www.gifdeliverynetwork.com/foolishforkedabyssiniancat"),
+ ("https://v3.redgifs.com/watch/FoolishForkedAbyssiniancat"),
)
def gifs(self):
- return (RedgifsAPI(self).gif(self.key),)
+ return (self.api.gif(self.key),)
class RedgifsAPI():
+ """https://api.redgifs.com/docs/index.html"""
+
API_ROOT = "https://api.redgifs.com"
def __init__(self, extractor):
@@ -149,6 +212,19 @@ class RedgifsAPI():
params = {"order": order}
return self._pagination(endpoint, params)
+ def collection(self, user, collection_id):
+ endpoint = "/v2/users/{}/collections/{}/gifs".format(
+ user, collection_id)
+ return self._pagination(endpoint)
+
+ def collection_info(self, user, collection_id):
+ endpoint = "/v2/users/{}/collections/{}".format(user, collection_id)
+ return self._call(endpoint)
+
+ def collections(self, user):
+ endpoint = "/v2/users/{}/collections".format(user)
+ return self._pagination(endpoint, key="collections")
+
def search(self, params):
endpoint = "/v2/gifs/search"
params["search_text"] = params.pop("tags", None)
@@ -161,12 +237,14 @@ class RedgifsAPI():
return self.extractor.request(
url, params=params, headers=self.headers).json()
- def _pagination(self, endpoint, params):
+ def _pagination(self, endpoint, params=None, key="gifs"):
+ if params is None:
+ params = {}
params["page"] = 1
while True:
data = self._call(endpoint, params)
- yield from data["gifs"]
+ yield from data[key]
if params["page"] >= data["pages"]:
return
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index f2bf3cb..278ad14 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -63,6 +63,10 @@ BASE_PATTERN = ShopifyExtractor.update({
"root": "https://modcloth.com",
"pattern": r"modcloth\.com",
},
+ "ohpolly": {
+ "root": "https://www.ohpolly.com",
+ "pattern": r"(?:www\.)?ohpolly\.com",
+ },
"omgmiamiswimwear": {
"root": "https://www.omgmiamiswimwear.com",
"pattern": r"(?:www\.)?omgmiamiswimwear\.com",
@@ -102,6 +106,7 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
("https://loungeunderwear.com/collections/apparel"),
("https://michaels.com.au/collections/microphones"),
("https://modcloth.com/collections/shoes"),
+ ("https://www.ohpolly.com/collections/dresses-mini-dresses"),
("https://www.omgmiamiswimwear.com/collections/fajas"),
("https://pinupgirlclothing.com/collections/evening"),
("https://www.raidlondon.com/collections/flats"),
@@ -141,6 +146,8 @@ class ShopifyProductExtractor(ShopifyExtractor):
("https://michaels.com.au/collections/audio/products"
"/boya-by-wm4-pro-k5-2-4ghz-mic-android-1-1-101281"),
("https://modcloth.com/collections/shoes/products/heidii-brn"),
+ (("https://www.ohpolly.com/products/edonia-ruched-triangle-cup"
+ "-a-line-mini-dress-brown")),
("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", {
"pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/",
"count": 5,
diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py
index 506db26..bea457f 100644
--- a/gallery_dl/extractor/slideshare.py
+++ b/gallery_dl/extractor/slideshare.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2022 Mike Fährmann, Leonardo Taccari
+# Copyright 2016-2023 Mike Fährmann, Leonardo Taccari
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://www.slideshare.net/"""
from .common import GalleryExtractor
-from .. import text
-import json
+from .. import text, util
class SlidesharePresentationExtractor(GalleryExtractor):
@@ -97,7 +96,7 @@ class SlidesharePresentationExtractor(GalleryExtractor):
@staticmethod
def images(page):
- data = json.loads(text.extract(
+ data = util.json_loads(text.extract(
page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0])
# useing 'stripped_title' here is technically wrong, but it works all
diff --git a/gallery_dl/extractor/soundgasm.py b/gallery_dl/extractor/soundgasm.py
index 1afb92c..236f94f 100644
--- a/gallery_dl/extractor/soundgasm.py
+++ b/gallery_dl/extractor/soundgasm.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2022 Mike Fährmann
+# Copyright 2022-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -11,17 +11,46 @@
from .common import Extractor, Message
from .. import text
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?soundgasm\.net/u(?:ser)?"
-class SoundgasmAudioExtractor(Extractor):
- """Extractor for audio clips from soundgasm.net"""
+
+class SoundgasmExtractor(Extractor):
+ """Base class for soundgasm extractors"""
category = "soundgasm"
- subcategory = "audio"
root = "https://soundgasm.net"
+ request_interval = (0.5, 1.5)
directory_fmt = ("{category}", "{user}")
filename_fmt = "{title}.{extension}"
archive_fmt = "{user}_{slug}"
- pattern = (r"(?:https?://)?(?:www\.)?soundgasm\.net"
- r"/u(?:ser)?/([^/?#]+)/([^/?#]+)")
+
+ def items(self):
+ for sound in map(self._extract_sound, self.sounds()):
+ url = sound["url"]
+ yield Message.Directory, sound
+ yield Message.Url, url, text.nameext_from_url(url, sound)
+
+ def _extract_sound(self, url):
+ extr = text.extract_from(self.request(url).text)
+
+ _, user, slug = url.rstrip("/").rsplit("/", 2)
+ data = {
+ "user" : user,
+ "slug" : slug,
+ "title": text.unescape(extr('aria-label="title">', "<")),
+ "description": text.unescape(text.remove_html(extr(
+ 'class="jp-description">', '</div>'))),
+ }
+
+ formats = extr('"setMedia", {', '}')
+ data["url"] = text.extr(formats, ': "', '"')
+
+ return data
+
+
+class SoundgasmAudioExtractor(SoundgasmExtractor):
+ """Extractor for audio clips from soundgasm.net"""
+ subcategory = "audio"
+ pattern = BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)"
test = (
(("https://soundgasm.net/u/ClassWarAndPuppies2"
"/687-Otto-von-Toontown-12822"), {
@@ -47,47 +76,39 @@ class SoundgasmAudioExtractor(Extractor):
)
def __init__(self, match):
- Extractor.__init__(self, match)
+ SoundgasmExtractor.__init__(self, match)
self.user, self.slug = match.groups()
- def items(self):
- url = "{}/u/{}/{}".format(self.root, self.user, self.slug)
- extr = text.extract_from(self.request(url).text)
+ def sounds(self):
+ return ("{}/u/{}/{}".format(self.root, self.user, self.slug),)
- data = {
- "user" : self.user,
- "slug" : self.slug,
- "title": text.unescape(extr('aria-label="title">', "<")),
- "description": text.unescape(text.remove_html(extr(
- 'class="jp-description">', '</div>'))),
- }
-
- formats = extr('"setMedia", {', '}')
- url = text.extr(formats, ': "', '"')
-
- yield Message.Directory, data
- yield Message.Url, url, text.nameext_from_url(url, data)
-
-class SoundgasmUserExtractor(Extractor):
+class SoundgasmUserExtractor(SoundgasmExtractor):
"""Extractor for all sounds from a soundgasm user"""
- category = "soundgasm"
subcategory = "user"
- root = "https://soundgasm.net"
- pattern = (r"(?:https?://)?(?:www\.)?soundgasm\.net"
- r"/u(?:ser)?/([^/?#]+)/?$")
+ pattern = BASE_PATTERN + r"/([^/?#]+)/?$"
test = ("https://soundgasm.net/u/fierce-aphrodite", {
- "pattern": SoundgasmAudioExtractor.pattern,
+ "pattern": r"https://media\.soundgasm\.net/sounds/[0-9a-f]{40}\.m4a",
"count" : ">= 15",
+ "keyword": {
+ "description": str,
+ "extension": "m4a",
+ "filename": "re:^[0-9a-f]{40}$",
+ "slug": str,
+ "title": str,
+ "url": str,
+ "user": "fierce-aphrodite"
+ },
})
def __init__(self, match):
- Extractor.__init__(self, match)
+ SoundgasmExtractor.__init__(self, match)
self.user = match.group(1)
- def items(self):
+ def sounds(self):
page = self.request(self.root + "/user/" + self.user).text
- data = {"_extractor": SoundgasmAudioExtractor}
- for sound in text.extract_iter(
- page, 'class="sound-details">', "</a>"):
- yield Message.Queue, text.extr(sound, '<a href="', '"'), data
+ return [
+ text.extr(sound, '<a href="', '"')
+ for sound in text.extract_iter(
+ page, 'class="sound-details">', "</a>")
+ ]
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index ea39c5e..4de7e9b 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2022 Mike Fährmann
+# Copyright 2020-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,9 +9,8 @@
"""Extractors for https://www.subscribestar.com/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
from ..cache import cache
-import json
BASE_PATTERN = r"(?:https?://)?(?:www\.)?subscribestar\.(com|adult)"
@@ -92,7 +91,7 @@ class SubscribestarExtractor(Extractor):
gallery = text.extr(html, 'data-gallery="', '"')
if gallery:
media.extend(
- item for item in json.loads(text.unescape(gallery))
+ item for item in util.json_loads(text.unescape(gallery))
if "/previews/" not in item["url"]
)
diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py
new file mode 100644
index 0000000..4b15b14
--- /dev/null
+++ b/gallery_dl/extractor/szurubooru.py
@@ -0,0 +1,144 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2023 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for szurubooru instances"""
+
+from . import booru
+from .. import text
+
+import collections
+import binascii
+
+
+class SzurubooruExtractor(booru.BooruExtractor):
+ basecategory = "szurubooru"
+ filename_fmt = "{id}_{version}_{checksumMD5}.{extension}"
+ per_page = 100
+
+ def __init__(self, match):
+ booru.BooruExtractor.__init__(self, match)
+ self.headers = {
+ "Accept": "application/json",
+ "Content-Type": "application/json",
+ }
+
+ username = self.config("username")
+ if username:
+ token = self.config("token")
+ if token:
+ value = username + ":" + token
+ self.headers["Authorization"] = "Token " + \
+ binascii.b2a_base64(value.encode())[:-1].decode()
+
+ def _api_request(self, endpoint, params=None):
+ url = self.root + "/api" + endpoint
+ return self.request(url, headers=self.headers, params=params).json()
+
+ def _pagination(self, endpoint, params):
+ params["offset"] = 0
+ params["limit"] = self.per_page
+
+ while True:
+ data = self._api_request(endpoint, params)
+ results = data["results"]
+
+ yield from results
+
+ if len(results) < self.per_page:
+ return
+ params["offset"] += len(results)
+
+ def _file_url(self, post):
+ url = post["contentUrl"]
+ if not url.startswith("http"):
+ url = self.root + "/" + url
+ return url
+
+ @staticmethod
+ def _prepare(post):
+ post["date"] = text.parse_datetime(
+ post["creationTime"], "%Y-%m-%dT%H:%M:%S.%fZ")
+
+ tags = []
+ append = tags.append
+ tags_categories = collections.defaultdict(list)
+
+ for tag in post["tags"]:
+ tag_type = tag["category"].rpartition("_")[2]
+ tag_name = tag["names"][0]
+ tags_categories[tag_type].append(tag_name)
+ append(tag_name)
+
+ post["tags"] = tags
+ for category, tags in tags_categories.items():
+ post["tags_" + category] = tags
+
+
+BASE_PATTERN = SzurubooruExtractor.update({
+ "foalcon": {
+ "root": "https://booru.foalcon.com",
+ "pattern": r"booru\.foalcon\.com",
+ },
+ "bcbnsfw": {
+ "root": "https://booru.bcbnsfw.space",
+ "pattern": r"booru\.bcbnsfw\.space",
+ },
+})
+
+
+class SzurubooruTagExtractor(SzurubooruExtractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}_{version}"
+ pattern = BASE_PATTERN + r"/posts/query=([^/?#]+)"
+ test = (
+ ("https://booru.foalcon.com/posts/query=simple_background", {
+ "pattern": r"https://booru\.foalcon\.com/data/posts"
+ r"/\d+_[0-9a-f]{16}\.\w+",
+ "range": "1-150",
+ "count": 150,
+ }),
+ ("https://booru.bcbnsfw.space/posts/query=simple_background"),
+ )
+
+ def __init__(self, match):
+ SzurubooruExtractor.__init__(self, match)
+ query = match.group(match.lastindex)
+ self.query = text.unquote(query.replace("+", " "))
+
+ def metadata(self):
+ return {"search_tags": self.query}
+
+ def posts(self):
+ return self._pagination("/posts/", {"query": self.query})
+
+
+class SzurubooruPostExtractor(SzurubooruExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}_{version}"
+ pattern = BASE_PATTERN + r"/post/(\d+)"
+ test = (
+ ("https://booru.foalcon.com/post/30092", {
+ "pattern": r"https://booru\.foalcon\.com/data/posts"
+ r"/30092_b7d56e941888b624\.png",
+ "url": "dad4d4c67d87cd9a4ac429b3414747c27a95d5cb",
+ "content": "86d1514c0ca8197950cc4b74e7a59b2dc76ebf9c",
+ }),
+ ("https://booru.bcbnsfw.space/post/1599", {
+ "pattern": r"https://booru\.bcbnsfw\.space/data/posts"
+ r"/1599_53784518e92086bd\.png",
+ "content": "0c38fc612ba1f03950fad31c4f80a1fccdab1096",
+ }),
+ )
+
+ def __init__(self, match):
+ SzurubooruExtractor.__init__(self, match)
+ self.post_id = match.group(match.lastindex)
+
+ def posts(self):
+ return (self._api_request("/post/" + self.post_id),)
diff --git a/gallery_dl/extractor/telegraph.py b/gallery_dl/extractor/telegraph.py
index 5996268..116f3af 100644
--- a/gallery_dl/extractor/telegraph.py
+++ b/gallery_dl/extractor/telegraph.py
@@ -68,6 +68,21 @@ class TelegraphGalleryExtractor(GalleryExtractor):
"title": "Всё о друзьях моей сестрицы",
},
}),
+ ("https://telegra.ph/Disharmonica---Saber-Nero-02-21", {
+ "pattern": r"https://telegra\.ph/file/[0-9a-f]+\.(jpg|png)",
+ "keyword": {
+ "author": "cosmos",
+ "caption": "",
+ "count": 89,
+ "date": "dt:2022-02-21 05:57:39",
+ "description": "",
+ "num_formatted": r"re:^\d{2}$",
+ "post_url": "https://telegra.ph"
+ "/Disharmonica---Saber-Nero-02-21",
+ "slug": "Disharmonica---Saber-Nero-02-21",
+ "title": "Disharmonica - Saber Nero",
+ },
+ }),
)
def metadata(self, page):
@@ -89,7 +104,8 @@ class TelegraphGalleryExtractor(GalleryExtractor):
return data
def images(self, page):
- figures = tuple(text.extract_iter(page, "<figure>", "</figure>"))
+ figures = (tuple(text.extract_iter(page, "<figure>", "</figure>")) or
+ tuple(text.extract_iter(page, "<img", ">")))
num_zeroes = len(str(len(figures)))
num = 0
@@ -105,7 +121,7 @@ class TelegraphGalleryExtractor(GalleryExtractor):
result.append((url, {
"url" : url,
- "caption" : text.unescape(caption),
+ "caption" : text.unescape(caption) if caption else "",
"num" : num,
"num_formatted": str(num).zfill(num_zeroes),
}))
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index c75952a..155db1e 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -286,7 +286,11 @@ class TumblrUserExtractor(TumblrExtractor):
"count": 3,
"options": (("posts", "all"), ("external", True))
}),
- ("https://mikf123-hidden.tumblr.com/", { # dashbord-only
+ ("https://mikf123-hidden.tumblr.com/", { # dashboard-only
+ "options": (("access-token", None),),
+ "exception": exception.AuthorizationError,
+ }),
+ ("https://mikf123-hidden.tumblr.com/", { # dashboard-only
"count": 2,
"keyword": {"tags": ["test", "hidden"]},
}),
@@ -498,12 +502,24 @@ class TumblrAPI(oauth.OAuth1API):
if 200 <= status < 400:
return data["response"]
+ self.log.debug(data)
if status == 403:
raise exception.AuthorizationError()
+
elif status == 404:
+ try:
+ error = data["errors"][0]["detail"]
+ board = ("only viewable within the Tumblr dashboard" in error)
+ except Exception:
+ board = False
+
+ if board:
+ self.log.info("Run 'gallery-dl oauth:tumblr' "
+ "to access dashboard-only blogs")
+ raise exception.AuthorizationError(error)
raise exception.NotFoundError("user or post")
- elif status == 429:
+ elif status == 429:
# daily rate limit
if response.headers.get("x-ratelimit-perday-remaining") == "0":
self.log.info("Daily API rate limit exceeded")
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 17a2202..29b4ac3 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -41,6 +41,10 @@ class TwitterExtractor(Extractor):
self.cards = self.config("cards", False)
self.cards_blacklist = self.config("cards-blacklist")
self.syndication = self.config("syndication")
+
+ if not self.config("transform", True):
+ self._transform_user = util.identity
+ self._transform_tweet = util.identity
self._user = self._user_obj = None
self._user_cache = {}
self._init_sizes()
@@ -212,7 +216,7 @@ class TwitterExtractor(Extractor):
files.append(value)
return
elif name == "unified_card":
- data = json.loads(bvals["unified_card"]["string_value"])
+ data = util.json_loads(bvals["unified_card"]["string_value"])
self._extract_media(tweet, data["media_entities"].values(), files)
return
@@ -1436,6 +1440,8 @@ class TwitterAPI():
if "retweeted_status_result" in legacy:
retweet = legacy["retweeted_status_result"]["result"]
+ if "tweet" in retweet:
+ retweet = retweet["tweet"]
if original_retweets:
try:
retweet["legacy"]["retweeted_status_id_str"] = \
diff --git a/gallery_dl/extractor/vsco.py b/gallery_dl/extractor/vsco.py
index 00389fa..053a799 100644
--- a/gallery_dl/extractor/vsco.py
+++ b/gallery_dl/extractor/vsco.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,8 +9,7 @@
"""Extractors for https://vsco.co/"""
from .common import Extractor, Message
-from .. import text
-import json
+from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.)?vsco\.co/([^/]+)"
@@ -69,7 +68,7 @@ class VscoExtractor(Extractor):
def _extract_preload_state(self, url):
page = self.request(url, notfound=self.subcategory).text
- return json.loads(text.extr(page, "__PRELOADED_STATE__ = ", "<"))
+ return util.json_loads(text.extr(page, "__PRELOADED_STATE__ = ", "<"))
def _pagination(self, url, params, token, key, extra=None):
headers = {
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index ab05c48..68bd136 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,10 +9,9 @@
"""Extractors for https://www.weibo.com/"""
from .common import Extractor, Message
-from .. import text, exception
+from .. import text, util, exception
from ..cache import cache
import random
-import json
BASE_PATTERN = r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)"
USER_PATTERN = BASE_PATTERN + r"/(?:(u|n|p(?:rofile)?)/)?([^/?#]+)(?:/home)?"
@@ -179,7 +178,7 @@ class WeiboExtractor(Extractor):
page = Extractor.request(
self, passport_url, method="POST", headers=headers, data=data).text
- data = json.loads(text.extr(page, "(", ");"))["data"]
+ data = util.json_loads(text.extr(page, "(", ");"))["data"]
passport_url = "https://passport.weibo.com/visitor/visitor"
params = {
diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py
index 70e9646..662e08b 100644
--- a/gallery_dl/extractor/wikifeet.py
+++ b/gallery_dl/extractor/wikifeet.py
@@ -7,8 +7,7 @@
"""Extractors for https://www.wikifeet.com/"""
from .common import GalleryExtractor
-from .. import text
-import json
+from .. import text, util
class WikifeetGalleryExtractor(GalleryExtractor):
@@ -114,5 +113,5 @@ class WikifeetGalleryExtractor(GalleryExtractor):
"height": data["ph"],
"tags" : [tagmap[tag] for tag in data["tags"]],
})
- for data in json.loads(text.extr(page, "['gdata'] = ", ";"))
+ for data in util.json_loads(text.extr(page, "['gdata'] = ", ";"))
]
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index 0125739..b308e74 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -9,9 +9,7 @@
"""Extractors for https://xhamster.com/"""
from .common import Extractor, Message
-from .. import text
-import json
-
+from .. import text, util
BASE_PATTERN = (r"(?:https?://)?((?:[\w-]+\.)?xhamster"
r"(?:\d?\.(?:com|one|desi)|\.porncache\.net))")
@@ -144,7 +142,7 @@ class XhamsterGalleryExtractor(XhamsterExtractor):
def _data(self, url):
page = self.request(url).text
- return json.loads(text.extr(
+ return util.json_loads(text.extr(
page, "window.initials=", "</script>").rstrip("\n\r;"))
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
index 10de439..46ea074 100644
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@@ -9,8 +9,7 @@
"""Extractors for https://www.xvideos.com/"""
from .common import GalleryExtractor, Extractor, Message
-from .. import text
-import json
+from .. import text, util
class XvideosBase():
@@ -113,7 +112,7 @@ class XvideosUserExtractor(XvideosBase, Extractor):
def items(self):
url = "{}/profiles/{}".format(self.root, self.user)
page = self.request(url, notfound=self.subcategory).text
- data = json.loads(text.extr(
+ data = util.json_loads(text.extr(
page, "xv.conf=", ";</script>"))["data"]
if not isinstance(data["galleries"], dict):