summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2024-09-07 18:33:19 -0400
committerLibravatarUnit 193 <unit193@unit193.net>2024-09-07 18:33:19 -0400
commit1f3ffe32342852fd9ea9e7704022488f3a1222bd (patch)
treecb255a091b73e96840de0f6f44b36dff1acab4b9 /gallery_dl/extractor
parentb5e56c51e491b41f9eb6a895459c185788a377e5 (diff)
New upstream version 1.27.4.upstream/1.27.4
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/batoto.py27
-rw-r--r--gallery_dl/extractor/bunkr.py50
-rw-r--r--gallery_dl/extractor/cyberdrop.py14
-rw-r--r--gallery_dl/extractor/deviantart.py11
-rw-r--r--gallery_dl/extractor/e621.py24
-rw-r--r--gallery_dl/extractor/exhentai.py2
-rw-r--r--gallery_dl/extractor/flickr.py38
-rw-r--r--gallery_dl/extractor/furaffinity.py5
-rw-r--r--gallery_dl/extractor/generic.py8
-rw-r--r--gallery_dl/extractor/gofile.py3
-rw-r--r--gallery_dl/extractor/hitomi.py1
-rw-r--r--gallery_dl/extractor/instagram.py29
-rw-r--r--gallery_dl/extractor/koharu.py25
-rw-r--r--gallery_dl/extractor/lolisafe.py2
-rw-r--r--gallery_dl/extractor/newgrounds.py10
-rw-r--r--gallery_dl/extractor/pixiv.py90
-rw-r--r--gallery_dl/extractor/sankaku.py5
-rw-r--r--gallery_dl/extractor/sexcom.py19
-rw-r--r--gallery_dl/extractor/szurubooru.py8
-rw-r--r--gallery_dl/extractor/toyhouse.py3
-rw-r--r--gallery_dl/extractor/tumblr.py3
-rw-r--r--gallery_dl/extractor/twitter.py29
-rw-r--r--gallery_dl/extractor/wikimedia.py124
-rw-r--r--gallery_dl/extractor/ytdl.py17
24 files changed, 376 insertions, 171 deletions
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py
index 2adb142..786acd9 100644
--- a/gallery_dl/extractor/batoto.py
+++ b/gallery_dl/extractor/batoto.py
@@ -51,28 +51,29 @@ class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
if not manga:
manga = extr('link-hover">', "<")
info = text.remove_html(extr('link-hover">', "</"))
+ info = text.unescape(info)
match = re.match(
- r"(?:Volume\s+(\d+) )?"
- r"\w+\s+(\d+)(.*)", info)
+ r"(?i)(?:(?:Volume|S(?:eason)?)\s*(\d+)\s+)?"
+ r"(?:Chapter|Episode)\s*(\d+)([\w.]*)", info)
if match:
volume, chapter, minor = match.groups()
- title = text.remove_html(extr(
- "selected>", "</option")).partition(" : ")[2]
else:
volume = chapter = 0
minor = ""
- title = info
return {
- "manga" : text.unescape(manga),
- "manga_id" : text.parse_int(manga_id),
- "title" : text.unescape(title),
- "volume" : text.parse_int(volume),
- "chapter" : text.parse_int(chapter),
- "chapter_minor": minor,
- "chapter_id" : text.parse_int(self.chapter_id),
- "date" : text.parse_timestamp(extr(' time="', '"')[:-3]),
+ "manga" : text.unescape(manga),
+ "manga_id" : text.parse_int(manga_id),
+ "chapter_url" : extr(self.chapter_id + "-ch_", '"'),
+ "title" : text.unescape(text.remove_html(extr(
+ "selected>", "</option")).partition(" : ")[2]),
+ "volume" : text.parse_int(volume),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor" : minor,
+ "chapter_string": info,
+ "chapter_id" : text.parse_int(self.chapter_id),
+ "date" : text.parse_timestamp(extr(' time="', '"')[:-3]),
}
def images(self, page):
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 240bbd3..780bdf1 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -6,15 +6,24 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://bunkr.sk/"""
+"""Extractors for https://bunkr.si/"""
from .lolisafe import LolisafeAlbumExtractor
-from .. import text
-
-BASE_PATTERN = (
- r"(?:https?://)?(?:app\.)?(bunkr+"
- r"\.(?:s[kiu]|[cf]i|ru|la|is|to|ac|black|cat|media|red|site|ws|org))"
-)
+from .. import text, config
+
+
+if config.get(("extractor", "bunkr"), "tlds"):
+ BASE_PATTERN = (
+ r"(?:bunkr:(?:https?://)?([^/?#]+)|"
+ r"(?:https?://)?(?:app\.)?(bunkr+\.\w+))"
+ )
+else:
+ BASE_PATTERN = (
+ r"(?:bunkr:(?:https?://)?([^/?#]+)|"
+ r"(?:https?://)?(?:app\.)?(bunkr+"
+ r"\.(?:s[kiu]|[cf]i|ru|la|is|to|a[cx]"
+ r"|black|cat|media|red|site|ws|org)))"
+ )
LEGACY_DOMAINS = {
"bunkr.ru",
@@ -28,15 +37,15 @@ LEGACY_DOMAINS = {
class BunkrAlbumExtractor(LolisafeAlbumExtractor):
- """Extractor for bunkr.sk albums"""
+ """Extractor for bunkr.si albums"""
category = "bunkr"
- root = "https://bunkr.sk"
+ root = "https://bunkr.si"
pattern = BASE_PATTERN + r"/a/([^/?#]+)"
- example = "https://bunkr.sk/a/ID"
+ example = "https://bunkr.si/a/ID"
def __init__(self, match):
LolisafeAlbumExtractor.__init__(self, match)
- domain = match.group(match.lastindex-1)
+ domain = self.groups[0] or self.groups[1]
if domain not in LEGACY_DOMAINS:
self.root = "https://" + domain
@@ -69,11 +78,16 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
def _extract_file(self, url):
page = self.request(url).text
- return (
- text.extr(page, '<source src="', '"') or
- text.extr(page, '<img src="', '"') or
- text.rextract(page, ' href="', '"', page.rindex("Download"))[0]
- )
+ url = (text.extr(page, '<source src="', '"') or
+ text.extr(page, '<img src="', '"'))
+
+ if not url:
+ url_download = text.rextract(
+ page, ' href="', '"', page.rindex("Download"))[0]
+ page = self.request(text.unescape(url_download)).text
+ url = text.unescape(text.rextract(page, ' href="', '"')[0])
+
+ return url
def _validate(self, response):
if response.history and response.url.endswith("/maintenance-vid.mp4"):
@@ -83,11 +97,11 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
class BunkrMediaExtractor(BunkrAlbumExtractor):
- """Extractor for bunkr.sk media links"""
+ """Extractor for bunkr.si media links"""
subcategory = "media"
directory_fmt = ("{category}",)
pattern = BASE_PATTERN + r"(/[vid]/[^/?#]+)"
- example = "https://bunkr.sk/v/FILENAME"
+ example = "https://bunkr.si/v/FILENAME"
def fetch_album(self, album_id):
try:
diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py
index d864960..a514696 100644
--- a/gallery_dl/extractor/cyberdrop.py
+++ b/gallery_dl/extractor/cyberdrop.py
@@ -14,6 +14,7 @@ from .. import text
class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
category = "cyberdrop"
root = "https://cyberdrop.me"
+ root_api = "https://api.cyberdrop.me"
pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)"
example = "https://cyberdrop.me/a/ID"
@@ -55,5 +56,14 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor):
def _extract_files(self, file_ids):
for file_id in file_ids:
- url = "{}/api/f/{}".format(self.root, file_id)
- yield self.request(url).json()
+ try:
+ url = "{}/api/file/info/{}".format(self.root_api, file_id)
+ file = self.request(url).json()
+ auth = self.request(file["auth_url"]).json()
+ file["url"] = auth["url"]
+ except Exception as exc:
+ self.log.warning("%s (%s: %s)",
+ file_id, exc.__class__.__name__, exc)
+ continue
+
+ yield file
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index f3ea4e7..ea70b58 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -69,11 +69,12 @@ class DeviantartExtractor(Extractor):
self.quality = ",q_{}".format(self.quality)
self.quality_sub = re.compile(r",q_\d+").sub
- if self.original != "image":
- self._update_content = self._update_content_default
- else:
- self._update_content = self._update_content_image
+ if isinstance(self.original, str) and \
+ self.original.lower().startswith("image"):
self.original = True
+ self._update_content = self._update_content_image
+ else:
+ self._update_content = self._update_content_default
journals = self.config("journals", "html")
if journals == "html":
@@ -1462,6 +1463,8 @@ class DeviantartOAuthAPI():
return
if "next_cursor" in data:
+ if not data["next_cursor"]:
+ return
params["offset"] = None
params["cursor"] = data["next_cursor"]
elif data["next_offset"] is not None:
diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py
index af963bc..553ec22 100644
--- a/gallery_dl/extractor/e621.py
+++ b/gallery_dl/extractor/e621.py
@@ -10,6 +10,7 @@
from .common import Message
from . import danbooru
+from ..cache import memcache
from .. import text, util
@@ -44,16 +45,11 @@ class E621Extractor(danbooru.DanbooruExtractor):
self.root[8:], md5[0:2], md5[2:4], md5, file["ext"])
if notes and post.get("has_notes"):
- url = "{}/notes.json?search[post_id]={}".format(
- self.root, post["id"])
- post["notes"] = self.request(url).json()
+ post["notes"] = self._get_notes(post["id"])
if pools and post["pools"]:
- url = "{}/pools.json?search[id]={}".format(
- self.root, ",".join(map(str, post["pools"])))
- post["pools"] = _pools = self.request(url).json()
- for pool in _pools:
- pool["name"] = pool["name"].replace("_", " ")
+ post["pools"] = self._get_pools(
+ ",".join(map(str, post["pools"])))
post["filename"] = file["md5"]
post["extension"] = file["ext"]
@@ -64,6 +60,18 @@ class E621Extractor(danbooru.DanbooruExtractor):
yield Message.Directory, post
yield Message.Url, file["url"], post
+ def _get_notes(self, id):
+ return self.request(
+ "{}/notes.json?search[post_id]={}".format(self.root, id)).json()
+
+ @memcache(keyarg=1)
+ def _get_pools(self, ids):
+ pools = self.request(
+ "{}/pools.json?search[id]={}".format(self.root, ids)).json()
+ for pool in pools:
+ pool["name"] = pool["name"].replace("_", " ")
+ return pools
+
BASE_PATTERN = E621Extractor.update({
"e621": {
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 1b4f995..01af7a4 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -430,7 +430,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
}
page = self.request(url, cookies=cookies).text
- current = text.extr(page, "<strong>", "</strong>")
+ current = text.extr(page, "<strong>", "</strong>").replace(",", "")
self.log.debug("Image Limits: %s/%s", current, self.limits)
self._remaining = self.limits - text.parse_int(current)
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index c94a110..1b4971c 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -75,11 +75,8 @@ class FlickrImageExtractor(FlickrExtractor):
def items(self):
photo = self.api.photos_getInfo(self.item_id)
- if self.api.exif:
- photo.update(self.api.photos_getExif(self.item_id))
- if self.api.contexts:
- photo.update(self.api.photos_getAllContexts(self.item_id))
+ self.api._extract_metadata(photo)
if photo["media"] == "video" and self.api.videos:
self.api._extract_video(photo)
else:
@@ -135,8 +132,13 @@ class FlickrAlbumExtractor(FlickrExtractor):
def metadata(self):
data = FlickrExtractor.metadata(self)
- data["album"] = self.api.photosets_getInfo(
- self.album_id, self.user["nsid"])
+ try:
+ data["album"] = self.api.photosets_getInfo(
+ self.album_id, self.user["nsid"])
+ except Exception:
+ data["album"] = {}
+ self.log.warning("%s: Unable to retrieve album metadata",
+ self.album_id)
return data
def photos(self):
@@ -407,6 +409,8 @@ class FlickrAPI(oauth.OAuth1API):
self.log.debug("Server response: %s", data)
if data["code"] == 1:
raise exception.NotFoundError(self.extractor.subcategory)
+ elif data["code"] == 2:
+ raise exception.AuthorizationError(msg)
elif data["code"] == 98:
raise exception.AuthenticationError(msg)
elif data["code"] == 99:
@@ -453,10 +457,7 @@ class FlickrAPI(oauth.OAuth1API):
photo["date"] = text.parse_timestamp(photo["dateupload"])
photo["tags"] = photo["tags"].split()
- if self.exif:
- photo.update(self.photos_getExif(photo["id"]))
- if self.contexts:
- photo.update(self.photos_getAllContexts(photo["id"]))
+ self._extract_metadata(photo)
photo["id"] = text.parse_int(photo["id"])
if "owner" in photo:
@@ -512,6 +513,23 @@ class FlickrAPI(oauth.OAuth1API):
photo["width"] = photo["height"] = 0
return photo
+ def _extract_metadata(self, photo):
+ if self.exif:
+ try:
+ photo.update(self.photos_getExif(photo["id"]))
+ except Exception as exc:
+ self.log.warning(
+ "Unable to retrieve 'exif' data for %s (%s: %s)",
+ photo["id"], exc.__class__.__name__, exc)
+
+ if self.contexts:
+ try:
+ photo.update(self.photos_getAllContexts(photo["id"]))
+ except Exception as exc:
+ self.log.warning(
+ "Unable to retrieve 'contexts' data for %s (%s: %s)",
+ photo["id"], exc.__class__.__name__, exc)
+
@staticmethod
def _clean_info(info):
info["title"] = info["title"]["_content"]
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index 3055426..d253582 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -179,6 +179,11 @@ class FuraffinityExtractor(Extractor):
break
self._favorite_id = text.parse_int(extr('data-fav-id="', '"'))
yield post_id
+
+ pos = page.find('type="submit">Next</button>')
+ if pos >= 0:
+ path = text.rextract(page, '<form action="', '"', pos)[0]
+ continue
path = text.extr(page, 'right" href="', '"')
def _pagination_search(self, query):
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 16d4340..a6c1d5a 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -15,7 +15,7 @@ import re
class GenericExtractor(Extractor):
"""Extractor for images in a generic web page."""
category = "generic"
- directory_fmt = ("{category}", "{pageurl}")
+ directory_fmt = ("{category}", "{subcategory}", "{path}")
archive_fmt = "{imageurl}"
# By default, the generic extractor is disabled
@@ -52,7 +52,10 @@ class GenericExtractor(Extractor):
self.scheme = match.group('scheme')
else:
self.scheme = 'https://'
- self.url = self.scheme + self.url
+ self.url = text.ensure_http_scheme(self.url, self.scheme)
+
+ self.subcategory = match.group('domain')
+ self.path = match.group('path')
# Used to resolve relative image urls
self.root = self.scheme + match.group('domain')
@@ -87,6 +90,7 @@ class GenericExtractor(Extractor):
def metadata(self, page):
"""Extract generic webpage metadata, return them in a dict."""
data = {}
+ data['path'] = self.path.replace("/", "")
data['pageurl'] = self.url
data['title'] = text.extr(page, '<title>', "</title>")
data['description'] = text.extr(
diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py
index f0eb4e9..52b4ae6 100644
--- a/gallery_dl/extractor/gofile.py
+++ b/gallery_dl/extractor/gofile.py
@@ -47,8 +47,7 @@ class GofileFolderExtractor(Extractor):
raise exception.AuthorizationError("Password required")
num = 0
- for content_id in folder["childrenIds"]:
- content = contents[content_id]
+ for content in contents.values():
content["folder"] = folder
if content["type"] == "file":
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 9b74700..18df9df 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -89,6 +89,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
path = ext = "webp"
ihash = image["hash"]
idata = text.nameext_from_url(image["name"])
+ idata["extension_original"] = idata["extension"]
if ext:
idata["extension"] = ext
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index c05fe72..422c865 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -12,6 +12,7 @@
from .common import Extractor, Message
from .. import text, util, exception
from ..cache import cache, memcache
+import itertools
import binascii
import json
import re
@@ -57,12 +58,17 @@ class InstagramExtractor(Extractor):
data = self.metadata()
videos = self.config("videos", True)
previews = self.config("previews", False)
+ max_posts = self.config("max-posts")
video_headers = {"User-Agent": "Mozilla/5.0"}
order = self.config("order-files")
reverse = order[0] in ("r", "d") if order else False
- for post in self.posts():
+ posts = self.posts()
+ if max_posts:
+ posts = itertools.islice(posts, max_posts)
+
+ for post in posts:
if "__typename" in post:
post = self._parse_post_graphql(post)
@@ -159,15 +165,19 @@ class InstagramExtractor(Extractor):
if "title" in post:
data["highlight_title"] = post["title"]
if "created_at" in post:
- data["date"] = text.parse_timestamp(post.get("created_at"))
+ data["post_date"] = data["date"] = text.parse_timestamp(
+ post.get("created_at"))
else: # regular image/video post
+ date = text.parse_timestamp(post.get("taken_at"))
data = {
"post_id" : post["pk"],
"post_shortcode": post["code"],
+ "post_url": "{}/p/{}/".format(self.root, post["code"]),
+ "post_date": date,
+ "date": date,
"likes": post.get("like_count", 0),
"pinned": post.get("timeline_pinned_user_ids", ()),
- "date": text.parse_timestamp(post.get("taken_at")),
"liked": post.get("has_liked", False),
}
@@ -206,7 +216,6 @@ class InstagramExtractor(Extractor):
data["owner_id"] = owner["pk"]
data["username"] = owner.get("username")
data["fullname"] = owner.get("full_name")
- data["post_url"] = "{}/p/{}/".format(self.root, data["post_shortcode"])
data["_files"] = files = []
for num, item in enumerate(items, 1):
@@ -269,7 +278,6 @@ class InstagramExtractor(Extractor):
owner = post["owner"]
data = {
"typename" : typename,
- "date" : text.parse_timestamp(post["taken_at_timestamp"]),
"likes" : post["edge_media_preview_like"]["count"],
"liked" : post.get("viewer_has_liked", False),
"pinned" : pinned,
@@ -279,11 +287,13 @@ class InstagramExtractor(Extractor):
"post_id" : post["id"],
"post_shortcode": post["shortcode"],
"post_url" : "{}/p/{}/".format(self.root, post["shortcode"]),
+ "post_date" : text.parse_timestamp(post["taken_at_timestamp"]),
"description": text.parse_unicode_escapes("\n".join(
edge["node"]["text"]
for edge in post["edge_media_to_caption"]["edges"]
)),
}
+ data["date"] = data["post_date"]
tags = self._find_tags(data["description"])
if tags:
@@ -313,6 +323,7 @@ class InstagramExtractor(Extractor):
media = {
"num": num,
"media_id" : node["id"],
+ "date" : data["date"],
"shortcode" : (node.get("shortcode") or
shortcode_from_id(node["id"])),
"display_url": node["display_url"],
@@ -328,6 +339,7 @@ class InstagramExtractor(Extractor):
dimensions = post["dimensions"]
media = {
"media_id" : post["id"],
+ "date" : data["date"],
"shortcode" : post["shortcode"],
"display_url": post["display_url"],
"video_url" : post.get("video_url"),
@@ -378,7 +390,11 @@ class InstagramExtractor(Extractor):
"full_name": user["full_name"]})
def _init_cursor(self):
- return self.config("cursor") or None
+ cursor = self.config("cursor", True)
+ if not cursor:
+ self._update_cursor = util.identity
+ elif isinstance(cursor, str):
+ return cursor
def _update_cursor(self, cursor):
self.log.debug("Cursor: %s", cursor)
@@ -418,6 +434,7 @@ class InstagramUserExtractor(InstagramExtractor):
base = "{}/{}/".format(self.root, self.item)
stories = "{}/stories/{}/".format(self.root, self.item)
return self._dispatch_extractors((
+ (InstagramInfoExtractor , base + "info/"),
(InstagramAvatarExtractor , base + "avatar/"),
(InstagramStoriesExtractor , stories),
(InstagramHighlightsExtractor, base + "highlights/"),
diff --git a/gallery_dl/extractor/koharu.py b/gallery_dl/extractor/koharu.py
index 979b1a2..cacf504 100644
--- a/gallery_dl/extractor/koharu.py
+++ b/gallery_dl/extractor/koharu.py
@@ -161,16 +161,29 @@ class KoharuGalleryExtractor(KoharuExtractor, GalleryExtractor):
return results
def _select_format(self, formats):
- if not self.fmt or self.fmt == "original":
- fmtid = "0"
+ fmt = self.fmt
+
+ if not fmt or fmt == "best":
+ fmtids = ("0", "1600", "1280", "980", "780")
+ elif isinstance(fmt, str):
+ fmtids = fmt.split(",")
+ elif isinstance(fmt, list):
+ fmtids = fmt
else:
- fmtid = str(self.fmt)
+ fmtids = (str(self.fmt),)
- try:
- fmt = formats[fmtid]
- except KeyError:
+ for fmtid in fmtids:
+ try:
+ fmt = formats[fmtid]
+ if fmt["id"]:
+ break
+ except KeyError:
+ self.log.debug("%s: Format %s is not available",
+ self.groups[0], fmtid)
+ else:
raise exception.NotFoundError("format")
+ self.log.debug("%s: Selected format %s", self.groups[0], fmtid)
fmt["w"] = fmtid
return fmt
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index 3d7d685..117b88b 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -34,7 +34,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
def __init__(self, match):
LolisafeExtractor.__init__(self, match)
- self.album_id = match.group(match.lastindex)
+ self.album_id = self.groups[-1]
def _init(self):
domain = self.config("domain")
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index ecd6619..5fc0ce5 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -171,15 +171,17 @@ class NewgroundsExtractor(Extractor):
if self.flash:
url += "/format/flash"
- with self.request(url, fatal=False) as response:
- if response.status_code >= 400:
- return {}
- page = response.text
+ response = self.request(url, fatal=False)
+ page = response.text
pos = page.find('id="adults_only"')
if pos >= 0:
msg = text.extract(page, 'class="highlight">', '<', pos)[0]
self.log.warning('"%s"', msg)
+ return {}
+
+ if response.status_code >= 400:
+ return {}
extr = text.extract_from(page)
data = extract_data(extr, post_url)
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index d732894..3479b88 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -94,12 +94,39 @@ class PixivExtractor(Extractor):
work.get("id"), exc.message)
continue
- url = ugoira["zip_urls"]["medium"].replace(
- "_ugoira600x600", "_ugoira1920x1080")
- work["frames"] = ugoira["frames"]
+ url = ugoira["zip_urls"]["medium"]
+ work["frames"] = frames = ugoira["frames"]
work["date_url"] = self._date_from_url(url)
work["_http_adjust_extension"] = False
- yield Message.Url, url, text.nameext_from_url(url, work)
+
+ if self.load_ugoira == "original":
+ base, sep, _ = url.rpartition("_ugoira")
+ base = base.replace(
+ "/img-zip-ugoira/", "/img-original/", 1) + sep
+
+ for ext in ("jpg", "png", "gif"):
+ try:
+ url = ("{}0.{}".format(base, ext))
+ self.request(url, method="HEAD")
+ break
+ except exception.HttpError:
+ pass
+ else:
+ self.log.warning(
+ "Unable to find Ugoira frame URLs (%s)",
+ work.get("id"))
+ continue
+
+ for num, frame in enumerate(frames):
+ url = ("{}{}.{}".format(base, num, ext))
+ work["num"] = work["_ugoira_frame_index"] = num
+ work["suffix"] = "_p{:02}".format(num)
+ text.nameext_from_url(url, work)
+ yield Message.Url, url, work
+
+ else:
+ url = url.replace("_ugoira600x600", "_ugoira1920x1080")
+ yield Message.Url, url, text.nameext_from_url(url, work)
elif work["page_count"] == 1:
url = meta_single_page["original_image_url"]
@@ -551,9 +578,6 @@ class PixivSeriesExtractor(PixivExtractor):
directory_fmt = ("{category}", "{user[id]} {user[account]}",
"{series[id]} {series[title]}")
filename_fmt = "{num_series:>03}_{id}_p{num}.{extension}"
- cookies_domain = ".pixiv.net"
- browser = "firefox"
- tls12 = False
pattern = BASE_PATTERN + r"/user/(\d+)/series/(\d+)"
example = "https://www.pixiv.net/user/12345/series/12345"
@@ -562,34 +586,18 @@ class PixivSeriesExtractor(PixivExtractor):
self.user_id, self.series_id = match.groups()
def works(self):
- url = self.root + "/ajax/series/" + self.series_id
- params = {"p": 1}
- headers = {
- "Accept": "application/json",
- "Referer": "{}/user/{}/series/{}".format(
- self.root, self.user_id, self.series_id),
- "Alt-Used": "www.pixiv.net",
- }
+ series = None
- while True:
- data = self.request(url, params=params, headers=headers).json()
- body = data["body"]
- page = body["page"]
-
- series = body["extraData"]["meta"]
- series["id"] = self.series_id
- series["total"] = page["total"]
- series["title"] = text.extr(series["title"], '"', '"')
-
- for info in page["series"]:
- work = self.api.illust_detail(info["workId"])
- work["num_series"] = info["order"]
- work["series"] = series
- yield work
-
- if len(page["series"]) < 10:
- return
- params["p"] += 1
+ for work in self.api.illust_series(self.series_id):
+ if series is None:
+ series = self.api.data
+ series["total"] = num_series = series.pop("series_work_count")
+ else:
+ num_series -= 1
+
+ work["num_series"] = num_series
+ work["series"] = series
+ yield work
class PixivNovelExtractor(PixivExtractor):
@@ -916,6 +924,11 @@ class PixivAppAPI():
params = {"illust_id": illust_id}
return self._pagination("/v2/illust/related", params)
+ def illust_series(self, series_id, offset=0):
+ params = {"illust_series_id": series_id, "offset": offset}
+ return self._pagination("/v1/illust/series", params,
+ key_data="illust_series_detail")
+
def novel_bookmark_detail(self, novel_id):
params = {"novel_id": novel_id}
return self._call(
@@ -1013,10 +1026,15 @@ class PixivAppAPI():
raise exception.StopExtraction("API request failed: %s", error)
- def _pagination(self, endpoint, params, key="illusts"):
+ def _pagination(self, endpoint, params,
+ key_items="illusts", key_data=None):
while True:
data = self._call(endpoint, params)
- yield from data[key]
+
+ if key_data:
+ self.data = data.get(key_data)
+ key_data = None
+ yield from data[key_items]
if not data["next_url"]:
return
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index ad3efa7..7db8172 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -66,7 +66,8 @@ class SankakuExtractor(BooruExtractor):
def _prepare(self, post):
post["created_at"] = post["created_at"]["s"]
post["date"] = text.parse_timestamp(post["created_at"])
- post["tags"] = [tag["name"] for tag in post["tags"] if tag["name"]]
+ post["tags"] = [tag["name"].lower().replace(" ", "_")
+ for tag in post["tags"] if tag["name"]]
post["tag_string"] = " ".join(post["tags"])
post["_http_validate"] = self._check_expired
@@ -79,7 +80,7 @@ class SankakuExtractor(BooruExtractor):
for tag in post["tags"]:
name = tag["name"]
if name:
- tags[types[tag["type"]]].append(name)
+ tags[types[tag["type"]]].append(name.lower().replace(" ", "_"))
for key, value in tags.items():
post["tags_" + key] = value
post["tag_string_" + key] = " ".join(value)
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index 80f2aea..7708b5c 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -152,6 +152,25 @@ class SexcomPinsExtractor(SexcomExtractor):
return self._pagination(url)
+class SexcomLikesExtractor(SexcomExtractor):
+ """Extractor for a user's liked pins on www.sex.com"""
+ subcategory = "likes"
+ directory_fmt = ("{category}", "{user}", "Likes")
+ pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/likes/"
+ example = "https://www.sex.com/user/USER/likes/"
+
+ def __init__(self, match):
+ SexcomExtractor.__init__(self, match)
+ self.user = match.group(1)
+
+ def metadata(self):
+ return {"user": text.unquote(self.user)}
+
+ def pins(self):
+ url = "{}/user/{}/likes/".format(self.root, self.user)
+ return self._pagination(url)
+
+
class SexcomBoardExtractor(SexcomExtractor):
"""Extractor for pins from a board on www.sex.com"""
subcategory = "board"
diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py
index bba1ece..b6917cc 100644
--- a/gallery_dl/extractor/szurubooru.py
+++ b/gallery_dl/extractor/szurubooru.py
@@ -86,6 +86,7 @@ BASE_PATTERN = SzurubooruExtractor.update({
"bcbnsfw": {
"root": "https://booru.bcbnsfw.space",
"pattern": r"booru\.bcbnsfw\.space",
+ "query-all": "*",
},
"snootbooru": {
"root": "https://snootbooru.com",
@@ -110,7 +111,12 @@ class SzurubooruTagExtractor(SzurubooruExtractor):
return {"search_tags": self.query}
def posts(self):
- return self._pagination("/posts/", {"query": self.query})
+ if self.query.strip():
+ query = self.query
+ else:
+ query = self.config_instance("query-all")
+
+ return self._pagination("/posts/", {"query": query})
class SzurubooruPostExtractor(SzurubooruExtractor):
diff --git a/gallery_dl/extractor/toyhouse.py b/gallery_dl/extractor/toyhouse.py
index 64fa951..44d87ee 100644
--- a/gallery_dl/extractor/toyhouse.py
+++ b/gallery_dl/extractor/toyhouse.py
@@ -123,4 +123,5 @@ class ToyhouseImageExtractor(ToyhouseExtractor):
def posts(self):
url = "{}/~images/{}".format(self.root, self.user)
- return (self._parse_post(self.request(url).text, '<img src="'),)
+ return (self._parse_post(
+ self.request(url).text, '<img class="mw-100" src="'),)
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index ff29c04..73455d2 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -400,6 +400,9 @@ class TumblrAPI(oauth.OAuth1API):
"""Retrieve liked posts"""
endpoint = "/v2/blog/{}/likes".format(blog)
params = {"limit": "50", "before": self.before}
+ if self.api_key:
+ params["api_key"] = self.api_key
+
while True:
posts = self._call(endpoint, params)["liked_posts"]
if not posts:
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index ea57d76..d4ec343 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -155,6 +155,7 @@ class TwitterExtractor(Extractor):
if not self.unavailable:
continue
+ mtype = media.get("type")
descr = media.get("ext_alt_text")
width = media["original_info"].get("width", 0)
height = media["original_info"].get("height", 0)
@@ -164,6 +165,7 @@ class TwitterExtractor(Extractor):
files.append({
"url": "ytdl:{}/i/web/status/{}".format(
self.root, tweet["id_str"]),
+ "type" : mtype,
"width" : width,
"height" : height,
"extension" : None,
@@ -177,6 +179,7 @@ class TwitterExtractor(Extractor):
)
files.append({
"url" : variant["url"],
+ "type" : mtype,
"width" : width,
"height" : height,
"bitrate" : variant.get("bitrate", 0),
@@ -193,6 +196,7 @@ class TwitterExtractor(Extractor):
base = url.rpartition("=")[0] + "="
files.append(text.nameext_from_url(url, {
"url" : base + self._size_image,
+ "type" : mtype,
"width" : width,
"height" : height,
"_fallback" : self._image_fallback(base),
@@ -504,7 +508,11 @@ class TwitterExtractor(Extractor):
}
def _init_cursor(self):
- return self.config("cursor") or None
+ cursor = self.config("cursor", True)
+ if not cursor:
+ self._update_cursor = util.identity
+ elif isinstance(cursor, str):
+ return cursor
def _update_cursor(self, cursor):
self.log.debug("Cursor: %s", cursor)
@@ -560,6 +568,7 @@ class TwitterUserExtractor(TwitterExtractor):
def items(self):
base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors((
+ (TwitterInfoExtractor , base + "info"),
(TwitterAvatarExtractor , base + "photo"),
(TwitterBackgroundExtractor, base + "header_photo"),
(TwitterTimelineExtractor , base + "timeline"),
@@ -590,9 +599,16 @@ class TwitterTimelineExtractor(TwitterExtractor):
return cursor
def tweets(self):
- self._cursor = cursor = self.config("cursor") or None
reset = False
+ cursor = self.config("cursor", True)
+ if not cursor:
+ self._update_cursor = util.identity
+ elif isinstance(cursor, str):
+ self._cursor = cursor
+ else:
+ cursor = None
+
if cursor:
state = cursor.partition("/")[0]
state, _, tweet_id = state.partition("_")
@@ -1612,6 +1628,9 @@ class TwitterAPI():
entries = instr["entries"]
elif instr_type == "TimelineAddToModule":
entries = instr["moduleItems"]
+ elif instr_type == "TimelinePinEntry":
+ if pinned_tweet:
+ pinned_tweet = instr["entry"]
elif instr_type == "TimelineReplaceEntry":
entry = instr["entry"]
if entry["entryId"].startswith("cursor-bottom-"):
@@ -1650,9 +1669,11 @@ class TwitterAPI():
tweet = None
if pinned_tweet:
- pinned_tweet = False
- if instructions[-1]["type"] == "TimelinePinEntry":
+ if isinstance(pinned_tweet, dict):
+ tweets.append(pinned_tweet)
+ elif instructions[-1]["type"] == "TimelinePinEntry":
tweets.append(instructions[-1]["entry"])
+ pinned_tweet = False
for entry in entries:
esw = entry["entryId"].startswith
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index 9370cfb..7a62e01 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -17,13 +17,11 @@ class WikimediaExtractor(BaseExtractor):
"""Base class for wikimedia extractors"""
basecategory = "wikimedia"
filename_fmt = "{filename} ({sha1[:8]}).{extension}"
- directory_fmt = ("{category}", "{page}")
archive_fmt = "{sha1}"
request_interval = (1.0, 2.0)
def __init__(self, match):
BaseExtractor.__init__(self, match)
- path = match.group(match.lastindex)
if self.category == "wikimedia":
self.category = self.root.split(".")[-2]
@@ -31,31 +29,7 @@ class WikimediaExtractor(BaseExtractor):
self.category = "{}-{}".format(
self.category, self.root.partition(".")[0].rpartition("/")[2])
- if path.startswith("wiki/"):
- path = path[5:]
-
- pre, sep, _ = path.partition(":")
- prefix = pre.lower() if sep else None
-
- self.title = path = text.unquote(path)
- if prefix:
- self.subcategory = prefix
-
- if prefix == "category":
- self.params = {
- "generator": "categorymembers",
- "gcmtitle" : path,
- "gcmtype" : "file",
- }
- elif prefix == "file":
- self.params = {
- "titles" : path,
- }
- else:
- self.params = {
- "generator": "images",
- "titles" : path,
- }
+ self.per_page = self.config("limit", 50)
def _init(self):
api_path = self.config_instance("api-path")
@@ -67,6 +41,22 @@ class WikimediaExtractor(BaseExtractor):
else:
self.api_url = self.root + "/api.php"
+ @staticmethod
+ def prepare(image):
+ """Adjust the content of a image object"""
+ image["metadata"] = {
+ m["name"]: m["value"]
+ for m in image["metadata"] or ()}
+ image["commonmetadata"] = {
+ m["name"]: m["value"]
+ for m in image["commonmetadata"] or ()}
+
+ filename = image["canonicaltitle"]
+ image["filename"], _, image["extension"] = \
+ filename.partition(":")[2].rpartition(".")
+ image["date"] = text.parse_datetime(
+ image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
+
def items(self):
for info in self._pagination(self.params):
try:
@@ -75,20 +65,7 @@ class WikimediaExtractor(BaseExtractor):
self.log.debug("Missing 'imageinfo' for %s", info)
continue
- image["metadata"] = {
- m["name"]: m["value"]
- for m in image["metadata"] or ()}
- image["commonmetadata"] = {
- m["name"]: m["value"]
- for m in image["commonmetadata"] or ()}
-
- filename = image["canonicaltitle"]
- image["filename"], _, image["extension"] = \
- filename.partition(":")[2].rpartition(".")
- image["date"] = text.parse_datetime(
- image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
- image["page"] = self.title
-
+ self.prepare(image)
yield Message.Directory, image
yield Message.Url, image["url"], image
@@ -110,6 +87,17 @@ class WikimediaExtractor(BaseExtractor):
while True:
data = self.request(url, params=params).json()
+ # ref: https://www.mediawiki.org/wiki/API:Errors_and_warnings
+ error = data.get("error")
+ if error:
+ self.log.error("%s: %s", error["code"], error["info"])
+ return
+ # MediaWiki will emit warnings for non-fatal mistakes such as
+ # invalid parameter instead of raising an error
+ warnings = data.get("warnings")
+ if warnings:
+ self.log.debug("MediaWiki returned warnings: %s", warnings)
+
try:
pages = data["query"]["pages"]
except KeyError:
@@ -181,5 +169,59 @@ BASE_PATTERN = WikimediaExtractor.update({
class WikimediaArticleExtractor(WikimediaExtractor):
"""Extractor for wikimedia articles"""
subcategory = "article"
+ directory_fmt = ("{category}", "{page}")
pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
example = "https://en.wikipedia.org/wiki/TITLE"
+
+ def __init__(self, match):
+ WikimediaExtractor.__init__(self, match)
+
+ path = match.group(match.lastindex)
+ if path.startswith("wiki/"):
+ path = path[5:]
+
+ pre, sep, _ = path.partition(":")
+ prefix = pre.lower() if sep else None
+
+ self.title = path = text.unquote(path)
+ if prefix:
+ self.subcategory = prefix
+
+ if prefix == "category":
+ self.params = {
+ "generator": "categorymembers",
+ "gcmtitle" : path,
+ "gcmtype" : "file",
+ "gcmlimit" : self.per_page,
+ }
+ elif prefix == "file":
+ self.params = {
+ "titles" : path,
+ }
+ else:
+ self.params = {
+ "generator": "images",
+ "gimlimit" : self.per_page,
+ "titles" : path,
+ }
+
+ def prepare(self, image):
+ WikimediaExtractor.prepare(image)
+ image["page"] = self.title
+
+
+class WikimediaWikiExtractor(WikimediaExtractor):
+ """Extractor for all files on a MediaWiki instance"""
+ subcategory = "wiki"
+ pattern = BASE_PATTERN + r"/?$"
+ example = "https://en.wikipedia.org/"
+
+ def __init__(self, match):
+ WikimediaExtractor.__init__(self, match)
+
+ # ref: https://www.mediawiki.org/wiki/API:Allpages
+ self.params = {
+ "generator" : "allpages",
+ "gapnamespace": 6, # "File" namespace
+ "gaplimit" : self.per_page,
+ }
diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py
index cb3c74c..168845e 100644
--- a/gallery_dl/extractor/ytdl.py
+++ b/gallery_dl/extractor/ytdl.py
@@ -116,21 +116,20 @@ class YoutubeDLExtractor(Extractor):
for entry in entries:
if not entry:
continue
- elif entry.get("_type") in ("url", "url_transparent"):
+
+ if entry.get("_type") in ("url", "url_transparent"):
try:
- info_dict = ytdl_instance.extract_info(
+ entry = ytdl_instance.extract_info(
entry["url"], False,
ie_key=entry.get("ie_key"))
except ytdl_module.utils.YoutubeDLError:
continue
-
- if not info_dict:
+ if not entry:
continue
- elif "entries" in info_dict:
- yield from self._process_entries(
- ytdl_module, ytdl_instance, info_dict["entries"])
- else:
- yield info_dict
+
+ if "entries" in entry:
+ yield from self._process_entries(
+ ytdl_module, ytdl_instance, entry["entries"])
else:
yield entry