summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/artstation.py3
-rw-r--r--gallery_dl/extractor/common.py16
-rw-r--r--gallery_dl/extractor/foolfuuka.py14
-rw-r--r--gallery_dl/extractor/foolslide.py3
-rw-r--r--gallery_dl/extractor/gelbooru_v01.py91
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py42
-rw-r--r--gallery_dl/extractor/imgur.py13
-rw-r--r--gallery_dl/extractor/instagram.py94
-rw-r--r--gallery_dl/extractor/khinsider.py19
-rw-r--r--gallery_dl/extractor/lolisafe.py6
-rw-r--r--gallery_dl/extractor/mangafox.py3
-rw-r--r--gallery_dl/extractor/mangahere.py5
-rw-r--r--gallery_dl/extractor/mangasee.py11
-rw-r--r--gallery_dl/extractor/mastodon.py5
-rw-r--r--gallery_dl/extractor/moebooru.py5
-rw-r--r--gallery_dl/extractor/naverwebtoon.py53
-rw-r--r--gallery_dl/extractor/nijie.py48
-rw-r--r--gallery_dl/extractor/oauth.py27
-rw-r--r--gallery_dl/extractor/philomena.py25
-rw-r--r--gallery_dl/extractor/pixiv.py246
-rw-r--r--gallery_dl/extractor/reactor.py3
-rw-r--r--gallery_dl/extractor/readcomiconline.py8
-rw-r--r--gallery_dl/extractor/shopify.py58
-rw-r--r--gallery_dl/extractor/twitter.py89
-rw-r--r--gallery_dl/extractor/vk.py31
-rw-r--r--gallery_dl/extractor/weasyl.py4
-rw-r--r--gallery_dl/extractor/webtoons.py26
27 files changed, 695 insertions, 253 deletions
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index 5675081..e686c70 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2018-2021 Mike Fährmann
+# Copyright 2018-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -20,6 +20,7 @@ class ArtstationExtractor(Extractor):
filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}"
directory_fmt = ("{category}", "{userinfo[username]}")
archive_fmt = "{asset[id]}"
+ browser = "firefox"
root = "https://www.artstation.com"
def __init__(self, match):
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index abb352c..cac8c2d 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -306,23 +306,29 @@ class Extractor():
cookiefile = util.expand_path(cookies)
try:
with open(cookiefile) as fp:
- cookies = util.load_cookiestxt(fp)
+ util.cookiestxt_load(fp, self._cookiejar)
except Exception as exc:
self.log.warning("cookies: %s", exc)
else:
- self._update_cookies(cookies)
self._cookiefile = cookiefile
+ elif isinstance(cookies, (list, tuple)):
+ from ..cookies import load_cookies
+ try:
+ load_cookies(self._cookiejar, cookies)
+ except Exception as exc:
+ self.log.warning("cookies: %s", exc)
else:
self.log.warning(
- "expected 'dict' or 'str' value for 'cookies' option, "
- "got '%s' (%s)", cookies.__class__.__name__, cookies)
+ "Expected 'dict', 'list', or 'str' value for 'cookies' "
+ "option, got '%s' (%s)",
+ cookies.__class__.__name__, cookies)
def _store_cookies(self):
"""Store the session's cookiejar in a cookies.txt file"""
if self._cookiefile and self.config("cookies-update", True):
try:
with open(self._cookiefile, "w") as fp:
- util.save_cookiestxt(fp, self._cookiejar)
+ util.cookiestxt_store(fp, self._cookiejar)
except OSError as exc:
self.log.warning("cookies: %s", exc)
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index 04e5926..093113d 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for 4chan archives based on FoolFuuka"""
+"""Extractors for FoolFuuka 4chan archives"""
from .common import BaseExtractor, Message
from .. import text
@@ -16,6 +16,7 @@ import itertools
class FoolfuukaExtractor(BaseExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
basecategory = "foolfuuka"
+ filename_fmt = "{timestamp_ms} {filename_media}.{extension}"
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
external = "default"
@@ -40,6 +41,9 @@ class FoolfuukaExtractor(BaseExtractor):
post["filename"], _, post["extension"] = \
media["media"].rpartition(".")
+ post["filename_media"] = media["media_filename"].rpartition(".")[0]
+ post["timestamp_ms"] = text.parse_int(
+ media["media_orig"].rpartition(".")[0])
yield Message.Url, url, post
def metadata(self):
@@ -66,6 +70,7 @@ BASE_PATTERN = FoolfuukaExtractor.update({
},
"archivedmoe": {
"root": "https://archived.moe",
+ "pattern": r"archived\.moe",
},
"archiveofsins": {
"root": "https://archiveofsins.com",
@@ -73,12 +78,15 @@ BASE_PATTERN = FoolfuukaExtractor.update({
},
"b4k": {
"root": "https://arch.b4k.co",
+ "pattern": r"arch\.b4k\.co",
},
"desuarchive": {
"root": "https://desuarchive.org",
+ "pattern": r"desuarchive\.org",
},
"fireden": {
"root": "https://boards.fireden.net",
+ "pattern": r"boards\.fireden\.net",
},
"nyafuu": {
"root": "https://archive.nyafuu.org",
@@ -90,9 +98,11 @@ BASE_PATTERN = FoolfuukaExtractor.update({
},
"thebarchive": {
"root": "https://thebarchive.com",
+ "pattern": r"thebarchive\.com",
},
"wakarimasen": {
"root": "https://archive.wakarimasen.moe",
+ "pattern": r"archive\.wakarimasen\.moe",
},
})
@@ -101,7 +111,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
"""Base extractor for threads on FoolFuuka based boards/archives"""
subcategory = "thread"
directory_fmt = ("{category}", "{board[shortname]}",
- "{thread_num}{title:? - //}")
+ "{thread_num} {title|comment[:50]}")
pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)"
test = (
("https://archive.4plebs.org/tg/thread/54059290", {
diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py
index c09eb96..382cc25 100644
--- a/gallery_dl/extractor/foolslide.py
+++ b/gallery_dl/extractor/foolslide.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2021 Mike Fährmann
+# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -41,6 +41,7 @@ class FoolslideExtractor(BaseExtractor):
BASE_PATTERN = FoolslideExtractor.update({
"kireicake": {
"root": "https://reader.kireicake.com",
+ "pattern": r"reader\.kireicake\.com",
},
"powermanga": {
"root": "https://read.powermanga.org",
diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py
index 541f454..9c19664 100644
--- a/gallery_dl/extractor/gelbooru_v01.py
+++ b/gallery_dl/extractor/gelbooru_v01.py
@@ -1,12 +1,12 @@
# -*- coding: utf-8 -*-
-# Copyright 2021 Mike Fährmann
+# Copyright 2021-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for Gelbooru v0.1 sites"""
+"""Extractors for Gelbooru Beta 0.1.11 sites"""
from . import booru
from .. import text
@@ -42,14 +42,43 @@ class GelbooruV01Extractor(booru.BooruExtractor):
return post
+ def _pagination(self, url, begin, end):
+ pid = self.page_start
+
+ while True:
+ page = self.request(url + str(pid)).text
+
+ cnt = 0
+ for post_id in text.extract_iter(page, begin, end):
+ yield self._parse_post(post_id)
+ cnt += 1
+
+ if cnt < self.per_page:
+ return
+ pid += self.per_page
+
BASE_PATTERN = GelbooruV01Extractor.update({
- "thecollection" : {"root": "https://the-collection.booru.org"},
- "illusioncardsbooru": {"root": "https://illusioncards.booru.org"},
- "allgirlbooru" : {"root": "https://allgirl.booru.org"},
- "drawfriends" : {"root": "https://drawfriends.booru.org"},
- "vidyart" : {"root": "https://vidyart.booru.org"},
- "theloudbooru" : {"root": "https://tlb.booru.org"},
+ "thecollection": {
+ "root": "https://the-collection.booru.org",
+ "pattern": r"the-collection\.booru\.org",
+ },
+ "illusioncardsbooru": {
+ "root": "https://illusioncards.booru.org",
+ "pattern": r"illusioncards\.booru\.org",
+ },
+ "allgirlbooru": {
+ "root": "https://allgirl.booru.org",
+ "pattern": r"allgirl\.booru\.org",
+ },
+ "drawfriends": {
+ "root": "https://drawfriends.booru.org",
+ "pattern": r"drawfriends\.booru\.org",
+ },
+ "vidyart": {
+ "root": "https://vidyart.booru.org",
+ "pattern": r"vidyart\.booru\.org",
+ },
})
@@ -75,7 +104,6 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor):
}),
("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"),
("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"),
- ("https://tlb.booru.org/index.php?page=post&s=list&tags=all"),
)
def __init__(self, match):
@@ -88,20 +116,42 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor):
def posts(self):
url = "{}/index.php?page=post&s=list&tags={}&pid=".format(
self.root, self.tags)
- pid = self.page_start
+ return self._pagination(url, 'class="thumb"><a id="p', '"')
- while True:
- page = self.request(url + str(pid)).text
- cnt = 0
- for post_id in text.extract_iter(
- page, 'class="thumb"><a id="p', '"'):
- yield self._parse_post(post_id)
- cnt += 1
+class GelbooruV01FavoriteExtractor(GelbooruV01Extractor):
+ subcategory = "favorite"
+ directory_fmt = ("{category}", "favorites", "{favorite_id}")
+ archive_fmt = "f_{favorite_id}_{id}"
+ per_page = 50
+ pattern = BASE_PATTERN + r"/index\.php\?page=favorites&s=view&id=(\d+)"
+ test = (
+ (("https://the-collection.booru.org"
+ "/index.php?page=favorites&s=view&id=1166"), {
+ "count": 2,
+ }),
+ (("https://illusioncards.booru.org"
+ "/index.php?page=favorites&s=view&id=84887"), {
+ "count": 2,
+ }),
+ ("https://allgirl.booru.org/index.php?page=favorites&s=view&id=380", {
+ "count": 4,
+ }),
+ ("https://drawfriends.booru.org/index.php?page=favorites&s=view&id=1"),
+ ("https://vidyart.booru.org/index.php?page=favorites&s=view&id=1"),
+ )
- if cnt < self.per_page:
- return
- pid += self.per_page
+ def __init__(self, match):
+ GelbooruV01Extractor.__init__(self, match)
+ self.favorite_id = match.group(match.lastindex)
+
+ def metadata(self):
+ return {"favorite_id": text.parse_int(self.favorite_id)}
+
+ def posts(self):
+ url = "{}/index.php?page=favorites&s=view&id={}&pid=".format(
+ self.root, self.favorite_id)
+ return self._pagination(url, "posts[", "]")
class GelbooruV01PostExtractor(GelbooruV01Extractor):
@@ -141,7 +191,6 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor):
}),
("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"),
("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"),
- ("https://tlb.booru.org/index.php?page=post&s=view&id=127223"),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 7e16a51..2dd0c0c 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for Gelbooru v0.2 sites"""
+"""Extractors for Gelbooru Beta 0.2 sites"""
from . import booru
from .. import text, util, exception
@@ -26,6 +26,9 @@ class GelbooruV02Extractor(booru.BooruExtractor):
except KeyError:
self.api_root = self.root
+ if self.category == "realbooru":
+ self._file_url = self._file_url_realbooru
+
def _api_request(self, params):
url = self.api_root + "/index.php?page=dapi&s=post&q=index"
return ElementTree.fromstring(self.request(url, params=params).text)
@@ -61,6 +64,14 @@ class GelbooruV02Extractor(booru.BooruExtractor):
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
+ def _file_url_realbooru(self, post):
+ url = post["file_url"]
+ if url.count("/") == 5:
+ md5 = post["md5"]
+ url = "{}/images/{}/{}/{}.{}".format(
+ self.root, md5[0:2], md5[2:4], md5, url.rpartition(".")[2])
+ return url
+
def _extended_tags(self, post, page=None):
if not page:
url = "{}/index.php?page=post&s=view&id={}".format(
@@ -105,11 +116,23 @@ class GelbooruV02Extractor(booru.BooruExtractor):
INSTANCES = {
- "realbooru": {"root": "https://realbooru.com"},
- "rule34" : {"root": "https://rule34.xxx",
- "api_root": " https://api.rule34.xxx"},
- "safebooru": {"root": "https://safebooru.org"},
- "tbib" : {"root": "https://tbib.org"},
+ "realbooru": {
+ "root": "https://realbooru.com",
+ "pattern": r"realbooru\.com",
+ },
+ "rule34": {
+ "root": "https://rule34.xxx",
+ "pattern": r"rule34\.xxx",
+ "api_root": "https://api.rule34.xxx",
+ },
+ "safebooru": {
+ "root": "https://safebooru.org",
+ "pattern": r"safebooru\.org",
+ },
+ "tbib": {
+ "root": "https://tbib.org",
+ "pattern": r"tbib\.org",
+ },
}
BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES)
@@ -147,7 +170,7 @@ class GelbooruV02TagExtractor(GelbooruV02Extractor):
return {"search_tags": self.tags}
def posts(self):
- return self._pagination({"tags" : self.tags})
+ return self._pagination({"tags": self.tags})
class GelbooruV02PoolExtractor(GelbooruV02Extractor):
@@ -213,7 +236,7 @@ class GelbooruV02FavoriteExtractor(GelbooruV02Extractor):
"count": 2,
}),
("https://realbooru.com/index.php?page=favorites&s=view&id=274", {
- "count": 4,
+ "count": 2,
}),
("https://tbib.org/index.php?page=favorites&s=view&id=7881", {
"count": 3,
@@ -279,7 +302,8 @@ class GelbooruV02PostExtractor(GelbooruV02Extractor):
},
}),
("https://realbooru.com/index.php?page=post&s=view&id=668483", {
- "url": "2421b5b0e15d5e20f9067090a8b0fd4114d3e7d9",
+ "pattern": r"https://realbooru\.com/images/dc/b5"
+ r"/dcb5c0ce9ec0bf74a6930608985f4719\.jpeg",
"content": "7f5873ce3b6cd295ea2e81fcb49583098ea9c8da",
}),
("https://tbib.org/index.php?page=post&s=view&id=9233957", {
diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py
index 2035655..fd78ce2 100644
--- a/gallery_dl/extractor/imgur.py
+++ b/gallery_dl/extractor/imgur.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2021 Mike Fährmann
+# Copyright 2015-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -201,17 +201,24 @@ class ImgurAlbumExtractor(ImgurExtractor):
("https://imgur.com/a/TcBmQ", {
"exception": exception.HttpError,
}),
+ ("https://imgur.com/a/pjOnJA0", { # empty, no 'media' (#2557)
+ "count": 0,
+ }),
("https://www.imgur.com/a/TcBmP"), # www
("https://m.imgur.com/a/TcBmP"), # mobile
)
def items(self):
album = self.api.album(self.key)
- album["date"] = text.parse_datetime(album["created_at"])
- images = album["media"]
+ try:
+ images = album["media"]
+ except KeyError:
+ return
+
del album["media"]
count = len(images)
+ album["date"] = text.parse_datetime(album["created_at"])
try:
del album["ad_url"]
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index e07b64e..82c9858 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -80,12 +80,22 @@ class InstagramExtractor(Extractor):
def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
- if response.history and "/accounts/login/" in response.request.url:
- if self._cursor:
- self.log.info("Use '-o cursor=%s' to continue downloading "
- "from the current position", self._cursor)
- raise exception.StopExtraction(
- "HTTP redirect to login page (%s)", response.request.url)
+ if response.history:
+
+ url = response.request.url
+ if "/accounts/login/" in url:
+ page = "login"
+ elif "/challenge/" in url:
+ page = "challenge"
+ else:
+ page = None
+
+ if page:
+ if self._cursor:
+ self.log.info("Use '-o cursor=%s' to continue downloading "
+ "from the current position", self._cursor)
+ raise exception.StopExtraction("HTTP redirect to %s page (%s)",
+ page, url.partition("?")[0])
www_claim = response.headers.get("x-ig-set-www-claim")
if www_claim is not None:
@@ -298,7 +308,7 @@ class InstagramExtractor(Extractor):
video = None
media = image
- files.append({
+ media = {
"num" : num,
"date" : text.parse_timestamp(item.get("taken_at") or
media.get("taken_at")),
@@ -309,7 +319,9 @@ class InstagramExtractor(Extractor):
"video_url" : video["url"] if video else None,
"width" : media["width"],
"height" : media["height"],
- })
+ }
+ self._extract_tagged_users(item, media)
+ files.append(media)
return data
@@ -321,22 +333,45 @@ class InstagramExtractor(Extractor):
"abcdefghijklmnopqrstuvwxyz"
"0123456789-_")
- def _extract_tagged_users(self, src, dest):
- if "edge_media_to_tagged_user" not in src:
- return
- edges = src["edge_media_to_tagged_user"]["edges"]
+ @staticmethod
+ def _extract_tagged_users(src, dest):
+ dest["tagged_users"] = tagged_users = []
+
+ edges = src.get("edge_media_to_tagged_user")
if edges:
- dest["tagged_users"] = tagged_users = []
- for edge in edges:
+ for edge in edges["edges"]:
user = edge["node"]["user"]
- tagged_users.append({
- "id" : user["id"],
- "username" : user["username"],
- "full_name": user["full_name"],
- })
-
- def _extract_shared_data(self, url):
- page = self.request(url).text
+ tagged_users.append({"id" : user["id"],
+ "username" : user["username"],
+ "full_name": user["full_name"]})
+
+ usertags = src.get("usertags")
+ if usertags:
+ for tag in usertags["in"]:
+ user = tag["user"]
+ tagged_users.append({"id" : user["pk"],
+ "username" : user["username"],
+ "full_name": user["full_name"]})
+
+ mentions = src.get("reel_mentions")
+ if mentions:
+ for mention in mentions:
+ user = mention["user"]
+ tagged_users.append({"id" : user.get("pk"),
+ "username" : user["username"],
+ "full_name": user["full_name"]})
+
+ stickers = src.get("story_bloks_stickers")
+ if stickers:
+ for sticker in stickers:
+ sticker = sticker["bloks_sticker"]
+ if sticker["bloks_sticker_type"] == "mention":
+ user = sticker["sticker_data"]["ig_mention"]
+ tagged_users.append({"id" : user["account_id"],
+ "username" : user["username"],
+ "full_name": user["full_name"]})
+
+ def _extract_shared_data(self, page):
shared_data, pos = text.extract(
page, "window._sharedData =", ";</script>")
additional_data, pos = text.extract(
@@ -349,13 +384,15 @@ class InstagramExtractor(Extractor):
return data
def _extract_profile_page(self, url):
- data = self._extract_shared_data(url)["entry_data"]
+ page = self.request(url).text
+ data = self._extract_shared_data(page)["entry_data"]
if "HttpErrorPage" in data:
raise exception.NotFoundError("user")
return data["ProfilePage"][0]["graphql"]["user"]
def _extract_post_page(self, url):
- data = self._extract_shared_data(url)["entry_data"]
+ page = self.request(url).text
+ data = self._extract_shared_data(page)["entry_data"]
if "HttpErrorPage" in data:
raise exception.NotFoundError("post")
return data["PostPage"][0]
@@ -524,7 +561,8 @@ class InstagramTagExtractor(InstagramExtractor):
def posts(self):
url = "{}/explore/tags/{}/".format(self.root, self.item)
- page = self._extract_shared_data(url)["entry_data"]["TagPage"][0]
+ page = self._extract_shared_data(
+ self.request(url).text)["entry_data"]["TagPage"][0]
if "data" in page:
return self._pagination_sections(page["data"]["recent"])
@@ -718,8 +756,12 @@ class InstagramStoriesExtractor(InstagramExtractor):
reel_id = "highlight:" + self.highlight_id
else:
url = "{}/stories/{}/".format(self.root, self.user)
+ with self.request(url, allow_redirects=False) as response:
+ if 300 <= response.status_code < 400:
+ return ()
+ page = response.text
try:
- data = self._extract_shared_data(url)["entry_data"]
+ data = self._extract_shared_data(page)["entry_data"]
user = data["StoriesPage"][0]["user"]
except KeyError:
return ()
diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py
index 67a1a95..e7827b1 100644
--- a/gallery_dl/extractor/khinsider.py
+++ b/gallery_dl/extractor/khinsider.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2016-2020 Mike Fährmann
+# Copyright 2016-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -26,7 +26,18 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
"pattern": r"https?://vgm(site|downloads).com"
r"/soundtracks/horizon-riders-wii/[^/]+"
r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack.mp3",
- "keyword": "12ca70e0709ea15250e577ea388cf2b5b0c65630",
+ "keyword": {
+ "album": {
+ "count": 1,
+ "date": "Sep 18th, 2016",
+ "name": "Horizon Riders (Wii)",
+ "size": 26214400,
+ "type": "Gamerip",
+ },
+ "extension": "mp3",
+ "filename": "Horizon Riders Wii - Full Soundtrack",
+ },
+ "count": 1,
})
def __init__(self, match):
@@ -48,10 +59,10 @@ class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
def metadata(self, page):
extr = text.extract_from(page)
return {"album": {
- "name" : text.unescape(extr("Album name: <b>", "<")),
+ "name" : text.unescape(extr("<h2>", "<")),
"count": text.parse_int(extr("Number of Files: <b>", "<")),
"size" : text.parse_bytes(extr("Total Filesize: <b>", "<")[:-1]),
- "date" : extr("Date added: <b>", "<"),
+ "date" : extr("Date Added: <b>", "<"),
"type" : extr("Album type: <b>", "<"),
}}
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
index ad7cd1d..b6a508d 100644
--- a/gallery_dl/extractor/lolisafe.py
+++ b/gallery_dl/extractor/lolisafe.py
@@ -63,6 +63,12 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
LolisafeExtractor.__init__(self, match)
self.album_id = match.group(match.lastindex)
+ domain = self.config("domain")
+ if domain is None or domain == "auto":
+ self.root = text.root_from_url(match.group(0))
+ else:
+ self.root = text.ensure_http_scheme(domain)
+
def items(self):
files, data = self.fetch_album(self.album_id)
diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py
index f6514ca..4808105 100644
--- a/gallery_dl/extractor/mangafox.py
+++ b/gallery_dl/extractor/mangafox.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2021 Mike Fährmann
+# Copyright 2017-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -34,6 +34,7 @@ class MangafoxChapterExtractor(ChapterExtractor):
base, self.cstr, self.volume, self.chapter, self.minor = match.groups()
self.urlbase = self.root + base
ChapterExtractor.__init__(self, match, self.urlbase + "/1.html")
+ self.session.headers["Referer"] = self.root + "/"
def metadata(self, page):
manga, pos = text.extract(page, "<title>", "</title>")
diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py
index f655f94..461c92d 100644
--- a/gallery_dl/extractor/mangahere.py
+++ b/gallery_dl/extractor/mangahere.py
@@ -17,8 +17,8 @@ class MangahereBase():
"""Base class for mangahere extractors"""
category = "mangahere"
root = "https://www.mangahere.cc"
- mobile_root = "https://m.mangahere.cc"
- url_fmt = mobile_root + "/manga/{}/{}.html"
+ root_mobile = "https://m.mangahere.cc"
+ url_fmt = root_mobile + "/manga/{}/{}.html"
class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
@@ -42,6 +42,7 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor):
self.part, self.volume, self.chapter = match.groups()
url = self.url_fmt.format(self.part, 1)
ChapterExtractor.__init__(self, match, url)
+ self.session.headers["Referer"] = self.root_mobile + "/"
def metadata(self, page):
pos = page.index("</select>")
diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py
index 0b0da65..2bd11ef 100644
--- a/gallery_dl/extractor/mangasee.py
+++ b/gallery_dl/extractor/mangasee.py
@@ -9,7 +9,7 @@
"""Extractors for https://mangasee123.com/"""
from .common import ChapterExtractor, MangaExtractor
-from .. import text
+from .. import text, util
import json
@@ -57,6 +57,15 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor):
},
})
+ def __init__(self, match):
+ ChapterExtractor.__init__(self, match)
+ self.session.headers["Referer"] = self.gallery_url
+
+ domain = "mangasee123.com"
+ cookies = self.session.cookies
+ if not cookies.get("PHPSESSID", domain=domain):
+ cookies.set("PHPSESSID", util.generate_token(13), domain=domain)
+
def metadata(self, page):
extr = text.extract_from(page)
self.chapter = data = json.loads(extr("vm.CurChapter =", ";\r\n"))
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index cd7cabb..6e780e8 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2021 Mike Fährmann
+# Copyright 2019-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -60,12 +60,14 @@ class MastodonExtractor(BaseExtractor):
INSTANCES = {
"mastodon.social": {
"root" : "https://mastodon.social",
+ "pattern" : r"mastodon\.social",
"access-token" : "Y06R36SMvuXXN5_wiPKFAEFiQaMSQg0o_hGgc86Jj48",
"client-id" : "dBSHdpsnOUZgxOnjKSQrWEPakO3ctM7HmsyoOd4FcRo",
"client-secret": "DdrODTHs_XoeOsNVXnILTMabtdpWrWOAtrmw91wU1zI",
},
"pawoo": {
"root" : "https://pawoo.net",
+ "pattern" : r"pawoo\.net",
"access-token" : "c12c9d275050bce0dc92169a28db09d7"
"0d62d0a75a8525953098c167eacd3668",
"client-id" : "978a25f843ec01e53d09be2c290cd75c"
@@ -75,6 +77,7 @@ INSTANCES = {
},
"baraag": {
"root" : "https://baraag.net",
+ "pattern" : r"baraag\.net",
"access-token" : "53P1Mdigf4EJMH-RmeFOOSM9gdSDztmrAYFgabOKKE0",
"client-id" : "czxx2qilLElYHQ_sm-lO8yXuGwOHxLX9RYYaD0-nq1o",
"client-secret": "haMaFdMBgK_-BIxufakmI2gFgkYjqmgXGEO2tB-R2xY",
diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py
index 604d65c..65b9a83 100644
--- a/gallery_dl/extractor/moebooru.py
+++ b/gallery_dl/extractor/moebooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2021 Mike Fährmann
+# Copyright 2020-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -54,6 +54,7 @@ class MoebooruExtractor(BooruExtractor):
BASE_PATTERN = MoebooruExtractor.update({
"yandere": {
"root": "https://yande.re",
+ "pattern": r"yande\.re",
},
"konachan": {
"root": "https://konachan.com",
@@ -61,6 +62,7 @@ BASE_PATTERN = MoebooruExtractor.update({
},
"hypnohub": {
"root": "https://hypnohub.net",
+ "pattern": r"hypnohub\.net",
},
"sakugabooru": {
"root": "https://www.sakugabooru.com",
@@ -68,6 +70,7 @@ BASE_PATTERN = MoebooruExtractor.update({
},
"lolibooru": {
"root": "https://lolibooru.moe",
+ "pattern": r"lolibooru\.moe",
},
})
diff --git a/gallery_dl/extractor/naverwebtoon.py b/gallery_dl/extractor/naverwebtoon.py
index 348f6a1..eadd460 100644
--- a/gallery_dl/extractor/naverwebtoon.py
+++ b/gallery_dl/extractor/naverwebtoon.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2021 Seonghyeon Cho
+# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,8 +11,10 @@
from .common import GalleryExtractor, Extractor, Message
from .. import text
+import re
-BASE_PATTERN = r"(?:https?://)?comic\.naver\.com/webtoon"
+BASE_PATTERN = (r"(?:https?://)?comic\.naver\.com"
+ r"/(webtoon|challenge|bestChallenge)")
class NaverwebtoonBase():
@@ -25,19 +28,33 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
directory_fmt = ("{category}", "{comic}")
filename_fmt = "{episode:>03}-{num:>02}.{extension}"
archive_fmt = "{title_id}_{episode}_{num}"
- pattern = BASE_PATTERN + r"/detail\.nhn\?([^#]+)"
+ pattern = BASE_PATTERN + r"/detail(?:\.nhn)?\?([^#]+)"
test = (
- (("https://comic.naver.com/webtoon/detail.nhn?"
- "titleId=26458&no=1&weekday=tue"), {
+ (("https://comic.naver.com/webtoon/detail"
+ "?titleId=26458&no=1&weekday=tue"), {
"url": "47a956ba8c7a837213d5985f50c569fcff986f75",
"content": "3806b6e8befbb1920048de9888dfce6220f69a60",
"count": 14
}),
+ (("https://comic.naver.com/challenge/detail"
+ "?titleId=765124&no=1"), {
+ "pattern": r"https://image-comic\.pstatic\.net/nas"
+ r"/user_contents_data/challenge_comic/2021/01/19"
+ r"/342586/upload_7149856273586337846\.jpeg",
+ "count": 1,
+ }),
+ (("https://comic.naver.com/bestChallenge/detail.nhn"
+ "?titleId=771467&no=3"), {
+ "pattern": r"https://image-comic\.pstatic\.net/nas"
+ r"/user_contents_data/challenge_comic/2021/04/28"
+ r"/345534/upload_3617293622396203109\.jpeg",
+ "count": 1,
+ }),
)
def __init__(self, match):
- query = match.group(1)
- url = "{}/webtoon/detail.nhn?{}".format(self.root, query)
+ path, query = match.groups()
+ url = "{}/{}/detail?{}".format(self.root, path, query)
GalleryExtractor.__init__(self, match, url)
query = text.parse_query(query)
@@ -70,22 +87,31 @@ class NaverwebtoonEpisodeExtractor(NaverwebtoonBase, GalleryExtractor):
class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor):
subcategory = "comic"
categorytransfer = True
- pattern = (BASE_PATTERN + r"/list\.nhn\?([^#]+)")
+ pattern = (BASE_PATTERN + r"/list(?:\.nhn)?\?([^#]+)")
test = (
- ("https://comic.naver.com/webtoon/list.nhn?titleId=22073", {
+ ("https://comic.naver.com/webtoon/list?titleId=22073", {
"pattern": NaverwebtoonEpisodeExtractor.pattern,
"count": 32,
}),
+ ("https://comic.naver.com/challenge/list?titleId=765124", {
+ "pattern": NaverwebtoonEpisodeExtractor.pattern,
+ "count": 25,
+ }),
+ ("https://comic.naver.com/bestChallenge/list.nhn?titleId=789786", {
+ "pattern": NaverwebtoonEpisodeExtractor.pattern,
+ "count": ">= 12",
+ }),
)
def __init__(self, match):
Extractor.__init__(self, match)
- query = text.parse_query(match.group(1))
+ self.path, query = match.groups()
+ query = text.parse_query(query)
self.title_id = query.get("titleId")
self.page_no = text.parse_int(query.get("page"), 1)
def items(self):
- url = self.root + "/webtoon/list.nhn"
+ url = "{}/{}/list".format(self.root, self.path)
params = {"titleId": self.title_id, "page": self.page_no}
data = {"_extractor": NaverwebtoonEpisodeExtractor}
@@ -103,7 +129,8 @@ class NaverwebtoonComicExtractor(NaverwebtoonBase, Extractor):
def get_episode_urls(self, page):
"""Extract and return all episode urls in page"""
return [
- self.root + "/webtoon/detail.nhn?" + query
- for query in text.extract_iter(
- page, '<a href="/webtoon/detail?', '"')
+ self.root + path
+ for path in re.findall(
+ r'<a href="(/(?:webtoon|challenge|bestChallenge)'
+ r'/detail\?[^"]+)', page)
][::2]
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index 90ca01d..832831f 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -91,6 +91,10 @@ class NijieExtractor(AsynchronousMixin, Extractor):
"url": url,
})
+ @staticmethod
+ def _extract_user_name(page):
+ return text.unescape(text.extract(page, "<br />", "<")[0] or "")
+
def login(self):
"""Login and obtain session cookies"""
if not self._check_cookies(self.cookienames):
@@ -119,9 +123,8 @@ class NijieExtractor(AsynchronousMixin, Extractor):
while True:
page = self.request(url, params=params, notfound="artist").text
- if not self.user_name:
- self.user_name = text.unescape(text.extract(
- page, '<br />', '<')[0] or "")
+ if self.user_name is None:
+ self.user_name = self._extract_user_name(page)
yield from text.extract_iter(page, 'illust_id="', '"')
if '<a rel="next"' not in page:
@@ -137,11 +140,12 @@ class NijieUserExtractor(NijieExtractor):
test = ("https://nijie.info/members.php?id=44",)
def items(self):
- base = "{}/{{}}.php?id={}".format(self.root, self.user_id)
+ fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format
return self._dispatch_extractors((
- (NijieIllustrationExtractor, base.format("members_illust")),
- (NijieDoujinExtractor , base.format("members_dojin")),
- (NijieFavoriteExtractor , base.format("user_like_illust_view")),
+ (NijieIllustrationExtractor, fmt("members_illust")),
+ (NijieDoujinExtractor , fmt("members_dojin")),
+ (NijieFavoriteExtractor , fmt("user_like_illust_view")),
+ (NijieNuitaExtractor , fmt("history_nuita")),
), ("illustration", "doujin"))
@@ -217,6 +221,36 @@ class NijieFavoriteExtractor(NijieExtractor):
return data
+class NijieNuitaExtractor(NijieExtractor):
+ """Extractor for a nijie user's 抜いた list"""
+ subcategory = "nuita"
+ directory_fmt = ("{category}", "nuita", "{user_id}")
+ archive_fmt = "n_{user_id}_{image_id}_{num}"
+ pattern = BASE_PATTERN + r"/history_nuita\.php\?id=(\d+)"
+ test = ("https://nijie.info/history_nuita.php?id=728995", {
+ "range": "1-10",
+ "count": 10,
+ "keyword": {
+ "user_id" : 728995,
+ "user_name": "莚",
+ },
+ })
+
+ def image_ids(self):
+ return self._pagination("history_nuita")
+
+ def _extract_data(self, page):
+ data = NijieExtractor._extract_data(page)
+ data["user_id"] = self.user_id
+ data["user_name"] = self.user_name
+ return data
+
+ @staticmethod
+ def _extract_user_name(page):
+ return text.unescape(text.extract(
+ page, "<title>", "さんの抜いた")[0] or "")
+
+
class NijieImageExtractor(NijieExtractor):
"""Extractor for a work/image from nijie.info"""
subcategory = "image"
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index 428f772..653822f 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -11,6 +11,7 @@
from .common import Extractor, Message
from . import deviantart, flickr, mastodon, pixiv, reddit, smugmug, tumblr
from .. import text, oauth, util, config, exception
+from ..output import stdout_write
from ..cache import cache
import urllib.parse
import hashlib
@@ -37,7 +38,7 @@ class OAuthBase(Extractor):
def recv(self):
"""Open local HTTP server and recv callback parameters"""
import socket
- print("Waiting for response. (Cancel with Ctrl+c)")
+ stdout_write("Waiting for response. (Cancel with Ctrl+c)\n")
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.bind(("localhost", self.config("port", 6414)))
@@ -60,7 +61,7 @@ class OAuthBase(Extractor):
def send(self, msg):
"""Send 'msg' to the socket opened in 'recv()'"""
- print(msg)
+ stdout_write(msg)
self.client.send(b"HTTP/1.1 200 OK\r\n\r\n" + msg.encode())
self.client.close()
@@ -69,12 +70,13 @@ class OAuthBase(Extractor):
import webbrowser
url += "?" + urllib.parse.urlencode(params)
if not self.config("browser", True) or not webbrowser.open(url):
- print("Please open this URL in your browser:")
- print(url, end="\n\n", flush=True)
+ stdout_write(
+ "Please open this URL in your browser:\n\n" + url + "\n\n")
return (recv or self.recv)()
def error(self, msg):
- return self.send("Remote server reported an error:\n\n" + str(msg))
+ return self.send(
+ "Remote server reported an error:\n\n{}\n".format(msg))
def _oauth1_authorization_flow(
self, request_token_url, authorize_url, access_token_url):
@@ -133,7 +135,7 @@ class OAuthBase(Extractor):
# check authorization response
if state != params.get("state"):
- self.send("'state' mismatch: expected {}, got {}.".format(
+ self.send("'state' mismatch: expected {}, got {}.\n".format(
state, params.get("state")
))
return
@@ -188,7 +190,7 @@ class OAuthBase(Extractor):
opt = self.oauth_config(names[0])
if self.cache and (opt is None or opt == "cache"):
- msg += _vh + " been cached and will automatically be used."
+ msg += _vh + " been cached and will automatically be used.\n"
else:
msg += "Put " + _va + " into your configuration file as \n"
msg += " and\n".join(
@@ -200,7 +202,7 @@ class OAuthBase(Extractor):
"\nor set\n'extractor.{}.{}' to \"cache\""
.format(self.subcategory, names[0])
)
- msg += "\nto use {}.".format(_it)
+ msg += "\nto use {}.\n".format(_it)
return msg
@@ -398,9 +400,9 @@ class OAuthPixiv(OAuthBase):
data = self.session.post(url, headers=headers, data=data).json()
if "error" in data:
- print(data)
+ stdout_write("\n{}\n".format(data))
if data["error"] in ("invalid_request", "invalid_grant"):
- print("'code' expired, try again")
+ stdout_write("'code' expired, try again\n\n")
return
token = data["refresh_token"]
@@ -409,10 +411,10 @@ class OAuthPixiv(OAuthBase):
pixiv._refresh_token_cache.update(username, token)
self.log.info("Writing 'refresh-token' to cache")
- print(self._generate_message(("refresh-token",), (token,)))
+ stdout_write(self._generate_message(("refresh-token",), (token,)))
def _input(self):
- print("""
+ stdout_write("""\
1) Open your browser's Developer Tools (F12) and switch to the Network tab
2) Login
3) Select the last network monitor entry ('callback?state=...')
@@ -421,6 +423,7 @@ class OAuthPixiv(OAuthBase):
- This 'code' will expire 30 seconds after logging in.
- Copy-pasting more than just the 'code' value will work as well,
like the entire URL or several query parameters.
+
""")
code = input("code: ")
return code.rpartition("=")[2].strip()
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 92b8113..951b34d 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021 Mike Fährmann
+# Copyright 2021-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -58,12 +58,21 @@ class PhilomenaExtractor(BooruExtractor):
INSTANCES = {
- "derpibooru": {"root": "https://derpibooru.org",
- "filter_id": "56027"},
- "ponybooru" : {"root": "https://ponybooru.org",
- "filter_id": "2"},
- "furbooru" : {"root": "https://furbooru.org",
- "filter_id": "2"},
+ "derpibooru": {
+ "root": "https://derpibooru.org",
+ "pattern": r"derpibooru\.org",
+ "filter_id": "56027",
+ },
+ "ponybooru": {
+ "root": "https://ponybooru.org",
+ "pattern": r"ponybooru\.org",
+ "filter_id": "2",
+ },
+ "furbooru": {
+ "root": "https://furbooru.org",
+ "pattern": r"furbooru\.org",
+ "filter_id": "2",
+ },
}
BASE_PATTERN = PhilomenaExtractor.update(INSTANCES)
@@ -239,5 +248,5 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor):
def posts(self):
gallery_id = "gallery_id:" + self.gallery_id
url = self.root + "/api/v1/json/search/images"
- params = {"sd": "desc", "sf": gallery_id, "q" : gallery_id}
+ params = {"sd": "desc", "sf": gallery_id, "q": gallery_id}
return self._pagination(url, params)
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index a33df42..9b35e42 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2021 Mike Fährmann
+# Copyright 2014-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -10,16 +10,16 @@
from .common import Extractor, Message
from .. import text, util, exception
-from ..cache import cache
+from ..cache import cache, memcache
from datetime import datetime, timedelta
import itertools
import hashlib
-import time
class PixivExtractor(Extractor):
"""Base class for pixiv extractors"""
category = "pixiv"
+ root = "https://www.pixiv.net"
directory_fmt = ("{category}", "{user[id]} {user[account]}")
filename_fmt = "{id}_p{num}.{extension}"
archive_fmt = "{id}{suffix}.{extension}"
@@ -73,7 +73,14 @@ class PixivExtractor(Extractor):
if work["type"] == "ugoira":
if not self.load_ugoira:
continue
- ugoira = self.api.ugoira_metadata(work["id"])
+
+ try:
+ ugoira = self.api.ugoira_metadata(work["id"])
+ except exception.StopExtraction as exc:
+ self.log.warning(
+ "Unable to retrieve Ugoira metatdata (%s - %s)",
+ work.get("id"), exc.message)
+ continue
url = ugoira["zip_urls"]["medium"].replace(
"_ugoira600x600", "_ugoira1920x1080")
@@ -91,22 +98,70 @@ class PixivExtractor(Extractor):
work["suffix"] = "_p{:02}".format(work["num"])
yield Message.Url, url, text.nameext_from_url(url, work)
+ @staticmethod
+ def _make_work(kind, url, user):
+ p = url.split("/")
+ return {
+ "create_date" : "{}-{}-{}T{}:{}:{}+09:00".format(
+ p[5], p[6], p[7], p[8], p[9], p[10]) if len(p) > 9 else None,
+ "height" : 0,
+ "id" : kind,
+ "image_urls" : None,
+ "meta_pages" : (),
+ "meta_single_page": {"original_image_url": url},
+ "page_count" : 1,
+ "sanity_level" : 0,
+ "tags" : (),
+ "title" : kind,
+ "type" : kind,
+ "user" : user,
+ "width" : 0,
+ "x_restrict" : 0,
+ }
+
def works(self):
- """Return an iterable containing all relevant 'work'-objects"""
+ """Return an iterable containing all relevant 'work' objects"""
def metadata(self):
- """Collect metadata for extractor-job"""
+ """Collect metadata for extractor job"""
return {}
class PixivUserExtractor(PixivExtractor):
- """Extractor for works of a pixiv user"""
+ """Extractor for a pixiv user profile"""
subcategory = "user"
pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:"
- r"(?:en/)?users/(\d+)(?:/(?:artworks|illustrations|manga)"
- r"(?:/([^/?#]+))?)?/?(?:$|[?#])"
- r"|member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?"
- r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))")
+ r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id="
+ r")(\d+)(?:$|[?#])")
+ test = (
+ ("https://www.pixiv.net/en/users/173530"),
+ ("https://www.pixiv.net/u/173530"),
+ ("https://www.pixiv.net/member.php?id=173530"),
+ ("https://www.pixiv.net/mypage.php#id=173530"),
+ ("https://www.pixiv.net/#id=173530"),
+ )
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def items(self):
+ base = "{}/users/{}/".format(self.root, self.user_id)
+ return self._dispatch_extractors((
+ (PixivAvatarExtractor , base + "avatar"),
+ (PixivBackgroundExtractor, base + "background"),
+ (PixivArtworksExtractor , base + "artworks"),
+ (PixivFavoriteExtractor , base + "bookmarks/artworks"),
+ ), ("artworks",))
+
+
+class PixivArtworksExtractor(PixivExtractor):
+ """Extractor for artworks of a pixiv user"""
+ subcategory = "artworks"
+ pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:"
+ r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)"
+ r"(?:/([^/?#]+))?/?(?:$|[?#])"
+ r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)")
test = (
("https://www.pixiv.net/en/users/173530/artworks", {
"url": "852c31ad83b6840bacbce824d85f2a997889efb7",
@@ -120,47 +175,30 @@ class PixivUserExtractor(PixivExtractor):
"&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), {
"url": "25b1cd81153a8ff82eec440dd9f20a4a22079658",
}),
- # avatar (#595, #623, #1124)
- ("https://www.pixiv.net/en/users/173530", {
- "options": (("avatar", True),),
- "content": "4e57544480cc2036ea9608103e8f024fa737fe66",
- "range": "1",
- }),
- # background (#623, #1124, #2495)
- ("https://www.pixiv.net/en/users/194921", {
- "options": (("background", True),),
- "content": "aeda3536003ea3002f70657cb93c5053f26f5843",
- "range": "1",
- }),
# deleted account
("http://www.pixiv.net/member_illust.php?id=173531", {
"options": (("metadata", True),),
"exception": exception.NotFoundError,
}),
- ("https://www.pixiv.net/en/users/173530"),
("https://www.pixiv.net/en/users/173530/manga"),
("https://www.pixiv.net/en/users/173530/illustrations"),
("https://www.pixiv.net/member_illust.php?id=173530"),
- ("https://www.pixiv.net/u/173530"),
- ("https://www.pixiv.net/user/173530"),
- ("https://www.pixiv.net/mypage.php#id=173530"),
- ("https://www.pixiv.net/#id=173530"),
("https://touch.pixiv.net/member_illust.php?id=173530"),
)
def __init__(self, match):
PixivExtractor.__init__(self, match)
- u1, t1, u2, t2, u3 = match.groups()
+ u1, t1, u2, t2 = match.groups()
if t1:
t1 = text.unquote(t1)
elif t2:
t2 = text.parse_query(t2).get("tag")
- self.user_id = u1 or u2 or u3
+ self.user_id = u1 or u2
self.tag = t1 or t2
def metadata(self):
if self.config("metadata"):
- return {"user": self.api.user_detail(self.user_id)["user"]}
+ return self.api.user_detail(self.user_id)
return {}
def works(self):
@@ -173,54 +211,60 @@ class PixivUserExtractor(PixivExtractor):
if tag in [t["name"].lower() for t in work["tags"]]
)
- avatar = self.config("avatar")
- background = self.config("background")
- if avatar or background:
- work_list = []
- detail = self.api.user_detail(self.user_id)
- user = detail["user"]
-
- if avatar:
- url = user["profile_image_urls"]["medium"]
- work_list.append((self._make_work(
- "avatar", url.replace("_170.", "."), user),))
-
- if background:
- url = detail["profile"]["background_image_url"]
- if url:
- if "/c/" in url:
- parts = url.split("/")
- del parts[3:5]
- url = "/".join(parts)
- url = url.replace("_master1200.", ".")
- work = self._make_work("background", url, user)
- if url.endswith(".jpg"):
- work["_fallback"] = (url[:-4] + ".png",)
- work_list.append((work,))
-
- work_list.append(works)
- works = itertools.chain.from_iterable(work_list)
-
return works
- @staticmethod
- def _make_work(kind, url, user):
- return {
- "create_date" : None,
- "height" : 0,
- "id" : kind,
- "image_urls" : None,
- "meta_pages" : (),
- "meta_single_page": {"original_image_url": url},
- "page_count" : 1,
- "sanity_level" : 0,
- "tags" : (),
- "title" : kind,
- "type" : kind,
- "user" : user,
- "width" : 0,
- "x_restrict" : 0,
- }
+
+class PixivAvatarExtractor(PixivExtractor):
+ """Extractor for pixiv avatars"""
+ subcategory = "avatar"
+ filename_fmt = "avatar{date:?_//%Y-%m-%d}.{extension}"
+ archive_fmt = "avatar_{user[id]}_{date}"
+ pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net"
+ r"/(?:en/)?users/(\d+)/avatar")
+ test = ("https://www.pixiv.net/en/users/173530/avatar", {
+ "content": "4e57544480cc2036ea9608103e8f024fa737fe66",
+ })
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def works(self):
+ user = self.api.user_detail(self.user_id)["user"]
+ url = user["profile_image_urls"]["medium"].replace("_170.", ".")
+ return (self._make_work("avatar", url, user),)
+
+
+class PixivBackgroundExtractor(PixivExtractor):
+ """Extractor for pixiv background banners"""
+ subcategory = "background"
+ filename_fmt = "background{date?_//:%Y-%m-%d}.{extension}"
+ archive_fmt = "background_{user[id]}_{date}"
+ pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net"
+ r"/(?:en/)?users/(\d+)/background")
+ test = ("https://www.pixiv.net/en/users/194921/background", {
+ "pattern": r"https://i\.pximg\.net/background/img/2021/01/30/16/12/02"
+ r"/194921_af1f71e557a42f499213d4b9eaccc0f8\.jpg",
+ })
+
+ def __init__(self, match):
+ PixivExtractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def works(self):
+ detail = self.api.user_detail(self.user_id)
+ url = detail["profile"]["background_image_url"]
+ if not url:
+ return ()
+ if "/c/" in url:
+ parts = url.split("/")
+ del parts[3:5]
+ url = "/".join(parts)
+ url = url.replace("_master1200.", ".")
+ work = self._make_work("background", url, detail["user"])
+ if url.endswith(".jpg"):
+ work["_fallback"] = (url[:-4] + ".png",)
+ return (work,)
class PixivMeExtractor(PixivExtractor):
@@ -312,10 +356,10 @@ class PixivFavoriteExtractor(PixivExtractor):
r"|bookmark\.php)(?:\?([^#]*))?")
test = (
("https://www.pixiv.net/en/users/173530/bookmarks/artworks", {
- "url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
+ "url": "85a3104eaaaf003c7b3947117ca2f1f0b1cfc949",
}),
("https://www.pixiv.net/bookmark.php?id=173530", {
- "url": "e717eb511500f2fa3497aaee796a468ecf685cc4",
+ "url": "85a3104eaaaf003c7b3947117ca2f1f0b1cfc949",
}),
# bookmarks with specific tag
(("https://www.pixiv.net/en/users/3137110"
@@ -735,66 +779,70 @@ class PixivAppAPI():
def illust_detail(self, illust_id):
params = {"illust_id": illust_id}
- return self._call("v1/illust/detail", params)["illust"]
+ return self._call("/v1/illust/detail", params)["illust"]
def illust_follow(self, restrict="all"):
params = {"restrict": restrict}
- return self._pagination("v2/illust/follow", params)
+ return self._pagination("/v2/illust/follow", params)
def illust_ranking(self, mode="day", date=None):
params = {"mode": mode, "date": date}
- return self._pagination("v1/illust/ranking", params)
+ return self._pagination("/v1/illust/ranking", params)
def illust_related(self, illust_id):
params = {"illust_id": illust_id}
- return self._pagination("v2/illust/related", params)
+ return self._pagination("/v2/illust/related", params)
def search_illust(self, word, sort=None, target=None, duration=None,
date_start=None, date_end=None):
params = {"word": word, "search_target": target,
"sort": sort, "duration": duration,
"start_date": date_start, "end_date": date_end}
- return self._pagination("v1/search/illust", params)
+ return self._pagination("/v1/search/illust", params)
def user_bookmarks_illust(self, user_id, tag=None, restrict="public"):
params = {"user_id": user_id, "tag": tag, "restrict": restrict}
- return self._pagination("v1/user/bookmarks/illust", params)
+ return self._pagination("/v1/user/bookmarks/illust", params)
+ @memcache(keyarg=1)
def user_detail(self, user_id):
params = {"user_id": user_id}
- return self._call("v1/user/detail", params)
+ return self._call("/v1/user/detail", params)
def user_following(self, user_id, restrict="public"):
params = {"user_id": user_id, "restrict": restrict}
- return self._pagination("v1/user/following", params, "user_previews")
+ return self._pagination("/v1/user/following", params, "user_previews")
def user_illusts(self, user_id):
params = {"user_id": user_id}
- return self._pagination("v1/user/illusts", params)
+ return self._pagination("/v1/user/illusts", params)
def ugoira_metadata(self, illust_id):
params = {"illust_id": illust_id}
- return self._call("v1/ugoira/metadata", params)["ugoira_metadata"]
+ return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"]
def _call(self, endpoint, params=None):
- url = "https://app-api.pixiv.net/" + endpoint
+ url = "https://app-api.pixiv.net" + endpoint
+
+ while True:
+ self.login()
+ response = self.extractor.request(url, params=params, fatal=False)
+ data = response.json()
- self.login()
- response = self.extractor.request(url, params=params, fatal=False)
- data = response.json()
+ if "error" not in data:
+ return data
+
+ self.log.debug(data)
- if "error" in data:
if response.status_code == 404:
raise exception.NotFoundError()
error = data["error"]
if "rate limit" in (error.get("message") or "").lower():
- self.log.info("Waiting two minutes for API rate limit reset.")
- time.sleep(120)
- return self._call(endpoint, params)
- raise exception.StopExtraction("API request failed: %s", error)
+ self.extractor.wait(seconds=300)
+ continue
- return data
+ raise exception.StopExtraction("API request failed: %s", error)
def _pagination(self, endpoint, params, key="illusts"):
while True:
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index b3a620a..db8d700 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2021 Mike Fährmann
+# Copyright 2019-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -160,6 +160,7 @@ BASE_PATTERN = ReactorExtractor.update({
},
"thatpervert": {
"root": "http://thatpervert.com",
+ "pattern": r"thatpervert\.com",
},
})
diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py
index c8b8c9a..16b9191 100644
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@@ -24,6 +24,7 @@ class ReadcomiconlineBase():
archive_fmt = "{issue_id}_{page}"
root = "https://readcomiconline.li"
browser = "firefox"
+ request_interval = (1, 9)
def request(self, url, **kwargs):
"""Detect and handle redirects to CAPTCHA pages"""
@@ -85,7 +86,7 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
return [
(beau(url), None)
for url in text.extract_iter(
- page, 'lstImages.push("', '"'
+ page, "lstImages.push('", "'",
)
]
@@ -129,10 +130,13 @@ class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
def beau(url):
- """https://readcomiconline.li/Scripts/rguard.min.js?v=1.1"""
+ """https://readcomiconline.li/Scripts/rguard.min.js"""
if url.startswith("https"):
return url
+ url = url.replace("_x236", "d")
+ url = url.replace("_x945", "g")
+
containsS0 = "=s0" in url
url = url[:-3 if containsS0 else -6]
url = url[4:22] + url[25:]
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index f276e84..f2bf3cb 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2021 Mike Fährmann
+# Copyright 2019-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -43,19 +43,45 @@ class ShopifyExtractor(BaseExtractor):
BASE_PATTERN = ShopifyExtractor.update({
+ "chelseacrew": {
+ "root": "https://chelseacrew.com",
+ "pattern": r"(?:www\.)?chelseacrew\.com",
+ },
"fashionnova": {
"root": "https://www.fashionnova.com",
"pattern": r"(?:www\.)?fashionnova\.com",
},
+ "loungeunderwear": {
+ "root": "https://loungeunderwear.com",
+ "pattern": r"(?:[a-z]+\.)?loungeunderwear\.com",
+ },
+ "michaelscameras": {
+ "root": "https://michaels.com.au",
+ "pattern": r"michaels\.com\.au",
+ },
+ "modcloth": {
+ "root": "https://modcloth.com",
+ "pattern": r"modcloth\.com",
+ },
"omgmiamiswimwear": {
"root": "https://www.omgmiamiswimwear.com",
+ "pattern": r"(?:www\.)?omgmiamiswimwear\.com",
+ },
+ "pinupgirlclothing": {
+ "root": "https://pinupgirlclothing.com",
+ "pattern": r"pinupgirlclothing\.com",
+ },
+ "raidlondon": {
+ "root": "https://www.raidlondon.com",
+ "pattern": r"(?:www\.)?raidlondon\.com",
+ },
+ "unique-vintage": {
+ "root": "https://www.unique-vintage.com",
+ "pattern": r"(?:www\.)?unique\-vintage\.com",
},
"windsorstore": {
"root": "https://www.windsorstore.com",
- },
- "loungeunderwear": {
- "root": "https://loungeunderwear.com",
- "pattern": r"(?:[a-z]+\.)?loungeunderwear\.com",
+ "pattern": r"(?:www\.)?windsorstore\.com",
},
})
@@ -66,15 +92,21 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
directory_fmt = ("{category}", "{collection[title]}")
pattern = BASE_PATTERN + r"(/collections/[\w-]+)/?(?:$|[?#])"
test = (
+ ("https://chelseacrew.com/collections/flats"),
("https://www.fashionnova.com/collections/mini-dresses", {
"range": "1-20",
"count": 20,
}),
("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
("https://www.fashionnova.com/collections/mini-dresses#1"),
+ ("https://loungeunderwear.com/collections/apparel"),
+ ("https://michaels.com.au/collections/microphones"),
+ ("https://modcloth.com/collections/shoes"),
("https://www.omgmiamiswimwear.com/collections/fajas"),
+ ("https://pinupgirlclothing.com/collections/evening"),
+ ("https://www.raidlondon.com/collections/flats"),
+ ("https://www.unique-vintage.com/collections/flapper-1920s"),
("https://www.windsorstore.com/collections/dresses-ball-gowns"),
- ("https://loungeunderwear.com/collections/apparel"),
)
def metadata(self):
@@ -99,18 +131,28 @@ class ShopifyProductExtractor(ShopifyExtractor):
directory_fmt = ("{category}", "Products")
pattern = BASE_PATTERN + r"((?:/collections/[\w-]+)?/products/[\w-]+)"
test = (
+ ("https://chelseacrew.com/collections/flats/products/dora"),
("https://www.fashionnova.com/products/essential-slide-red", {
"pattern": r"https?://cdn\d*\.shopify.com/",
"count": 3,
}),
+ ("https://www.fashionnova.com/collections/flats/products/name"),
+ ("https://de.loungeunderwear.com/products/ribbed-crop-top-black"),
+ ("https://michaels.com.au/collections/audio/products"
+ "/boya-by-wm4-pro-k5-2-4ghz-mic-android-1-1-101281"),
+ ("https://modcloth.com/collections/shoes/products/heidii-brn"),
("https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", {
"pattern": r"https://cdn\.shopify\.com/s/files/1/1819/6171/",
"count": 5,
}),
- ("https://www.fashionnova.com/collections/flats/products/name"),
+ ("https://pinupgirlclothing.com/collections/evening/products"
+ "/clarice-coat-dress-in-olive-green-poly-crepe-laura-byrnes-design"),
+ ("https://www.raidlondon.com/collections/flats/products"
+ "/raid-addyson-chunky-flat-shoe-in-white"),
+ ("https://www.unique-vintage.com/collections/flapper-1920s/products"
+ "/unique-vintage-plus-size-black-silver-beaded-troyes-flapper-dress"),
("https://www.windsorstore.com/collections/accessories-belts/products"
"/rhine-buckle-dbl-o-ring-pu-strap-belt-073010158001"),
- ("https://de.loungeunderwear.com/products/ribbed-crop-top-black"),
)
def products(self):
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 4c947e7..2737d34 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -15,7 +15,7 @@ import json
BASE_PATTERN = (
r"(?:https?://)?(?:www\.|mobile\.)?"
- r"(?:(?:fx)?twitter\.com|nitter\.net)"
+ r"(?:(?:[fv]x)?twitter\.com|nitter\.net)"
)
@@ -39,7 +39,7 @@ class TwitterExtractor(Extractor):
self.pinned = self.config("pinned", False)
self.quoted = self.config("quoted", False)
self.videos = self.config("videos", True)
- self.cards = self.config("cards", True)
+ self.cards = self.config("cards", False)
self._user_cache = {}
self._init_sizes()
@@ -104,6 +104,7 @@ class TwitterExtractor(Extractor):
def _extract_media(self, tweet, entities, files):
for media in entities:
+ descr = media.get("ext_alt_text")
width = media["original_info"].get("width", 0)
height = media["original_info"].get("height", 0)
@@ -112,9 +113,10 @@ class TwitterExtractor(Extractor):
files.append({
"url": "ytdl:{}/i/web/status/{}".format(
self.root, tweet["id_str"]),
- "width" : width,
- "height" : height,
- "extension": None,
+ "width" : width,
+ "height" : height,
+ "extension" : None,
+ "description": descr,
})
elif self.videos:
video_info = media["video_info"]
@@ -123,22 +125,24 @@ class TwitterExtractor(Extractor):
key=lambda v: v.get("bitrate", 0),
)
files.append({
- "url" : variant["url"],
- "width" : width,
- "height" : height,
- "bitrate" : variant.get("bitrate", 0),
- "duration": video_info.get(
+ "url" : variant["url"],
+ "width" : width,
+ "height" : height,
+ "bitrate" : variant.get("bitrate", 0),
+ "duration" : video_info.get(
"duration_millis", 0) / 1000,
+ "description": descr,
})
elif "media_url_https" in media:
url = media["media_url_https"]
base, _, fmt = url.rpartition(".")
base += "?format=" + fmt + "&name="
files.append(text.nameext_from_url(url, {
- "url" : base + self._size_image,
- "width" : width,
- "height" : height,
- "_fallback": self._image_fallback(base),
+ "url" : base + self._size_image,
+ "width" : width,
+ "height" : height,
+ "_fallback" : self._image_fallback(base),
+ "description": descr,
}))
else:
files.append({"url": media["media_url"]})
@@ -323,6 +327,9 @@ class TwitterExtractor(Extractor):
elif userfmt == "media":
cls = TwitterMediaExtractor
fmt = (self.root + "/id:{rest_id}/media").format_map
+ elif userfmt == "tweets":
+ cls = TwitterTweetsExtractor
+ fmt = (self.root + "/id:{rest_id}/tweets").format_map
else:
cls = None
fmt = userfmt.format_map
@@ -383,7 +390,7 @@ class TwitterExtractor(Extractor):
class TwitterTimelineExtractor(TwitterExtractor):
- """Extractor for Tweets from a user's timeline"""
+ """Extractor for a Twitter user timeline"""
subcategory = "timeline"
pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])"
r"|i(?:/user/|ntent/user\?user_id=)(\d+))")
@@ -400,6 +407,8 @@ class TwitterTimelineExtractor(TwitterExtractor):
("https://www.twitter.com/id:2976459548"),
("https://twitter.com/i/user/2976459548"),
("https://twitter.com/intent/user?user_id=2976459548"),
+ ("https://fxtwitter.com/supernaturepics"),
+ ("https://vxtwitter.com/supernaturepics"),
)
def __init__(self, match):
@@ -409,6 +418,52 @@ class TwitterTimelineExtractor(TwitterExtractor):
self.user = "id:" + user_id
def tweets(self):
+ tweets = (self.api.user_tweets(self.user) if self.retweets else
+ self.api.user_media(self.user))
+
+ # yield initial batch of (media) tweets
+ tweet = None
+ for tweet in tweets:
+ yield tweet
+
+ if tweet is None:
+ return
+
+ # get username
+ if not self.user.startswith("id:"):
+ username = self.user
+ elif "core" in tweet:
+ username = (tweet["core"]["user_results"]["result"]
+ ["legacy"]["screen_name"])
+ else:
+ username = tweet["user"]["screen_name"]
+
+ # get tweet data
+ if "legacy" in tweet:
+ tweet = tweet["legacy"]
+
+ # yield search results starting from last tweet id
+ yield from self.api.search_adaptive(
+ "from:{} include:retweets include:nativeretweets max_id:{} "
+ "filter:images OR card_name:animated_gif OR filter:native_video"
+ .format(username, tweet["id_str"])
+ )
+
+
+class TwitterTweetsExtractor(TwitterExtractor):
+ """Extractor for Tweets from a user's Tweets timeline"""
+ subcategory = "tweets"
+ pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/tweets(?!\w)"
+ test = (
+ ("https://twitter.com/supernaturepics/tweets", {
+ "range": "1-40",
+ "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40",
+ }),
+ ("https://mobile.twitter.com/supernaturepics/tweets#t"),
+ ("https://www.twitter.com/id:2976459548/tweets"),
+ )
+
+ def tweets(self):
return self.api.user_tweets(self.user)
@@ -662,6 +717,10 @@ class TwitterTweetExtractor(TwitterExtractor):
"options": (("syndication", True),),
"count": 1,
}),
+ # media alt texts / descriptions (#2617)
+ ("https://twitter.com/my0nruri/status/1528379296041299968", {
+ "keyword": {"description": "oc"}
+ }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/vk.py b/gallery_dl/extractor/vk.py
index 8fb9bbf..23f6ea2 100644
--- a/gallery_dl/extractor/vk.py
+++ b/gallery_dl/extractor/vk.py
@@ -9,7 +9,7 @@
"""Extractors for https://vk.com/"""
from .common import Extractor, Message
-from .. import text
+from .. import text, exception
BASE_PATTERN = r"(?:https://)?(?:www\.|m\.)?vk\.com"
@@ -39,9 +39,15 @@ class VkExtractor(Extractor):
self.log.warning("no photo URL found (%s)", photo.get("id"))
continue
- photo.update(data)
- photo["url"], photo["width"], photo["height"] = photo[size]
+ try:
+ photo["url"], photo["width"], photo["height"] = photo[size]
+ except ValueError:
+ # photo without width/height entries (#2535)
+ photo["url"] = photo[size + "src"]
+ photo["width"] = photo["height"] = 0
+
photo["id"] = photo["id"].rpartition("_")[2]
+ photo.update(data)
text.nameext_from_url(photo["url"], photo)
yield Message.Url, photo["url"], photo
@@ -66,6 +72,10 @@ class VkExtractor(Extractor):
url, method="POST", headers=headers, data=data,
).json()["payload"][1]
+ if len(payload) < 4:
+ self.log.debug(payload)
+ raise exception.AuthorizationError(payload[0])
+
total = payload[1]
photos = payload[3]
@@ -105,7 +115,7 @@ class VkPhotosExtractor(VkExtractor):
},
}),
("https://vk.com/cosplayinrussia", {
- "range": "25-35",
+ "range": "15-25",
"keywords": {
"id": r"re:\d+",
"user": {
@@ -117,6 +127,12 @@ class VkPhotosExtractor(VkExtractor):
},
},
}),
+ # photos without width/height (#2535)
+ ("https://vk.com/id76957806", {
+ "pattern": r"https://sun\d+-\d+\.userapi\.com/",
+ "range": "1-9",
+ "count": 9,
+ }),
("https://m.vk.com/albums398982326"),
("https://www.vk.com/id398982326?profile=1"),
("https://vk.com/albums-165740836"),
@@ -150,7 +166,8 @@ class VkPhotosExtractor(VkExtractor):
'<h1 class="page_name">', "<")).replace(" ", " "),
"info": text.unescape(text.remove_html(extr(
'<span class="current_text">', '</span'))),
- "id" : extr('<a href="/albums', '"'),
+ "id" : (extr('<a href="/albums', '"') or
+ extr('data-from-id="', '"')),
}}
@@ -166,6 +183,10 @@ class VkAlbumExtractor(VkExtractor):
("https://vk.com/album-165740836_281339889", {
"count": 12,
}),
+ # "Access denied" (#2556)
+ ("https://vk.com/album-53775183_00", {
+ "exception": exception.AuthorizationError,
+ }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/weasyl.py b/gallery_dl/extractor/weasyl.py
index 75b78c5..599a175 100644
--- a/gallery_dl/extractor/weasyl.py
+++ b/gallery_dl/extractor/weasyl.py
@@ -47,6 +47,7 @@ class WeasylExtractor(Extractor):
return data
def submissions(self, owner_login, folderid=None):
+ metadata = self.config("metadata")
url = "{}/api/users/{}/gallery".format(self.root, owner_login)
params = {
"nextid" : None,
@@ -56,6 +57,9 @@ class WeasylExtractor(Extractor):
while True:
data = self.request(url, params=params).json()
for submission in data["submissions"]:
+ if metadata:
+ submission = self.request_submission(
+ submission["submitid"])
if self.populate_submission(submission):
submission["folderid"] = folderid
# Do any submissions have more than one url? If so
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index cf5b192..59f46f0 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -1,6 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2020 Leonardo Taccari
+# Copyright 2021-2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -41,8 +42,8 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
"""Extractor for an episode on webtoons.com"""
subcategory = "episode"
directory_fmt = ("{category}", "{comic}")
- filename_fmt = "{episode}-{num:>02}.{extension}"
- archive_fmt = "{title_no}_{episode}_{num}"
+ filename_fmt = "{episode_no}-{num:>02}.{extension}"
+ archive_fmt = "{title_no}_{episode_no}_{num}"
pattern = (BASE_PATTERN + r"/([^/?#]+)/([^/?#]+)/(?:[^/?#]+))"
r"/viewer(?:\?([^#'\"]+))")
test = (
@@ -54,6 +55,18 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
"49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9"),
"count": 5,
}),
+ (("https://www.webtoons.com/en/challenge/punderworld"
+ "/happy-earth-day-/viewer?title_no=312584&episode_no=40"), {
+ "keyword": {
+ "comic": "punderworld",
+ "description": str,
+ "episode": "36",
+ "episode_no": "40",
+ "genre": "challenge",
+ "title": r"re:^Punderworld - .+",
+ "title_no": "312584",
+ },
+ }),
)
def __init__(self, match):
@@ -65,11 +78,13 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
query = text.parse_query(query)
self.title_no = query.get("title_no")
- self.episode = query.get("episode_no")
+ self.episode_no = query.get("episode_no")
def metadata(self, page):
+ keywords, pos = text.extract(
+ page, '<meta name="keywords" content="', '"')
title, pos = text.extract(
- page, '<meta property="og:title" content="', '"')
+ page, '<meta property="og:title" content="', '"', pos)
descr, pos = text.extract(
page, '<meta property="og:description" content="', '"', pos)
@@ -77,8 +92,9 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
"genre" : self.genre,
"comic" : self.comic,
"title_no" : self.title_no,
- "episode" : self.episode,
+ "episode_no" : self.episode_no,
"title" : text.unescape(title),
+ "episode" : keywords.split(", ")[1],
"description": text.unescape(descr),
"lang" : self.lang,
"language" : util.code_to_language(self.lang),