aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2021-12-01 14:44:00 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2021-12-01 14:44:00 -0500
commita5aecc343fd2886e7ae09bb3e2afeec38f175755 (patch)
tree06a284b3d73700bd38116423e2480afa516255c2 /gallery_dl/extractor
parentfc8c5e642017e2b4e5299e2093e72b316479690d (diff)
New upstream version 1.19.3.upstream/1.19.3
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py1
-rw-r--r--gallery_dl/extractor/dynastyscans.py25
-rw-r--r--gallery_dl/extractor/exhentai.py8
-rw-r--r--gallery_dl/extractor/foolfuuka.py6
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py15
-rw-r--r--gallery_dl/extractor/instagram.py20
-rw-r--r--gallery_dl/extractor/kemonoparty.py125
-rw-r--r--gallery_dl/extractor/mangadex.py42
-rw-r--r--gallery_dl/extractor/mangoxo.py12
-rw-r--r--gallery_dl/extractor/philomena.py12
-rw-r--r--gallery_dl/extractor/reactor.py228
-rw-r--r--gallery_dl/extractor/seisoparty.py201
-rw-r--r--gallery_dl/extractor/shopify.py6
-rw-r--r--gallery_dl/extractor/skeb.py3
-rw-r--r--gallery_dl/extractor/subscribestar.py14
-rw-r--r--gallery_dl/extractor/twitter.py37
-rw-r--r--gallery_dl/extractor/webtoons.py5
-rw-r--r--gallery_dl/extractor/xvideos.py4
-rw-r--r--gallery_dl/extractor/ytdl.py79
19 files changed, 369 insertions, 474 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 79fe971..dd9da01 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -111,7 +111,6 @@ modules = [
"sankaku",
"sankakucomplex",
"seiga",
- "seisoparty",
"senmanga",
"sexcom",
"simplyhentai",
diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py
index 4541d25..ab1044f 100644
--- a/gallery_dl/extractor/dynastyscans.py
+++ b/gallery_dl/extractor/dynastyscans.py
@@ -8,7 +8,7 @@
"""Extractors for https://dynasty-scans.com/"""
-from .common import ChapterExtractor, Extractor, Message
+from .common import ChapterExtractor, MangaExtractor, Extractor, Message
from .. import text
import json
import re
@@ -48,12 +48,12 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
(("http://dynasty-scans.com/chapters/"
"hitoribocchi_no_oo_seikatsu_ch33"), {
"url": "dce64e8c504118f1ab4135c00245ea12413896cb",
- "keyword": "1564965671ac69bb7fbc340538397f6bd0aa269b",
+ "keyword": "b67599703c27316a2fe4f11c3232130a1904e032",
}),
(("http://dynasty-scans.com/chapters/"
"new_game_the_spinoff_special_13"), {
"url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538",
- "keyword": "22b35029bc65d6d95db2e2c147b0a37f2d290f29",
+ "keyword": "6b674eb3a274999153f6be044973b195008ced2f",
}),
)
@@ -76,7 +76,8 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
"author" : text.remove_html(author),
"group" : (text.remove_html(group) or
text.extract(group, ' alt="', '"')[0] or ""),
- "date" : extr('"icon-calendar"></i> ', '<'),
+ "date" : text.parse_datetime(extr(
+ '"icon-calendar"></i> ', '<'), "%b %d, %Y"),
"lang" : "en",
"language": "English",
}
@@ -89,6 +90,22 @@ class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor):
]
+class DynastyscansMangaExtractor(DynastyscansBase, MangaExtractor):
+ chapterclass = DynastyscansChapterExtractor
+ reverse = False
+ pattern = BASE_PATTERN + r"(/series/[^/?#]+)"
+ test = ("https://dynasty-scans.com/series/hitoribocchi_no_oo_seikatsu", {
+ "pattern": DynastyscansChapterExtractor.pattern,
+ "count": ">= 100",
+ })
+
+ def chapters(self, page):
+ return [
+ (self.root + path, {})
+ for path in text.extract_iter(page, '<dd>\n<a href="', '"')
+ ]
+
+
class DynastyscansSearchExtractor(DynastyscansBase, Extractor):
"""Extrator for image search results on dynasty-scans.com"""
subcategory = "search"
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index aabfe6b..7ffb214 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -122,7 +122,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"date": "dt:2018-03-18 20:15:00",
"eh_category": "Non-H",
"expunged": False,
- "favorites": "18",
+ "favorites": "19",
"filecount": "4",
"filesize": 1488978,
"gid": 1200119,
@@ -239,7 +239,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"title_jpn" : text.unescape(extr('<h1 id="gj">', '</h1>')),
"_" : extr('<div id="gdc"><div class="cs ct', '"'),
"eh_category" : extr('>', '<'),
- "uploader" : text.unquote(extr('/uploader/', '"')),
+ "uploader" : extr('<div id="gdn">', '</div>'),
"date" : text.parse_datetime(extr(
'>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"),
"parent" : extr(
@@ -255,6 +255,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
"torrentcount" : extr('>Torrent Download (', ')'),
}
+ if data["uploader"].startswith("<"):
+ data["uploader"] = text.unescape(text.extract(
+ data["uploader"], ">", "<")[0])
+
f = data["favorites"][0]
if f == "N":
data["favorites"] = "0"
diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py
index d2c5e8f..6ddd689 100644
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@@ -122,7 +122,7 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
"url": "d309713d2f838797096b3e9cb44fe514a9c9d07a",
}),
("https://desuarchive.org/a/thread/159542679/", {
- "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
+ "url": "2bddbe03b01b4630337f6916f6df36d1d443b7b8",
}),
("https://boards.fireden.net/sci/thread/11264294/", {
"url": "61cab625c95584a12a30049d054931d64f8d20aa",
@@ -131,10 +131,10 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
}),
("https://rbt.asia/g/thread/61487650/", {
- "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+ "url": "b4692707cddb4ad1c9ba1cde77c4703025cb86e5",
}),
("https://archive.rebeccablacktech.com/g/thread/61487650/", {
- "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+ "url": "b4692707cddb4ad1c9ba1cde77c4703025cb86e5",
}),
("https://thebarchive.com/b/thread/739772332/", {
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index e09e190..a42a202 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -27,8 +27,21 @@ class GelbooruV02Extractor(booru.BooruExtractor):
params["pid"] = self.page_start
params["limit"] = self.per_page
+ post = None
while True:
- root = self._api_request(params)
+ try:
+ root = self._api_request(params)
+ except ElementTree.ParseError:
+ if "tags" not in params or post is None:
+ raise
+ taglist = [tag for tag in params["tags"].split()
+ if not tag.startswith("id:<")]
+ taglist.append("id:<" + str(post.attrib["id"]))
+ params["tags"] = " ".join(taglist)
+ params["pid"] = 0
+ continue
+
+ post = None
for post in root:
yield post.attrib
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index bf479ab..a1dd465 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -439,15 +439,27 @@ class InstagramTaggedExtractor(InstagramExtractor):
test = ("https://www.instagram.com/instagram/tagged/", {
"range": "1-16",
"count": ">= 16",
+ "keyword": {
+ "tagged_owner_id" : "25025320",
+ "tagged_username" : "instagram",
+ "tagged_full_name": "Instagram",
+ },
})
- def posts(self):
+ def metadata(self):
url = "{}/{}/".format(self.root, self.item)
- user = self._extract_profile_page(url)
+ self.user = user = self._extract_profile_page(url)
+
+ return {
+ "tagged_owner_id" : user["id"],
+ "tagged_username" : user["username"],
+ "tagged_full_name": user["full_name"],
+ }
+ def posts(self):
query_hash = "be13233562af2d229b008d2976b998b5"
- variables = {"id": user["id"], "first": 50}
- edge = self._get_edge_data(user, None)
+ variables = {"id": self.user["id"], "first": 50}
+ edge = self._get_edge_data(self.user, None)
return self._pagination_graphql(query_hash, variables, edge)
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 2e1d0b2..6483278 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -14,7 +14,7 @@ from ..cache import cache
import itertools
import re
-BASE_PATTERN = r"(?:https?://)?kemono\.party"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?kemono\.party"
USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
@@ -30,19 +30,20 @@ class KemonopartyExtractor(Extractor):
def items(self):
self._prepare_ddosguard_cookies()
- find_inline = re.compile(
+ self._find_inline = re.compile(
r'src="(?:https?://kemono\.party)?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
- skip_service = \
- "patreon" if self.config("patreon-skip-file", True) else None
+ find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match
+ generators = self._build_file_generators(self.config("files"))
comments = self.config("comments")
+ username = dms = None
if self.config("metadata"):
username = text.unescape(text.extract(
self.request(self.user_url).text,
'<meta name="artist_name" content="', '"')[0])
- else:
- username = None
+ if self.config("dms"):
+ dms = True
posts = self.posts()
max_posts = self.config("max-posts")
@@ -51,31 +52,38 @@ class KemonopartyExtractor(Extractor):
for post in posts:
- files = []
- append = files.append
- file = post["file"]
-
- if file:
- file["type"] = "file"
- if post["service"] != skip_service or not post["attachments"]:
- append(file)
- for attachment in post["attachments"]:
- attachment["type"] = "attachment"
- append(attachment)
- for path in find_inline(post["content"] or ""):
- append({"path": path, "name": path, "type": "inline"})
-
post["date"] = text.parse_datetime(
- post["published"], "%a, %d %b %Y %H:%M:%S %Z")
+ post["published"] or post["added"],
+ "%a, %d %b %Y %H:%M:%S %Z")
if username:
post["username"] = username
if comments:
post["comments"] = self._extract_comments(post)
+ if dms is not None:
+ if dms is True:
+ dms = self._extract_dms(post)
+ post["dms"] = dms
yield Message.Directory, post
- for post["num"], file in enumerate(files, 1):
- post["type"] = file["type"]
+ hashes = set()
+ post["num"] = 0
+ for file in itertools.chain.from_iterable(
+ g(post) for g in generators):
url = file["path"]
+
+ match = find_hash(url)
+ if match:
+ post["hash"] = hash = match.group(1)
+ if hash in hashes:
+ self.log.debug("Skipping %s (duplicate)", url)
+ continue
+ hashes.add(hash)
+ else:
+ post["hash"] = ""
+
+ post["type"] = file["type"]
+ post["num"] += 1
+
if url[0] == "/":
url = self.root + "/data" + url
elif url.startswith("https://kemono.party"):
@@ -103,6 +111,34 @@ class KemonopartyExtractor(Extractor):
return {c.name: c.value for c in response.history[0].cookies}
+ def _file(self, post):
+ file = post["file"]
+ if not file:
+ return ()
+ file["type"] = "file"
+ return (file,)
+
+ def _attachments(self, post):
+ for attachment in post["attachments"]:
+ attachment["type"] = "attachment"
+ return post["attachments"]
+
+ def _inline(self, post):
+ for path in self._find_inline(post["content"] or ""):
+ yield {"path": path, "name": path, "type": "inline"}
+
+ def _build_file_generators(self, filetypes):
+ if filetypes is None:
+ return (self._file, self._attachments, self._inline)
+ genmap = {
+ "file" : self._file,
+ "attachments": self._attachments,
+ "inline" : self._inline,
+ }
+ if isinstance(filetypes, str):
+ filetypes = filetypes.split(",")
+ return [genmap[ft] for ft in filetypes]
+
def _extract_comments(self, post):
url = "{}/{}/user/{}/post/{}".format(
self.root, post["service"], post["user"], post["id"])
@@ -121,6 +157,21 @@ class KemonopartyExtractor(Extractor):
})
return comments
+ def _extract_dms(self, post):
+ url = "{}/{}/user/{}/dms".format(
+ self.root, post["service"], post["user"])
+ page = self.request(url).text
+
+ dms = []
+ for dm in text.extract_iter(page, "<article", "</article>"):
+ dms.append({
+ "body": text.unescape(text.extract(
+ dm, '<div class="dm-card__content">', '</div>',
+ )[0].strip()),
+ "date": text.extract(dm, 'datetime="', '"')[0],
+ })
+ return dms
+
class KemonopartyUserExtractor(KemonopartyExtractor):
"""Extractor for all posts from a kemono.party user listing"""
@@ -175,6 +226,8 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
"embed": dict,
"extension": "jpeg",
"filename": "P058kDFYus7DbqAkGlfWTlOr",
+ "hash": "210f35388e28bbcf756db18dd516e2d8"
+ "2ce758e0d32881eeee76d43e1716d382",
"id": "506575",
"num": 1,
"published": "Sun, 11 Aug 2019 02:09:04 GMT",
@@ -188,25 +241,39 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
}),
# inline image (#1286)
("https://kemono.party/fanbox/user/7356311/post/802343", {
- "pattern": r"https://kemono\.party/data/inline/fanbox"
- r"/uaozO4Yga6ydkGIJFAQDixfE\.jpeg",
+ "pattern": r"https://kemono\.party/data/47/b5/47b5c014ecdcfabdf2c8"
+ r"5eec53f1133a76336997ae8596f332e97d956a460ad2\.jpg",
+ "keyword": {"hash": "47b5c014ecdcfabdf2c85eec53f1133a"
+ "76336997ae8596f332e97d956a460ad2"},
}),
# kemono.party -> data.kemono.party
("https://kemono.party/gumroad/user/trylsc/post/IURjT", {
- "pattern": r"https://kemono\.party/data/(file|attachment)s"
- r"/gumroad/trylsc/IURjT/",
+ "pattern": r"https://kemono\.party/data/("
+ r"files/gumroad/trylsc/IURjT/reward8\.jpg|"
+ r"c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)",
}),
# username (#1548, #1652)
("https://kemono.party/gumroad/user/3252870377455/post/aJnAH", {
"options": (("metadata", True),),
"keyword": {"username": "Kudalyn's Creations"},
}),
- # skip patreon main file (#1667, #1689)
+ # skip patreon duplicates
("https://kemono.party/patreon/user/4158582/post/32099982", {
"count": 2,
- "keyword": {"type": "attachment"},
+ }),
+ # DMs (#2008)
+ ("https://kemono.party/patreon/user/34134344/post/38129255", {
+ "options": (("dms", True),),
+ "keyword": {"dms": [{
+ "body": r"re:Hi! Thank you very much for supporting the work I"
+ r" did in May. Here's your reward pack! I hope you fin"
+ r"d something you enjoy in it. :\)\n\nhttps://www.medi"
+ r"afire.com/file/\w+/Set13_tier_2.zip/file",
+ "date": "2021-07-31 02:47:51.327865",
+ }]},
}),
("https://kemono.party/subscribestar/user/alcorart/post/184330"),
+ ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index ff1d7c3..393f4e2 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -46,10 +46,10 @@ class MangadexExtractor(Extractor):
def _transform(self, chapter):
relationships = defaultdict(list)
for item in chapter["relationships"]:
- relationships[item["type"]].append(item["id"])
- manga = self.api.manga(relationships["manga"][0])
+ relationships[item["type"]].append(item)
+ manga = self.api.manga(relationships["manga"][0]["id"])
for item in manga["relationships"]:
- relationships[item["type"]].append(item["id"])
+ relationships[item["type"]].append(item)
cattributes = chapter["attributes"]
mattributes = manga["attributes"]
@@ -75,16 +75,12 @@ class MangadexExtractor(Extractor):
"count" : len(cattributes["data"]),
}
- if self.config("metadata"):
- data["artist"] = [
- self.api.author(uuid)["attributes"]["name"]
- for uuid in relationships["artist"]]
- data["author"] = [
- self.api.author(uuid)["attributes"]["name"]
- for uuid in relationships["author"]]
- data["group"] = [
- self.api.group(uuid)["attributes"]["name"]
- for uuid in relationships["scanlation_group"]]
+ data["artist"] = [artist["attributes"]["name"]
+ for artist in relationships["artist"]]
+ data["author"] = [author["attributes"]["name"]
+ for author in relationships["author"]]
+ data["group"] = [group["attributes"]["name"]
+ for group in relationships["scanlation_group"]]
return data
@@ -95,12 +91,11 @@ class MangadexChapterExtractor(MangadexExtractor):
pattern = BASE_PATTERN + r"/chapter/([0-9a-f-]+)"
test = (
("https://mangadex.org/chapter/f946ac53-0b71-4b5d-aeb2-7931b13c4aaa", {
- "keyword": "f6c2b908df06eb834d56193dfe1fa1f7c2c4dccd",
+ "keyword": "86fb262cf767dac6d965cd904ad499adba466404",
# "content": "50383a4c15124682057b197d40261641a98db514",
}),
# oneshot
("https://mangadex.org/chapter/61a88817-9c29-4281-bdf1-77b3c1be9831", {
- "options": (("metadata", True),),
"count": 64,
"keyword": "6abcbe1e24eeb1049dc931958853cd767ee483fb",
}),
@@ -147,6 +142,8 @@ class MangadexMangaExtractor(MangadexExtractor):
"date" : "type:datetime",
"lang" : str,
"language": str,
+ "artist" : ["Arakawa Hiromu"],
+ "author" : ["Arakawa Hiromu"],
},
}),
("https://mangadex.cc/manga/d0c88e3b-ea64-4e07-9841-c1d2ac982f4a/", {
@@ -193,20 +190,14 @@ class MangadexAPI():
def athome_server(self, uuid):
return self._call("/at-home/server/" + uuid)
- @memcache(keyarg=1)
- def author(self, uuid):
- return self._call("/author/" + uuid)["data"]
-
def chapter(self, uuid):
- return self._call("/chapter/" + uuid)["data"]
-
- @memcache(keyarg=1)
- def group(self, uuid):
- return self._call("/group/" + uuid)["data"]
+ params = {"includes[]": ("scanlation_group",)}
+ return self._call("/chapter/" + uuid, params)["data"]
@memcache(keyarg=1)
def manga(self, uuid):
- return self._call("/manga/" + uuid)["data"]
+ params = {"includes[]": ("artist", "author")}
+ return self._call("/manga/" + uuid, params)["data"]
def manga_feed(self, uuid):
order = "desc" if self.extractor.config("chapter-reverse") else "asc"
@@ -275,6 +266,7 @@ class MangadexAPI():
ratings = ("safe", "suggestive", "erotica", "pornographic")
params["contentRating[]"] = ratings
+ params["includes[]"] = ("scanlation_group",)
params["translatedLanguage[]"] = config("lang")
params["offset"] = 0
diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py
index d45fbc9..1486057 100644
--- a/gallery_dl/extractor/mangoxo.py
+++ b/gallery_dl/extractor/mangoxo.py
@@ -122,18 +122,18 @@ class MangoxoAlbumExtractor(MangoxoExtractor):
def metadata(self, page):
"""Return general metadata"""
extr = text.extract_from(page)
- title = extr('<title>', '</title>')
- count = extr('id="pic-count">', '<')
- cid = extr('<img alt="', '"')
+ title = extr('<img id="cover-img" alt="', '"')
+ cid = extr('href="https://www.mangoxo.com/user/', '"')
+ cname = extr('<img alt="', '"')
cover = extr(' src="', '"')
- cname = extr('target="_blank">', '<')
- date = extr('</i>', '<')
+ count = extr('id="pic-count">', '<')
+ date = extr('class="fa fa-calendar"></i>', '<')
descr = extr('<pre>', '</pre>')
return {
"channel": {
"id": cid,
- "name": text.unescape(cname.strip()),
+ "name": text.unescape(cname),
"cover": cover,
},
"album": {
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index d3b3bb1..51a0d38 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -62,6 +62,8 @@ INSTANCES = {
"filter_id": "56027"},
"ponybooru" : {"root": "https://ponybooru.org",
"filter_id": "2"},
+ "furbooru" : {"root": "https://furbooru.org",
+ "filter_id": "2"},
}
BASE_PATTERN = PhilomenaExtractor.update(INSTANCES)
@@ -124,6 +126,9 @@ class PhilomenaPostExtractor(PhilomenaExtractor):
("https://ponybooru.org/images/1", {
"content": "bca26f58fafd791fe07adcd2a28efd7751824605",
}),
+ ("https://furbooru.org/images/1", {
+ "content": "9eaa1e1b32fa0f16520912257dbefaff238d5fd2",
+ }),
)
def __init__(self, match):
@@ -157,6 +162,10 @@ class PhilomenaSearchExtractor(PhilomenaExtractor):
"range": "40-60",
"count": 21,
}),
+ ("https://furbooru.org/search?q=cute", {
+ "range": "40-60",
+ "count": 21,
+ }),
)
def __init__(self, match):
@@ -210,6 +219,9 @@ class PhilomenaGalleryExtractor(PhilomenaExtractor):
("https://ponybooru.org/galleries/27", {
"count": ">= 24",
}),
+ ("https://furbooru.org/galleries/27", {
+ "count": ">= 13",
+ }),
)
def __init__(self, match):
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py
index 04fe581..b3a620a 100644
--- a/gallery_dl/extractor/reactor.py
+++ b/gallery_dl/extractor/reactor.py
@@ -8,29 +8,29 @@
"""Generic extractors for *reactor sites"""
-from .common import Extractor, Message
+from .common import BaseExtractor, Message
from .. import text
import urllib.parse
import json
-BASE_PATTERN = r"(?:https?://)?((?:[^/.]+\.)?reactor\.cc)"
-
-class ReactorExtractor(Extractor):
+class ReactorExtractor(BaseExtractor):
"""Base class for *reactor.cc extractors"""
basecategory = "reactor"
filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}"
archive_fmt = "{post_id}_{num}"
- instances = ()
request_interval = 5.0
def __init__(self, match):
- Extractor.__init__(self, match)
- self.root = "http://" + match.group(1)
+ BaseExtractor.__init__(self, match)
+ url = text.ensure_http_scheme(match.group(0), "http://")
+ pos = url.index("/", 10)
+
+ self.root, self.path = url[:pos], url[pos:]
self.session.headers["Referer"] = self.root
self.gif = self.config("gif", False)
- if not self.category:
+ if self.category == "reactor":
# set category based on domain name
netloc = urllib.parse.urlsplit(self.root).netloc
self.category = netloc.rpartition(".")[0]
@@ -50,7 +50,7 @@ class ReactorExtractor(Extractor):
def posts(self):
"""Return all relevant post-objects"""
- return self._pagination(self.url)
+ return self._pagination(self.root + self.path)
def _pagination(self, url):
while True:
@@ -145,91 +145,63 @@ class ReactorExtractor(Extractor):
}
+BASE_PATTERN = ReactorExtractor.update({
+ "reactor" : {
+ "root": "http://reactor.cc",
+ "pattern": r"(?:[^/.]+\.)?reactor\.cc",
+ },
+ "joyreactor" : {
+ "root": "http://joyreactor.cc",
+ "pattern": r"(?:www\.)?joyreactor\.c(?:c|om)",
+ },
+ "pornreactor": {
+ "root": "http://pornreactor.cc",
+ "pattern": r"(?:www\.)?(?:pornreactor\.cc|fapreactor.com)",
+ },
+ "thatpervert": {
+ "root": "http://thatpervert.com",
+ },
+})
+
+
class ReactorTagExtractor(ReactorExtractor):
"""Extractor for tag searches on *reactor.cc sites"""
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "{search_tags}_{post_id}_{num}"
pattern = BASE_PATTERN + r"/tag/([^/?#]+)"
- test = ("http://anime.reactor.cc/tag/Anime+Art",)
+ test = (
+ ("http://reactor.cc/tag/gif"),
+ ("http://anime.reactor.cc/tag/Anime+Art"),
+ ("http://joyreactor.cc/tag/Advent+Cirno", {
+ "count": ">= 15",
+ }),
+ ("http://joyreactor.com/tag/Cirno", {
+ "url": "aa59090590b26f4654881301fe8fe748a51625a8",
+ }),
+ ("http://pornreactor.cc/tag/RiceGnat", {
+ "range": "1-25",
+ "count": ">= 25",
+ }),
+ ("http://fapreactor.com/tag/RiceGnat"),
+ )
def __init__(self, match):
ReactorExtractor.__init__(self, match)
- self.tag = match.group(2)
+ self.tag = match.group(match.lastindex)
def metadata(self):
return {"search_tags": text.unescape(self.tag).replace("+", " ")}
-class ReactorSearchExtractor(ReactorTagExtractor):
+class ReactorSearchExtractor(ReactorExtractor):
"""Extractor for search results on *reactor.cc sites"""
subcategory = "search"
directory_fmt = ("{category}", "search", "{search_tags}")
archive_fmt = "s_{search_tags}_{post_id}_{num}"
pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)"
- test = ("http://anime.reactor.cc/search?q=Art",)
-
-
-class ReactorUserExtractor(ReactorExtractor):
- """Extractor for all posts of a user on *reactor.cc sites"""
- subcategory = "user"
- directory_fmt = ("{category}", "user", "{user}")
- pattern = BASE_PATTERN + r"/user/([^/?#]+)"
- test = ("http://anime.reactor.cc/user/Shuster",)
-
- def __init__(self, match):
- ReactorExtractor.__init__(self, match)
- self.user = match.group(2)
-
- def metadata(self):
- return {"user": text.unescape(self.user).replace("+", " ")}
-
-
-class ReactorPostExtractor(ReactorExtractor):
- """Extractor for single posts on *reactor.cc sites"""
- subcategory = "post"
- pattern = BASE_PATTERN + r"/post/(\d+)"
- test = ("http://anime.reactor.cc/post/3576250",)
-
- def __init__(self, match):
- ReactorExtractor.__init__(self, match)
- self.post_id = match.group(2)
-
- def items(self):
- post = self.request(self.url).text
- pos = post.find('class="uhead">')
- for image in self._parse_post(post[pos:]):
- if image["num"] == 1:
- yield Message.Directory, image
- url = image["url"]
- yield Message.Url, url, text.nameext_from_url(url, image)
-
-
-# --------------------------------------------------------------------
-# JoyReactor
-
-JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))"
-
-
-class JoyreactorTagExtractor(ReactorTagExtractor):
- """Extractor for tag searches on joyreactor.cc"""
- category = "joyreactor"
- pattern = JR_BASE_PATTERN + r"/tag/([^/?#]+)"
- test = (
- ("http://joyreactor.cc/tag/Advent+Cirno", {
- "count": ">= 15",
- }),
- ("http://joyreactor.com/tag/Cirno", {
- "url": "aa59090590b26f4654881301fe8fe748a51625a8",
- }),
- )
-
-
-class JoyreactorSearchExtractor(ReactorSearchExtractor):
- """Extractor for search results on joyreactor.cc"""
- category = "joyreactor"
- pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)"
test = (
+ ("http://reactor.cc/search?q=Art"),
("http://joyreactor.cc/search/Nature", {
"range": "1-25",
"count": ">= 20",
@@ -238,26 +210,54 @@ class JoyreactorSearchExtractor(ReactorSearchExtractor):
"range": "1-25",
"count": ">= 20",
}),
+ ("http://pornreactor.cc/search?q=ecchi+hentai"),
+ ("http://fapreactor.com/search/ecchi+hentai"),
)
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.tag = match.group(match.lastindex)
+
+ def metadata(self):
+ return {"search_tags": text.unescape(self.tag).replace("+", " ")}
+
-class JoyreactorUserExtractor(ReactorUserExtractor):
- """Extractor for all posts of a user on joyreactor.cc"""
- category = "joyreactor"
- pattern = JR_BASE_PATTERN + r"/user/([^/?#]+)"
+class ReactorUserExtractor(ReactorExtractor):
+ """Extractor for all posts of a user on *reactor.cc sites"""
+ subcategory = "user"
+ directory_fmt = ("{category}", "user", "{user}")
+ pattern = BASE_PATTERN + r"/user/([^/?#]+)"
test = (
+ ("http://reactor.cc/user/Dioklet"),
+ ("http://anime.reactor.cc/user/Shuster"),
("http://joyreactor.cc/user/hemantic"),
("http://joyreactor.com/user/Tacoman123", {
"url": "60ce9a3e3db791a0899f7fb7643b5b87d09ae3b5",
}),
+ ("http://pornreactor.cc/user/Disillusion", {
+ "range": "1-25",
+ "count": ">= 20",
+ }),
+ ("http://fapreactor.com/user/Disillusion"),
)
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.user = match.group(match.lastindex)
+
+ def metadata(self):
+ return {"user": text.unescape(self.user).replace("+", " ")}
+
-class JoyreactorPostExtractor(ReactorPostExtractor):
- """Extractor for single posts on joyreactor.cc"""
- category = "joyreactor"
- pattern = JR_BASE_PATTERN + r"/post/(\d+)"
+class ReactorPostExtractor(ReactorExtractor):
+ """Extractor for single posts on *reactor.cc sites"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/post/(\d+)"
test = (
+ ("http://reactor.cc/post/4999736", {
+ "url": "dfc74d150d7267384d8c229c4b82aa210755daa0",
+ }),
+ ("http://anime.reactor.cc/post/3576250"),
("http://joyreactor.com/post/3721876", { # single image
"pattern": r"http://img\d\.joyreactor\.com/pics/post/full"
r"/cartoon-painting-monster-lake-4841316.jpeg",
@@ -281,57 +281,6 @@ class JoyreactorPostExtractor(ReactorPostExtractor):
("http://joyreactor.cc/post/1299", { # "malformed" JSON
"url": "ab02c6eb7b4035ad961b29ee0770ee41be2fcc39",
}),
- )
-
-
-# --------------------------------------------------------------------
-# PornReactor
-
-PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)"
-
-
-class PornreactorTagExtractor(ReactorTagExtractor):
- """Extractor for tag searches on pornreactor.cc"""
- category = "pornreactor"
- pattern = PR_BASE_PATTERN + r"/tag/([^/?#]+)"
- test = (
- ("http://pornreactor.cc/tag/RiceGnat", {
- "range": "1-25",
- "count": ">= 25",
- }),
- ("http://fapreactor.com/tag/RiceGnat"),
- )
-
-
-class PornreactorSearchExtractor(ReactorSearchExtractor):
- """Extractor for search results on pornreactor.cc"""
- category = "pornreactor"
- pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?#]+)"
- test = (
- ("http://pornreactor.cc/search?q=ecchi+hentai"),
- ("http://fapreactor.com/search/ecchi+hentai"),
- )
-
-
-class PornreactorUserExtractor(ReactorUserExtractor):
- """Extractor for all posts of a user on pornreactor.cc"""
- category = "pornreactor"
- pattern = PR_BASE_PATTERN + r"/user/([^/?#]+)"
- test = (
- ("http://pornreactor.cc/user/Disillusion", {
- "range": "1-25",
- "count": ">= 20",
- }),
- ("http://fapreactor.com/user/Disillusion"),
- )
-
-
-class PornreactorPostExtractor(ReactorPostExtractor):
- """Extractor for single posts on pornreactor.cc"""
- category = "pornreactor"
- subcategory = "post"
- pattern = PR_BASE_PATTERN + r"/post/(\d+)"
- test = (
("http://pornreactor.cc/post/863166", {
"url": "a09fb0577489e1f9564c25d0ad576f81b19c2ef3",
"content": "ec6b0568bfb1803648744077da082d14de844340",
@@ -340,3 +289,16 @@ class PornreactorPostExtractor(ReactorPostExtractor):
"url": "2a956ce0c90e8bc47b4392db4fa25ad1342f3e54",
}),
)
+
+ def __init__(self, match):
+ ReactorExtractor.__init__(self, match)
+ self.post_id = match.group(match.lastindex)
+
+ def items(self):
+ post = self.request(self.root + self.path).text
+ pos = post.find('class="uhead">')
+ for image in self._parse_post(post[pos:]):
+ if image["num"] == 1:
+ yield Message.Directory, image
+ url = image["url"]
+ yield Message.Url, url, text.nameext_from_url(url, image)
diff --git a/gallery_dl/extractor/seisoparty.py b/gallery_dl/extractor/seisoparty.py
deleted file mode 100644
index a2a24e0..0000000
--- a/gallery_dl/extractor/seisoparty.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2021 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://seiso.party/"""
-
-from .common import Extractor, Message
-from .. import text, exception
-from ..cache import cache
-import re
-
-
-class SeisopartyExtractor(Extractor):
- """Base class for seisoparty extractors"""
- category = "seisoparty"
- root = "https://seiso.party"
- directory_fmt = ("{category}", "{service}", "{username}")
- filename_fmt = "{id}_{title}_{num:>02}_{filename}.{extension}"
- archive_fmt = "{service}_{user}_{id}_{num}"
- cookiedomain = ".seiso.party"
-
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.user_name = None
- self._find_files = re.compile(
- r'href="(https://cdn(?:-\d)?\.seiso\.party/files/[^"]+)').findall
-
- def items(self):
- self._prepare_ddosguard_cookies()
-
- for post in self.posts():
- files = post.pop("files")
- yield Message.Directory, post
- for post["num"], url in enumerate(files, 1):
- yield Message.Url, url, text.nameext_from_url(url, post)
-
- def _parse_post(self, page, post_id):
- extr = text.extract_from(page)
- return {
- "service" : self.service,
- "user" : self.user_id,
- "username": self.user_name,
- "id" : post_id,
- "date" : text.parse_datetime(extr(
- '<div class="margin-bottom-15 minor-text">', '<'),
- "%Y-%m-%d %H:%M:%S %Z"),
- "title" : text.unescape(extr('class="post-title">', '<')),
- "content" : text.unescape(extr("\n<p>\n", "\n</p>\n").strip()),
- "files" : self._find_files(page),
- }
-
- def login(self):
- username, password = self._get_auth_info()
- if username:
- self._update_cookies(self._login_impl(username, password))
-
- @cache(maxage=28*24*3600, keyarg=1)
- def _login_impl(self, username, password):
- self.log.info("Logging in as %s", username)
-
- url = self.root + "/account/login"
- data = {"username": username, "password": password}
-
- response = self.request(url, method="POST", data=data)
- if response.url.endswith("/account/login") and \
- "Username or password is incorrect" in response.text:
- raise exception.AuthenticationError()
-
- return {c.name: c.value for c in response.history[0].cookies}
-
-
-class SeisopartyUserExtractor(SeisopartyExtractor):
- """Extractor for all posts from a seiso.party user listing"""
- subcategory = "user"
- pattern = r"(?:https?://)?seiso\.party/artists/([^/?#]+)/([^/?#]+)"
- test = (
- ("https://seiso.party/artists/fanbox/21", {
- "pattern": r"https://cdn\.seiso\.party/files/fanbox/\d+/",
- "count": ">=15",
- "keyword": {
- "content": str,
- "date": "type:datetime",
- "id": r"re:\d+",
- "num": int,
- "service": "fanbox",
- "title": str,
- "user": "21",
- "username": "雨",
- },
- }),
- )
-
- def __init__(self, match):
- SeisopartyExtractor.__init__(self, match)
- self.service, self.user_id = match.groups()
-
- def posts(self):
- url = "{}/artists/{}/{}".format(self.root, self.service, self.user_id)
- page = self.request(url).text
- self.user_name, pos = text.extract(page, '<span class="title">', '<')
-
- url = self.root + text.extract(
- page, 'href="', '"', page.index('id="content"', pos))[0]
- response = self.request(url)
- headers = {"Referer": url}
-
- while True:
- yield self._parse_post(response.text, url.rpartition("/")[2])
- response = self.request(url + "/next", headers=headers)
- if url == response.url:
- return
- url = headers["Referer"] = response.url
-
-
-class SeisopartyPostExtractor(SeisopartyExtractor):
- """Extractor for a single seiso.party post"""
- subcategory = "post"
- pattern = r"(?:https?://)?seiso\.party/post/([^/?#]+)/([^/?#]+)/([^/?#]+)"
- test = (
- ("https://seiso.party/post/fanbox/21/371", {
- "url": "75f13b92de0ce399b6163c3de18f1f36011c2366",
- "count": 2,
- "keyword": {
- "content": "この前描いためぐるちゃんのPSDファイルです。<br/>"
- "どうぞよろしくお願いします。",
- "date": "dt:2021-05-06 12:38:31",
- "extension": "re:psd|jpg",
- "filename": "re:backcourt|ffb2ccb7a3586d05f9a4620329dd131e",
- "id": "371",
- "num": int,
- "service": "fanbox",
- "title": "MEGURU.PSD",
- "user": "21",
- "username": "雨",
- },
- }),
- ("https://seiso.party/post/patreon/429/95949", {
- "pattern": r"https://cdn-2\.seiso\.party/files/patreon/95949/",
- "count": 2,
- }),
- )
-
- def __init__(self, match):
- SeisopartyExtractor.__init__(self, match)
- self.service, self.user_id, self.post_id = match.groups()
-
- def posts(self):
- url = "{}/artists/{}/{}".format(self.root, self.service, self.user_id)
- page = self.request(url).text
- self.user_name, pos = text.extract(page, '<span class="title">', '<')
-
- url = "{}/post/{}/{}/{}".format(
- self.root, self.service, self.user_id, self.post_id)
- return (self._parse_post(self.request(url).text, self.post_id),)
-
-
-class SeisopartyFavoriteExtractor(SeisopartyExtractor):
- """Extractor for seiso.party favorites"""
- subcategory = "favorite"
- pattern = r"(?:https?://)?seiso\.party/favorites/artists/?(?:\?([^#]+))?"
- test = (
- ("https://seiso.party/favorites/artists", {
- "pattern": SeisopartyUserExtractor.pattern,
- "url": "0c862434bc3bbbe84cbf41c3a6152473a8cde683",
- "count": 3,
- }),
- ("https://seiso.party/favorites/artists?sort=id&sort_direction=asc", {
- "url": "629a8b9c6d3a8a64f521908bdb3d7426ac03f8d3",
- }),
- )
-
- def __init__(self, match):
- SeisopartyExtractor.__init__(self, match)
- self.query = match.group(1)
-
- def items(self):
- self._prepare_ddosguard_cookies()
- self.login()
-
- url = self.root + "/favorites/artists"
- data = {"_extractor": SeisopartyUserExtractor}
- params = text.parse_query(self.query)
- params["page"] = text.parse_int(params.get("page"), 1)
-
- while True:
- page = self.request(url, params=params).text
-
- cnt = 0
- for card in text.extract_iter(
- page, '<div class="artist-card', '</a>'):
- path = text.extract(card, '<a href="', '"')[0]
- yield Message.Queue, self.root + path, data
- cnt += 1
-
- if cnt < 25:
- return
- params["page"] += 1
diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py
index 6d924de..f276e84 100644
--- a/gallery_dl/extractor/shopify.py
+++ b/gallery_dl/extractor/shopify.py
@@ -53,6 +53,10 @@ BASE_PATTERN = ShopifyExtractor.update({
"windsorstore": {
"root": "https://www.windsorstore.com",
},
+ "loungeunderwear": {
+ "root": "https://loungeunderwear.com",
+ "pattern": r"(?:[a-z]+\.)?loungeunderwear\.com",
+ },
})
@@ -70,6 +74,7 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
("https://www.fashionnova.com/collections/mini-dresses#1"),
("https://www.omgmiamiswimwear.com/collections/fajas"),
("https://www.windsorstore.com/collections/dresses-ball-gowns"),
+ ("https://loungeunderwear.com/collections/apparel"),
)
def metadata(self):
@@ -105,6 +110,7 @@ class ShopifyProductExtractor(ShopifyExtractor):
("https://www.fashionnova.com/collections/flats/products/name"),
("https://www.windsorstore.com/collections/accessories-belts/products"
"/rhine-buckle-dbl-o-ring-pu-strap-belt-073010158001"),
+ ("https://de.loungeunderwear.com/products/ribbed-crop-top-black"),
)
def products(self):
diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py
index c1a8878..2c806ad 100644
--- a/gallery_dl/extractor/skeb.py
+++ b/gallery_dl/extractor/skeb.py
@@ -21,6 +21,7 @@ class SkebExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.user_name = match.group(1)
+ self.thumbnails = self.config("thumbnails", False)
def items(self):
for post_num in self.posts():
@@ -94,7 +95,7 @@ class SkebExtractor(Extractor):
return resp, post
def _get_urls_from_post(self, resp, post):
- if "og_image_url" in resp:
+ if self.thumbnails and "og_image_url" in resp:
post["content_category"] = "thumb"
post["file_id"] = "thumb"
post["file_url"] = resp["og_image_url"]
diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py
index ae8b58d..69e3854 100644
--- a/gallery_dl/extractor/subscribestar.py
+++ b/gallery_dl/extractor/subscribestar.py
@@ -38,12 +38,11 @@ class SubscribestarExtractor(Extractor):
self.login()
for post_html in self.posts():
media = self._media_from_post(post_html)
- if not media:
- continue
data = self._data_from_post(post_html)
yield Message.Directory, data
- for item in media:
+ for num, item in enumerate(media, 1):
item.update(data)
+ item["num"] = num
text.nameext_from_url(item.get("name") or item["url"], item)
yield Message.Url, item["url"], item
@@ -140,8 +139,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
test = (
("https://www.subscribestar.com/subscribestar", {
"count": ">= 20",
- "pattern": r"https://(star-uploads|ss-uploads-prod)\.s\d+-us-west-"
- r"\d+\.amazonaws\.com/uploads(_v2)?/users/11/",
+ "pattern": r"https://\w+\.cloudfront\.net/uploads(_v2)?/users/11/",
"keyword": {
"author_id": 11,
"author_name": "subscribestar",
@@ -149,6 +147,7 @@ class SubscribestarUserExtractor(SubscribestarExtractor):
"content": str,
"date" : "type:datetime",
"id" : int,
+ "num" : int,
"post_id": int,
"type" : "re:image|video|attachment",
"url" : str,
@@ -190,7 +189,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
pattern = BASE_PATTERN + r"/posts/(\d+)"
test = (
("https://www.subscribestar.com/posts/102468", {
- "url": "612da5a98af056dd78dc846fbcfa705e721f6675",
+ "count": 1,
"keyword": {
"author_id": 11,
"author_name": "subscribestar",
@@ -202,6 +201,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
"group": "imgs_and_videos",
"height": 291,
"id": 203885,
+ "num": 1,
"pinned": False,
"post_id": 102468,
"type": "image",
@@ -209,7 +209,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
},
}),
("https://subscribestar.adult/posts/22950", {
- "url": "440d745a368e6b3e218415f593a5045f384afa0d",
+ "count": 1,
"keyword": {"date": "dt:2019-04-28 07:32:00"},
}),
)
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index 00f3b04..f1c392d 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -41,7 +41,9 @@ class TwitterExtractor(Extractor):
self.videos = self.config("videos", True)
self.cards = self.config("cards", False)
self._user_cache = {}
+ self._init_sizes()
+ def _init_sizes(self):
size = self.config("size")
if size is None:
self._size_image = "orig"
@@ -580,13 +582,17 @@ class TwitterImageExtractor(Extractor):
subcategory = "image"
pattern = r"https?://pbs\.twimg\.com/media/([\w-]+)(?:\?format=|\.)(\w+)"
test = (
- ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg%name=orig"),
+ ("https://pbs.twimg.com/media/EqcpviCVoAAG-QG?format=jpg&name=orig", {
+ "options": (("size", "4096x4096,orig"),),
+ "url": "cb3042a6f6826923da98f0d2b66c427e9385114c",
+ }),
("https://pbs.twimg.com/media/EqcpviCVoAAG-QG.jpg:orig"),
)
def __init__(self, match):
Extractor.__init__(self, match)
self.id, self.fmt = match.groups()
+ TwitterExtractor._init_sizes(self)
def items(self):
base = "https://pbs.twimg.com/media/{}?format={}&name=".format(
@@ -595,11 +601,11 @@ class TwitterImageExtractor(Extractor):
data = {
"filename": self.id,
"extension": self.fmt,
- "_fallback": TwitterExtractor._image_fallback(base),
+ "_fallback": TwitterExtractor._image_fallback(self, base),
}
yield Message.Directory, data
- yield Message.Url, base + "orig", data
+ yield Message.Url, base + self._size_image, data
class TwitterAPI():
@@ -793,16 +799,21 @@ class TwitterAPI():
data = response.json()
if "errors" in data:
try:
- msg = ", ".join(
- '"' + error["message"] + '"'
- for error in data["errors"]
- )
+ errors, warnings = [], []
+ for error in data["errors"]:
+ if error.get("kind") == "NonFatal":
+ warnings.append(error["message"])
+ else:
+ errors.append(error["message"])
+ errors = ", ".join(errors)
except Exception:
- msg = data["errors"]
- if msg and response.status_code < 400:
- raise exception.StopExtraction(msg)
+ errors = data["errors"]
+ if warnings:
+ self.extractor.log.warning(", ".join(warnings))
+ if errors and response.status_code < 400:
+ raise exception.StopExtraction(errors)
else:
- msg = ""
+ errors = ""
if response.status_code < 400:
# success
@@ -816,7 +827,7 @@ class TwitterAPI():
continue
if response.status_code == 401 and \
- "have been blocked from viewing" in msg:
+ "have been blocked from viewing" in errors:
# account blocked
extr = self.extractor
if self.headers["x-twitter-auth-type"] and \
@@ -833,7 +844,7 @@ class TwitterAPI():
# error
raise exception.StopExtraction(
- "%s %s (%s)", response.status_code, response.reason, msg)
+ "%s %s (%s)", response.status_code, response.reason, errors)
def _pagination(self, endpoint, params=None):
if params is None:
diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py
index e2474c9..cf5b192 100644
--- a/gallery_dl/extractor/webtoons.py
+++ b/gallery_dl/extractor/webtoons.py
@@ -48,7 +48,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
test = (
(("https://www.webtoons.com/en/comedy/safely-endangered"
"/ep-572-earth/viewer?title_no=352&episode_no=572"), {
- "url": "11041d71a3f92728305c11a228e77cf0f7aa02ef",
+ "url": "55bec5d7c42aba19e3d0d56db25fdf0b0b13be38",
"content": ("1748c7e82b6db910fa179f6dc7c4281b0f680fa7",
"42055e44659f6ffc410b3fb6557346dfbb993df3",
"49e1f2def04c6f7a6a3dacf245a1cd9abe77a6a9"),
@@ -62,7 +62,6 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
url = "{}/{}/viewer?{}".format(self.root, self.path, query)
GalleryExtractor.__init__(self, match, url)
self.setup_agegate_cookies()
- self.session.headers["Referer"] = url
query = text.parse_query(query)
self.title_no = query.get("title_no")
@@ -88,7 +87,7 @@ class WebtoonsEpisodeExtractor(WebtoonsBase, GalleryExtractor):
@staticmethod
def images(page):
return [
- (url, None)
+ (url.replace("://webtoon-phinf.", "://swebtoon-phinf."), None)
for url in text.extract_iter(
page, 'class="_images" data-url="', '"')
]
diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py
index 0922c7c..0a55532 100644
--- a/gallery_dl/extractor/xvideos.py
+++ b/gallery_dl/extractor/xvideos.py
@@ -32,8 +32,8 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor):
test = (
("https://www.xvideos.com/profiles/pervertedcouple/photos/751031", {
"count": 8,
- "pattern": r"https://profile-pics-l3\.xvideos-cdn\.com"
- r"/[0-9a-f]{40}-\d+/videos/profiles/galleries/84/ca/37"
+ "pattern": r"https://profile-pics-cdn\d+\.xvideos-cdn\.com"
+ r"/[^/]+\,\d+/videos/profiles/galleries/84/ca/37"
r"/pervertedcouple/gal751031/pic_\d+_big\.jpg",
"keyword": {
"gallery": {
diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py
index d380dab..8eb0c83 100644
--- a/gallery_dl/extractor/ytdl.py
+++ b/gallery_dl/extractor/ytdl.py
@@ -9,7 +9,7 @@
"""Extractors for sites supported by youtube-dl"""
from .common import Extractor, Message
-from .. import text, config, exception
+from .. import ytdl, config, exception
class YoutubeDLExtractor(Extractor):
@@ -54,52 +54,45 @@ class YoutubeDLExtractor(Extractor):
self.log.debug("Using %s", ytdl_module)
# construct YoutubeDL object
- options = {
- "format" : self.config("format"),
+ extr_opts = {
+ "extract_flat" : "in_playlist",
+ "force_generic_extractor": self.force_generic_extractor,
+ }
+ user_opts = {
"retries" : self._retries,
"socket_timeout" : self._timeout,
"nocheckcertificate" : not self._verify,
- "proxy" : self.session.proxies.get("http"),
- "force_generic_extractor": self.force_generic_extractor,
- "nopart" : not self.config("part", True),
- "updatetime" : self.config("mtime", True),
- "ratelimit" : text.parse_bytes(
- self.config("rate"), None),
- "min_filesize" : text.parse_bytes(
- self.config("filesize-min"), None),
- "max_filesize" : text.parse_bytes(
- self.config("filesize-max"), None),
}
- raw_options = self.config("raw-options")
- if raw_options:
- options.update(raw_options)
- if self.config("logging", True):
- options["logger"] = self.log
- options["extract_flat"] = "in_playlist"
-
username, password = self._get_auth_info()
if username:
- options["username"], options["password"] = username, password
+ user_opts["username"], user_opts["password"] = username, password
del username, password
- ytdl = ytdl_module.YoutubeDL(options)
+ ytdl_instance = ytdl.construct_YoutubeDL(
+ ytdl_module, self, user_opts, extr_opts)
# transfer cookies to ytdl
cookies = self.session.cookies
if cookies:
- set_cookie = self.ytdl.cookiejar.set_cookie
- for cookie in self.session.cookies:
+ set_cookie = ytdl_instance.cookiejar.set_cookie
+ for cookie in cookies:
set_cookie(cookie)
# extract youtube_dl info_dict
- info_dict = ytdl._YoutubeDL__extract_info(
- self.ytdl_url,
- ytdl.get_info_extractor(self.ytdl_ie_key),
- False, {}, True)
-
- if "entries" in info_dict:
- results = self._process_entries(ytdl, info_dict["entries"])
+ try:
+ info_dict = ytdl_instance._YoutubeDL__extract_info(
+ self.ytdl_url,
+ ytdl_instance.get_info_extractor(self.ytdl_ie_key),
+ False, {}, True)
+ except ytdl_module.utils.YoutubeDLError:
+ raise exception.StopExtraction("Failed to extract video data")
+
+ if not info_dict:
+ return
+ elif "entries" in info_dict:
+ results = self._process_entries(
+ ytdl_module, ytdl_instance, info_dict["entries"])
else:
results = (info_dict,)
@@ -107,7 +100,7 @@ class YoutubeDLExtractor(Extractor):
for info_dict in results:
info_dict["extension"] = None
info_dict["_ytdl_info_dict"] = info_dict
- info_dict["_ytdl_instance"] = ytdl
+ info_dict["_ytdl_instance"] = ytdl_instance
url = "ytdl:" + (info_dict.get("url") or
info_dict.get("webpage_url") or
@@ -116,15 +109,23 @@ class YoutubeDLExtractor(Extractor):
yield Message.Directory, info_dict
yield Message.Url, url, info_dict
- def _process_entries(self, ytdl, entries):
+ def _process_entries(self, ytdl_module, ytdl_instance, entries):
for entry in entries:
- if entry.get("_type") in ("url", "url_transparent"):
- info_dict = ytdl.extract_info(
- entry["url"], False,
- ie_key=entry.get("ie_key"))
- if "entries" in info_dict:
+ if not entry:
+ continue
+ elif entry.get("_type") in ("url", "url_transparent"):
+ try:
+ info_dict = ytdl_instance.extract_info(
+ entry["url"], False,
+ ie_key=entry.get("ie_key"))
+ except ytdl_module.utils.YoutubeDLError:
+ continue
+
+ if not info_dict:
+ continue
+ elif "entries" in info_dict:
yield from self._process_entries(
- ytdl, info_dict["entries"])
+ ytdl_module, ytdl_instance, info_dict["entries"])
else:
yield info_dict
else: