summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@unit193.net>2024-01-08 03:22:24 -0500
committerLibravatarUnit 193 <unit193@unit193.net>2024-01-08 03:22:24 -0500
commite949aaf6f6ac93896947d5b736e48e7911926efb (patch)
treeb73090d78cd83dee0f85b385a25dcf623ac12f2d /gallery_dl/extractor
parent4d7a4f1ecef2c96269f3590335d2834ebcdd50bf (diff)
New upstream version 1.26.6.upstream/1.26.6
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py4
-rw-r--r--gallery_dl/extractor/batoto.py118
-rw-r--r--gallery_dl/extractor/chevereto.py2
-rw-r--r--gallery_dl/extractor/common.py7
-rw-r--r--gallery_dl/extractor/deviantart.py56
-rw-r--r--gallery_dl/extractor/fanbox.py59
-rw-r--r--gallery_dl/extractor/gelbooru.py20
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py2
-rw-r--r--gallery_dl/extractor/idolcomplex.py17
-rw-r--r--gallery_dl/extractor/imagechest.py2
-rw-r--r--gallery_dl/extractor/komikcast.py20
-rw-r--r--gallery_dl/extractor/lynxchan.py8
-rw-r--r--gallery_dl/extractor/manganelo.py9
-rw-r--r--gallery_dl/extractor/nijie.py7
-rw-r--r--gallery_dl/extractor/nudecollect.py87
-rw-r--r--gallery_dl/extractor/paheal.py13
-rw-r--r--gallery_dl/extractor/pinterest.py3
-rw-r--r--gallery_dl/extractor/poringa.py138
-rw-r--r--gallery_dl/extractor/rule34us.py6
-rw-r--r--gallery_dl/extractor/szurubooru.py4
-rw-r--r--gallery_dl/extractor/twitter.py4
-rw-r--r--gallery_dl/extractor/weibo.py3
-rw-r--r--gallery_dl/extractor/zzup.py40
23 files changed, 482 insertions, 147 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index 695b8b2..9e33f2c 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -24,6 +24,7 @@ modules = [
"architizer",
"artstation",
"aryion",
+ "batoto",
"bbc",
"behance",
"blogger",
@@ -107,7 +108,6 @@ modules = [
"nitter",
"nozomi",
"nsfwalbum",
- "nudecollect",
"paheal",
"patreon",
"philomena",
@@ -122,6 +122,7 @@ modules = [
"pixnet",
"plurk",
"poipiku",
+ "poringa",
"pornhub",
"pornpics",
"postmill",
@@ -177,6 +178,7 @@ modules = [
"xhamster",
"xvideos",
"zerochan",
+ "zzup",
"booru",
"moebooru",
"foolfuuka",
diff --git a/gallery_dl/extractor/batoto.py b/gallery_dl/extractor/batoto.py
new file mode 100644
index 0000000..cd6302e
--- /dev/null
+++ b/gallery_dl/extractor/batoto.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://bato.to/"""
+
+from .common import Extractor, ChapterExtractor, MangaExtractor
+from .. import text, exception
+import re
+
+BASE_PATTERN = (r"(?:https?://)?"
+ r"(?:(?:ba|d|w)to\.to|\.to|(?:batotoo|mangatoto)\.com)")
+
+
+class BatotoBase():
+ """Base class for batoto extractors"""
+ category = "batoto"
+ root = "https://bato.to"
+
+ def request(self, url, **kwargs):
+ kwargs["encoding"] = "utf-8"
+ return Extractor.request(self, url, **kwargs)
+
+
+class BatotoChapterExtractor(BatotoBase, ChapterExtractor):
+ """Extractor for bato.to manga chapters"""
+ pattern = BASE_PATTERN + r"/(?:title/[^/?#]+|chapter)/(\d+)"
+ example = "https://bato.to/title/12345-MANGA/54321"
+
+ def __init__(self, match):
+ self.root = text.root_from_url(match.group(0))
+ self.chapter_id = match.group(1)
+ url = "{}/title/0/{}".format(self.root, self.chapter_id)
+ ChapterExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ manga, info, _ = extr("<title>", "<").rsplit(" - ", 3)
+ manga_id = extr("/title/", "/")
+
+ match = re.match(
+ r"(?:Volume\s+(\d+) )?"
+ r"\w+\s+(\d+)(.*)", info)
+ if match:
+ volume, chapter, minor = match.groups()
+ title = text.remove_html(extr(
+ "selected>", "</option")).partition(" : ")[2]
+ else:
+ volume = chapter = 0
+ minor = ""
+ title = info
+
+ return {
+ "manga" : text.unescape(manga),
+ "manga_id" : text.parse_int(manga_id),
+ "title" : text.unescape(title),
+ "volume" : text.parse_int(volume),
+ "chapter" : text.parse_int(chapter),
+ "chapter_minor": minor,
+ "chapter_id" : text.parse_int(self.chapter_id),
+ "date" : text.parse_timestamp(extr(' time="', '"')[:-3]),
+ }
+
+ def images(self, page):
+ images_container = text.extr(page, 'pageOpts', ':[0,0]}"')
+ images_container = text.unescape(images_container)
+ return [
+ (url, None)
+ for url in text.extract_iter(images_container, r"\"", r"\"")
+ ]
+
+
+class BatotoMangaExtractor(BatotoBase, MangaExtractor):
+ """Extractor for bato.to manga"""
+ reverse = False
+ chapterclass = BatotoChapterExtractor
+ pattern = BASE_PATTERN + r"/(?:title|series)/(\d+)[^/?#]*/?$"
+ example = "https://bato.to/title/12345-MANGA/"
+
+ def __init__(self, match):
+ self.root = text.root_from_url(match.group(0))
+ self.manga_id = match.group(1)
+ url = "{}/title/{}".format(self.root, self.manga_id)
+ MangaExtractor.__init__(self, match, url)
+
+ def chapters(self, page):
+ extr = text.extract_from(page)
+
+ warning = extr(' class="alert alert-warning">', "</div><")
+ if warning:
+ raise exception.StopExtraction("'%s'", text.remove_html(warning))
+
+ data = {
+ "manga_id": text.parse_int(self.manga_id),
+ "manga" : text.unescape(extr(
+ "<title>", "<").rpartition(" - ")[0]),
+ }
+
+ extr('<div data-hk="0-0-0-0"', "")
+ results = []
+ while True:
+ href = extr('<a href="/title/', '"')
+ if not href:
+ break
+
+ chapter = href.rpartition("-ch_")[2]
+ chapter, sep, minor = chapter.partition(".")
+
+ data["chapter"] = text.parse_int(chapter)
+ data["chapter_minor"] = sep + minor
+ data["date"] = text.parse_datetime(
+ extr('time="', '"'), "%Y-%m-%dT%H:%M:%S.%fZ")
+
+ url = "{}/title/{}".format(self.root, href)
+ results.append((url, data.copy()))
+ return results
diff --git a/gallery_dl/extractor/chevereto.py b/gallery_dl/extractor/chevereto.py
index 21166bd..2bf200b 100644
--- a/gallery_dl/extractor/chevereto.py
+++ b/gallery_dl/extractor/chevereto.py
@@ -35,7 +35,7 @@ class CheveretoExtractor(BaseExtractor):
BASE_PATTERN = CheveretoExtractor.update({
"jpgfish": {
- "root": "https://jpg2.su",
+ "root": "https://jpg4.su",
"pattern": r"jpe?g\d?\.(?:su|pet|fish(?:ing)?|church)",
},
"pixl": {
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index 9b010c5..0dd05ef 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -526,12 +526,15 @@ class Extractor():
if include == "all":
include = extractors
elif isinstance(include, str):
- include = include.split(",")
+ include = include.replace(" ", "").split(",")
result = [(Message.Version, 1)]
for category in include:
- if category in extractors:
+ try:
extr, url = extractors[category]
+ except KeyError:
+ self.log.warning("Invalid include '%s'", category)
+ else:
result.append((Message.Queue, url, {"_extractor": extr}))
return iter(result)
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 2ba47e1..4b5f1d7 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -38,7 +38,7 @@ class DeviantartExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
- self.user = match.group(1) or match.group(2)
+ self.user = (match.group(1) or match.group(2)).lower()
self.offset = 0
def _init(self):
@@ -104,7 +104,6 @@ class DeviantartExtractor(Extractor):
raise exception.StopExtraction()
else:
self.subcategory = "group-" + self.subcategory
- self.user = self.user.lower()
self.group = True
for deviation in self.deviations():
@@ -513,11 +512,13 @@ class DeviantartUserExtractor(DeviantartExtractor):
def items(self):
base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors((
- (DeviantartGalleryExtractor , base + "gallery"),
- (DeviantartScrapsExtractor , base + "gallery/scraps"),
- (DeviantartJournalExtractor , base + "posts"),
- (DeviantartStatusExtractor , base + "posts/statuses"),
- (DeviantartFavoriteExtractor, base + "favourites"),
+ (DeviantartAvatarExtractor , base + "avatar"),
+ (DeviantartBackgroundExtractor, base + "banner"),
+ (DeviantartGalleryExtractor , base + "gallery"),
+ (DeviantartScrapsExtractor , base + "gallery/scraps"),
+ (DeviantartJournalExtractor , base + "posts"),
+ (DeviantartStatusExtractor , base + "posts/statuses"),
+ (DeviantartFavoriteExtractor , base + "favourites"),
), ("gallery",))
@@ -538,6 +539,47 @@ class DeviantartGalleryExtractor(DeviantartExtractor):
return self._folder_urls(folders, "gallery", DeviantartFolderExtractor)
+class DeviantartAvatarExtractor(DeviantartExtractor):
+ """Extractor for an artist's avatar"""
+ subcategory = "avatar"
+ archive_fmt = "a_{_username}_{index}"
+ pattern = BASE_PATTERN + r"/avatar"
+ example = "https://www.deviantart.com/USER/avatar/"
+
+ def deviations(self):
+ profile = self.api.user_profile(self.user.lower())
+ if profile:
+ url = profile["user"]["usericon"]
+ return ({
+ "author" : profile["user"],
+ "category" : "avatar",
+ "index" : text.parse_int(url.rpartition("?")[2]),
+ "is_deleted" : False,
+ "is_downloadable": False,
+ "published_time" : 0,
+ "title" : "avatar",
+ "content" : {
+ "src": url.replace("/avatars/", "/avatars-big/", 1),
+ },
+ },)
+ return ()
+
+
+class DeviantartBackgroundExtractor(DeviantartExtractor):
+ """Extractor for an artist's banner"""
+ subcategory = "background"
+ archive_fmt = "b_{index}"
+ pattern = BASE_PATTERN + r"/ba(?:nner|ckground)"
+ example = "https://www.deviantart.com/USER/banner/"
+
+ def deviations(self):
+ try:
+ return (self.api.user_profile(self.user.lower())
+ ["cover_deviation"]["cover_deviation"],)
+ except Exception:
+ return ()
+
+
class DeviantartFolderExtractor(DeviantartExtractor):
"""Extractor for deviations inside an artist's gallery folder"""
subcategory = "folder"
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index 4572bea..61a3928 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -8,6 +8,7 @@
from .common import Extractor, Message
from .. import text
+from ..cache import memcache
import re
BASE_PATTERN = (
@@ -27,8 +28,20 @@ class FanboxExtractor(Extractor):
_warning = True
def _init(self):
+ self.headers = {"Origin": self.root}
self.embeds = self.config("embeds", True)
+ includes = self.config("metadata")
+ if includes:
+ if isinstance(includes, str):
+ includes = includes.split(",")
+ elif not isinstance(includes, (list, tuple)):
+ includes = ("user", "plan")
+ self._meta_user = ("user" in includes)
+ self._meta_plan = ("plan" in includes)
+ else:
+ self._meta_user = self._meta_plan = False
+
if self._warning:
if not self.cookies_check(("FANBOXSESSID",)):
self.log.warning("no 'FANBOXSESSID' cookie set")
@@ -43,11 +56,9 @@ class FanboxExtractor(Extractor):
"""Return all relevant post objects"""
def _pagination(self, url):
- headers = {"Origin": self.root}
-
while url:
url = text.ensure_http_scheme(url)
- body = self.request(url, headers=headers).json()["body"]
+ body = self.request(url, headers=self.headers).json()["body"]
for item in body["items"]:
try:
yield self._get_post_data(item["id"])
@@ -58,9 +69,8 @@ class FanboxExtractor(Extractor):
def _get_post_data(self, post_id):
"""Fetch and process post data"""
- headers = {"Origin": self.root}
url = "https://api.fanbox.cc/post.info?postId="+post_id
- post = self.request(url, headers=headers).json()["body"]
+ post = self.request(url, headers=self.headers).json()["body"]
content_body = post.pop("body", None)
if content_body:
@@ -98,8 +108,47 @@ class FanboxExtractor(Extractor):
post["text"] = content_body.get("text") if content_body else None
post["isCoverImage"] = False
+ if self._meta_user:
+ post["user"] = self._get_user_data(post["creatorId"])
+ if self._meta_plan:
+ plans = self._get_plan_data(post["creatorId"])
+ post["plan"] = plans[post["feeRequired"]]
+
return content_body, post
+ @memcache(keyarg=1)
+ def _get_user_data(self, creator_id):
+ url = "https://api.fanbox.cc/creator.get"
+ params = {"creatorId": creator_id}
+ data = self.request(url, params=params, headers=self.headers).json()
+
+ user = data["body"]
+ user.update(user.pop("user"))
+
+ return user
+
+ @memcache(keyarg=1)
+ def _get_plan_data(self, creator_id):
+ url = "https://api.fanbox.cc/plan.listCreator"
+ params = {"creatorId": creator_id}
+ data = self.request(url, params=params, headers=self.headers).json()
+
+ plans = {0: {
+ "id" : "",
+ "title" : "",
+ "fee" : 0,
+ "description" : "",
+ "coverImageUrl" : "",
+ "creatorId" : creator_id,
+ "hasAdultContent": None,
+ "paymentMethod" : None,
+ }}
+ for plan in data["body"]:
+ del plan["user"]
+ plans[plan["fee"]] = plan
+
+ return plans
+
def _get_urls_from_post(self, content_body, post):
num = 0
cover_image = post.get("coverImageUrl")
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index b62ff78..eba1539 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -32,10 +32,13 @@ class GelbooruBase():
url = self.root + "/index.php?page=dapi&q=index&json=1"
data = self.request(url, params=params).json()
- if key not in data:
- return ()
+ try:
+ posts = data[key]
+ except KeyError:
+ self.log.error("Incomplete API response (missing '%s')", key)
+ self.log.debug("%s", data)
+ return []
- posts = data[key]
if not isinstance(posts, list):
return (posts,)
return posts
@@ -165,15 +168,16 @@ class GelbooruFavoriteExtractor(GelbooruBase,
"id" : self.favorite_id,
"limit": "1",
}
- count = self._api_request(params, "@attributes")[0]["count"]
+ count = self._api_request(params, "@attributes")[0]["count"]
if count <= self.offset:
return
- pnum, last = divmod(count + 1, self.per_page)
- if self.offset >= last:
+ pnum, last = divmod(count-1, self.per_page)
+ if self.offset > last:
+ # page number change
self.offset -= last
- diff, self.offset = divmod(self.offset, self.per_page)
+ diff, self.offset = divmod(self.offset-1, self.per_page)
pnum -= diff + 1
skip = self.offset
@@ -183,8 +187,8 @@ class GelbooruFavoriteExtractor(GelbooruBase,
while True:
favs = self._api_request(params, "favorite")
-
favs.reverse()
+
if skip:
favs = favs[skip:]
skip = 0
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index 0864b9f..0c8af3d 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -168,7 +168,7 @@ INSTANCES = {
},
"rule34": {
"root": "https://rule34.xxx",
- "pattern": r"rule34\.xxx",
+ "pattern": r"(?:www\.)?rule34\.xxx",
"api_root": "https://api.rule34.xxx",
},
"safebooru": {
diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py
index b9e2c3d..f70a948 100644
--- a/gallery_dl/extractor/idolcomplex.py
+++ b/gallery_dl/extractor/idolcomplex.py
@@ -34,8 +34,11 @@ class IdolcomplexExtractor(SankakuExtractor):
self.start_post = 0
def _init(self):
+ self.find_pids = re.compile(
+ r" href=[\"#]/\w\w/posts/([0-9a-f]+)"
+ ).findall
self.find_tags = re.compile(
- r'tag-type-([^"]+)">\s*<div [^>]+>\s*<a href="/\?tags=([^"]+)'
+ r'tag-type-([^"]+)">\s*<a [^>]*?href="/[^?]*\?tags=([^"]+)'
).findall
def items(self):
@@ -149,8 +152,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)"
- example = "https://idol.sankakucomplex.com/?tags=TAGS"
+ pattern = BASE_PATTERN + r"/(?:posts/?)?\?([^#]*)"
+ example = "https://idol.sankakucomplex.com/en/posts?tags=TAGS"
per_page = 20
def __init__(self, match):
@@ -196,7 +199,8 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor):
page = self.request(self.root, params=params, retries=10).text
pos = ((page.find('id="more-popular-posts-link"') + 1) or
(page.find('<span class="thumb') + 1))
- yield from text.extract_iter(page, ' href="/posts/', '"', pos)
+
+ yield from self.find_pids(page, pos)
next_url = text.extract(page, 'next-page-url="', '"', pos)[0]
if not next_url:
@@ -218,7 +222,7 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor):
subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}"
- pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pools?/show/(\d+)"
+ pattern = BASE_PATTERN + r"/pools?/show/(\d+)"
example = "https://idol.sankakucomplex.com/pools/show/12345"
per_page = 24
@@ -242,8 +246,7 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor):
while True:
page = self.request(url, params=params, retries=10).text
pos = page.find('id="pool-show"') + 1
- post_ids = list(text.extract_iter(
- page, ' href="/posts/', '"', pos))
+ post_ids = self.find_pids(page, pos)
yield from post_ids
if len(post_ids) < self.per_page:
diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py
index 9aa0332..9199d12 100644
--- a/gallery_dl/extractor/imagechest.py
+++ b/gallery_dl/extractor/imagechest.py
@@ -44,7 +44,7 @@ class ImagechestGalleryExtractor(GalleryExtractor):
}
def images(self, page):
- if " More Files</button>" in page:
+ if ' load-all">' in page:
url = "{}/p/{}/loadAll".format(self.root, self.gallery_id)
headers = {
"X-Requested-With": "XMLHttpRequest",
diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py
index a3e0130..7a19be5 100644
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@@ -6,19 +6,19 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://komikcast.site/"""
+"""Extractors for https://komikcast.lol/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
import re
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:site|me|com)"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?komikcast\.(?:lol|site|me|com)"
class KomikcastBase():
"""Base class for komikcast extractors"""
category = "komikcast"
- root = "https://komikcast.site"
+ root = "https://komikcast.lol"
@staticmethod
def parse_chapter_string(chapter_string, data=None):
@@ -46,9 +46,9 @@ class KomikcastBase():
class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
- """Extractor for manga-chapters from komikcast.site"""
+ """Extractor for manga-chapters from komikcast.lol"""
pattern = BASE_PATTERN + r"(/chapter/[^/?#]+/)"
- example = "https://komikcast.site/chapter/TITLE/"
+ example = "https://komikcast.lol/chapter/TITLE/"
def metadata(self, page):
info = text.extr(page, "<title>", " - Komikcast<")
@@ -65,10 +65,10 @@ class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
- """Extractor for manga from komikcast.site"""
+ """Extractor for manga from komikcast.lol"""
chapterclass = KomikcastChapterExtractor
pattern = BASE_PATTERN + r"(/(?:komik/)?[^/?#]+)/?$"
- example = "https://komikcast.site/komik/TITLE"
+ example = "https://komikcast.lol/komik/TITLE"
def chapters(self, page):
results = []
@@ -76,8 +76,10 @@ class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
for item in text.extract_iter(
page, '<a class="chapter-link-item" href="', '</a'):
- url, _, chapter_string = item.rpartition('">Chapter ')
- self.parse_chapter_string(chapter_string, data)
+ url, _, chapter = item.rpartition('">Chapter')
+ chapter, sep, minor = chapter.strip().partition(".")
+ data["chapter"] = text.parse_int(chapter)
+ data["chapter_minor"] = sep + minor
results.append((url, data.copy()))
return results
diff --git a/gallery_dl/extractor/lynxchan.py b/gallery_dl/extractor/lynxchan.py
index 0edd5c1..85b3fef 100644
--- a/gallery_dl/extractor/lynxchan.py
+++ b/gallery_dl/extractor/lynxchan.py
@@ -18,8 +18,8 @@ class LynxchanExtractor(BaseExtractor):
BASE_PATTERN = LynxchanExtractor.update({
"bbw-chan": {
- "root": "https://bbw-chan.nl",
- "pattern": r"bbw-chan\.nl",
+ "root": "https://bbw-chan.link",
+ "pattern": r"bbw-chan\.(?:link|nl)",
},
"kohlchan": {
"root": "https://kohlchan.net",
@@ -40,7 +40,7 @@ class LynxchanThreadExtractor(LynxchanExtractor):
filename_fmt = "{postId}{num:?-//} {filename}.{extension}"
archive_fmt = "{boardUri}_{postId}_{num}"
pattern = BASE_PATTERN + r"/([^/?#]+)/res/(\d+)"
- example = "https://bbw-chan.nl/a/res/12345.html"
+ example = "https://endchan.org/a/res/12345.html"
def __init__(self, match):
LynxchanExtractor.__init__(self, match)
@@ -71,7 +71,7 @@ class LynxchanBoardExtractor(LynxchanExtractor):
"""Extractor for LynxChan boards"""
subcategory = "board"
pattern = BASE_PATTERN + r"/([^/?#]+)(?:/index|/catalog|/\d+|/?$)"
- example = "https://bbw-chan.nl/a/"
+ example = "https://endchan.org/a/"
def __init__(self, match):
LynxchanExtractor.__init__(self, match)
diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py
index 46019ad..232b98d 100644
--- a/gallery_dl/extractor/manganelo.py
+++ b/gallery_dl/extractor/manganelo.py
@@ -10,7 +10,11 @@ from .common import ChapterExtractor, MangaExtractor
from .. import text
import re
-BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)"
+BASE_PATTERN = (
+ r"(?:https?://)?"
+ r"((?:chap|read|www\.|m\.)?mangan(?:at|el)o"
+ r"\.(?:to|com))"
+)
class ManganeloBase():
@@ -67,10 +71,11 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor):
def images(self, page):
page = text.extr(
- page, 'class="container-chapter-reader', '\n<div')
+ page, 'class="container-chapter-reader', 'class="container')
return [
(url, None)
for url in text.extract_iter(page, '<img src="', '"')
+ if not url.endswith("/gohome.png")
] or [
(url, None)
for url in text.extract_iter(
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index 57c3118..b991705 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -55,9 +55,12 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor):
else:
data["user_id"] = data["artist_id"]
data["user_name"] = data["artist_name"]
- yield Message.Directory, data
- for num, url in enumerate(self._extract_images(image_id, page)):
+ urls = list(self._extract_images(image_id, page))
+ data["count"] = len(urls)
+
+ yield Message.Directory, data
+ for num, url in enumerate(urls):
image = text.nameext_from_url(url, {
"num": num,
"url": "https:" + url,
diff --git a/gallery_dl/extractor/nudecollect.py b/gallery_dl/extractor/nudecollect.py
deleted file mode 100644
index bda5d77..0000000
--- a/gallery_dl/extractor/nudecollect.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://nudecollect.com/"""
-
-from .common import GalleryExtractor
-from .. import text
-
-
-class NudecollectExtractor(GalleryExtractor):
- """Base class for Nudecollect extractors"""
- category = "nudecollect"
- directory_fmt = ("{category}", "{title}")
- filename_fmt = "{slug}_{num:>03}.{extension}"
- archive_fmt = "{slug}_{num}"
- root = "https://www.nudecollect.com"
-
- def request(self, url, **kwargs):
- kwargs["allow_redirects"] = False
- return GalleryExtractor.request(self, url, **kwargs)
-
- @staticmethod
- def get_title(page):
- return text.unescape(text.extr(page, "<title>", "</title>"))[31:]
-
- @staticmethod
- def get_image(page):
- return text.extr(page, '<img src="', '"')
-
-
-class NudecollectImageExtractor(NudecollectExtractor):
- """Extractor for individual images from nudecollect.com"""
- subcategory = "image"
- pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
- r"(/content/([^/?#]+)/image-(\d+)-pics-(\d+)"
- r"-mirror-(\d+)\.html)")
- example = ("https://www.nudecollect.com/content/12345_TITLE"
- "/image-1-pics-108-mirror-1.html")
-
- def __init__(self, match):
- NudecollectExtractor.__init__(self, match)
- _, self.slug, self.num, self.count, self.mirror = match.groups()
-
- def metadata(self, page):
- return {
- "slug" : self.slug,
- "title" : self.get_title(page),
- "count" : text.parse_int(self.count),
- "mirror": text.parse_int(self.mirror),
- }
-
- def images(self, page):
- return ((self.get_image(page), {"num": text.parse_int(self.num)}),)
-
-
-class NudecollectAlbumExtractor(NudecollectExtractor):
- """Extractor for image albums on nudecollect.com"""
- subcategory = "album"
- pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
- r"/content/([^/?#]+)/(?:index-mirror-(\d+)-(\d+)"
- r"|page-\d+-pics-(\d+)-mirror-(\d+))\.html")
- example = ("https://www.nudecollect.com/content/12345_TITLE"
- "/index-mirror-01-123.html")
-
- def __init__(self, match):
- self.slug = match.group(1)
- self.mirror = match.group(2) or match.group(5)
- self.count = text.parse_int(match.group(3) or match.group(4))
- url = "{}/content/{}/image-1-pics-{}-mirror-{}.html".format(
- self.root, self.slug, self.count, self.mirror)
- NudecollectExtractor.__init__(self, match, url)
-
- def metadata(self, page):
- return {
- "slug" : self.slug,
- "title" : self.get_title(page),
- "mirror": text.parse_int(self.mirror),
- }
-
- def images(self, page):
- url = self.get_image(page)
- p1, _, p2 = url.partition("/image0")
- ufmt = p1 + "/image{:>05}" + p2[4:]
- return [(ufmt.format(num), None) for num in range(1, self.count + 1)]
diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py
index 0389ead..89c0d2f 100644
--- a/gallery_dl/extractor/paheal.py
+++ b/gallery_dl/extractor/paheal.py
@@ -32,7 +32,7 @@ class PahealExtractor(Extractor):
post["tags"] = text.unquote(post["tags"])
post.update(data)
yield Message.Directory, post
- yield Message.Url, url, text.nameext_from_url(url, post)
+ yield Message.Url, url, post
def get_metadata(self):
"""Return general metadata"""
@@ -59,11 +59,13 @@ class PahealExtractor(Extractor):
extr(">Source&nbsp;Link<", "</td>"), "href='", "'")),
}
- dimensions, size, ext = extr("Info</th><td>", ">").split(" // ")
- post["width"], _, height = dimensions.partition("x")
+ dimensions, size, ext = extr("Info</th><td>", "<").split(" // ")
post["size"] = text.parse_bytes(size[:-1])
+ post["width"], _, height = dimensions.partition("x")
post["height"], _, duration = height.partition(", ")
post["duration"] = text.parse_float(duration[:-1])
+ post["filename"] = "{} - {}".format(post_id, post["tags"])
+ post["extension"] = ext
return post
@@ -112,6 +114,7 @@ class PahealTagExtractor(PahealExtractor):
tags, data, date = data.split("\n")
dimensions, size, ext = data.split(" // ")
+ tags = text.unescape(tags)
width, _, height = dimensions.partition("x")
height, _, duration = height.partition(", ")
@@ -119,9 +122,11 @@ class PahealTagExtractor(PahealExtractor):
"id": pid, "md5": md5, "file_url": url,
"width": width, "height": height,
"duration": text.parse_float(duration[:-1]),
- "tags": text.unescape(tags),
+ "tags": tags,
"size": text.parse_bytes(size[:-1]),
"date": text.parse_datetime(date, "%B %d, %Y; %H:%M"),
+ "filename" : "{} - {}".format(pid, tags),
+ "extension": ext,
}
def _extract_data_ex(self, post):
diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py
index 4b26393..c46a587 100644
--- a/gallery_dl/extractor/pinterest.py
+++ b/gallery_dl/extractor/pinterest.py
@@ -47,6 +47,7 @@ class PinterestExtractor(Extractor):
carousel_data = pin.get("carousel_data")
if carousel_data:
+ pin["count"] = len(carousel_data["carousel_slots"])
for num, slot in enumerate(carousel_data["carousel_slots"], 1):
slot["media_id"] = slot.pop("id")
pin.update(slot)
@@ -65,7 +66,7 @@ class PinterestExtractor(Extractor):
if videos or media.get("duration") is None:
pin.update(media)
- pin["num"] = 0
+ pin["num"] = pin["count"] = 1
pin["media_id"] = ""
url = media["url"]
diff --git a/gallery_dl/extractor/poringa.py b/gallery_dl/extractor/poringa.py
new file mode 100644
index 0000000..0149d06
--- /dev/null
+++ b/gallery_dl/extractor/poringa.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for http://www.poringa.net/"""
+
+from .common import Extractor, Message
+from .. import text, exception
+from ..cache import cache
+import itertools
+
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?poringa\.net"
+
+
+class PoringaExtractor(Extractor):
+ category = "poringa"
+ directory_fmt = ("{category}", "{user}", "{post_id}")
+ filename_fmt = "{post_id}_{title}_{num:>03}_{filename}.{extension}"
+ archive_fmt = "{post_id}_{num}"
+ root = "http://www.poringa.net"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.item = match.group(1)
+ self.__cookies = True
+
+ def items(self):
+ for post_id in self.posts():
+ url = "{}/posts/imagenes/{}".format(self.root, post_id)
+
+ try:
+ response = self.request(url)
+ except exception.HttpError as exc:
+ self.log.warning(
+ "Unable to fetch posts for '%s' (%s)", post_id, exc)
+ continue
+
+ if "/registro-login?" in response.url:
+ self.log.warning("Private post '%s'", post_id)
+ continue
+
+ page = response.text
+ title, pos = text.extract(
+ page, 'property="og:title" content="', '"')
+
+ try:
+ pos = page.index('<div class="main-info', pos)
+ user, pos = text.extract(
+ page, 'href="http://www.poringa.net/', '"', pos)
+ except ValueError:
+ user = None
+
+ if not user:
+ user = "poringa"
+
+ data = {
+ "post_id" : post_id,
+ "title" : text.unescape(title),
+ "user" : text.unquote(user),
+ "_http_headers": {"Referer": url},
+ }
+
+ main_post = text.extr(
+ page, 'property="dc:content" role="main">', '</div>')
+ urls = list(text.extract_iter(
+ main_post, '<img class="imagen" border="0" src="', '"'))
+ data["count"] = len(urls)
+
+ yield Message.Directory, data
+ for data["num"], url in enumerate(urls, 1):
+ yield Message.Url, url, text.nameext_from_url(url, data)
+
+ def posts(self):
+ return ()
+
+ def request(self, url, **kwargs):
+ if self.__cookies:
+ self.__cookies = False
+ self.cookies_update(_cookie_cache())
+
+ for _ in range(5):
+ response = Extractor.request(self, url, **kwargs)
+ if response.cookies:
+ _cookie_cache.update("", response.cookies)
+ if response.content.find(
+ b"<title>Please wait a few moments</title>", 0, 600) < 0:
+ return response
+ self.sleep(5.0, "check")
+
+ def _pagination(self, url, params):
+ for params["p"] in itertools.count(1):
+ page = self.request(url, params=params).text
+
+ posts_ids = PoringaPostExtractor.pattern.findall(page)
+ posts_ids = list(dict.fromkeys(posts_ids))
+ yield from posts_ids
+
+ if len(posts_ids) < 19:
+ return
+
+
+class PoringaPostExtractor(PoringaExtractor):
+ """Extractor for posts on poringa.net"""
+ subcategory = "post"
+ pattern = BASE_PATTERN + r"/posts/imagenes/(\d+)"
+ example = "http://www.poringa.net/posts/imagenes/12345/TITLE.html"
+
+ def posts(self):
+ return (self.item,)
+
+
+class PoringaUserExtractor(PoringaExtractor):
+ subcategory = "user"
+ pattern = BASE_PATTERN + r"/(\w+)$"
+ example = "http://www.poringa.net/USER"
+
+ def posts(self):
+ url = self.root + "/buscar/"
+ params = {"q": self.item}
+ return self._pagination(url, params)
+
+
+class PoringaSearchExtractor(PoringaExtractor):
+ subcategory = "search"
+ pattern = BASE_PATTERN + r"/buscar/\?&?q=([^&#]+)"
+ example = "http://www.poringa.net/buscar/?q=QUERY"
+
+ def posts(self):
+ url = self.root + "/buscar/"
+ params = {"q": self.item}
+ return self._pagination(url, params)
+
+
+@cache()
+def _cookie_cache():
+ return ()
diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py
index 6439a22..cf70ccc 100644
--- a/gallery_dl/extractor/rule34us.py
+++ b/gallery_dl/extractor/rule34us.py
@@ -38,7 +38,11 @@ class Rule34usExtractor(BooruExtractor):
"height" : extr(' x ', 'h'),
"file_url": extr(' src="', '"'),
}
- post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
+
+ url = post["file_url"]
+ if "//video-cdn1." in url:
+ post["_fallback"] = (url.replace("//video-cdn1.", "//video."),)
+ post["md5"] = url.rpartition("/")[2].partition(".")[0]
tags = collections.defaultdict(list)
for tag_type, tag_name in self._find_tags(page):
diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py
index 5415bf3..08cccab 100644
--- a/gallery_dl/extractor/szurubooru.py
+++ b/gallery_dl/extractor/szurubooru.py
@@ -87,6 +87,10 @@ BASE_PATTERN = SzurubooruExtractor.update({
"root": "https://booru.bcbnsfw.space",
"pattern": r"booru\.bcbnsfw\.space",
},
+ "snootbooru": {
+ "root": "https://snootbooru.com",
+ "pattern": r"snootbooru\.com",
+ },
})
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index fdcefdd..aa9ab9f 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -552,9 +552,11 @@ class TwitterTimelineExtractor(TwitterExtractor):
return self.api.user_media
if strategy == "tweets":
return self.api.user_tweets
+ if strategy == "media":
+ return self.api.user_media
if strategy == "with_replies":
return self.api.user_tweets_and_replies
- return self.api.user_media
+ raise exception.StopExtraction("Invalid strategy '%s'", strategy)
class TwitterTweetsExtractor(TwitterExtractor):
diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py
index 7413b5a..3bd0648 100644
--- a/gallery_dl/extractor/weibo.py
+++ b/gallery_dl/extractor/weibo.py
@@ -225,9 +225,6 @@ class WeiboUserExtractor(WeiboExtractor):
pattern = USER_PATTERN + r"(?:$|#)"
example = "https://weibo.com/USER"
- def initialize(self):
- pass
-
def items(self):
base = "{}/u/{}?tabtype=".format(self.root, self._user_id())
return self._dispatch_extractors((
diff --git a/gallery_dl/extractor/zzup.py b/gallery_dl/extractor/zzup.py
new file mode 100644
index 0000000..45b0cd8
--- /dev/null
+++ b/gallery_dl/extractor/zzup.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class ZzupGalleryExtractor(GalleryExtractor):
+ category = "zzup"
+ directory_fmt = ("{category}", "{title}")
+ filename_fmt = "{slug}_{num:>03}.{extension}"
+ archive_fmt = "{slug}_{num}"
+ root = "https://zzup.com"
+ pattern = (r"(?:https?://)?(?:www\.)?zzup\.com(/content"
+ r"/[\w=]+/([^/?#]+)/[\w=]+)/(?:index|page-\d+)\.html")
+ example = "https://zzup.com/content/xyz=/12345_TITLE/123=/index.html"
+
+ def __init__(self, match):
+ url = "{}/{}/index.html".format(self.root, match.group(1))
+ GalleryExtractor.__init__(self, match, url)
+ self.slug = match.group(2)
+
+ def metadata(self, page):
+ return {
+ "slug" : self.slug,
+ "title": text.unescape(text.extr(
+ page, "<title>", "</title>"))[:-11],
+ }
+
+ def images(self, page):
+ path = text.extr(page, 'class="picbox"><a target="_blank" href="', '"')
+ count = text.parse_int(text.extr(path, "-pics-", "-mirror"))
+ page = self.request(self.root + path).text
+ url = self.root + text.extr(page, '\n<a href="', '"')
+ p1, _, p2 = url.partition("/image0")
+ ufmt = p1 + "/image{:>05}" + p2[4:]
+ return [(ufmt.format(num), None) for num in range(1, count + 1)]