summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/common.py3
-rw-r--r--gallery_dl/extractor/gelbooru.py43
-rw-r--r--gallery_dl/extractor/gelbooru_v02.py18
-rw-r--r--gallery_dl/extractor/hitomi.py35
-rw-r--r--gallery_dl/extractor/mangadex.py17
-rw-r--r--gallery_dl/extractor/newgrounds.py64
-rw-r--r--gallery_dl/extractor/patreon.py1
-rw-r--r--gallery_dl/extractor/wordpress.py41
8 files changed, 156 insertions, 66 deletions
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index c440aee..afe4a16 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -336,7 +336,8 @@ class Extractor():
now = time.time()
for cookie in self._cookiejar:
- if cookie.name in names and cookie.domain == domain:
+ if cookie.name in names and (
+ not domain or cookie.domain == domain):
if cookie.expires and cookie.expires < now:
self.log.warning("Cookie '%s' has expired", cookie.name)
else:
diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py
index df45d0d..a6bda52 100644
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@@ -10,7 +10,7 @@
from .common import Extractor, Message
from . import gelbooru_v02
-from .. import text, exception
+from .. import text, util, exception
import binascii
@@ -20,6 +20,42 @@ class GelbooruBase():
basecategory = "booru"
root = "https://gelbooru.com"
+ def _api_request(self, params):
+ url = self.root + "/index.php?page=dapi&s=post&q=index&json=1"
+ data = self.request(url, params=params).json()
+ if "post" not in data:
+ return ()
+ posts = data["post"]
+ if not isinstance(posts, list):
+ return (posts,)
+ return posts
+
+ def _pagination(self, params):
+ params["pid"] = self.page_start
+ params["limit"] = self.per_page
+
+ post = None
+ while True:
+ try:
+ posts = self._api_request(params)
+ except ValueError:
+ if "tags" not in params or post is None:
+ raise
+ taglist = [tag for tag in params["tags"].split()
+ if not tag.startswith("id:<")]
+ taglist.append("id:<" + str(post.attrib["id"]))
+ params["tags"] = " ".join(taglist)
+ params["pid"] = 0
+ continue
+
+ post = None
+ for post in posts:
+ yield post
+
+ if len(posts) < self.per_page:
+ return
+ params["pid"] += 1
+
@staticmethod
def _file_url(post):
url = post["file_url"]
@@ -82,6 +118,11 @@ class GelbooruPoolExtractor(GelbooruBase,
"pool_name": text.unescape(name),
}
+ def posts(self):
+ params = {}
+ for params["id"] in util.advance(self.post_ids, self.page_start):
+ yield from self._api_request(params)
+
class GelbooruPostExtractor(GelbooruBase,
gelbooru_v02.GelbooruV02PostExtractor):
diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py
index a42a202..8da0bde 100644
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@@ -19,8 +19,15 @@ import re
class GelbooruV02Extractor(booru.BooruExtractor):
basecategory = "gelbooru_v02"
+ def __init__(self, match):
+ booru.BooruExtractor.__init__(self, match)
+ try:
+ self.api_root = INSTANCES[self.category]["api_root"]
+ except KeyError:
+ self.api_root = self.root
+
def _api_request(self, params):
- url = self.root + "/index.php?page=dapi&s=post&q=index"
+ url = self.api_root + "/index.php?page=dapi&s=post&q=index"
return ElementTree.fromstring(self.request(url, params=params).text)
def _pagination(self, params):
@@ -97,12 +104,15 @@ class GelbooruV02Extractor(booru.BooruExtractor):
post["notes"] = notes
-BASE_PATTERN = GelbooruV02Extractor.update({
+INSTANCES = {
"realbooru": {"root": "https://realbooru.com"},
- "rule34" : {"root": "https://rule34.xxx"},
+ "rule34" : {"root": "https://rule34.xxx",
+ "api_root": " https://api.rule34.xxx"},
"safebooru": {"root": "https://safebooru.org"},
"tbib" : {"root": "https://tbib.org"},
-})
+}
+
+BASE_PATTERN = GelbooruV02Extractor.update(INSTANCES)
class GelbooruV02TagExtractor(GelbooruV02Extractor):
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index 88cf98c..ce6c7ce 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -26,7 +26,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
r"/(?:[^/?#]+-)?(\d+)")
test = (
("https://hitomi.la/galleries/867789.html", {
- "pattern": r"https://[a-c]b.hitomi.la/images/1639745412/\d+"
+ "pattern": r"https://[a-c]b.hitomi.la/images/1641140516/\d+"
r"/[0-9a-f]{64}\.jpg",
"keyword": "4873ef9a523621fc857b114e0b2820ba4066e9ae",
"options": (("metadata", True),),
@@ -39,12 +39,12 @@ class HitomiGalleryExtractor(GalleryExtractor):
}),
# Game CG with scenes (#321)
("https://hitomi.la/galleries/733697.html", {
- "url": "479d16fe92117a6a2ce81b4e702e6347922c81e3",
+ "url": "d4854175da2b5fa4ae62749266c7be0bf237dc99",
"count": 210,
}),
# fallback for galleries only available through /reader/ URLs
("https://hitomi.la/galleries/1045954.html", {
- "url": "ebc1415c5d7f634166ef7e2635b77735de1ea7a2",
+ "url": "eea99c3745719a7a392150335e6ae3f73faa0b85",
"count": 1413,
}),
# gallery with "broken" redirect
@@ -138,7 +138,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
def images(self, _):
# see https://ltn.hitomi.la/gg.js
- gg_m, gg_b = _parse_gg(self)
+ gg_m, gg_b, gg_default = _parse_gg(self)
result = []
for image in self.info["files"]:
@@ -148,7 +148,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
# see https://ltn.hitomi.la/common.js
inum = int(ihash[-1] + ihash[-3:-1], 16)
url = "https://{}b.hitomi.la/images/{}/{}/{}.{}".format(
- chr(97 + gg_m.get(inum, 0)),
+ chr(97 + gg_m.get(inum, gg_default)),
gg_b, inum, ihash, idata["extension"],
)
result.append((url, idata))
@@ -195,10 +195,25 @@ class HitomiTagExtractor(Extractor):
def _parse_gg(extr):
page = extr.request("https://ltn.hitomi.la/gg.js").text
- m = {
- int(match.group(1)): int(match.group(2))
- for match in re.finditer(r"case (\d+): o = (\d+); break;", page)
- }
+ m = {}
+
+ keys = []
+ for match in re.finditer(
+ r"case\s+(\d+):(?:\s*o\s*=\s*(\d+))?", page):
+ key, value = match.groups()
+ keys.append(int(key))
+
+ if value:
+ value = int(value)
+ for key in keys:
+ m[key] = value
+ keys.clear()
+
+ for match in re.finditer(
+ r"if\s+\(g\s*===?\s*(\d+)\)[\s{]*o\s*=\s*(\d+)", page):
+ m[int(match.group(1))] = int(match.group(2))
+
+ d = re.search(r"(?:var\s|default:)\s*o\s*=\s*(\d+)", page)
b = re.search(r"b:\s*[\"'](.+)[\"']", page)
- return m, b.group(1).strip("/")
+ return m, b.group(1).strip("/"), int(d.group(1)) if d else 1
diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py
index 393f4e2..ea5d4a8 100644
--- a/gallery_dl/extractor/mangadex.py
+++ b/gallery_dl/extractor/mangadex.py
@@ -40,7 +40,7 @@ class MangadexExtractor(Extractor):
uuid = chapter["id"]
data = self._transform(chapter)
data["_extractor"] = MangadexChapterExtractor
- self._cache[uuid] = (chapter, data)
+ self._cache[uuid] = data
yield Message.Queue, self.root + "/chapter/" + uuid, data
def _transform(self, chapter):
@@ -72,7 +72,7 @@ class MangadexExtractor(Extractor):
"date" : text.parse_datetime(cattributes["publishAt"]),
"lang" : lang,
"language": util.code_to_language(lang),
- "count" : len(cattributes["data"]),
+ "count" : cattributes["pages"],
}
data["artist"] = [artist["attributes"]["name"]
@@ -107,20 +107,21 @@ class MangadexChapterExtractor(MangadexExtractor):
def items(self):
try:
- chapter, data = self._cache.pop(self.uuid)
+ data = self._cache.pop(self.uuid)
except KeyError:
chapter = self.api.chapter(self.uuid)
data = self._transform(chapter)
- yield Message.Directory, data
- cattributes = chapter["attributes"]
+ yield Message.Directory, data
data["_http_headers"] = self._headers
- base = "{}/data/{}/".format(
- self.api.athome_server(self.uuid)["baseUrl"], cattributes["hash"])
+
+ server = self.api.athome_server(self.uuid)
+ chapter = server["chapter"]
+ base = "{}/data/{}/".format(server["baseUrl"], chapter["hash"])
enum = util.enumerate_reversed if self.config(
"page-reverse") else enumerate
- for data["page"], page in enum(cattributes["data"], 1):
+ for data["page"], page in enum(chapter["data"], 1):
text.nameext_from_url(page, data)
yield Message.Url, base + page, data
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index 4351b3e..8bcbc20 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -38,6 +38,7 @@ class NewgroundsExtractor(Extractor):
def items(self):
self.login()
+ metadata = self.metadata()
for post_url in self.posts():
try:
@@ -48,6 +49,8 @@ class NewgroundsExtractor(Extractor):
url = None
if url:
+ if metadata:
+ post.update(metadata)
yield Message.Directory, post
yield Message.Url, url, text.nameext_from_url(url, post)
@@ -62,9 +65,12 @@ class NewgroundsExtractor(Extractor):
"Unable to get download URL for '%s'", post_url)
def posts(self):
- """Return urls of all relevant image pages"""
+ """Return URLs of all relevant post pages"""
return self._pagination(self._path)
+ def metadata(self):
+ """Return general metadata"""
+
def login(self):
username, password = self._get_auth_info()
if username:
@@ -493,3 +499,59 @@ class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
text.ensure_http_scheme(user.rpartition('"')[2])
for user in text.extract_iter(page, 'class="item-user', '"><img')
]
+
+
+class NewgroundsSearchExtractor(NewgroundsExtractor):
+ """Extractor for newgrounds.com search reesults"""
+ subcategory = "search"
+ directory_fmt = ("{category}", "search", "{search_tags}")
+ pattern = (r"(?:https?://)?(?:www\.)?newgrounds\.com"
+ r"/search/conduct/([^/?#]+)/?\?([^#]+)")
+ test = (
+ ("https://www.newgrounds.com/search/conduct/art?terms=tree", {
+ "pattern": NewgroundsImageExtractor.pattern,
+ "keyword": {"search_tags": "tree"},
+ "range": "1-10",
+ "count": 10,
+ }),
+ ("https://www.newgrounds.com/search/conduct/movies?terms=tree", {
+ "pattern": r"https://uploads.ungrounded.net(/alternate)?/\d+/\d+",
+ "range": "1-10",
+ "count": 10,
+ }),
+ ("https://www.newgrounds.com/search/conduct/audio?advanced=1"
+ "&terms=tree+green+nature&match=tdtu&genre=5&suitabilities=e%2Cm"),
+ )
+
+ def __init__(self, match):
+ NewgroundsExtractor.__init__(self, match)
+ self._path, query = match.groups()
+ self.query = text.parse_query(query)
+
+ def posts(self):
+ return self._pagination("/search/conduct/" + self._path, self.query)
+
+ def metadata(self):
+ return {"search_tags": self.query.get("terms", "")}
+
+ def _pagination(self, path, params):
+ url = self.root + path
+ headers = {
+ "Accept": "application/json, text/javascript, */*; q=0.01",
+ "X-Requested-With": "XMLHttpRequest",
+ "Referer": self.root,
+ }
+ params["inner"] = "1"
+ params["page"] = 1
+
+ while True:
+ data = self.request(url, params=params, headers=headers).json()
+
+ post_url = None
+ for post_url in text.extract_iter(data["content"], 'href="', '"'):
+ if not post_url.startswith("/search/"):
+ yield post_url
+
+ if post_url is None:
+ return
+ params["page"] += 1
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index f8c80ef..a7e0ff1 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -20,6 +20,7 @@ class PatreonExtractor(Extractor):
"""Base class for patreon extractors"""
category = "patreon"
root = "https://www.patreon.com"
+ cookiedomain = ".patreon.com"
directory_fmt = ("{category}", "{creator[full_name]}")
filename_fmt = "{id}_{title}_{num:>02}.{extension}"
archive_fmt = "{id}_{num}"
diff --git a/gallery_dl/extractor/wordpress.py b/gallery_dl/extractor/wordpress.py
deleted file mode 100644
index dd7d28a..0000000
--- a/gallery_dl/extractor/wordpress.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2021 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for WordPress blogs"""
-
-from .common import BaseExtractor, Message
-from .. import text
-
-
-class WordpressExtractor(BaseExtractor):
- """Base class for wordpress extractors"""
- basecategory = "wordpress"
-
- def items(self):
- for post in self.posts():
- yield Message.Difrectory, post
-
-
-
-BASE_PATTERN = WordpressExtractor.update({})
-
-
-class WordpressBlogExtractor(WordpressExtractor):
- """Extractor for WordPress blogs"""
- subcategory = "blog"
- directory_fmt = ("{category}", "{blog}")
- pattern = BASE_PATTERN + r"/?$"
-
- def posts(self):
- url = self.root + "/wp-json/wp/v2/posts"
- params = {"page": 1, "per_page": "100"}
-
- while True:
- data = self.request(url, params=params).json()
- exit()
- yield 1