summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor')
-rw-r--r--gallery_dl/extractor/__init__.py3
-rw-r--r--gallery_dl/extractor/booru.py6
-rw-r--r--gallery_dl/extractor/bunkr.py3
-rw-r--r--gallery_dl/extractor/common.py23
-rw-r--r--gallery_dl/extractor/danbooru.py25
-rw-r--r--gallery_dl/extractor/deviantart.py185
-rw-r--r--gallery_dl/extractor/fantia.py17
-rw-r--r--gallery_dl/extractor/generic.py4
-rw-r--r--gallery_dl/extractor/hiperdex.py27
-rw-r--r--gallery_dl/extractor/hotleak.py13
-rw-r--r--gallery_dl/extractor/instagram.py47
-rw-r--r--gallery_dl/extractor/kemonoparty.py23
-rw-r--r--gallery_dl/extractor/lexica.py104
-rw-r--r--gallery_dl/extractor/mastodon.py19
-rw-r--r--gallery_dl/extractor/nudecollect.py142
-rw-r--r--gallery_dl/extractor/oauth.py163
-rw-r--r--gallery_dl/extractor/philomena.py11
-rw-r--r--gallery_dl/extractor/sankaku.py10
-rw-r--r--gallery_dl/extractor/twitter.py127
-rw-r--r--gallery_dl/extractor/wikifeet.py118
20 files changed, 844 insertions, 226 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index f26f6a9..6140c2c 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -77,6 +77,7 @@ modules = [
"kemonoparty",
"khinsider",
"komikcast",
+ "lexica",
"lightroom",
"lineblog",
"livedoor",
@@ -102,6 +103,7 @@ modules = [
"nitter",
"nozomi",
"nsfwalbum",
+ "nudecollect",
"paheal",
"patreon",
"philomena",
@@ -158,6 +160,7 @@ modules = [
"webtoons",
"weibo",
"wikiart",
+ "wikifeet",
"xhamster",
"xvideos",
"zerochan",
diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py
index 0d7d13d..cbd0e07 100644
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2015-2022 Mike Fährmann
+# Copyright 2015-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -27,6 +27,10 @@ class BooruExtractor(BaseExtractor):
notes = self.config("notes", False)
fetch_html = tags or notes
+ url_key = self.config("url")
+ if url_key:
+ self._file_url = operator.itemgetter(url_key)
+
for post in self.posts():
try:
url = self._file_url(post)
diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py
index 8283fbc..1c339a9 100644
--- a/gallery_dl/extractor/bunkr.py
+++ b/gallery_dl/extractor/bunkr.py
@@ -75,7 +75,8 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
headers = {"Referer": root.replace("://", "://stream.", 1) + "/"}
for file in files:
if file["file"].endswith(
- (".mp4", ".m4v", ".mov", ".webm", ".zip", ".rar", ".7z")):
+ (".mp4", ".m4v", ".mov", ".webm", ".mkv", ".ts",
+ ".zip", ".rar", ".7z")):
file["_http_headers"] = headers
file["file"] = file["file"].replace(
"://cdn", "://media-files", 1)
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index ad766da..4cefa1c 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2022 Mike Fährmann
+# Copyright 2014-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -36,6 +36,7 @@ class Extractor():
browser = None
root = ""
test = None
+ finalize = None
request_interval = 0.0
request_interval_min = 0.0
request_timestamp = 0.0
@@ -44,7 +45,6 @@ class Extractor():
def __init__(self, match):
self.log = logging.getLogger(self.category)
self.url = match.string
- self.finalize = None
if self.basecategory:
self.config = self._config_shared
@@ -53,6 +53,7 @@ class Extractor():
self._parentdir = ""
self._write_pages = self.config("write-pages", False)
+ self._retry_codes = self.config("retry-codes")
self._retries = self.config("retries", 4)
self._timeout = self.config("timeout", 30)
self._verify = self.config("verify", True)
@@ -64,6 +65,8 @@ class Extractor():
if self._retries < 0:
self._retries = float("inf")
+ if not self._retry_codes:
+ self._retry_codes = ()
self._init_session()
self._init_cookies()
@@ -103,12 +106,15 @@ class Extractor():
values[:0] = config.accumulate((self.subcategory,), key, conf=conf)
return values
- def request(self, url, *, method="GET", session=None, retries=None,
- encoding=None, fatal=True, notfound=None, **kwargs):
+ def request(self, url, *, method="GET", session=None,
+ retries=None, retry_codes=None, encoding=None,
+ fatal=True, notfound=None, **kwargs):
if session is None:
session = self.session
if retries is None:
retries = self._retries
+ if retry_codes is None:
+ retry_codes = self._retry_codes
if "proxies" not in kwargs:
kwargs["proxies"] = self._proxies
if "timeout" not in kwargs:
@@ -153,12 +159,12 @@ class Extractor():
code in (403, 503):
content = response.content
if b"_cf_chl_opt" in content or b"jschl-answer" in content:
- self.log.warning("Cloudflare IUAM challenge")
+ self.log.warning("Cloudflare challenge")
break
if b'name="captcha-bypass"' in content:
self.log.warning("Cloudflare CAPTCHA")
break
- if code < 500 and code != 429 and code != 430:
+ if code not in retry_codes and code < 500:
break
finally:
@@ -501,7 +507,10 @@ class Extractor():
try:
with open(path + ".txt", 'wb') as fp:
util.dump_response(
- response, fp, headers=(self._write_pages == "all"))
+ response, fp,
+ headers=(self._write_pages in ("all", "ALL")),
+ hide_auth=(self._write_pages != "ALL")
+ )
except Exception as e:
self.log.warning("Failed to dump HTTP request (%s: %s)",
e.__class__.__name__, e)
diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py
index 4c93604..7b0e572 100644
--- a/gallery_dl/extractor/danbooru.py
+++ b/gallery_dl/extractor/danbooru.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2022 Mike Fährmann
+# Copyright 2014-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -40,7 +40,17 @@ class DanbooruExtractor(BaseExtractor):
self.ugoira = self.config("ugoira", False)
self.external = self.config("external", False)
- self.extended_metadata = self.config("metadata", False)
+
+ metadata = self.config("metadata", False)
+ if metadata:
+ if isinstance(metadata, (list, tuple)):
+ metadata = ",".join(metadata)
+ elif not isinstance(metadata, str):
+ metadata = "artist_commentary,children,notes,parent,uploader"
+ self.metadata_includes = metadata
+ else:
+ self.metadata_includes = None
+
threshold = self.config("threshold")
if isinstance(threshold, int):
self.threshold = 1 if threshold < 1 else threshold
@@ -99,13 +109,10 @@ class DanbooruExtractor(BaseExtractor):
url = post["large_file_url"]
post["extension"] = "webm"
- if self.extended_metadata:
- template = (
- "{}/posts/{}.json?only=artist_commentary,children,notes,"
- "parent,uploader"
- )
- resp = self.request(template.format(self.root, post["id"]))
- post.update(resp.json())
+ if self.metadata_includes:
+ meta_url = "{}/posts/{}.json?only={}".format(
+ self.root, post["id"], self.metadata_includes)
+ post.update(self.request(meta_url).json())
if url[0] == "/":
url = self.root + url
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index aeb2d0a..a3187fa 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -118,11 +118,18 @@ class DeviantartExtractor(Extractor):
if "flash" in deviation:
yield self.commit(deviation, deviation["flash"])
- if "excerpt" in deviation and self.commit_journal:
- journal = self.api.deviation_content(deviation["deviationid"])
- if self.extra:
- deviation["_journal"] = journal["html"]
- yield self.commit_journal(deviation, journal)
+ if self.commit_journal:
+ if "excerpt" in deviation:
+ journal = self.api.deviation_content(
+ deviation["deviationid"])
+ elif "body" in deviation:
+ journal = {"html": deviation.pop("body")}
+ else:
+ journal = None
+ if journal:
+ if self.extra:
+ deviation["_journal"] = journal["html"]
+ yield self.commit_journal(deviation, journal)
if not self.extra:
continue
@@ -150,10 +157,19 @@ class DeviantartExtractor(Extractor):
"""Adjust the contents of a Deviation-object"""
if "index" not in deviation:
try:
- deviation["index"] = text.parse_int(
- deviation["url"].rpartition("-")[2])
+ if deviation["url"].startswith("https://sta.sh"):
+ filename = deviation["content"]["src"].split("/")[5]
+ deviation["index_base36"] = filename.partition("-")[0][1:]
+ deviation["index"] = id_from_base36(
+ deviation["index_base36"])
+ else:
+ deviation["index"] = text.parse_int(
+ deviation["url"].rpartition("-")[2])
except KeyError:
deviation["index"] = 0
+ deviation["index_base36"] = "0"
+ if "index_base36" not in deviation:
+ deviation["index_base36"] = base36_from_id(deviation["index"])
if self.user:
deviation["username"] = self.user
@@ -170,13 +186,11 @@ class DeviantartExtractor(Extractor):
if self.comments:
deviation["comments"] = (
- self.api.comments_deviation(deviation["deviationid"])
+ self.api.comments(deviation["deviationid"], target="deviation")
if deviation["stats"]["comments"] else ()
)
# filename metadata
- alphabet = "0123456789abcdefghijklmnopqrstuvwxyz"
- deviation["index_base36"] = util.bencode(deviation["index"], alphabet)
sub = re.compile(r"\W").sub
deviation["filename"] = "".join((
sub("_", deviation["title"].lower()), "_by_",
@@ -253,9 +267,10 @@ class DeviantartExtractor(Extractor):
html = journal["html"]
if html.startswith("<style"):
html = html.partition("</style>")[2]
+ head, _, tail = html.rpartition("<script")
content = "\n".join(
text.unescape(text.remove_html(txt))
- for txt in html.rpartition("<script")[0].split("<br />")
+ for txt in (head or tail).split("<br />")
)
txt = JOURNAL_TEMPLATE_TEXT.format(
title=deviation["title"],
@@ -402,8 +417,9 @@ class DeviantartUserExtractor(DeviantartExtractor):
}),
("https://www.deviantart.com/shimoda7", {
"options": (("include", "all"),),
- "pattern": r"/shimoda7/(gallery(/scraps)?|posts|favourites)$",
- "count": 4,
+ "pattern": r"/shimoda7/"
+ r"(gallery(/scraps)?|posts(/statuses)?|favourites)$",
+ "count": 5,
}),
("https://shimoda7.deviantart.com/"),
)
@@ -414,6 +430,7 @@ class DeviantartUserExtractor(DeviantartExtractor):
(DeviantartGalleryExtractor , base + "gallery"),
(DeviantartScrapsExtractor , base + "gallery/scraps"),
(DeviantartJournalExtractor , base + "posts"),
+ (DeviantartStatusExtractor , base + "posts/statuses"),
(DeviantartFavoriteExtractor, base + "favourites"),
), ("gallery",))
@@ -746,6 +763,97 @@ class DeviantartJournalExtractor(DeviantartExtractor):
return self.api.browse_user_journals(self.user, self.offset)
+class DeviantartStatusExtractor(DeviantartExtractor):
+ """Extractor for an artist's status updates"""
+ subcategory = "status"
+ directory_fmt = ("{category}", "{username}", "Status")
+ filename_fmt = "{category}_{index}_{title}_{date}.{extension}"
+ archive_fmt = "S_{_username}_{index}.{extension}"
+ pattern = BASE_PATTERN + r"/posts/statuses"
+ test = (
+ ("https://www.deviantart.com/t1na/posts/statuses", {
+ "count": 0,
+ }),
+ ("https://www.deviantart.com/justgalym/posts/statuses", {
+ "count": 4,
+ "url": "bf4c44c0c60ff2648a880f4c3723464ad3e7d074",
+ }),
+ # shared deviation
+ ("https://www.deviantart.com/justgalym/posts/statuses", {
+ "options": (("journals", "none"),),
+ "count": 1,
+ "pattern": r"https://images-wixmp-\w+\.wixmp\.com/f"
+ r"/[^/]+/[^.]+\.jpg\?token=",
+ }),
+ # shared sta.sh item
+ ("https://www.deviantart.com/vanillaghosties/posts/statuses", {
+ "options": (("journals", "none"), ("original", False)),
+ "range": "5-",
+ "count": 1,
+ "keyword": {
+ "index" : int,
+ "index_base36": "re:^[0-9a-z]+$",
+ "url" : "re:^https://sta.sh",
+ },
+ }),
+ ("https://www.deviantart.com/justgalym/posts/statuses", {
+ "options": (("journals", "text"),),
+ "url": "c8744f7f733a3029116607b826321233c5ca452d",
+ }),
+ )
+
+ def deviations(self):
+ for status in self.api.user_statuses(self.user, self.offset):
+ yield from self.status(status)
+
+ def status(self, status):
+ for item in status.get("items") or (): # do not trust is_share
+ # shared deviations/statuses
+ if "deviation" in item:
+ yield item["deviation"].copy()
+ if "status" in item:
+ yield from self.status(item["status"].copy())
+ # assume is_deleted == true means necessary fields are missing
+ if status["is_deleted"]:
+ self.log.warning(
+ "Skipping status %s (deleted)", status.get("statusid"))
+ return
+ yield status
+
+ def prepare(self, deviation):
+ if "deviationid" in deviation:
+ return DeviantartExtractor.prepare(self, deviation)
+
+ try:
+ path = deviation["url"].split("/")
+ deviation["index"] = text.parse_int(path[-1] or path[-2])
+ except KeyError:
+ deviation["index"] = 0
+
+ if self.user:
+ deviation["username"] = self.user
+ deviation["_username"] = self.user.lower()
+ else:
+ deviation["username"] = deviation["author"]["username"]
+ deviation["_username"] = deviation["username"].lower()
+
+ deviation["date"] = dt = text.parse_datetime(deviation["ts"])
+ deviation["published_time"] = int(util.datetime_to_timestamp(dt))
+
+ deviation["da_category"] = "Status"
+ deviation["category_path"] = "status"
+ deviation["is_downloadable"] = False
+ deviation["title"] = "Status Update"
+
+ comments_count = deviation.pop("comments_count", 0)
+ deviation["stats"] = {"comments": comments_count}
+ if self.comments:
+ deviation["comments"] = (
+ self.api.comments(deviation["statusid"], target="status")
+ if comments_count else ()
+ )
+
+
class DeviantartPopularExtractor(DeviantartExtractor):
"""Extractor for popular deviations"""
subcategory = "popular"
@@ -867,7 +975,9 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
archive_fmt = "g_{_username}_{index}.{extension}"
pattern = (BASE_PATTERN + r"/(art|journal)/(?:[^/?#]+-)?(\d+)"
r"|(?:https?://)?(?:www\.)?deviantart\.com/"
- r"(?:view/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)(\d+)")
+ r"(?:view/|deviation/|view(?:-full)?\.php/*\?(?:[^#]+&)?id=)"
+ r"(\d+)" # bare deviation ID without slug
+ r"|(?:https?://)?fav\.me/d([0-9a-z]+)") # base36
test = (
(("https://www.deviantart.com/shimoda7/art/For-the-sake-10073852"), {
"options": (("original", 0),),
@@ -940,6 +1050,15 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
("https://www.deviantart.com/view/1", {
"exception": exception.NotFoundError,
}),
+ # /deviation/ (#3558)
+ ("https://www.deviantart.com/deviation/817215762"),
+ # fav.me (#3558)
+ ("https://fav.me/ddijrpu", {
+ "count": 1,
+ }),
+ ("https://fav.me/dddd", {
+ "exception": exception.NotFoundError,
+ }),
# old-style URLs
("https://shimoda7.deviantart.com"
"/art/For-the-sake-of-a-memory-10073852"),
@@ -956,7 +1075,8 @@ class DeviantartDeviationExtractor(DeviantartExtractor):
def __init__(self, match):
DeviantartExtractor.__init__(self, match)
self.type = match.group(3)
- self.deviation_id = match.group(4) or match.group(5)
+ self.deviation_id = \
+ match.group(4) or match.group(5) or id_from_base36(match.group(6))
def deviations(self):
url = "{}/{}/{}/{}".format(
@@ -1149,9 +1269,9 @@ class DeviantartOAuthAPI():
"mature_content": self.mature}
return self._pagination_list(endpoint, params)
- def comments_deviation(self, deviation_id, offset=0):
- """Fetch comments posted on a deviation"""
- endpoint = "/comments/deviation/" + deviation_id
+ def comments(self, id, target, offset=0):
+ """Fetch comments posted on a target"""
+ endpoint = "/comments/{}/{}".format(target, id)
params = {"maxdepth": "5", "offset": offset, "limit": 50,
"mature_content": self.mature}
return self._pagination_list(endpoint, params=params, key="thread")
@@ -1187,8 +1307,6 @@ class DeviantartOAuthAPI():
def deviation_metadata(self, deviations):
""" Fetch deviation metadata for a set of deviations"""
- if not deviations:
- return []
endpoint = "/deviation/metadata?" + "&".join(
"deviationids[{}]={}".format(num, deviation["deviationid"])
for num, deviation in enumerate(deviations)
@@ -1224,6 +1342,12 @@ class DeviantartOAuthAPI():
endpoint = "/user/profile/" + username
return self._call(endpoint, fatal=False)
+ def user_statuses(self, username, offset=0):
+ """Yield status updates of a specific user"""
+ endpoint = "/user/statuses/"
+ params = {"username": username, "offset": offset, "limit": 50}
+ return self._pagination(endpoint, params)
+
def user_friends_watch(self, username):
"""Watch a user"""
endpoint = "/user/friends/watch/" + username
@@ -1350,10 +1474,12 @@ class DeviantartOAuthAPI():
"Private deviations detected! Run 'gallery-dl "
"oauth:deviantart' and follow the instructions to "
"be able to access them.")
- if self.metadata:
- self._metadata(results)
- if self.folders:
- self._folders(results)
+ # "statusid" cannot be used instead
+ if results and "deviationid" in results[0]:
+ if self.metadata:
+ self._metadata(results)
+ if self.folders:
+ self._folders(results)
yield from results
if not data["has_more"] and (
@@ -1561,6 +1687,17 @@ def _login_impl(extr, username, password):
}
+def id_from_base36(base36):
+ return util.bdecode(base36, _ALPHABET)
+
+
+def base36_from_id(deviation_id):
+ return util.bencode(int(deviation_id), _ALPHABET)
+
+
+_ALPHABET = "0123456789abcdefghijklmnopqrstuvwxyz"
+
+
###############################################################################
# Journal Formats #############################################################
diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py
index c05ec39..476fdeb 100644
--- a/gallery_dl/extractor/fantia.py
+++ b/gallery_dl/extractor/fantia.py
@@ -21,6 +21,10 @@ class FantiaExtractor(Extractor):
_warning = True
def items(self):
+ self.headers = {
+ "Accept" : "application/json, text/plain, */*",
+ "Referer": self.root,
+ }
if self._warning:
if not self._check_cookies(("_session_id",)):
@@ -43,10 +47,11 @@ class FantiaExtractor(Extractor):
def _pagination(self, url):
params = {"page": 1}
- headers = {"Referer": self.root}
+ headers = self.headers
while True:
page = self.request(url, params=params, headers=headers).text
+ self._csrf_token(page)
post_id = None
for post_id in text.extract_iter(
@@ -57,11 +62,16 @@ class FantiaExtractor(Extractor):
return
params["page"] += 1
+ def _csrf_token(self, page=None):
+ if not page:
+ page = self.request(self.root + "/").text
+ self.headers["X-CSRF-Token"] = text.extr(
+ page, 'name="csrf-token" content="', '"')
+
def _get_post_data(self, post_id):
"""Fetch and process post data"""
- headers = {"Referer": self.root}
url = self.root+"/api/v1/posts/"+post_id
- resp = self.request(url, headers=headers).json()["post"]
+ resp = self.request(url, headers=self.headers).json()["post"]
post = {
"post_id": resp["id"],
"post_url": self.root + "/posts/" + str(resp["id"]),
@@ -173,4 +183,5 @@ class FantiaPostExtractor(FantiaExtractor):
self.post_id = match.group(1)
def posts(self):
+ self._csrf_token()
return (self.post_id,)
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 10c7295..9292da3 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -150,7 +150,7 @@ class GenericExtractor(Extractor):
https://en.wikipedia.org/wiki/List_of_file_formats
Compared to the "pattern" class variable, here we must exclude also
- other special characters (space, ", ', >), since we are looking for
+ other special characters (space, ", ', <, >), since we are looking for
urls in html tags.
"""
@@ -158,7 +158,7 @@ class GenericExtractor(Extractor):
(?:[^?&#"'>\s]+) # anything until dot+extension
\.(?:jpe?g|jpe|png|gif
|web[mp]|mp4|mkv|og[gmv]|opus) # dot + image/video extensions
- (?:[^"'>\s]*)? # optional query and fragment
+ (?:[^"'<>\s]*)? # optional query and fragment
"""
imageurls_src = re.findall(imageurl_pattern_src, page)
diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py
index adee94a..d61c139 100644
--- a/gallery_dl/extractor/hiperdex.py
+++ b/gallery_dl/extractor/hiperdex.py
@@ -1,25 +1,26 @@
# -*- coding: utf-8 -*-
-# Copyright 2020-2021 Mike Fährmann
+# Copyright 2020-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extractors for https://hiperdex.com/"""
+"""Extractors for https://1sthiperdex.com/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text
from ..cache import memcache
import re
-BASE_PATTERN = r"((?:https?://)?(?:www\.)?hiperdex\d?\.(?:com|net|info))"
+BASE_PATTERN = (r"((?:https?://)?(?:www\.)?"
+ r"(?:1st)?hiperdex\d?\.(?:com|net|info))")
class HiperdexBase():
"""Base class for hiperdex extractors"""
category = "hiperdex"
- root = "https://hiperdex.com"
+ root = "https://1sthiperdex.com"
@memcache(keyarg=1)
def manga_data(self, manga, page=None):
@@ -52,6 +53,8 @@ class HiperdexBase():
}
def chapter_data(self, chapter):
+ if chapter.startswith("chapter-"):
+ chapter = chapter[8:]
chapter, _, minor = chapter.partition("-")
data = {
"chapter" : text.parse_int(chapter),
@@ -62,12 +65,13 @@ class HiperdexBase():
class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
- """Extractor for manga chapters from hiperdex.com"""
+ """Extractor for manga chapters from 1sthiperdex.com"""
pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))"
test = (
- ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/", {
- "pattern": r"https://hiperdex\d?.(com|net|info)/wp-content/uploads"
- r"/WP-manga/data/manga_\w+/[0-9a-f]{32}/\d+\.webp",
+ ("https://1sthiperdex.com/manga/domestic-na-kanojo/154-5/", {
+ "pattern": r"https://(1st)?hiperdex\d?.(com|net|info)"
+ r"/wp-content/uploads/WP-manga/data"
+ r"/manga_\w+/[0-9a-f]{32}/\d+\.webp",
"count": 9,
"keyword": {
"artist" : "Sasuga Kei",
@@ -82,6 +86,7 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
"type" : "Manga",
},
}),
+ ("https://hiperdex.com/manga/domestic-na-kanojo/154-5/"),
("https://hiperdex2.com/manga/domestic-na-kanojo/154-5/"),
("https://hiperdex.net/manga/domestic-na-kanojo/154-5/"),
("https://hiperdex.info/manga/domestic-na-kanojo/154-5/"),
@@ -104,11 +109,11 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
- """Extractor for manga from hiperdex.com"""
+ """Extractor for manga from 1sthiperdex.com"""
chapterclass = HiperdexChapterExtractor
pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$"
test = (
- ("https://hiperdex.com/manga/youre-not-that-special/", {
+ ("https://1sthiperdex.com/manga/youre-not-that-special/", {
"count": 51,
"pattern": HiperdexChapterExtractor.pattern,
"keyword": {
@@ -125,6 +130,7 @@ class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
"type" : "Manhwa",
},
}),
+ ("https://hiperdex.com/manga/youre-not-that-special/"),
("https://hiperdex2.com/manga/youre-not-that-special/"),
("https://hiperdex.net/manga/youre-not-that-special/"),
("https://hiperdex.info/manga/youre-not-that-special/"),
@@ -166,6 +172,7 @@ class HiperdexArtistExtractor(HiperdexBase, MangaExtractor):
reverse = False
pattern = BASE_PATTERN + r"(/manga-a(?:rtist|uthor)/(?:[^/?#]+))"
test = (
+ ("https://1sthiperdex.com/manga-artist/beck-ho-an/"),
("https://hiperdex.net/manga-artist/beck-ho-an/"),
("https://hiperdex2.com/manga-artist/beck-ho-an/"),
("https://hiperdex.info/manga-artist/beck-ho-an/"),
diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py
index eb64db0..7c656be 100644
--- a/gallery_dl/extractor/hotleak.py
+++ b/gallery_dl/extractor/hotleak.py
@@ -8,6 +8,7 @@
from .common import Extractor, Message
from .. import text, exception
+import binascii
BASE_PATTERN = r"(?:https?://)?(?:www\.)?hotleak\.vip"
@@ -49,6 +50,11 @@ class HotleakExtractor(Extractor):
params["page"] += 1
+def decode_video_url(url):
+ # cut first and last 16 characters, reverse, base64 decode
+ return binascii.a2b_base64(url[-17:15:-1]).decode()
+
+
class HotleakPostExtractor(HotleakExtractor):
"""Extractor for individual posts on hotleak"""
subcategory = "post"
@@ -100,8 +106,8 @@ class HotleakPostExtractor(HotleakExtractor):
text.nameext_from_url(data["url"], data)
elif self.type == "video":
- data["url"] = "ytdl:" + text.extr(
- text.unescape(page), '"src":"', '"')
+ data["url"] = "ytdl:" + decode_video_url(text.extr(
+ text.unescape(page), '"src":"', '"'))
text.nameext_from_url(data["url"], data)
data["extension"] = "mp4"
@@ -163,7 +169,8 @@ class HotleakCreatorExtractor(HotleakExtractor):
elif post["type"] == 1:
data["type"] = "video"
- data["url"] = "ytdl:" + post["stream_url_play"]
+ data["url"] = "ytdl:" + decode_video_url(
+ post["stream_url_play"])
text.nameext_from_url(data["url"], data)
data["extension"] = "mp4"
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index db9f3fb..deb31a0 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
# Copyright 2018-2020 Leonardo Taccari
-# Copyright 2018-2022 Mike Fährmann
+# Copyright 2018-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -90,6 +90,11 @@ class InstagramExtractor(Extractor):
def posts(self):
return ()
+ def finalize(self):
+ if self._cursor:
+ self.log.info("Use '-o cursor=%s' to continue downloading "
+ "from the current position", self._cursor)
+
def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs)
@@ -104,9 +109,6 @@ class InstagramExtractor(Extractor):
page = None
if page:
- if self._cursor:
- self.log.info("Use '-o cursor=%s' to continue downloading "
- "from the current position", self._cursor)
raise exception.StopExtraction("HTTP redirect to %s page (%s)",
page, url.partition("?")[0])
@@ -114,6 +116,10 @@ class InstagramExtractor(Extractor):
if www_claim is not None:
self.www_claim = www_claim
+ csrf_token = response.cookies.get("csrftoken")
+ if csrf_token:
+ self.csrf_token = csrf_token
+
return response
def login(self):
@@ -794,7 +800,12 @@ class InstagramRestAPI():
def user_clips(self, user_id):
endpoint = "/v1/clips/user/"
- data = {"target_user_id": user_id, "page_size": "50"}
+ data = {
+ "target_user_id": user_id,
+ "page_size": "50",
+ "max_id": None,
+ "include_feed_video": "true",
+ }
return self._pagination_post(endpoint, data)
def user_collection(self, collection_id):
@@ -820,19 +831,18 @@ class InstagramRestAPI():
def _call(self, endpoint, **kwargs):
extr = self.extractor
- url = "https://i.instagram.com/api" + endpoint
+ url = "https://www.instagram.com/api" + endpoint
kwargs["headers"] = {
+ "Accept" : "*/*",
"X-CSRFToken" : extr.csrf_token,
"X-Instagram-AJAX": "1006242110",
"X-IG-App-ID" : "936619743392459",
"X-ASBD-ID" : "198387",
"X-IG-WWW-Claim" : extr.www_claim,
- "Origin" : extr.root,
+ "X-Requested-With": "XMLHttpRequest",
+ "Alt-Used" : "www.instagram.com",
"Referer" : extr.root + "/",
}
- kwargs["cookies"] = {
- "csrftoken": extr.csrf_token,
- }
return extr.request(url, **kwargs).json()
def _pagination(self, endpoint, params=None, media=False):
@@ -851,7 +861,7 @@ class InstagramRestAPI():
yield from data["items"]
if not data.get("more_available"):
- return
+ return extr._update_cursor(None)
params["max_id"] = extr._update_cursor(data["next_max_id"])
def _pagination_post(self, endpoint, params):
@@ -866,7 +876,7 @@ class InstagramRestAPI():
info = data["paging_info"]
if not info.get("more_available"):
- return
+ return extr._update_cursor(None)
params["max_id"] = extr._update_cursor(info["max_id"])
def _pagination_sections(self, endpoint, params):
@@ -879,7 +889,7 @@ class InstagramRestAPI():
yield from info["sections"]
if not info.get("more_available"):
- return
+ return extr._update_cursor(None)
params["page"] = info["next_page"]
params["max_id"] = extr._update_cursor(info["next_max_id"])
@@ -894,7 +904,7 @@ class InstagramRestAPI():
yield from item["media_items"]
if "next_max_id" not in data:
- return
+ return extr._update_cursor(None)
params["max_id"] = extr._update_cursor(data["next_max_id"])
@@ -982,12 +992,7 @@ class InstagramGraphqlAPI():
"X-Requested-With": "XMLHttpRequest",
"Referer" : extr.root + "/",
}
- cookies = {
- "csrftoken": extr.csrf_token,
- }
- return extr.request(
- url, params=params, headers=headers, cookies=cookies,
- ).json()["data"]
+ return extr.request(url, params=params, headers=headers).json()["data"]
def _pagination(self, query_hash, variables,
key_data="user", key_edge=None):
@@ -1003,7 +1008,7 @@ class InstagramGraphqlAPI():
info = data["page_info"]
if not info["has_next_page"]:
- return
+ return extr._update_cursor(None)
elif not data["edges"]:
s = "" if self.item.endswith("s") else "s"
raise exception.StopExtraction(
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 63e3084..33e8370 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -16,6 +16,7 @@ import re
BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.party"
USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
+HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})"
class KemonopartyExtractor(Extractor):
@@ -41,7 +42,7 @@ class KemonopartyExtractor(Extractor):
self._find_inline = re.compile(
r'src="(?:https?://(?:kemono|coomer)\.party)?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
- find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match
+ find_hash = re.compile(HASH_PATTERN).match
generators = self._build_file_generators(self.config("files"))
duplicates = self.config("duplicates")
comments = self.config("comments")
@@ -89,10 +90,11 @@ class KemonopartyExtractor(Extractor):
match = find_hash(url)
if match:
file["hash"] = hash = match.group(1)
- if hash in hashes and not duplicates:
- self.log.debug("Skipping %s (duplicate)", url)
- continue
- hashes.add(hash)
+ if not duplicates:
+ if hash in hashes:
+ self.log.debug("Skipping %s (duplicate)", url)
+ continue
+ hashes.add(hash)
else:
file["hash"] = ""
@@ -362,14 +364,17 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
"pattern": r"https://kemono\.party/data/("
r"e3/77/e377e3525164559484ace2e64425b0cec1db08.*\.png|"
r"51/45/51453640a5e0a4d23fbf57fb85390f9c5ec154.*\.gif)",
+ "keyword": {"hash": "re:e377e3525164559484ace2e64425b0cec1db08"
+ "|51453640a5e0a4d23fbf57fb85390f9c5ec154"},
"count": ">= 2",
}),
# 'inline' files
(("https://kemono.party/discord"
"/server/315262215055736843/channel/315262215055736843#general"), {
"pattern": r"https://cdn\.discordapp\.com/attachments/\d+/\d+/.+$",
- "range": "1-5",
"options": (("image-filter", "type == 'inline'"),),
+ "keyword": {"hash": ""},
+ "range": "1-5",
}),
)
@@ -383,6 +388,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
find_inline = re.compile(
r"https?://(?:cdn\.discordapp.com|media\.discordapp\.net)"
r"(/[A-Za-z0-9-._~:/?#\[\]@!$&'()*+,;%=]+)").findall
+ find_hash = re.compile(HASH_PATTERN).match
posts = self.posts()
max_posts = self.config("max-posts")
@@ -393,11 +399,13 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
files = []
append = files.append
for attachment in post["attachments"]:
+ match = find_hash(attachment["path"])
+ attachment["hash"] = match.group(1) if match else ""
attachment["type"] = "attachment"
append(attachment)
for path in find_inline(post["content"] or ""):
append({"path": "https://cdn.discordapp.com" + path,
- "name": path, "type": "inline"})
+ "name": path, "type": "inline", "hash": ""})
post["channel_name"] = self.channel_name
post["date"] = text.parse_datetime(
@@ -406,6 +414,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
yield Message.Directory, post
for post["num"], file in enumerate(files, 1):
+ post["hash"] = file["hash"]
post["type"] = file["type"]
url = file["path"]
diff --git a/gallery_dl/extractor/lexica.py b/gallery_dl/extractor/lexica.py
new file mode 100644
index 0000000..ad93625
--- /dev/null
+++ b/gallery_dl/extractor/lexica.py
@@ -0,0 +1,104 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2023 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://lexica.art/"""
+
+from .common import Extractor, Message
+from .. import text
+
+
+class LexicaSearchExtractor(Extractor):
+ """Extractor for lexica.art search results"""
+ category = "lexica"
+ subcategory = "search"
+ root = "https://lexica.art"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "{id}"
+ pattern = r"(?:https?://)?lexica\.art/?\?q=([^&#]+)"
+ test = (
+ ("https://lexica.art/?q=tree", {
+ "pattern": r"https://lexica-serve-encoded-images2\.sharif\."
+ r"workers.dev/full_jpg/[0-9a-f-]{36}$",
+ "range": "1-80",
+ "count": 80,
+ "keyword": {
+ "height": int,
+ "id": str,
+ "upscaled_height": int,
+ "upscaled_width": int,
+ "userid": str,
+ "width": int,
+ "prompt": {
+ "c": int,
+ "grid": bool,
+ "height": int,
+ "id": str,
+ "images": list,
+ "initImage": None,
+ "initImageStrength": None,
+ "model": "lexica-aperture-v2",
+ "negativePrompt": str,
+ "prompt": str,
+ "seed": str,
+ "timestamp": r"re:\d{4}-\d\d-\d\dT\d\d:\d\d:\d\d.\d\d\dZ",
+ "width": int,
+ },
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.query = match.group(1)
+ self.text = text.unquote(self.query).replace("+", " ")
+
+ def items(self):
+ base = ("https://lexica-serve-encoded-images2.sharif.workers.dev"
+ "/full_jpg/")
+ tags = self.text
+
+ for image in self.posts():
+ image["filename"] = image["id"]
+ image["extension"] = "jpg"
+ image["search_tags"] = tags
+ yield Message.Directory, image
+ yield Message.Url, base + image["id"], image
+
+ def posts(self):
+ url = self.root + "/api/infinite-prompts"
+ headers = {
+ "Accept" : "application/json, text/plain, */*",
+ "Referer": "{}/?q={}".format(self.root, self.query),
+ }
+ json = {
+ "text" : self.text,
+ "searchMode": "images",
+ "source" : "search",
+ "cursor" : 0,
+ "model" : "lexica-aperture-v2",
+ }
+
+ while True:
+ data = self.request(
+ url, method="POST", headers=headers, json=json).json()
+
+ prompts = {
+ prompt["id"]: prompt
+ for prompt in data["prompts"]
+ }
+
+ for image in data["images"]:
+ image["prompt"] = prompts[image["promptid"]]
+ del image["promptid"]
+ yield image
+
+ cursor = data.get("nextCursor")
+ if not cursor:
+ return
+
+ json["cursor"] = cursor
diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py
index 049e0af..e49d29a 100644
--- a/gallery_dl/extractor/mastodon.py
+++ b/gallery_dl/extractor/mastodon.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019-2022 Mike Fährmann
+# Copyright 2019-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -31,8 +31,8 @@ class MastodonExtractor(BaseExtractor):
def items(self):
for status in self.statuses():
- if self._check_move:
- self._check_move(status["account"])
+ if self._check_moved:
+ self._check_moved(status["account"])
if not self.reblogs and status["reblog"]:
self.log.debug("Skipping %s (reblog)", status["id"])
continue
@@ -48,12 +48,13 @@ class MastodonExtractor(BaseExtractor):
status["instance_remote"] = \
acct.rpartition("@")[2] if "@" in acct else None
+ status["count"] = len(attachments)
status["tags"] = [tag["name"] for tag in status["tags"]]
status["date"] = text.parse_datetime(
status["created_at"][:19], "%Y-%m-%dT%H:%M:%S")
yield Message.Directory, status
- for media in attachments:
+ for status["num"], media in enumerate(attachments, 1):
status["media"] = media
url = media["url"]
yield Message.Url, url, text.nameext_from_url(url, status)
@@ -62,8 +63,8 @@ class MastodonExtractor(BaseExtractor):
"""Return an iterable containing all relevant Status objects"""
return ()
- def _check_move(self, account):
- self._check_move = None
+ def _check_moved(self, account):
+ self._check_moved = None
if "moved" in account:
self.log.warning("Account '%s' moved to '%s'",
account["acct"], account["moved"]["acct"])
@@ -181,6 +182,10 @@ class MastodonStatusExtractor(MastodonExtractor):
test = (
("https://mastodon.social/@jk/103794036899778366", {
"count": 4,
+ "keyword": {
+ "count": 4,
+ "num": int,
+ },
}),
("https://pawoo.net/@yoru_nine/105038878897832922", {
"content": "b52e807f8ab548d6f896b09218ece01eba83987a",
@@ -229,7 +234,7 @@ class MastodonAPI():
for account in self.account_search(handle, 1):
if account["acct"] == username:
- self.extractor._check_move(account)
+ self.extractor._check_moved(account)
return account["id"]
raise exception.NotFoundError("account")
diff --git a/gallery_dl/extractor/nudecollect.py b/gallery_dl/extractor/nudecollect.py
new file mode 100644
index 0000000..3159919
--- /dev/null
+++ b/gallery_dl/extractor/nudecollect.py
@@ -0,0 +1,142 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://nudecollect.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+
+
+class NudecollectExtractor(GalleryExtractor):
+ """Base class for Nudecollect extractors"""
+ category = "nudecollect"
+ directory_fmt = ("{category}", "{title}")
+ filename_fmt = "{slug}_{num:>03}.{extension}"
+ archive_fmt = "{slug}_{num}"
+ root = "https://www.nudecollect.com"
+
+ def request(self, url, **kwargs):
+ kwargs["allow_redirects"] = False
+ return GalleryExtractor.request(self, url, **kwargs)
+
+ @staticmethod
+ def get_title(page):
+ return text.unescape(text.extr(page, "<title>", "</title>"))[31:]
+
+ @staticmethod
+ def get_image(page):
+ return text.extr(page, '<img src="', '"')
+
+
+class NudecollectImageExtractor(NudecollectExtractor):
+ """Extractor for individual images from nudecollect.com"""
+ subcategory = "image"
+ pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
+ r"(/content/([^/?#]+)/image-(\d+)-pics-(\d+)"
+ r"-mirror-(\d+)\.html)")
+ test = (
+ (("https://www.nudecollect.com/content/20201220_Teenpornstorage_"
+ "Patritcy_Vanessa_Lesbian_Lust/image-4-pics-108-mirror-43.html"), {
+ "pattern": (r"https://mirror\d+\.nudecollect\.com/showimage"
+ r"/nudecollect-8769086487/image00004-5896498214-43"
+ r"-9689595623/20201220_Teenpornstorage_Patritcy_Vaness"
+ r"a_Lesbian_Lust/9879560327/nudecollect\.com\.jpg"),
+ "keyword": {
+ "slug" : ("20201220_Teenpornstorage_Patritcy"
+ "_Vanessa_Lesbian_Lust"),
+ "title" : ("20201220 Teenpornstorage Patritcy"
+ " Vanessa Lesbian Lust"),
+ "num" : 4,
+ "count" : 108,
+ "mirror": 43,
+ },
+ }),
+ (("https://www.nudecollect.com/content/20201220_Teenpornstorage_"
+ "Patritcy_Vanessa_Lesbian_Lust/image-10-pics-108-mirror-43.html")),
+ )
+
+ def __init__(self, match):
+ NudecollectExtractor.__init__(self, match)
+ _, self.slug, self.num, self.count, self.mirror = match.groups()
+
+ def metadata(self, page):
+ return {
+ "slug" : self.slug,
+ "title" : self.get_title(page),
+ "count" : text.parse_int(self.count),
+ "mirror": text.parse_int(self.mirror),
+ }
+
+ def images(self, page):
+ return ((self.get_image(page), {"num": text.parse_int(self.num)}),)
+
+
+class NudecollectAlbumExtractor(NudecollectExtractor):
+ """Extractor for image albums on nudecollect.com"""
+ subcategory = "album"
+ pattern = (r"(?:https?://)?(?:www\.)?nudecollect\.com"
+ r"/content/([^/?#]+)/(?:index-mirror-(\d+)-(\d+)"
+ r"|page-\d+-pics-(\d+)-mirror-(\d+))\.html")
+ test = (
+ (("https://www.nudecollect.com/content/20170219_TheWhiteBoxxx_"
+ "Caprice_Tracy_Loves_Hot_ass_fingering_and_sensual_lesbian_sex"
+ "_with_alluring_Czech_babes_x125_1080px/index-mirror-67-125.html"), {
+ "pattern": (r"https://mirror\d+\.nudecollect\.com/showimage"
+ r"/nudecollect-8769086487/image00\d\d\d-5896498214-67"
+ r"-9689595623/20170219_TheWhiteBoxxx_Caprice"
+ r"_Tracy_Loves_Hot_ass_fingering_and_sensual_"
+ r"lesbian_sex_with_alluring_Czech_babes_x125_1080px"
+ r"/9879560327/nudecollect\.com\.jpg"),
+ "count" : 125,
+ "keyword": {
+ "slug" : ("20170219_TheWhiteBoxxx_Caprice_Tracy_Loves_Hot_"
+ "ass_fingering_and_sensual_lesbian_sex_with_"
+ "alluring_Czech_babes_x125_1080px"),
+ "title" : ("20170219 TheWhiteBoxxx Caprice Tracy Loves Hot ass"
+ " fingering and sensual lesbian sex with alluring"
+ " Czech babes x125 1080px"),
+ "num" : int,
+ "mirror": 67,
+ },
+ }),
+ (("https://www.nudecollect.com/content/20201220_Teenpornstorage_"
+ "Patritcy_Vanessa_Lesbian_Lust/page-1-pics-108-mirror-43.html"), {
+ "pattern": (r"https://mirror\d+\.nudecollect\.com/showimage"
+ r"/nudecollect-8769086487/image00\d\d\d-5896498214-43"
+ r"-9689595623/20201220_Teenpornstorage_Patritcy_Vaness"
+ r"a_Lesbian_Lust/9879560327/nudecollect\.com\.jpg"),
+ "count" : 108,
+ "keyword": {
+ "slug" : ("20201220_Teenpornstorage_Patritcy"
+ "_Vanessa_Lesbian_Lust"),
+ "title" : ("20201220 Teenpornstorage Patritcy"
+ " Vanessa Lesbian Lust"),
+ "num" : int,
+ "mirror": 43,
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ self.slug = match.group(1)
+ self.mirror = match.group(2) or match.group(5)
+ self.count = text.parse_int(match.group(3) or match.group(4))
+ url = "{}/content/{}/image-1-pics-{}-mirror-{}.html".format(
+ self.root, self.slug, self.count, self.mirror)
+ NudecollectExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ return {
+ "slug" : self.slug,
+ "title" : self.get_title(page),
+ "mirror": text.parse_int(self.mirror),
+ }
+
+ def images(self, page):
+ url = self.get_image(page)
+ p1, _, p2 = url.partition("/image0")
+ ufmt = p1 + "/image{:>05}" + p2[4:]
+ return [(ufmt.format(num), None) for num in range(1, self.count + 1)]
diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py
index d6628c4..9270f33 100644
--- a/gallery_dl/extractor/oauth.py
+++ b/gallery_dl/extractor/oauth.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2017-2022 Mike Fährmann
+# Copyright 2017-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -68,11 +68,19 @@ class OAuthBase(Extractor):
def open(self, url, params, recv=None):
"""Open 'url' in browser amd return response parameters"""
- import webbrowser
url += "?" + urllib.parse.urlencode(params)
- if not self.config("browser", True) or not webbrowser.open(url):
- stdout_write(
- "Please open this URL in your browser:\n\n" + url + "\n\n")
+
+ browser = self.config("browser", True)
+ if browser:
+ import webbrowser
+ browser = webbrowser.get()
+
+ if browser and browser.open(url):
+ self.log.info("Opening URL in %s:", browser.name.capitalize())
+ else:
+ self.log.info("Please open this URL in your browser:")
+
+ stdout_write("\n{}\n\n".format(url))
return (recv or self.recv)()
def error(self, msg):
@@ -80,8 +88,18 @@ class OAuthBase(Extractor):
"Remote server reported an error:\n\n{}\n".format(msg))
def _oauth1_authorization_flow(
- self, request_token_url, authorize_url, access_token_url):
+ self, default_key, default_secret,
+ request_token_url, authorize_url, access_token_url):
"""Perform the OAuth 1.0a authorization flow"""
+
+ api_key = self.oauth_config("api-key") or default_key
+ api_secret = self.oauth_config("api-secret") or default_secret
+ self.session = oauth.OAuth1Session(api_key, api_secret)
+
+ self.log.info("Using %s %s API key (%s)",
+ "default" if api_key == default_key else "custom",
+ self.subcategory, api_key)
+
# get a request token
params = {"oauth_callback": self.redirect_uri}
data = self.session.get(request_token_url, params=params).text
@@ -112,11 +130,18 @@ class OAuthBase(Extractor):
))
def _oauth2_authorization_code_grant(
- self, client_id, client_secret, auth_url, token_url, *,
- scope="read", key="refresh_token", auth=True,
- cache=None, instance=None):
+ self, client_id, client_secret, default_id, default_secret,
+ auth_url, token_url, *, scope="read", duration="permanent",
+ key="refresh_token", auth=True, cache=None, instance=None):
"""Perform an OAuth2 authorization code grant"""
+ client_id = str(client_id) if client_id else default_id
+ client_secret = client_secret or default_secret
+
+ self.log.info("Using %s %s client ID (%s)",
+ "default" if client_id == default_id else "custom",
+ instance or self.subcategory, client_id)
+
state = "gallery-dl_{}_{}".format(
self.subcategory,
oauth.nonce(8),
@@ -127,7 +152,7 @@ class OAuthBase(Extractor):
"response_type": "code",
"state" : state,
"redirect_uri" : self.redirect_uri,
- "duration" : "permanent",
+ "duration" : duration,
"scope" : scope,
}
@@ -137,13 +162,12 @@ class OAuthBase(Extractor):
# check authorization response
if state != params.get("state"):
self.send("'state' mismatch: expected {}, got {}.\n".format(
- state, params.get("state")
- ))
+ state, params.get("state")))
return
if "error" in params:
return self.error(params)
- # exchange the authorization code for a token
+ # exchange authorization code for a token
data = {
"grant_type" : "authorization_code",
"code" : params["code"],
@@ -208,81 +232,36 @@ class OAuthBase(Extractor):
return msg
-class OAuthDeviantart(OAuthBase):
- subcategory = "deviantart"
- pattern = "oauth:deviantart$"
- redirect_uri = REDIRECT_URI_HTTPS
-
- def items(self):
- yield Message.Version, 1
-
- self._oauth2_authorization_code_grant(
- self.oauth_config(
- "client-id", deviantart.DeviantartOAuthAPI.CLIENT_ID),
- self.oauth_config(
- "client-secret", deviantart.DeviantartOAuthAPI.CLIENT_SECRET),
- "https://www.deviantart.com/oauth2/authorize",
- "https://www.deviantart.com/oauth2/token",
- scope="browse user.manage",
- cache=deviantart._refresh_token_cache,
- )
-
+# --------------------------------------------------------------------
+# OAuth 1.0a
class OAuthFlickr(OAuthBase):
subcategory = "flickr"
pattern = "oauth:flickr$"
redirect_uri = REDIRECT_URI_HTTPS
- def __init__(self, match):
- OAuthBase.__init__(self, match)
- self.session = oauth.OAuth1Session(
- self.oauth_config("api-key", flickr.FlickrAPI.API_KEY),
- self.oauth_config("api-secret", flickr.FlickrAPI.API_SECRET),
- )
-
def items(self):
yield Message.Version, 1
self._oauth1_authorization_flow(
+ flickr.FlickrAPI.API_KEY,
+ flickr.FlickrAPI.API_SECRET,
"https://www.flickr.com/services/oauth/request_token",
"https://www.flickr.com/services/oauth/authorize",
"https://www.flickr.com/services/oauth/access_token",
)
-class OAuthReddit(OAuthBase):
- subcategory = "reddit"
- pattern = "oauth:reddit$"
-
- def items(self):
- yield Message.Version, 1
-
- self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT
- self._oauth2_authorization_code_grant(
- self.oauth_config("client-id", reddit.RedditAPI.CLIENT_ID),
- "",
- "https://www.reddit.com/api/v1/authorize",
- "https://www.reddit.com/api/v1/access_token",
- scope="read history",
- cache=reddit._refresh_token_cache,
- )
-
-
class OAuthSmugmug(OAuthBase):
subcategory = "smugmug"
pattern = "oauth:smugmug$"
- def __init__(self, match):
- OAuthBase.__init__(self, match)
- self.session = oauth.OAuth1Session(
- self.oauth_config("api-key", smugmug.SmugmugAPI.API_KEY),
- self.oauth_config("api-secret", smugmug.SmugmugAPI.API_SECRET),
- )
-
def items(self):
yield Message.Version, 1
self._oauth1_authorization_flow(
+ smugmug.SmugmugAPI.API_KEY,
+ smugmug.SmugmugAPI.API_SECRET,
"https://api.smugmug.com/services/oauth/1.0a/getRequestToken",
"https://api.smugmug.com/services/oauth/1.0a/authorize",
"https://api.smugmug.com/services/oauth/1.0a/getAccessToken",
@@ -293,23 +272,61 @@ class OAuthTumblr(OAuthBase):
subcategory = "tumblr"
pattern = "oauth:tumblr$"
- def __init__(self, match):
- OAuthBase.__init__(self, match)
- self.session = oauth.OAuth1Session(
- self.oauth_config("api-key", tumblr.TumblrAPI.API_KEY),
- self.oauth_config("api-secret", tumblr.TumblrAPI.API_SECRET),
- )
-
def items(self):
yield Message.Version, 1
self._oauth1_authorization_flow(
+ tumblr.TumblrAPI.API_KEY,
+ tumblr.TumblrAPI.API_SECRET,
"https://www.tumblr.com/oauth/request_token",
"https://www.tumblr.com/oauth/authorize",
"https://www.tumblr.com/oauth/access_token",
)
+# --------------------------------------------------------------------
+# OAuth 2.0
+
+class OAuthDeviantart(OAuthBase):
+ subcategory = "deviantart"
+ pattern = "oauth:deviantart$"
+ redirect_uri = REDIRECT_URI_HTTPS
+
+ def items(self):
+ yield Message.Version, 1
+
+ self._oauth2_authorization_code_grant(
+ self.oauth_config("client-id"),
+ self.oauth_config("client-secret"),
+ deviantart.DeviantartOAuthAPI.CLIENT_ID,
+ deviantart.DeviantartOAuthAPI.CLIENT_SECRET,
+ "https://www.deviantart.com/oauth2/authorize",
+ "https://www.deviantart.com/oauth2/token",
+ scope="browse user.manage",
+ cache=deviantart._refresh_token_cache,
+ )
+
+
+class OAuthReddit(OAuthBase):
+ subcategory = "reddit"
+ pattern = "oauth:reddit$"
+
+ def items(self):
+ yield Message.Version, 1
+
+ self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT
+ self._oauth2_authorization_code_grant(
+ self.oauth_config("client-id"),
+ "",
+ reddit.RedditAPI.CLIENT_ID,
+ "",
+ "https://www.reddit.com/api/v1/authorize",
+ "https://www.reddit.com/api/v1/access_token",
+ scope="read history",
+ cache=reddit._refresh_token_cache,
+ )
+
+
class OAuthMastodon(OAuthBase):
subcategory = "mastodon"
pattern = "oauth:mastodon:(?:https?://)?([^/?#]+)"
@@ -330,6 +347,8 @@ class OAuthMastodon(OAuthBase):
self._oauth2_authorization_code_grant(
application["client-id"],
application["client-secret"],
+ application["client-id"],
+ application["client-secret"],
"https://{}/oauth/authorize".format(self.instance),
"https://{}/oauth/token".format(self.instance),
instance=self.instance,
@@ -362,6 +381,8 @@ class OAuthMastodon(OAuthBase):
return data
+# --------------------------------------------------------------------
+
class OAuthPixiv(OAuthBase):
subcategory = "pixiv"
pattern = "oauth:pixiv$"
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index fc85125..df85b96 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2021-2022 Mike Fährmann
+# Copyright 2021-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -56,12 +56,12 @@ class PhilomenaExtractor(BooruExtractor):
INSTANCES = {
"derpibooru": {
"root": "https://derpibooru.org",
- "pattern": r"derpibooru\.org",
+ "pattern": r"(?:www\.)?derpibooru\.org",
"filter_id": "56027",
},
"ponybooru": {
"root": "https://ponybooru.org",
- "pattern": r"ponybooru\.org",
+ "pattern": r"(?:www\.)?ponybooru\.org",
"filter_id": "2",
},
"furbooru": {
@@ -128,9 +128,14 @@ class PhilomenaPostExtractor(PhilomenaExtractor):
},
}),
("https://derpibooru.org/1"),
+ ("https://www.derpibooru.org/1"),
+ ("https://www.derpibooru.org/images/1"),
+
("https://ponybooru.org/images/1", {
"content": "bca26f58fafd791fe07adcd2a28efd7751824605",
}),
+ ("https://www.ponybooru.org/images/1"),
+
("https://furbooru.org/images/1", {
"content": "9eaa1e1b32fa0f16520912257dbefaff238d5fd2",
}),
diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py
index 7013f1b..ea4cf43 100644
--- a/gallery_dl/extractor/sankaku.py
+++ b/gallery_dl/extractor/sankaku.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2014-2022 Mike Fährmann
+# Copyright 2014-2023 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -80,15 +80,19 @@ class SankakuTagExtractor(SankakuExtractor):
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
archive_fmt = "t_{search_tags}_{id}"
- pattern = BASE_PATTERN + r"/\?([^#]*)"
+ pattern = BASE_PATTERN + r"/?\?([^#]*)"
test = (
("https://sankaku.app/?tags=bonocho", {
"count": 5,
"pattern": r"https://s\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
- r"/[^/]{32}\.\w+\?e=\d+&(expires=\d+&)?m=[^&#]+",
+ r"/[0-9a-f]{32}\.\w+\?e=\d+&(expires=\d+&)?m=[^&#]+",
}),
("https://beta.sankakucomplex.com/?tags=bonocho"),
("https://chan.sankakucomplex.com/?tags=bonocho"),
+ ("https://black.sankakucomplex.com/?tags=bonocho"),
+ ("https://white.sankakucomplex.com/?tags=bonocho"),
+ ("https://sankaku.app/ja?tags=order%3Apopularity"),
+ ("https://sankaku.app/no/?tags=order%3Apopularity"),
# error on five or more tags
("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", {
"options": (("username", None),),
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index c2d8247..17a2202 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -26,6 +26,7 @@ class TwitterExtractor(Extractor):
cookiedomain = ".twitter.com"
cookienames = ("auth_token",)
root = "https://twitter.com"
+ browser = "firefox"
def __init__(self, match):
Extractor.__init__(self, match)
@@ -945,16 +946,31 @@ class TwitterAPI():
def __init__(self, extractor):
self.extractor = extractor
- self.root = "https://twitter.com/i/api"
+ self.root = "https://api.twitter.com"
+ cookies = extractor.session.cookies
+ cookiedomain = extractor.cookiedomain
+
+ csrf = extractor.config("csrf")
+ if csrf is None or csrf == "cookies":
+ csrf_token = cookies.get("ct0", domain=cookiedomain)
+ else:
+ csrf_token = None
+ if not csrf_token:
+ csrf_token = util.generate_token()
+ cookies.set("ct0", csrf_token, domain=cookiedomain)
+
+ auth_token = cookies.get("auth_token", domain=cookiedomain)
+
self.headers = {
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejR"
"COuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu"
"4FA33AGWWjCpTnA",
"x-guest-token": None,
- "x-twitter-auth-type": None,
+ "x-twitter-auth-type": "OAuth2Session" if auth_token else None,
"x-twitter-client-language": "en",
"x-twitter-active-user": "yes",
- "x-csrf-token": None,
+ "x-csrf-token": csrf_token,
+ "Origin": "https://twitter.com",
"Referer": "https://twitter.com/",
}
self.params = {
@@ -967,24 +983,36 @@ class TwitterAPI():
"include_can_dm": "1",
"include_can_media_tag": "1",
"include_ext_has_nft_avatar": "1",
+ "include_ext_is_blue_verified": "1",
+ "include_ext_verified_type": "1",
"skip_status": "1",
"cards_platform": "Web-12",
"include_cards": "1",
"include_ext_alt_text": "true",
+ "include_ext_limited_action_results": "false",
"include_quote_count": "true",
"include_reply_count": "1",
"tweet_mode": "extended",
+ "include_ext_collab_control": "true",
+ "include_ext_views": "true",
"include_entities": "true",
"include_user_entities": "true",
"include_ext_media_color": "true",
"include_ext_media_availability": "true",
"include_ext_sensitive_media_warning": "true",
+ "include_ext_trusted_friends_metadata": "true",
"send_error_codes": "true",
"simple_quoted_tweet": "true",
+ "q": None,
"count": "100",
+ "query_source": None,
"cursor": None,
- "ext": "mediaStats,highlightedLabel,hasNftAvatar,"
- "voiceInfo,superFollowMetadata",
+ "pc": None,
+ "spelling_corrections": None,
+ "include_ext_edit_control": "true",
+ "ext": "mediaStats,highlightedLabel,hasNftAvatar,voiceInfo,"
+ "enrichments,superFollowMetadata,unmentionInfo,editControl,"
+ "collab_control,vibe",
}
self.variables = {
"includePromotedContent": False,
@@ -1006,28 +1034,6 @@ class TwitterAPI():
self._syndication = self.extractor.syndication
self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode
- cookies = extractor.session.cookies
- cookiedomain = extractor.cookiedomain
-
- csrf = extractor.config("csrf")
- if csrf is None or csrf == "cookies":
- csrf_token = cookies.get("ct0", domain=cookiedomain)
- else:
- csrf_token = None
- if not csrf_token:
- csrf_token = util.generate_token()
- cookies.set("ct0", csrf_token, domain=cookiedomain)
- self.headers["x-csrf-token"] = csrf_token
-
- if cookies.get("auth_token", domain=cookiedomain):
- # logged in
- self.headers["x-twitter-auth-type"] = "OAuth2Session"
- else:
- # guest
- guest_token = self._guest_token()
- cookies.set("gt", guest_token, domain=cookiedomain)
- self.headers["x-guest-token"] = guest_token
-
def tweet_detail(self, tweet_id):
endpoint = "/graphql/ItejhtHVxU7ksltgMmyaLA/TweetDetail"
variables = {
@@ -1183,17 +1189,26 @@ class TwitterAPI():
@cache(maxage=3600)
def _guest_token(self):
- root = "https://api.twitter.com"
endpoint = "/1.1/guest/activate.json"
- return str(self._call(endpoint, None, root, "POST")["guest_token"])
+ self.extractor.log.info("Requesting guest token")
+ return str(self._call(endpoint, None, "POST", False)["guest_token"])
+
+ def _authenticate_guest(self):
+ guest_token = self._guest_token()
+ if guest_token != self.headers["x-guest-token"]:
+ self.headers["x-guest-token"] = guest_token
+ self.extractor.session.cookies.set(
+ "gt", guest_token, domain=self.extractor.cookiedomain)
- def _call(self, endpoint, params, root=None, method="GET"):
- if root is None:
- root = self.root
+ def _call(self, endpoint, params, method="GET", auth=True):
+ url = self.root + endpoint
while True:
+ if not self.headers["x-twitter-auth-type"] and auth:
+ self._authenticate_guest()
+
response = self.extractor.request(
- root + endpoint, method=method, params=params,
+ url, method=method, params=params,
headers=self.headers, fatal=None)
# update 'x-csrf-token' header (#1170)
@@ -1226,21 +1241,33 @@ class TwitterAPI():
def _pagination_legacy(self, endpoint, params):
original_retweets = (self.extractor.retweets == "original")
+ bottom = ("cursor-bottom-", "sq-cursor-bottom")
while True:
data = self._call(endpoint, params)
- instr = data["timeline"]["instructions"]
- if not instr:
+ instructions = data["timeline"]["instructions"]
+ if not instructions:
return
tweets = data["globalObjects"]["tweets"]
users = data["globalObjects"]["users"]
tweet_id = cursor = None
tweet_ids = []
+ entries = ()
+
+ # process instructions
+ for instr in instructions:
+ if "addEntries" in instr:
+ entries = instr["addEntries"]["entries"]
+ elif "replaceEntry" in instr:
+ entry = instr["replaceEntry"]["entry"]
+ if entry["entryId"].startswith(bottom):
+ cursor = (entry["content"]["operation"]
+ ["cursor"]["value"])
# collect tweet IDs and cursor value
- for entry in instr[0]["addEntries"]["entries"]:
+ for entry in entries:
entry_startswith = entry["entryId"].startswith
if entry_startswith(("tweet-", "sq-I-t-")):
@@ -1252,7 +1279,7 @@ class TwitterAPI():
entry["content"]["timelineModule"]["metadata"]
["conversationMetadata"]["allTweetIds"][::-1])
- elif entry_startswith(("cursor-bottom-", "sq-cursor-bottom")):
+ elif entry_startswith(bottom):
cursor = entry["content"]["operation"]["cursor"]
if not cursor.get("stopOnEmptyResponse", True):
# keep going even if there are no tweets
@@ -1300,11 +1327,7 @@ class TwitterAPI():
quoted["quoted_by_id_str"] = tweet["id_str"]
yield quoted
- # update cursor value
- if "replaceEntry" in instr[-1] :
- cursor = (instr[-1]["replaceEntry"]["entry"]
- ["content"]["operation"]["cursor"]["value"])
-
+ # stop on empty response
if not cursor or (not tweets and not tweet_id):
return
params["cursor"] = cursor
@@ -1346,12 +1369,8 @@ class TwitterAPI():
if user.get("blocked_by"):
if self.headers["x-twitter-auth-type"] and \
extr.config("logout"):
- guest_token = self._guest_token()
- extr.session.cookies.set(
- "gt", guest_token, domain=extr.cookiedomain)
extr._cookiefile = None
del extr.session.cookies["auth_token"]
- self.headers["x-guest-token"] = guest_token
self.headers["x-twitter-auth-type"] = None
extr.log.info("Retrying API request as guest")
continue
@@ -1578,8 +1597,6 @@ def _login_impl(extr, username, password):
"Login with email is no longer possible. "
"You need to provide your username or phone number instead.")
- extr.log.info("Logging in as %s", username)
-
def process(response):
try:
data = response.json()
@@ -1598,8 +1615,10 @@ def _login_impl(extr, username, password):
extr.session.cookies.clear()
api = TwitterAPI(extr)
+ api._authenticate_guest()
headers = api.headers
- headers["Referer"] = "https://twitter.com/i/flow/login"
+
+ extr.log.info("Logging in as %s", username)
# init
data = {
@@ -1653,7 +1672,7 @@ def _login_impl(extr, username, password):
"web_modal": 1,
},
}
- url = "https://twitter.com/i/api/1.1/onboarding/task.json?flow_name=login"
+ url = "https://api.twitter.com/1.1/onboarding/task.json?flow_name=login"
response = extr.request(url, method="POST", headers=headers, json=data)
data = {
@@ -1668,7 +1687,7 @@ def _login_impl(extr, username, password):
},
],
}
- url = "https://twitter.com/i/api/1.1/onboarding/task.json"
+ url = "https://api.twitter.com/1.1/onboarding/task.json"
response = extr.request(
url, method="POST", headers=headers, json=data, fatal=None)
@@ -1692,7 +1711,7 @@ def _login_impl(extr, username, password):
},
],
}
- # url = "https://twitter.com/i/api/1.1/onboarding/task.json"
+ # url = "https://api.twitter.com/1.1/onboarding/task.json"
extr.sleep(random.uniform(2.0, 4.0), "login (username)")
response = extr.request(
url, method="POST", headers=headers, json=data, fatal=None)
@@ -1710,7 +1729,7 @@ def _login_impl(extr, username, password):
},
],
}
- # url = "https://twitter.com/i/api/1.1/onboarding/task.json"
+ # url = "https://api.twitter.com/1.1/onboarding/task.json"
extr.sleep(random.uniform(2.0, 4.0), "login (password)")
response = extr.request(
url, method="POST", headers=headers, json=data, fatal=None)
@@ -1727,7 +1746,7 @@ def _login_impl(extr, username, password):
},
],
}
- # url = "https://twitter.com/i/api/1.1/onboarding/task.json"
+ # url = "https://api.twitter.com/1.1/onboarding/task.json"
response = extr.request(
url, method="POST", headers=headers, json=data, fatal=None)
process(response)
diff --git a/gallery_dl/extractor/wikifeet.py b/gallery_dl/extractor/wikifeet.py
new file mode 100644
index 0000000..70e9646
--- /dev/null
+++ b/gallery_dl/extractor/wikifeet.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.wikifeet.com/"""
+
+from .common import GalleryExtractor
+from .. import text
+import json
+
+
+class WikifeetGalleryExtractor(GalleryExtractor):
+ """Extractor for image galleries from wikifeet.com"""
+ category = "wikifeet"
+ directory_fmt = ("{category}", "{celebrity}")
+ filename_fmt = "{category}_{celeb}_{pid}.{extension}"
+ archive_fmt = "{type}_{celeb}_{pid}"
+ pattern = (r"(?:https?://)(?:(?:www\.)?wikifeetx?|"
+ r"men\.wikifeet)\.com/([^/?#]+)")
+ test = (
+ ("https://www.wikifeet.com/Madison_Beer", {
+ "pattern": (r"https://pics\.wikifeet\.com/Madison_Beer"
+ r"-Feet-\d+\.jpg"),
+ "count" : ">= 352",
+ "keyword": {
+ "celeb" : "Madison_Beer",
+ "celebrity" : "Madison Beer",
+ "birthday" : "dt:1999-03-05 00:00:00",
+ "birthplace": "United States",
+ "rating" : float,
+ "pid" : int,
+ "width" : int,
+ "height" : int,
+ "shoesize" : "7.5 US",
+ "type" : "women",
+ "tags" : list,
+ },
+ }),
+ ("https://www.wikifeetx.com/Tifa_Quinn", {
+ "pattern": (r"https://pics\.wikifeet\.com/Tifa_Quinn"
+ r"-Feet-\d+\.jpg"),
+ "count" : ">= 9",
+ "keyword": {
+ "celeb" : "Tifa_Quinn",
+ "celebrity" : "Tifa Quinn",
+ "birthday" : "[NOT SET]",
+ "birthplace": "United States",
+ "rating" : float,
+ "pid" : int,
+ "width" : int,
+ "height" : int,
+ "shoesize" : "[NOT SET]",
+ "type" : "women",
+ "tags" : list,
+ },
+ }),
+ ("https://men.wikifeet.com/Chris_Hemsworth", {
+ "pattern": (r"https://pics\.wikifeet\.com/Chris_Hemsworth"
+ r"-Feet-\d+\.jpg"),
+ "count" : ">= 860",
+ "keyword": {
+ "celeb" : "Chris_Hemsworth",
+ "celebrity" : "Chris Hemsworth",
+ "birthday" : "dt:1983-08-11 00:00:00",
+ "birthplace": "Australia",
+ "rating" : float,
+ "pid" : int,
+ "width" : int,
+ "height" : int,
+ "shoesize" : "12.5 US",
+ "type" : "men",
+ "tags" : list,
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ self.root = text.root_from_url(match.group(0))
+ if "wikifeetx.com" in self.root:
+ self.category = "wikifeetx"
+ self.type = "men" if "://men." in self.root else "women"
+ self.celeb = match.group(1)
+ GalleryExtractor.__init__(self, match, self.root + "/" + self.celeb)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ return {
+ "celeb" : self.celeb,
+ "type" : self.type,
+ "rating" : text.parse_float(extr('"ratingValue": "', '"')),
+ "celebrity" : text.unescape(extr("times'>", "</h1>")),
+ "shoesize" : text.remove_html(extr("Shoe Size:", "edit")),
+ "birthplace": text.remove_html(extr("Birthplace:", "edit")),
+ "birthday" : text.parse_datetime(text.remove_html(
+ extr("Birth Date:", "edit")), "%Y-%m-%d"),
+ }
+
+ def images(self, page):
+ tagmap = {
+ "C": "Close-up",
+ "T": "Toenails",
+ "N": "Nylons",
+ "A": "Arches",
+ "S": "Soles",
+ "B": "Barefoot",
+ }
+ ufmt = "https://pics.wikifeet.com/" + self.celeb + "-Feet-{}.jpg"
+ return [
+ (ufmt.format(data["pid"]), {
+ "pid" : data["pid"],
+ "width" : data["pw"],
+ "height": data["ph"],
+ "tags" : [tagmap[tag] for tag in data["tags"]],
+ })
+ for data in json.loads(text.extr(page, "['gdata'] = ", ";"))
+ ]