summaryrefslogtreecommitdiffstats
path: root/gallery_dl
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl')
-rw-r--r--gallery_dl/__init__.py36
-rw-r--r--gallery_dl/downloader/ytdl.py2
-rw-r--r--gallery_dl/extractor/2chan.py2
-rw-r--r--gallery_dl/extractor/500px.py169
-rw-r--r--gallery_dl/extractor/__init__.py3
-rw-r--r--gallery_dl/extractor/artstation.py2
-rw-r--r--gallery_dl/extractor/blogger.py2
-rw-r--r--gallery_dl/extractor/common.py11
-rw-r--r--gallery_dl/extractor/cyberdrop.py23
-rw-r--r--gallery_dl/extractor/deviantart.py60
-rw-r--r--gallery_dl/extractor/exhentai.py16
-rw-r--r--gallery_dl/extractor/fanbox.py23
-rw-r--r--gallery_dl/extractor/fantia.py2
-rw-r--r--gallery_dl/extractor/flickr.py2
-rw-r--r--gallery_dl/extractor/furaffinity.py7
-rw-r--r--gallery_dl/extractor/generic.py208
-rw-r--r--gallery_dl/extractor/hitomi.py37
-rw-r--r--gallery_dl/extractor/imgbb.py2
-rw-r--r--gallery_dl/extractor/inkbunny.py22
-rw-r--r--gallery_dl/extractor/instagram.py31
-rw-r--r--gallery_dl/extractor/keenspot.py2
-rw-r--r--gallery_dl/extractor/kemonoparty.py43
-rw-r--r--gallery_dl/extractor/lolisafe.py79
-rw-r--r--gallery_dl/extractor/myportfolio.py4
-rw-r--r--gallery_dl/extractor/newgrounds.py4
-rw-r--r--gallery_dl/extractor/patreon.py2
-rw-r--r--gallery_dl/extractor/philomena.py2
-rw-r--r--gallery_dl/extractor/photobucket.py10
-rw-r--r--gallery_dl/extractor/pixiv.py15
-rw-r--r--gallery_dl/extractor/pixnet.py2
-rw-r--r--gallery_dl/extractor/pornhub.py2
-rw-r--r--gallery_dl/extractor/rule34us.py130
-rw-r--r--gallery_dl/extractor/sexcom.py9
-rw-r--r--gallery_dl/extractor/slickpic.py2
-rw-r--r--gallery_dl/extractor/smugmug.py2
-rw-r--r--gallery_dl/extractor/tumblr.py2
-rw-r--r--gallery_dl/extractor/tumblrgallery.py115
-rw-r--r--gallery_dl/extractor/twitter.py2
-rw-r--r--gallery_dl/extractor/wordpress.py41
-rw-r--r--gallery_dl/extractor/xhamster.py2
-rw-r--r--gallery_dl/extractor/ytdl.py10
-rw-r--r--gallery_dl/option.py52
-rw-r--r--gallery_dl/output.py30
-rw-r--r--gallery_dl/path.py7
-rw-r--r--gallery_dl/util.py20
-rw-r--r--gallery_dl/version.py2
-rw-r--r--gallery_dl/ytdl.py41
47 files changed, 1059 insertions, 233 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py
index 2cad029..ad8286e 100644
--- a/gallery_dl/__init__.py
+++ b/gallery_dl/__init__.py
@@ -115,6 +115,13 @@ def main():
config.load(args.cfgfiles, strict=True)
if args.yamlfiles:
config.load(args.yamlfiles, strict=True, fmt="yaml")
+ if args.filename:
+ if args.filename == "/O":
+ args.filename = "{filename}.{extension}"
+ config.set((), "filename", args.filename)
+ if args.directory:
+ config.set((), "base-directory", args.directory)
+ config.set((), "directory", ())
if args.postprocessors:
config.set((), "postprocessors", args.postprocessors)
if args.abort:
@@ -142,20 +149,23 @@ def main():
import os.path
import requests
- head = ""
- try:
- out, err = subprocess.Popen(
- ("git", "rev-parse", "--short", "HEAD"),
- stdout=subprocess.PIPE,
- stderr=subprocess.PIPE,
- cwd=os.path.dirname(os.path.abspath(__file__)),
- ).communicate()
- if out and not err:
- head = " - Git HEAD: " + out.decode().rstrip()
- except (OSError, subprocess.SubprocessError):
- pass
+ extra = ""
+ if getattr(sys, "frozen", False):
+ extra = " - Executable"
+ else:
+ try:
+ out, err = subprocess.Popen(
+ ("git", "rev-parse", "--short", "HEAD"),
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ cwd=os.path.dirname(os.path.abspath(__file__)),
+ ).communicate()
+ if out and not err:
+ extra = " - Git HEAD: " + out.decode().rstrip()
+ except (OSError, subprocess.SubprocessError):
+ pass
- log.debug("Version %s%s", __version__, head)
+ log.debug("Version %s%s", __version__, extra)
log.debug("Python %s - %s",
platform.python_version(), platform.platform())
try:
diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py
index 8416ca0..30f628e 100644
--- a/gallery_dl/downloader/ytdl.py
+++ b/gallery_dl/downloader/ytdl.py
@@ -39,7 +39,7 @@ class YoutubeDLDownloader(DownloaderBase):
if not ytdl_instance:
ytdl_instance = self.ytdl_instance
if not ytdl_instance:
- module = __import__(self.config("module") or "youtube_dl")
+ module = ytdl.import_module(self.config("module"))
self.ytdl_instance = ytdl_instance = ytdl.construct_YoutubeDL(
module, self, self.ytdl_opts)
if self.outtmpl == "default":
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py
index c92969b..38b2d5a 100644
--- a/gallery_dl/extractor/2chan.py
+++ b/gallery_dl/extractor/2chan.py
@@ -20,7 +20,7 @@ class _2chanThreadExtractor(Extractor):
filename_fmt = "{tim}.{extension}"
archive_fmt = "{board}_{thread}_{tim}"
url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
- pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)"
+ pattern = r"(?:https?://)?([\w-]+)\.2chan\.net/([^/]+)/res/(\d+)"
test = ("http://dec.2chan.net/70/res/4752.htm", {
"url": "f49aa31340e9a3429226af24e19e01f5b819ca1f",
"keyword": "44599c21b248e79692b2eb2da12699bd0ed5640a",
diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py
index 8c6fa09..88ceaeb 100644
--- a/gallery_dl/extractor/500px.py
+++ b/gallery_dl/extractor/500px.py
@@ -21,13 +21,13 @@ class _500pxExtractor(Extractor):
filename_fmt = "{id}_{name}.{extension}"
archive_fmt = "{id}"
root = "https://500px.com"
+ cookiedomain = ".500px.com"
def __init__(self, match):
Extractor.__init__(self, match)
self.session.headers["Referer"] = self.root + "/"
def items(self):
- first = True
data = self.metadata()
for photo in self.photos():
@@ -35,9 +35,7 @@ class _500pxExtractor(Extractor):
photo["extension"] = photo["image_format"]
if data:
photo.update(data)
- if first:
- first = False
- yield Message.Directory, photo
+ yield Message.Directory, photo
yield Message.Url, url, photo
def metadata(self):
@@ -72,24 +70,33 @@ class _500pxExtractor(Extractor):
self.log.warning("Unable to fetch photo %s", pid)
]
- def _request_api(self, url, params, csrf_token=None):
- headers = {"Origin": self.root, "X-CSRF-Token": csrf_token}
+ def _request_api(self, url, params):
+ headers = {
+ "Origin": self.root,
+ "x-csrf-token": self.session.cookies.get(
+ "x-csrf-token", domain=".500px.com"),
+ }
return self.request(url, headers=headers, params=params).json()
def _request_graphql(self, opname, variables):
url = "https://api.500px.com/graphql"
+ headers = {
+ "x-csrf-token": self.session.cookies.get(
+ "x-csrf-token", domain=".500px.com"),
+ }
data = {
"operationName": opname,
"variables" : json.dumps(variables),
"query" : QUERIES[opname],
}
- return self.request(url, method="POST", json=data).json()["data"]
+ return self.request(
+ url, method="POST", headers=headers, json=data).json()["data"]
class _500pxUserExtractor(_500pxExtractor):
"""Extractor for photos from a user's photostream on 500px.com"""
subcategory = "user"
- pattern = BASE_PATTERN + r"/(?!photo/)(?:p/)?([^/?#]+)/?(?:$|[?#])"
+ pattern = BASE_PATTERN + r"/(?!photo/|liked)(?:p/)?([^/?#]+)/?(?:$|[?#])"
test = (
("https://500px.com/p/light_expression_photography", {
"pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D4096/v2",
@@ -137,10 +144,6 @@ class _500pxGalleryExtractor(_500pxExtractor):
"user": dict,
},
}),
- # unavailable photos (#1335)
- ("https://500px.com/p/Light_Expression_Photography/galleries/street", {
- "count": 4,
- }),
("https://500px.com/fashvamp/galleries/lera"),
)
@@ -194,6 +197,30 @@ class _500pxGalleryExtractor(_500pxExtractor):
)["galleryByOwnerIdAndSlugOrToken"]["photos"]
+class _500pxFavoriteExtractor(_500pxExtractor):
+ """Extractor for favorite 500px photos"""
+ subcategory = "favorite"
+ pattern = BASE_PATTERN + r"/liked/?$"
+ test = ("https://500px.com/liked",)
+
+ def photos(self):
+ variables = {"pageSize": 20}
+ photos = self._request_graphql(
+ "LikedPhotosQueryRendererQuery", variables,
+ )["likedPhotos"]
+
+ while True:
+ yield from self._extend(photos["edges"])
+
+ if not photos["pageInfo"]["hasNextPage"]:
+ return
+
+ variables["cursor"] = photos["pageInfo"]["endCursor"]
+ photos = self._request_graphql(
+ "LikedPhotosPaginationContainerQuery", variables,
+ )["likedPhotos"]
+
+
class _500pxImageExtractor(_500pxExtractor):
"""Extractor for individual images from 500px.com"""
subcategory = "image"
@@ -640,4 +667,122 @@ fragment GalleriesDetailPaginationContainer_gallery_3e6UuE on Gallery {
}
""",
+ "LikedPhotosQueryRendererQuery": """\
+query LikedPhotosQueryRendererQuery($pageSize: Int) {
+ ...LikedPhotosPaginationContainer_query_RlXb8
+}
+
+fragment LikedPhotosPaginationContainer_query_RlXb8 on Query {
+ likedPhotos(first: $pageSize) {
+ edges {
+ node {
+ id
+ legacyId
+ canonicalPath
+ name
+ description
+ category
+ uploadedAt
+ location
+ width
+ height
+ isLikedByMe
+ notSafeForWork
+ tags
+ photographer: uploader {
+ id
+ legacyId
+ username
+ displayName
+ canonicalPath
+ avatar {
+ images {
+ url
+ id
+ }
+ id
+ }
+ followedByUsers {
+ totalCount
+ isFollowedByMe
+ }
+ }
+ images(sizes: [33, 35]) {
+ size
+ url
+ jpegUrl
+ webpUrl
+ id
+ }
+ __typename
+ }
+ cursor
+ }
+ pageInfo {
+ endCursor
+ hasNextPage
+ }
+ }
+}
+""",
+
+ "LikedPhotosPaginationContainerQuery": """\
+query LikedPhotosPaginationContainerQuery($cursor: String, $pageSize: Int) {
+ ...LikedPhotosPaginationContainer_query_3e6UuE
+}
+
+fragment LikedPhotosPaginationContainer_query_3e6UuE on Query {
+ likedPhotos(first: $pageSize, after: $cursor) {
+ edges {
+ node {
+ id
+ legacyId
+ canonicalPath
+ name
+ description
+ category
+ uploadedAt
+ location
+ width
+ height
+ isLikedByMe
+ notSafeForWork
+ tags
+ photographer: uploader {
+ id
+ legacyId
+ username
+ displayName
+ canonicalPath
+ avatar {
+ images {
+ url
+ id
+ }
+ id
+ }
+ followedByUsers {
+ totalCount
+ isFollowedByMe
+ }
+ }
+ images(sizes: [33, 35]) {
+ size
+ url
+ jpegUrl
+ webpUrl
+ id
+ }
+ __typename
+ }
+ cursor
+ }
+ pageInfo {
+ endCursor
+ hasNextPage
+ }
+ }
+}
+""",
+
}
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py
index dd9da01..65c994d 100644
--- a/gallery_dl/extractor/__init__.py
+++ b/gallery_dl/extractor/__init__.py
@@ -108,6 +108,7 @@ modules = [
"readcomiconline",
"reddit",
"redgifs",
+ "rule34us",
"sankaku",
"sankakucomplex",
"seiga",
@@ -144,12 +145,14 @@ modules = [
"foolslide",
"mastodon",
"shopify",
+ "lolisafe",
"imagehosts",
"directlink",
"recursive",
"oauth",
"test",
"ytdl",
+ "generic",
]
diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py
index f687ff8..5675081 100644
--- a/gallery_dl/extractor/artstation.py
+++ b/gallery_dl/extractor/artstation.py
@@ -29,12 +29,12 @@ class ArtstationExtractor(Extractor):
def items(self):
data = self.metadata()
- yield Message.Directory, data
for project in self.projects():
for asset in self.get_project_assets(project["hash_id"]):
asset.update(data)
adict = asset["asset"]
+ yield Message.Directory, asset
if adict["has_embedded_player"] and self.external:
player = adict["player_embedded"]
diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py
index 7e7c282..9a86cc4 100644
--- a/gallery_dl/extractor/blogger.py
+++ b/gallery_dl/extractor/blogger.py
@@ -15,7 +15,7 @@ import re
BASE_PATTERN = (
r"(?:blogger:(?:https?://)?([^/]+)|"
- r"(?:https?://)?([^.]+\.blogspot\.com))")
+ r"(?:https?://)?([\w-]+\.blogspot\.com))")
class BloggerExtractor(Extractor):
diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py
index e80366e..c440aee 100644
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@@ -571,7 +571,11 @@ class BaseExtractor(Extractor):
if not self.category:
for index, group in enumerate(match.groups()):
if group is not None:
- self.category, self.root = self.instances[index]
+ if index:
+ self.category, self.root = self.instances[index-1]
+ else:
+ self.root = group
+ self.category = group.partition("://")[2]
break
Extractor.__init__(self, match)
@@ -594,7 +598,10 @@ class BaseExtractor(Extractor):
pattern = re.escape(root[root.index(":") + 3:])
pattern_list.append(pattern + "()")
- return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")"
+ return (
+ r"(?:" + cls.basecategory + r":(https?://[^/?#]+)|"
+ r"(?:https?://)?(?:" + "|".join(pattern_list) + r"))"
+ )
class HTTPSAdapter(HTTPAdapter):
diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py
index dbaa97e..6d6e192 100644
--- a/gallery_dl/extractor/cyberdrop.py
+++ b/gallery_dl/extractor/cyberdrop.py
@@ -6,16 +6,13 @@
"""Extractors for https://cyberdrop.me/"""
-from .common import Extractor, Message
+from . import lolisafe
from .. import text
-class CyberdropAlbumExtractor(Extractor):
+class CyberdropAlbumExtractor(lolisafe.LolisafelbumExtractor):
category = "cyberdrop"
- subcategory = "album"
root = "https://cyberdrop.me"
- directory_fmt = ("{category}", "{album_name} ({album_id})")
- archive_fmt = "{album_id}_{id}"
pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.me/a/([^/?#]+)"
test = (
# images
@@ -44,11 +41,7 @@ class CyberdropAlbumExtractor(Extractor):
}),
)
- def __init__(self, match):
- Extractor.__init__(self, match)
- self.album_id = match.group(1)
-
- def items(self):
+ def fetch_album(self, album_id):
url = self.root + "/a/" + self.album_id
extr = text.extract_from(self.request(url).text)
@@ -58,9 +51,9 @@ class CyberdropAlbumExtractor(Extractor):
url = extr('id="file" href="', '"')
if not url:
break
- append(text.unescape(url))
+ append({"file": text.unescape(url)})
- data = {
+ return files, {
"album_id" : self.album_id,
"album_name" : extr("name: '", "'"),
"date" : text.parse_timestamp(extr("timestamp: ", ",")),
@@ -68,9 +61,3 @@ class CyberdropAlbumExtractor(Extractor):
"description": extr("description: `", "`"),
"count" : len(files),
}
-
- yield Message.Directory, data
- for url in files:
- text.nameext_from_url(url, data)
- data["filename"], _, data["id"] = data["filename"].rpartition("-")
- yield Message.Url, url, data
diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py
index 61affb5..94fec16 100644
--- a/gallery_dl/extractor/deviantart.py
+++ b/gallery_dl/extractor/deviantart.py
@@ -772,6 +772,7 @@ class DeviantartPopularExtractor(DeviantartExtractor):
if trange.startswith("popular-"):
trange = trange[8:]
self.time_range = {
+ "newest" : "now",
"most-recent" : "now",
"this-week" : "1week",
"this-month" : "1month",
@@ -786,6 +787,8 @@ class DeviantartPopularExtractor(DeviantartExtractor):
}
def deviations(self):
+ if self.time_range == "now":
+ return self.api.browse_newest(self.search_term, self.offset)
return self.api.browse_popular(
self.search_term, self.time_range, self.offset)
@@ -1034,21 +1037,32 @@ class DeviantartOAuthAPI():
def browse_deviantsyouwatch(self, offset=0):
"""Yield deviations from users you watch"""
- endpoint = "browse/deviantsyouwatch"
+ endpoint = "/browse/deviantsyouwatch"
params = {"limit": "50", "offset": offset,
"mature_content": self.mature}
return self._pagination(endpoint, params, public=False)
def browse_posts_deviantsyouwatch(self, offset=0):
"""Yield posts from users you watch"""
- endpoint = "browse/posts/deviantsyouwatch"
+ endpoint = "/browse/posts/deviantsyouwatch"
params = {"limit": "50", "offset": offset,
"mature_content": self.mature}
return self._pagination(endpoint, params, public=False, unpack=True)
+ def browse_newest(self, query=None, offset=0):
+ """Browse newest deviations"""
+ endpoint = "/browse/newest"
+ params = {
+ "q" : query,
+ "limit" : 50 if self.metadata else 120,
+ "offset" : offset,
+ "mature_content": self.mature,
+ }
+ return self._pagination(endpoint, params)
+
def browse_popular(self, query=None, timerange=None, offset=0):
"""Yield popular deviations"""
- endpoint = "browse/popular"
+ endpoint = "/browse/popular"
params = {
"q" : query,
"limit" : 50 if self.metadata else 120,
@@ -1060,7 +1074,7 @@ class DeviantartOAuthAPI():
def browse_tags(self, tag, offset=0):
""" Browse a tag """
- endpoint = "browse/tags"
+ endpoint = "/browse/tags"
params = {
"tag" : tag,
"offset" : offset,
@@ -1071,14 +1085,14 @@ class DeviantartOAuthAPI():
def browse_user_journals(self, username, offset=0):
"""Yield all journal entries of a specific user"""
- endpoint = "browse/user/journals"
+ endpoint = "/browse/user/journals"
params = {"username": username, "offset": offset, "limit": 50,
"mature_content": self.mature, "featured": "false"}
return self._pagination(endpoint, params)
def collections(self, username, folder_id, offset=0):
"""Yield all Deviation-objects contained in a collection folder"""
- endpoint = "collections/" + folder_id
+ endpoint = "/collections/" + folder_id
params = {"username": username, "offset": offset, "limit": 24,
"mature_content": self.mature}
return self._pagination(endpoint, params)
@@ -1086,21 +1100,21 @@ class DeviantartOAuthAPI():
@memcache(keyarg=1)
def collections_folders(self, username, offset=0):
"""Yield all collection folders of a specific user"""
- endpoint = "collections/folders"
+ endpoint = "/collections/folders"
params = {"username": username, "offset": offset, "limit": 50,
"mature_content": self.mature}
return self._pagination_list(endpoint, params)
def comments_deviation(self, deviation_id, offset=0):
"""Fetch comments posted on a deviation"""
- endpoint = "comments/deviation/" + deviation_id
+ endpoint = "/comments/deviation/" + deviation_id
params = {"maxdepth": "5", "offset": offset, "limit": 50,
"mature_content": self.mature}
return self._pagination_list(endpoint, params=params, key="thread")
def deviation(self, deviation_id, public=True):
"""Query and return info about a single Deviation"""
- endpoint = "deviation/" + deviation_id
+ endpoint = "/deviation/" + deviation_id
deviation = self._call(endpoint, public=public)
if self.metadata:
self._metadata((deviation,))
@@ -1110,13 +1124,13 @@ class DeviantartOAuthAPI():
def deviation_content(self, deviation_id, public=False):
"""Get extended content of a single Deviation"""
- endpoint = "deviation/content"
+ endpoint = "/deviation/content"
params = {"deviationid": deviation_id}
return self._call(endpoint, params=params, public=public)
def deviation_download(self, deviation_id, public=True):
"""Get the original file download (if allowed)"""
- endpoint = "deviation/download/" + deviation_id
+ endpoint = "/deviation/download/" + deviation_id
params = {"mature_content": self.mature}
return self._call(endpoint, params=params, public=public)
@@ -1124,7 +1138,7 @@ class DeviantartOAuthAPI():
""" Fetch deviation metadata for a set of deviations"""
if not deviations:
return []
- endpoint = "deviation/metadata?" + "&".join(
+ endpoint = "/deviation/metadata?" + "&".join(
"deviationids[{}]={}".format(num, deviation["deviationid"])
for num, deviation in enumerate(deviations)
)
@@ -1133,14 +1147,14 @@ class DeviantartOAuthAPI():
def gallery(self, username, folder_id, offset=0, extend=True, public=True):
"""Yield all Deviation-objects contained in a gallery folder"""
- endpoint = "gallery/" + folder_id
+ endpoint = "/gallery/" + folder_id
params = {"username": username, "offset": offset, "limit": 24,
"mature_content": self.mature, "mode": "newest"}
return self._pagination(endpoint, params, extend, public)
def gallery_all(self, username, offset=0):
"""Yield all Deviation-objects of a specific user"""
- endpoint = "gallery/all"
+ endpoint = "/gallery/all"
params = {"username": username, "offset": offset, "limit": 24,
"mature_content": self.mature}
return self._pagination(endpoint, params)
@@ -1148,7 +1162,7 @@ class DeviantartOAuthAPI():
@memcache(keyarg=1)
def gallery_folders(self, username, offset=0):
"""Yield all gallery folders of a specific user"""
- endpoint = "gallery/folders"
+ endpoint = "/gallery/folders"
params = {"username": username, "offset": offset, "limit": 50,
"mature_content": self.mature}
return self._pagination_list(endpoint, params)
@@ -1156,12 +1170,12 @@ class DeviantartOAuthAPI():
@memcache(keyarg=1)
def user_profile(self, username):
"""Get user profile information"""
- endpoint = "user/profile/" + username
+ endpoint = "/user/profile/" + username
return self._call(endpoint, fatal=False)
def user_friends_watch(self, username):
"""Watch a user"""
- endpoint = "user/friends/watch/" + username
+ endpoint = "/user/friends/watch/" + username
data = {
"watch[friend]" : "0",
"watch[deviations]" : "0",
@@ -1179,7 +1193,7 @@ class DeviantartOAuthAPI():
def user_friends_unwatch(self, username):
"""Unwatch a user"""
- endpoint = "user/friends/unwatch/" + username
+ endpoint = "/user/friends/unwatch/" + username
return self._call(
endpoint, method="POST", public=False, fatal=False,
).get("success")
@@ -1217,7 +1231,7 @@ class DeviantartOAuthAPI():
def _call(self, endpoint, fatal=True, public=True, **kwargs):
"""Call an API endpoint"""
- url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint
+ url = "https://www.deviantart.com/api/v1/oauth2" + endpoint
kwargs["fatal"] = None
while True:
@@ -1357,7 +1371,7 @@ class DeviantartEclipseAPI():
self.log = extractor.log
def deviation_extended_fetch(self, deviation_id, user=None, kind=None):
- endpoint = "da-browse/shared_api/deviation/extended_fetch"
+ endpoint = "/da-browse/shared_api/deviation/extended_fetch"
params = {
"deviationid" : deviation_id,
"username" : user,
@@ -1367,7 +1381,7 @@ class DeviantartEclipseAPI():
return self._call(endpoint, params)
def gallery_scraps(self, user, offset=None):
- endpoint = "da-user-profile/api/gallery/contents"
+ endpoint = "/da-user-profile/api/gallery/contents"
params = {
"username" : user,
"offset" : offset,
@@ -1377,7 +1391,7 @@ class DeviantartEclipseAPI():
return self._pagination(endpoint, params)
def user_watching(self, user, offset=None):
- endpoint = "da-user-profile/api/module/watching"
+ endpoint = "/da-user-profile/api/module/watching"
params = {
"username": user,
"moduleid": self._module_id_watching(user),
@@ -1387,7 +1401,7 @@ class DeviantartEclipseAPI():
return self._pagination(endpoint, params)
def _call(self, endpoint, params=None):
- url = "https://www.deviantart.com/_napi/" + endpoint
+ url = "https://www.deviantart.com/_napi" + endpoint
headers = {"Referer": "https://www.deviantart.com/"}
response = self.extractor._limited_request(
diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py
index 7ffb214..cf9706b 100644
--- a/gallery_dl/extractor/exhentai.py
+++ b/gallery_dl/extractor/exhentai.py
@@ -176,6 +176,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
self.image_token = match.group(4)
self.image_num = text.parse_int(match.group(6), 1)
+ source = self.config("source")
+ if source == "hitomi":
+ self.items = self._items_hitomi
+
def items(self):
self.login()
@@ -221,6 +225,18 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
data["_http_validate"] = None
yield Message.Url, url, data
+ def _items_hitomi(self):
+ if self.config("metadata", False):
+ data = self.metadata_from_api()
+ data["date"] = text.parse_timestamp(data["posted"])
+ else:
+ data = {}
+
+ from .hitomi import HitomiGalleryExtractor
+ url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id)
+ data["_extractor"] = HitomiGalleryExtractor
+ yield Message.Queue, url, data
+
def get_metadata(self, page):
"""Extract gallery metadata"""
data = self.metadata_from_page(page)
diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py
index cc6ee97..ef79808 100644
--- a/gallery_dl/extractor/fanbox.py
+++ b/gallery_dl/extractor/fanbox.py
@@ -33,7 +33,7 @@ class FanboxExtractor(Extractor):
def items(self):
if self._warning:
- if "FANBOXSESSID" not in self.session.cookies:
+ if not self._check_cookies(("FANBOXSESSID",)):
self.log.warning("no 'FANBOXSESSID' cookie set")
FanboxExtractor._warning = False
@@ -280,3 +280,24 @@ class FanboxPostExtractor(FanboxExtractor):
def posts(self):
return (self._get_post_data_from_id(self.post_id),)
+
+
+class FanboxRedirectExtractor(Extractor):
+ """Extractor for pixiv redirects to fanbox.cc"""
+ category = "fanbox"
+ subcategory = "redirect"
+ pattern = r"(?:https?://)?(?:www\.)?pixiv\.net/fanbox/creator/(\d+)"
+ test = ("https://www.pixiv.net/fanbox/creator/52336352", {
+ "pattern": FanboxCreatorExtractor.pattern,
+ })
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.user_id = match.group(1)
+
+ def items(self):
+ url = "https://www.pixiv.net/fanbox/creator/" + self.user_id
+ data = {"_extractor": FanboxCreatorExtractor}
+ response = self.request(
+ url, method="HEAD", allow_redirects=False, notfound="user")
+ yield Message.Queue, response.headers["Location"], data
diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py
index 62f7429..89a965f 100644
--- a/gallery_dl/extractor/fantia.py
+++ b/gallery_dl/extractor/fantia.py
@@ -22,7 +22,7 @@ class FantiaExtractor(Extractor):
def items(self):
if self._warning:
- if "_session_id" not in self.session.cookies:
+ if not self._check_cookies(("_session_id",)):
self.log.warning("no '_session_id' cookie set")
FantiaExtractor._warning = False
diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py
index 6c5c7df..2bd8c6b 100644
--- a/gallery_dl/extractor/flickr.py
+++ b/gallery_dl/extractor/flickr.py
@@ -56,7 +56,7 @@ class FlickrImageExtractor(FlickrExtractor):
subcategory = "image"
pattern = (r"(?:https?://)?(?:"
r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/"
- r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
+ r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)"
r"|flic\.kr/p/([A-Za-z1-9]+))")
test = (
("https://www.flickr.com/photos/departingyyz/16089302239", {
diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py
index b5ecbd6..891e0c1 100644
--- a/gallery_dl/extractor/furaffinity.py
+++ b/gallery_dl/extractor/furaffinity.py
@@ -22,6 +22,7 @@ class FuraffinityExtractor(Extractor):
archive_fmt = "{id}"
cookiedomain = ".furaffinity.net"
root = "https://www.furaffinity.net"
+ _warning = True
def __init__(self, match):
Extractor.__init__(self, match)
@@ -32,6 +33,12 @@ class FuraffinityExtractor(Extractor):
self._process_description = str.strip
def items(self):
+
+ if self._warning:
+ if not self._check_cookies(("a", "b")):
+ self.log.warning("no 'a' and 'b' session cookies set")
+ FuraffinityExtractor._warning = False
+
external = self.config("external", False)
metadata = self.metadata()
for post_id in util.advance(self.posts(), self.offset):
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
new file mode 100644
index 0000000..bece905
--- /dev/null
+++ b/gallery_dl/extractor/generic.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+
+"""Extractor for images in a generic web page."""
+
+from .common import Extractor, Message
+from .. import config, text
+import re
+import os.path
+
+
+class GenericExtractor(Extractor):
+ """Extractor for images in a generic web page."""
+
+ category = "generic"
+ directory_fmt = ("{category}", "{pageurl}")
+ archive_fmt = "{imageurl}"
+
+ # By default, the generic extractor is disabled
+ # and the "g(eneric):" prefix in url is required.
+ # If the extractor is enabled, make the prefix optional
+ pattern = r"(?ix)(?P<generic>g(?:eneric)?:)"
+ if config.get(("extractor", "generic"), "enabled"):
+ pattern += r"?"
+
+ # The generic extractor pattern should match (almost) any valid url
+ # Based on: https://tools.ietf.org/html/rfc3986#appendix-B
+ pattern += r"""
+ (?P<scheme>https?://)? # optional http(s) scheme
+ (?P<domain>[-\w\.]+) # required domain
+ (?P<path>/[^?&#]*)? # optional path
+ (?:\?(?P<query>[^/?#]*))? # optional query
+ (?:\#(?P<fragment>.*))?$ # optional fragment
+ """
+
+ def __init__(self, match):
+ """Init."""
+ Extractor.__init__(self, match)
+
+ # Strip the "g(eneric):" prefix
+ # and inform about "forced" or "fallback" mode
+ if match.group('generic'):
+ self.log.info("Forcing use of generic information extractor.")
+ self.url = match.group(0).partition(":")[2]
+ else:
+ self.log.info("Falling back on generic information extractor.")
+ self.url = match.group(0)
+
+ # Make sure we have a scheme, or use https
+ if match.group('scheme'):
+ self.scheme = match.group('scheme')
+ else:
+ self.scheme = 'https://'
+ self.url = self.scheme + self.url
+
+ # Used to resolve relative image urls
+ self.root = self.scheme + match.group('domain')
+
+ def items(self):
+ """Get page, extract metadata & images, yield them in suitable messages.
+
+ Adapted from common.GalleryExtractor.items()
+
+ """
+ page = self.request(self.url).text
+ data = self.metadata(page)
+ imgs = self.images(page)
+
+ try:
+ data["count"] = len(imgs)
+ except TypeError:
+ pass
+ images = enumerate(imgs, 1)
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+
+ for data["num"], (url, imgdata) in images:
+ if imgdata:
+ data.update(imgdata)
+ if "extension" not in imgdata:
+ text.nameext_from_url(url, data)
+ else:
+ text.nameext_from_url(url, data)
+ yield Message.Url, url, data
+
+ def metadata(self, page):
+ """Extract generic webpage metadata, return them in a dict."""
+ data = {}
+ data['pageurl'] = self.url
+ data['title'] = text.extract(page, '<title>', "</title>")[0] or ""
+ data['description'] = text.extract(
+ page, '<meta name="description" content="', '"')[0] or ""
+ data['keywords'] = text.extract(
+ page, '<meta name="keywords" content="', '"')[0] or ""
+ data['language'] = text.extract(
+ page, '<meta name="language" content="', '"')[0] or ""
+ data['name'] = text.extract(
+ page, '<meta itemprop="name" content="', '"')[0] or ""
+ data['copyright'] = text.extract(
+ page, '<meta name="copyright" content="', '"')[0] or ""
+ data['og_site'] = text.extract(
+ page, '<meta property="og:site" content="', '"')[0] or ""
+ data['og_site_name'] = text.extract(
+ page, '<meta property="og:site_name" content="', '"')[0] or ""
+ data['og_title'] = text.extract(
+ page, '<meta property="og:title" content="', '"')[0] or ""
+ data['og_description'] = text.extract(
+ page, '<meta property="og:description" content="', '"')[0] or ""
+
+ data = {k: text.unescape(data[k]) for k in data if data[k] != ""}
+
+ return data
+
+ def images(self, page):
+ """Extract image urls, return a list of (image url, metadata) tuples.
+
+ The extractor aims at finding as many _likely_ image urls as possible,
+ using two strategies (see below); since these often overlap, any
+ duplicate urls will be removed at the end of the process.
+
+ Note: since we are using re.findall() (see below), it's essential that
+ the following patterns contain 0 or at most 1 capturing group, so that
+ re.findall() return a list of urls (instead of a list of tuples of
+ matching groups). All other groups used in the pattern should be
+ non-capturing (?:...).
+
+ 1: Look in src/srcset attributes of img/video/source elements
+
+ See:
+ https://www.w3schools.com/tags/att_src.asp
+ https://www.w3schools.com/tags/att_source_srcset.asp
+
+ We allow both absolute and relative urls here.
+
+ Note that srcset attributes often contain multiple space separated
+ image urls; this pattern matches only the first url; remaining urls
+ will be matched by the "imageurl_pattern_ext" pattern below.
+ """
+ imageurl_pattern_src = r"""(?ix)
+ <(?:img|video|source)\s.*? # <img>, <video> or <source>
+ src(?:set)?=["']? # src or srcset attributes
+ (?P<URL>[^"'\s>]+) # url
+ """
+
+ """
+ 2: Look anywhere for urls containing common image/video extensions
+
+ The list of allowed extensions is borrowed from the directlink.py
+ extractor; other could be added, see
+ https://en.wikipedia.org/wiki/List_of_file_formats
+
+ Compared to the "pattern" class variable, here we must exclude also
+ other special characters (space, ", ', >), since we are looking for
+ urls in html tags.
+ """
+
+ imageurl_pattern_ext = r"""(?ix)
+ (?:[^?&#"'>\s]+) # anything until dot+extension
+ \.(?:jpe?g|jpe|png|gif
+ |web[mp]|mp4|mkv|og[gmv]|opus) # dot + image/video extensions
+ (?:[^"'>\s]*)? # optional query and fragment
+ """
+
+ imageurls_src = re.findall(imageurl_pattern_src, page)
+ imageurls_ext = re.findall(imageurl_pattern_ext, page)
+ imageurls = imageurls_src + imageurls_ext
+
+ # Resolve relative urls
+ #
+ # Image urls catched so far may be relative, so we must resolve them
+ # by prepending a suitable base url.
+ #
+ # If the page contains a <base> element, use it as base url
+ basematch = re.search(
+ r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)", page)
+ if basematch:
+ self.baseurl = basematch.group('url').rstrip('/')
+ # Otherwise, extract the base url from self.url
+ else:
+ if self.url.endswith("/"):
+ self.baseurl = self.url.rstrip('/')
+ else:
+ self.baseurl = os.path.dirname(self.url)
+
+ # Build the list of absolute image urls
+ absimageurls = []
+ for u in imageurls:
+ # Absolute urls are taken as-is
+ if u.startswith('http'):
+ absimageurls.append(u)
+ # // relative urls are prefixed with current scheme
+ elif u.startswith('//'):
+ absimageurls.append(self.scheme + u.lstrip('/'))
+ # / relative urls are prefixed with current scheme+domain
+ elif u.startswith('/'):
+ absimageurls.append(self.root + u)
+ # other relative urls are prefixed with baseurl
+ else:
+ absimageurls.append(self.baseurl + '/' + u)
+
+ # Remove duplicates
+ absimageurls = set(absimageurls)
+
+ # Create the image metadata dict and add imageurl to it
+ # (image filename and extension are added by items())
+ images = [(u, {'imageurl': u}) for u in absimageurls]
+
+ return images
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index a4ce925..88cf98c 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -10,9 +10,11 @@
from .common import GalleryExtractor, Extractor, Message
from .nozomi import decode_nozomi
+from ..cache import memcache
from .. import text, util
import string
import json
+import re
class HitomiGalleryExtractor(GalleryExtractor):
@@ -24,8 +26,10 @@ class HitomiGalleryExtractor(GalleryExtractor):
r"/(?:[^/?#]+-)?(\d+)")
test = (
("https://hitomi.la/galleries/867789.html", {
- "pattern": r"https://[a-c]b.hitomi.la/images/./../[0-9a-f]+.jpg",
+ "pattern": r"https://[a-c]b.hitomi.la/images/1639745412/\d+"
+ r"/[0-9a-f]{64}\.jpg",
"keyword": "4873ef9a523621fc857b114e0b2820ba4066e9ae",
+ "options": (("metadata", True),),
"count": 16,
}),
# download test
@@ -35,12 +39,12 @@ class HitomiGalleryExtractor(GalleryExtractor):
}),
# Game CG with scenes (#321)
("https://hitomi.la/galleries/733697.html", {
- "url": "0cb629ab2bfe93d994a7972f68ad2a5a64ecc161",
+ "url": "479d16fe92117a6a2ce81b4e702e6347922c81e3",
"count": 210,
}),
# fallback for galleries only available through /reader/ URLs
("https://hitomi.la/galleries/1045954.html", {
- "url": "b420755d56a1135104ca8ca0765f44e290db70c3",
+ "url": "ebc1415c5d7f634166ef7e2635b77735de1ea7a2",
"count": 1413,
}),
# gallery with "broken" redirect
@@ -71,7 +75,7 @@ class HitomiGalleryExtractor(GalleryExtractor):
self.info = info = json.loads(page.partition("=")[2])
data = self._data_from_gallery_info(info)
- if self.config("metadata", True):
+ if self.config("metadata", False):
data.update(self._data_from_gallery_page(info))
return data
@@ -133,19 +137,19 @@ class HitomiGalleryExtractor(GalleryExtractor):
}
def images(self, _):
+ # see https://ltn.hitomi.la/gg.js
+ gg_m, gg_b = _parse_gg(self)
+
result = []
for image in self.info["files"]:
ihash = image["hash"]
idata = text.nameext_from_url(image["name"])
# see https://ltn.hitomi.la/common.js
- inum = int(ihash[-3:-1], 16)
- offset = 1 if inum < 0x7c else 0
-
+ inum = int(ihash[-1] + ihash[-3:-1], 16)
url = "https://{}b.hitomi.la/images/{}/{}/{}.{}".format(
- chr(97 + offset),
- ihash[-1], ihash[-3:-1], ihash,
- idata["extension"],
+ chr(97 + gg_m.get(inum, 0)),
+ gg_b, inum, ihash, idata["extension"],
)
result.append((url, idata))
return result
@@ -185,3 +189,16 @@ class HitomiTagExtractor(Extractor):
for gallery_id in decode_nozomi(self.request(url).content):
url = "https://hitomi.la/galleries/{}.html".format(gallery_id)
yield Message.Queue, url, data
+
+
+@memcache()
+def _parse_gg(extr):
+ page = extr.request("https://ltn.hitomi.la/gg.js").text
+
+ m = {
+ int(match.group(1)): int(match.group(2))
+ for match in re.finditer(r"case (\d+): o = (\d+); break;", page)
+ }
+ b = re.search(r"b:\s*[\"'](.+)[\"']", page)
+
+ return m, b.group(1).strip("/")
diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py
index 1e875f0..f32093a 100644
--- a/gallery_dl/extractor/imgbb.py
+++ b/gallery_dl/extractor/imgbb.py
@@ -169,7 +169,7 @@ class ImgbbAlbumExtractor(ImgbbExtractor):
class ImgbbUserExtractor(ImgbbExtractor):
"""Extractor for user profiles in imgbb.com"""
subcategory = "user"
- pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$"
+ pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?$"
test = ("https://folkie.imgbb.com", {
"range": "1-80",
"pattern": r"https?://i\.ibb\.co/\w+/[^/?#]+",
diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py
index 3d09d79..8ee8ca9 100644
--- a/gallery_dl/extractor/inkbunny.py
+++ b/gallery_dl/extractor/inkbunny.py
@@ -205,6 +205,28 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor):
return self.api.search(params)
+class InkbunnySearchExtractor(InkbunnyExtractor):
+ """Extractor for inkbunny search results"""
+ subcategory = "search"
+ pattern = (BASE_PATTERN +
+ r"/submissionsviewall\.php\?([^#]+&mode=search&[^#]+)")
+ test = (("https://inkbunny.net/submissionsviewall.php?rid=ffffffffff"
+ "&mode=search&page=1&orderby=create_datetime&text=cute"
+ "&stringtype=and&keywords=yes&title=yes&description=no&artist="
+ "&favsby=&type=&days=&keyword_id=&user_id=&random=&md5="), {
+ "range": "1-10",
+ "count": 10,
+ })
+
+ def __init__(self, match):
+ InkbunnyExtractor.__init__(self, match)
+ self.params = text.parse_query(match.group(1))
+ self.params.pop("rid", None)
+
+ def posts(self):
+ return self.api.search(self.params)
+
+
class InkbunnyFollowingExtractor(InkbunnyExtractor):
"""Extractor for inkbunny user watches"""
subcategory = "following"
diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py
index a1dd465..781bf01 100644
--- a/gallery_dl/extractor/instagram.py
+++ b/gallery_dl/extractor/instagram.py
@@ -174,10 +174,16 @@ class InstagramExtractor(Extractor):
if post.get("is_video") and "video_url" not in post:
url = "{}/tv/{}/".format(self.root, post["shortcode"])
post = self._extract_post_page(url)
+ if "items" in post:
+ return self._parse_post_api({"media": post["items"][0]})
+ post = post["graphql"]["shortcode_media"]
elif typename == "GraphSidecar" and \
"edge_sidecar_to_children" not in post:
url = "{}/p/{}/".format(self.root, post["shortcode"])
post = self._extract_post_page(url)
+ if "items" in post:
+ return self._parse_post_api({"media": post["items"][0]})
+ post = post["graphql"]["shortcode_media"]
owner = post["owner"]
data = {
@@ -347,7 +353,7 @@ class InstagramExtractor(Extractor):
data = self._extract_shared_data(url)["entry_data"]
if "HttpErrorPage" in data:
raise exception.NotFoundError("post")
- return data["PostPage"][0]["graphql"]["shortcode_media"]
+ return data["PostPage"][0]
def _get_edge_data(self, user, key):
cursor = self.config("cursor")
@@ -564,7 +570,7 @@ class InstagramPostExtractor(InstagramExtractor):
"""Extractor for an Instagram post"""
subcategory = "post"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/(?:p|tv|reel)/([^/?#]+)")
+ r"/(?:[^/?#]+/)?(?:p|tv|reel)/([^/?#]+)")
test = (
# GraphImage
("https://www.instagram.com/p/BqvsDleB3lV/", {
@@ -663,6 +669,9 @@ class InstagramPostExtractor(InstagramExtractor):
}
}),
+ # URL with username (#2085)
+ ("https://www.instagram.com/dm/p/CW042g7B9CY/"),
+
("https://www.instagram.com/reel/CDg_6Y1pxWu/"),
)
@@ -686,14 +695,15 @@ class InstagramStoriesExtractor(InstagramExtractor):
"""Extractor for Instagram stories"""
subcategory = "stories"
pattern = (r"(?:https?://)?(?:www\.)?instagram\.com"
- r"/stories/(?:highlights/(\d+)|([^/?#]+))")
+ r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)")
test = (
("https://www.instagram.com/stories/instagram/"),
("https://www.instagram.com/stories/highlights/18042509488170095/"),
+ ("https://instagram.com/stories/geekmig/2724343156064789461"),
)
def __init__(self, match):
- self.highlight_id, self.user = match.groups()
+ self.highlight_id, self.user, self.media_id = match.groups()
if self.highlight_id:
self.subcategory = InstagramHighlightsExtractor.subcategory
InstagramExtractor.__init__(self, match)
@@ -712,7 +722,18 @@ class InstagramStoriesExtractor(InstagramExtractor):
endpoint = "/v1/feed/reels_media/"
params = {"reel_ids": reel_id}
- return self._request_api(endpoint, params=params)["reels"].values()
+ reels = self._request_api(endpoint, params=params)["reels"]
+
+ if self.media_id:
+ reel = reels[reel_id]
+ for item in reel["items"]:
+ if item["pk"] == self.media_id:
+ reel["items"] = (item,)
+ break
+ else:
+ raise exception.NotFoundError("story")
+
+ return reels.values()
class InstagramHighlightsExtractor(InstagramExtractor):
diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py
index 4012760..50ce0d3 100644
--- a/gallery_dl/extractor/keenspot.py
+++ b/gallery_dl/extractor/keenspot.py
@@ -19,7 +19,7 @@ class KeenspotComicExtractor(Extractor):
directory_fmt = ("{category}", "{comic}")
filename_fmt = "{filename}.{extension}"
archive_fmt = "{comic}_{filename}"
- pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?"
+ pattern = r"(?:https?://)?(?!www\.|forums\.)([\w-]+)\.keenspot\.com(/.+)?"
test = (
("http://marksmen.keenspot.com/", { # link
"range": "1-3",
diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py
index 6483278..f1d7bcf 100644
--- a/gallery_dl/extractor/kemonoparty.py
+++ b/gallery_dl/extractor/kemonoparty.py
@@ -14,7 +14,7 @@ from ..cache import cache
import itertools
import re
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?kemono\.party"
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?(kemono|coomer)\.party"
USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)"
@@ -27,17 +27,30 @@ class KemonopartyExtractor(Extractor):
archive_fmt = "{service}_{user}_{id}_{num}"
cookiedomain = ".kemono.party"
+ def __init__(self, match):
+ if match.group(1) == "coomer":
+ self.category = "coomerparty"
+ self.root = "https://coomer.party"
+ self.cookiedomain = ".coomer.party"
+ Extractor.__init__(self, match)
+
def items(self):
self._prepare_ddosguard_cookies()
self._find_inline = re.compile(
- r'src="(?:https?://kemono\.party)?(/inline/[^"]+'
+ r'src="(?:https?://(?:kemono|coomer)\.party)?(/inline/[^"]+'
r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall
find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match
generators = self._build_file_generators(self.config("files"))
comments = self.config("comments")
username = dms = None
+ # prevent files from coomer.party to be sent with gzip compression
+ if "coomer" in self.root:
+ headers = {"Accept-Encoding": "identity"}
+ else:
+ headers = None
+
if self.config("metadata"):
username = text.unescape(text.extract(
self.request(self.user_url).text,
@@ -83,10 +96,11 @@ class KemonopartyExtractor(Extractor):
post["type"] = file["type"]
post["num"] += 1
+ post["_http_headers"] = headers
if url[0] == "/":
url = self.root + "/data" + url
- elif url.startswith("https://kemono.party"):
+ elif url.startswith(self.root):
url = self.root + "/data" + url[20:]
text.nameext_from_url(file["name"], post)
@@ -129,7 +143,7 @@ class KemonopartyExtractor(Extractor):
def _build_file_generators(self, filetypes):
if filetypes is None:
- return (self._file, self._attachments, self._inline)
+ return (self._attachments, self._file, self._inline)
genmap = {
"file" : self._file,
"attachments": self._attachments,
@@ -191,8 +205,9 @@ class KemonopartyUserExtractor(KemonopartyExtractor):
)
def __init__(self, match):
+ _, service, user_id, offset = match.groups()
+ self.subcategory = service
KemonopartyExtractor.__init__(self, match)
- service, user_id, offset = match.groups()
self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id)
self.user_url = "{}/{}/user/{}".format(self.root, service, user_id)
self.offset = text.parse_int(offset)
@@ -233,7 +248,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
"published": "Sun, 11 Aug 2019 02:09:04 GMT",
"service": "fanbox",
"shared_file": False,
- "subcategory": "post",
+ "subcategory": "fanbox",
"title": "c96取り置き",
"type": "file",
"user": "6993449",
@@ -249,7 +264,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
# kemono.party -> data.kemono.party
("https://kemono.party/gumroad/user/trylsc/post/IURjT", {
"pattern": r"https://kemono\.party/data/("
- r"files/gumroad/trylsc/IURjT/reward8\.jpg|"
+ r"a4/7b/a47bfe938d8c1682eef06e885927484cd8df1b.+\.jpg|"
r"c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)",
}),
# username (#1548, #1652)
@@ -272,13 +287,19 @@ class KemonopartyPostExtractor(KemonopartyExtractor):
"date": "2021-07-31 02:47:51.327865",
}]},
}),
+ # coomer.party (#2100)
+ ("https://coomer.party/onlyfans/user/alinity/post/125962203", {
+ "pattern": r"https://coomer\.party/data/7d/3f/7d3fd9804583dc224968"
+ r"c0591163ec91794552b04f00a6c2f42a15b68231d5a8\.jpg",
+ }),
("https://kemono.party/subscribestar/user/alcorart/post/184330"),
("https://www.kemono.party/subscribestar/user/alcorart/post/184330"),
)
def __init__(self, match):
+ _, service, user_id, post_id = match.groups()
+ self.subcategory = service
KemonopartyExtractor.__init__(self, match)
- service, user_id, post_id = match.groups()
self.api_url = "{}/api/{}/user/{}/post/{}".format(
self.root, service, user_id, post_id)
self.user_url = "{}/{}/user/{}".format(self.root, service, user_id)
@@ -319,7 +340,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
def __init__(self, match):
KemonopartyExtractor.__init__(self, match)
- self.server, self.channel, self.channel_name = match.groups()
+ _, self.server, self.channel, self.channel_name = match.groups()
def items(self):
self._prepare_ddosguard_cookies()
@@ -353,7 +374,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor):
url = file["path"]
if url[0] == "/":
url = self.root + "/data" + url
- elif url.startswith("https://kemono.party"):
+ elif url.startswith(self.root):
url = self.root + "/data" + url[20:]
text.nameext_from_url(file["name"], post)
@@ -392,7 +413,7 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor):
def __init__(self, match):
KemonopartyExtractor.__init__(self, match)
- self.server = match.group(1)
+ self.server = match.group(2)
def items(self):
url = "{}/api/discord/channels/lookup?q={}".format(
diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py
new file mode 100644
index 0000000..cdaf22b
--- /dev/null
+++ b/gallery_dl/extractor/lolisafe.py
@@ -0,0 +1,79 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for lolisafe/chibisafe instances"""
+
+from .common import BaseExtractor, Message
+from .. import text
+
+
+class LolisafeExtractor(BaseExtractor):
+ """Base class for lolisafe extractors"""
+ basecategory = "lolisafe"
+ directory_fmt = ("{category}", "{album_name} ({album_id})")
+ archive_fmt = "{album_id}_{id}"
+
+
+BASE_PATTERN = LolisafeExtractor.update({
+ "bunkr": {"root": "https://bunkr.is", "pattern": r"bunkr\.(?:is|to)"},
+ "zzzz" : {"root": "https://zz.ht" , "pattern": r"zz\.(?:ht|fo)"},
+})
+
+
+class LolisafelbumExtractor(LolisafeExtractor):
+ subcategory = "album"
+ pattern = BASE_PATTERN + "/a/([^/?#]+)"
+ test = (
+ ("https://bunkr.is/a/Lktg9Keq", {
+ "pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png",
+ "content": "0c8768055e4e20e7c7259608b67799171b691140",
+ "keyword": {
+ "album_id": "Lktg9Keq",
+ "album_name": 'test テスト "&>',
+ "count": 1,
+ "filename": 'test-テスト-"&>-QjgneIQv',
+ "id": "QjgneIQv",
+ "name": 'test-テスト-"&>',
+ "num": int,
+ },
+ }),
+ ("https://bunkr.to/a/Lktg9Keq"),
+ ("https://zz.ht/a/lop7W6EZ", {
+ "pattern": r"https://z\.zz\.fo/(4anuY|ih560)\.png",
+ "count": 2,
+ "keyword": {
+ "album_id": "lop7W6EZ",
+ "album_name": "ferris",
+ },
+ }),
+ ("https://zz.fo/a/lop7W6EZ"),
+ )
+
+ def __init__(self, match):
+ LolisafeExtractor.__init__(self, match)
+ self.album_id = match.group(match.lastindex)
+
+ def items(self):
+ files, data = self.fetch_album(self.album_id)
+
+ yield Message.Directory, data
+ for data["num"], file in enumerate(files, 1):
+ url = file["file"]
+ text.nameext_from_url(url, data)
+ data["name"], sep, data["id"] = data["filename"].rpartition("-")
+ yield Message.Url, url, data
+
+ def fetch_album(self, album_id):
+ url = "{}/api/album/get/{}".format(self.root, album_id)
+ data = self.request(url).json()
+
+ return data["files"], {
+ "album_id" : self.album_id,
+ "album_name": text.unescape(data["title"]),
+ "count" : data["count"],
+ }
diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py
index 5c202f3..f06ab70 100644
--- a/gallery_dl/extractor/myportfolio.py
+++ b/gallery_dl/extractor/myportfolio.py
@@ -20,8 +20,8 @@ class MyportfolioGalleryExtractor(Extractor):
filename_fmt = "{num:>02}.{extension}"
archive_fmt = "{user}_{filename}"
pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
- r"(?:https?://)?([^.]+\.myportfolio\.com))"
- r"(/[^/?#]+)?")
+ r"(?:https?://)?([\w-]+\.myportfolio\.com))"
+ r"(/[^/?&#]+)?")
test = (
("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", {
"url": "acea0690c76db0e5cf267648cefd86e921bc3499",
diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py
index a699401..4351b3e 100644
--- a/gallery_dl/extractor/newgrounds.py
+++ b/gallery_dl/extractor/newgrounds.py
@@ -420,7 +420,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
"""Extractor for posts favorited by a newgrounds user"""
subcategory = "favorite"
directory_fmt = ("{category}", "{user}", "Favorites")
- pattern = (r"(?:https?://)?([^.]+)\.newgrounds\.com"
+ pattern = (r"(?:https?://)?([\w-]+)\.newgrounds\.com"
r"/favorites(?!/following)(?:/(art|audio|movies))?/?")
test = (
("https://tomfulp.newgrounds.com/favorites/art", {
@@ -475,7 +475,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor):
class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
"""Extractor for a newgrounds user's favorited users"""
subcategory = "following"
- pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/favorites/(following)"
+ pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/favorites/(following)"
test = ("https://tomfulp.newgrounds.com/favorites/following", {
"pattern": NewgroundsUserExtractor.pattern,
"range": "76-125",
diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py
index 62e4f58..f8c80ef 100644
--- a/gallery_dl/extractor/patreon.py
+++ b/gallery_dl/extractor/patreon.py
@@ -29,7 +29,7 @@ class PatreonExtractor(Extractor):
def items(self):
if self._warning:
- if "session_id" not in self.session.cookies:
+ if not self._check_cookies(("session_id",)):
self.log.warning("no 'session_id' cookie set")
PatreonExtractor._warning = False
generators = self._build_file_generators(self.config("files"))
diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py
index 51a0d38..6377fb0 100644
--- a/gallery_dl/extractor/philomena.py
+++ b/gallery_dl/extractor/philomena.py
@@ -46,7 +46,7 @@ class PhilomenaExtractor(BooruExtractor):
try:
params["filter_id"] = INSTANCES[self.category]["filter_id"]
except (KeyError, TypeError):
- pass
+ params["filter_id"] = "2"
while True:
data = self.request(url, params=params).json()
diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py
index bea0276..1993ab6 100644
--- a/gallery_dl/extractor/photobucket.py
+++ b/gallery_dl/extractor/photobucket.py
@@ -21,8 +21,8 @@ class PhotobucketAlbumExtractor(Extractor):
directory_fmt = ("{category}", "{username}", "{location}")
filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}"
archive_fmt = "{id}"
- pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)"
- r"/user/[^/?#]+/library(?:/[^?#]*)?")
+ pattern = (r"(?:https?://)?((?:[\w-]+\.)?photobucket\.com)"
+ r"/user/[^/?&#]+/library(?:/[^?&#]*)?")
test = (
("https://s369.photobucket.com/user/CrpyLrkr/library", {
"pattern": r"https?://[oi]+\d+.photobucket.com/albums/oo139/",
@@ -109,9 +109,9 @@ class PhotobucketImageExtractor(Extractor):
directory_fmt = ("{category}", "{username}")
filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}"
archive_fmt = "{username}_{id}"
- pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com"
- r"(?:/gallery/user/([^/?#]+)/media/([^/?#]+)"
- r"|/user/([^/?#]+)/media/[^?#]+\.html)")
+ pattern = (r"(?:https?://)?(?:[\w-]+\.)?photobucket\.com"
+ r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)"
+ r"|/user/([^/?&#]+)/media/[^?&#]+\.html)")
test = (
(("https://s271.photobucket.com/user/lakerfanryan"
"/media/Untitled-3-1.jpg.html"), {
diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py
index 8e47e2e..8943747 100644
--- a/gallery_dl/extractor/pixiv.py
+++ b/gallery_dl/extractor/pixiv.py
@@ -456,7 +456,9 @@ class PixivSearchExtractor(PixivExtractor):
self.sort = self.target = None
def works(self):
- return self.api.search_illust(self.word, self.sort, self.target)
+ return self.api.search_illust(
+ self.word, self.sort, self.target,
+ date_start=self.date_start, date_end=self.date_end)
def metadata(self):
query = text.parse_query(self.query)
@@ -489,10 +491,15 @@ class PixivSearchExtractor(PixivExtractor):
target = "s_tag"
self.target = target_map[target]
+ self.date_start = query.get("scd")
+ self.date_end = query.get("ecd")
+
return {"search": {
"word": self.word,
"sort": self.sort,
"target": self.target,
+ "date_start": self.date_start,
+ "date_end": self.date_end,
}}
@@ -710,9 +717,11 @@ class PixivAppAPI():
params = {"illust_id": illust_id}
return self._pagination("v2/illust/related", params)
- def search_illust(self, word, sort=None, target=None, duration=None):
+ def search_illust(self, word, sort=None, target=None, duration=None,
+ date_start=None, date_end=None):
params = {"word": word, "search_target": target,
- "sort": sort, "duration": duration}
+ "sort": sort, "duration": duration,
+ "start_date": date_start, "end_date": date_end}
return self._pagination("v1/search/illust", params)
def user_bookmarks_illust(self, user_id, tag=None, restrict="public"):
diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py
index 98928d6..a52071e 100644
--- a/gallery_dl/extractor/pixnet.py
+++ b/gallery_dl/extractor/pixnet.py
@@ -12,7 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
-BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net"
+BASE_PATTERN = r"(?:https?://)?(?!www\.)([\w-]+)\.pixnet.net"
class PixnetExtractor(Extractor):
diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py
index f976e82..f8497c0 100644
--- a/gallery_dl/extractor/pornhub.py
+++ b/gallery_dl/extractor/pornhub.py
@@ -12,7 +12,7 @@ from .common import Extractor, Message
from .. import text, exception
-BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?pornhub\.com"
+BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com"
class PornhubExtractor(Extractor):
diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py
new file mode 100644
index 0000000..00b6972
--- /dev/null
+++ b/gallery_dl/extractor/rule34us.py
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://rule34.us/"""
+
+from .booru import BooruExtractor
+from .. import text
+import re
+import collections
+
+
+class Rule34usExtractor(BooruExtractor):
+ category = "rule34us"
+ root = "https://rule34.us"
+ per_page = 42
+
+ def __init__(self, match):
+ BooruExtractor.__init__(self, match)
+ self._find_tags = re.compile(
+ r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall
+
+ def _parse_post(self, post_id):
+ url = "{}/index.php?r=posts/view&id={}".format(self.root, post_id)
+ page = self.request(url).text
+ extr = text.extract_from(page)
+
+ post = {
+ "id" : post_id,
+ "tags" : text.unescape(extr(
+ 'name="keywords" content="', '"').rstrip(", ")),
+ "uploader": text.extract(extr('Added by: ', '</li>'), ">", "<")[0],
+ "score" : text.extract(extr('Score: ', '> - <'), ">", "<")[0],
+ "width" : extr('Size: ', 'w'),
+ "height" : extr(' x ', 'h'),
+ "file_url": extr(' src="', '"'),
+ }
+ post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0]
+
+ tags = collections.defaultdict(list)
+ for tag_type, tag_name in self._find_tags(page):
+ tags[tag_type].append(text.unquote(tag_name))
+ for key, value in tags.items():
+ post["tags_" + key] = " ".join(value)
+
+ return post
+
+
+class Rule34usTagExtractor(Rule34usExtractor):
+ subcategory = "tag"
+ directory_fmt = ("{category}", "{search_tags}")
+ archive_fmt = "t_{search_tags}_{id}"
+ pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]+)"
+ test = ("https://rule34.us/index.php?r=posts/index&q=[terios]_elysion", {
+ "pattern": r"https://img\d*\.rule34\.us"
+ r"/images/../../[0-9a-f]{32}\.\w+",
+ "count": 10,
+ })
+
+ def __init__(self, match):
+ Rule34usExtractor.__init__(self, match)
+ self.tags = text.unquote(match.group(1).replace("+", " "))
+
+ def metadata(self):
+ return {"search_tags": self.tags}
+
+ def posts(self):
+ url = self.root + "/index.php"
+ params = {
+ "r" : "posts/index",
+ "q" : self.tags,
+ "page": self.page_start,
+ }
+
+ while True:
+ page = self.request(url, params=params).text
+
+ cnt = 0
+ for post_id in text.extract_iter(page, '><a id="', '"'):
+ yield self._parse_post(post_id)
+ cnt += 1
+
+ if cnt < self.per_page:
+ return
+
+ if "page" in params:
+ del params["page"]
+ params["q"] = self.tags + " id:<" + post_id
+
+
+class Rule34usPostExtractor(Rule34usExtractor):
+ subcategory = "post"
+ archive_fmt = "{id}"
+ pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/view&id=(\d+)"
+ test = (
+ ("https://rule34.us/index.php?r=posts/view&id=3709005", {
+ "pattern": r"https://img\d*\.rule34\.us/images/14/7b"
+ r"/147bee6fc2e13f73f5f9bac9d4930b13\.png",
+ "content": "d714342ea84050f82dda5f0c194d677337abafc5",
+ }),
+ ("https://rule34.us/index.php?r=posts/view&id=4576310", {
+ "pattern": r"https://video\.rule34\.us/images/a2/94"
+ r"/a294ff8e1f8e0efa041e5dc9d1480011\.mp4",
+ "keyword": {
+ "extension": "mp4",
+ "file_url": str,
+ "filename": "a294ff8e1f8e0efa041e5dc9d1480011",
+ "height": "3982",
+ "id": "4576310",
+ "md5": "a294ff8e1f8e0efa041e5dc9d1480011",
+ "score": r"re:\d+",
+ "tags": "tagme, video",
+ "tags_general": "video",
+ "tags_metadata": "tagme",
+ "uploader": "Anonymous",
+ "width": "3184",
+ },
+ }),
+ )
+
+ def __init__(self, match):
+ Rule34usExtractor.__init__(self, match)
+ self.post_id = match.group(1)
+
+ def posts(self):
+ return (self._parse_post(self.post_id),)
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py
index ccedff3..199b1ba 100644
--- a/gallery_dl/extractor/sexcom.py
+++ b/gallery_dl/extractor/sexcom.py
@@ -78,9 +78,14 @@ class SexcomExtractor(Extractor):
path += "/hd"
data["url"] = self.root + path
else:
+ iframe = extr('<iframe', '>')
+ src = (text.extract(iframe, ' src="', '"')[0] or
+ text.extract(iframe, " src='", "'")[0])
+ if not src:
+ self.log.warning("Unable to fetch media from %s", url)
+ return None
data["extension"] = None
- data["url"] = "ytdl:" + text.extract(
- extr('<iframe', '>'), ' src="', '"')[0]
+ data["url"] = "ytdl:" + src
else:
data["url"] = text.unescape(extr(' src="', '"').partition("?")[0])
text.nameext_from_url(data["url"], data)
diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py
index b5fbdc2..7b5982a 100644
--- a/gallery_dl/extractor/slickpic.py
+++ b/gallery_dl/extractor/slickpic.py
@@ -13,7 +13,7 @@ from .. import text
import time
-BASE_PATTERN = r"(?:https?://)?([^.]+)\.slickpic\.com"
+BASE_PATTERN = r"(?:https?://)?([\w-]+)\.slickpic\.com"
class SlickpicExtractor(Extractor):
diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py
index 5d582b5..bdf6036 100644
--- a/gallery_dl/extractor/smugmug.py
+++ b/gallery_dl/extractor/smugmug.py
@@ -13,7 +13,7 @@ from .. import text, oauth, exception
BASE_PATTERN = (
r"(?:smugmug:(?!album:)(?:https?://)?([^/]+)|"
- r"(?:https?://)?([^.]+)\.smugmug\.com)")
+ r"(?:https?://)?([\w-]+)\.smugmug\.com)")
class SmugmugExtractor(Extractor):
diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py
index 243710d..358bc95 100644
--- a/gallery_dl/extractor/tumblr.py
+++ b/gallery_dl/extractor/tumblr.py
@@ -35,7 +35,7 @@ POST_TYPES = frozenset((
BASE_PATTERN = (
r"(?:tumblr:(?:https?://)?([^/]+)|"
- r"(?:https?://)?([^.]+\.tumblr\.com))")
+ r"(?:https?://)?([\w-]+\.tumblr\.com))")
class TumblrExtractor(Extractor):
diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py
index 849dc49..e790613 100644
--- a/gallery_dl/extractor/tumblrgallery.py
+++ b/gallery_dl/extractor/tumblrgallery.py
@@ -19,6 +19,20 @@ class TumblrgalleryExtractor(GalleryExtractor):
directory_fmt = ("{category}", "{gallery_id} {title}")
root = "https://tumblrgallery.xyz"
+ @staticmethod
+ def _urls_from_page(page):
+ return text.extract_iter(
+ page, '<div class="report"> <a class="xx-co-me" href="', '"')
+
+ @staticmethod
+ def _data_from_url(url):
+ filename = text.nameext_from_url(url)["filename"]
+ parts = filename.split("_")
+ try:
+ return {"id": parts[1] if parts[1] != "inline" else parts[2]}
+ except IndexError:
+ return {"id": filename}
+
class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
"""Extractor for Tumblrblog on tumblrgallery.xyz"""
@@ -39,34 +53,27 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor):
def images(self, _):
page_num = 1
while True:
- response = self.request(
- "{}/tumblrblog/gallery/{}/{}.html"
- .format(self.root, self.gallery_id, page_num),
- allow_redirects=False
- )
- if response.status_code != 200:
+ url = "{}/tumblrblog/gallery/{}/{}.html".format(
+ self.root, self.gallery_id, page_num)
+ response = self.request(url, allow_redirects=False, fatal=False)
+
+ if response.status_code >= 300:
return
- page = response.text
+ for url in self._urls_from_page(response.text):
+ yield url, self._data_from_url(url)
page_num += 1
- urls = list(text.extract_iter(
- page,
- '<div class="report xx-co-me"> <a href="',
- '" data-fancybox="gallery"'
- ))
-
- for image_src in urls:
- yield image_src, {
- "id": text.extract(image_src, "tumblr_", "_")[0]
- }
-
class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
"""Extractor for Posts on tumblrgallery.xyz"""
subcategory = "post"
pattern = BASE_PATTERN + r"(/post/(\d+)\.html)"
- test = ("https://tumblrgallery.xyz/post/405674.html",)
+ test = ("https://tumblrgallery.xyz/post/405674.html", {
+ "pattern": r"https://78\.media\.tumblr\.com/bec67072219c1f3bc04fd9711d"
+ r"ec42ef/tumblr_p51qq1XCHS1txhgk3o1_1280\.jpg",
+ "count": 3,
+ })
def __init__(self, match):
TumblrgalleryExtractor.__init__(self, match)
@@ -81,17 +88,8 @@ class TumblrgalleryPostExtractor(TumblrgalleryExtractor):
}
def images(self, page):
- urls = list(text.extract_iter(
- page,
- '<div class="report xx-co-me"> <a href="',
- '" data-fancybox="gallery"'
- ))
-
- for image_src in urls:
- yield image_src, {
- "id": text.extract(image_src, "tumblr_", "_")[0] or
- text.nameext_from_url(image_src)["filename"]
- }
+ for url in self._urls_from_page(page):
+ yield url, self._data_from_url(url)
class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
@@ -100,7 +98,10 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
filename_fmt = "{category}_{num:>03}_{gallery_id}_{id}_{title}.{extension}"
directory_fmt = ("{category}", "{search_term}")
pattern = BASE_PATTERN + r"(/s\.php\?q=([^&#]+))"
- test = ("https://tumblrgallery.xyz/s.php?q=everyday-life",)
+ test = ("https://tumblrgallery.xyz/s.php?q=everyday-life", {
+ "pattern": r"https://\d+\.media\.tumblr\.com/.+",
+ "count": "< 1000",
+ })
def __init__(self, match):
TumblrgalleryExtractor.__init__(self, match)
@@ -112,38 +113,26 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor):
}
def images(self, _):
- page_num = 1
+ page_url = "s.php?q=" + self.search_term
while True:
- response = self.request(
- "{}/s.php?q={}&page={}"
- .format(self.root, self.search_term, page_num),
- allow_redirects=False
- )
- if response.status_code != 200:
- return
+ page = self.request(self.root + "/" + page_url).text
- page = response.text
- page_num += 1
+ for gallery_id in text.extract_iter(
+ page, '<div class="title"><a href="post/', '.html'):
- gallery_ids = list(text.extract_iter(
- page,
- '<div class="title"><a href="post/',
- '.html'
- ))
-
- for gallery_id in gallery_ids:
- post_page = self.request(
- "{}/post/{}.html"
- .format(self.root, gallery_id),
- allow_redirects=False
- ).text
- for image_src in TumblrgalleryPostExtractor.images(
- self, post_page
- ):
- image_src[1]["title"] = text.remove_html(
- text.unescape(
- text.extract(post_page, "<title>", "</title>")[0]
- )
- ).replace("_", "-")
- image_src[1]["gallery_id"] = gallery_id
- yield image_src
+ url = "{}/post/{}.html".format(self.root, gallery_id)
+ post_page = self.request(url).text
+
+ for url in self._urls_from_page(post_page):
+ data = self._data_from_url(url)
+ data["gallery_id"] = gallery_id
+ data["title"] = text.remove_html(text.unescape(
+ text.extract(post_page, "<title>", "</title>")[0]
+ )).replace("_", "-")
+ yield url, data
+
+ next_url = text.extract(
+ page, '</span> <a class="btn btn-primary" href="', '"')[0]
+ if not next_url or page_url == next_url:
+ return
+ page_url = next_url
diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py
index f1c392d..a49f1f2 100644
--- a/gallery_dl/extractor/twitter.py
+++ b/gallery_dl/extractor/twitter.py
@@ -47,7 +47,7 @@ class TwitterExtractor(Extractor):
size = self.config("size")
if size is None:
self._size_image = "orig"
- self._size_fallback = ("large", "medium", "small")
+ self._size_fallback = ("4096x4096", "large", "medium", "small")
else:
if isinstance(size, str):
size = size.split(",")
diff --git a/gallery_dl/extractor/wordpress.py b/gallery_dl/extractor/wordpress.py
new file mode 100644
index 0000000..dd7d28a
--- /dev/null
+++ b/gallery_dl/extractor/wordpress.py
@@ -0,0 +1,41 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2021 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for WordPress blogs"""
+
+from .common import BaseExtractor, Message
+from .. import text
+
+
+class WordpressExtractor(BaseExtractor):
+ """Base class for wordpress extractors"""
+ basecategory = "wordpress"
+
+ def items(self):
+ for post in self.posts():
+ yield Message.Difrectory, post
+
+
+
+BASE_PATTERN = WordpressExtractor.update({})
+
+
+class WordpressBlogExtractor(WordpressExtractor):
+ """Extractor for WordPress blogs"""
+ subcategory = "blog"
+ directory_fmt = ("{category}", "{blog}")
+ pattern = BASE_PATTERN + r"/?$"
+
+ def posts(self):
+ url = self.root + "/wp-json/wp/v2/posts"
+ params = {"page": 1, "per_page": "100"}
+
+ while True:
+ data = self.request(url, params=params).json()
+ exit()
+ yield 1
diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py
index f7a0a7e..146ab04 100644
--- a/gallery_dl/extractor/xhamster.py
+++ b/gallery_dl/extractor/xhamster.py
@@ -13,7 +13,7 @@ from .. import text
import json
-BASE_PATTERN = (r"(?:https?://)?((?:[^.]+\.)?xhamster"
+BASE_PATTERN = (r"(?:https?://)?((?:[\w-]+\.)?xhamster"
r"(?:\d?\.(?:com|one|desi)|\.porncache\.net))")
diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py
index 8eb0c83..8f3ef9a 100644
--- a/gallery_dl/extractor/ytdl.py
+++ b/gallery_dl/extractor/ytdl.py
@@ -23,9 +23,9 @@ class YoutubeDLExtractor(Extractor):
def __init__(self, match):
# import main youtube_dl module
- module_name = self.ytdl_module_name = config.get(
- ("extractor", "ytdl"), "module") or "youtube_dl"
- module = __import__(module_name)
+ ytdl_module = ytdl.import_module(config.get(
+ ("extractor", "ytdl"), "module"))
+ self.ytdl_module_name = ytdl_module.__name__
# find suitable youtube_dl extractor
self.ytdl_url = url = match.group(1)
@@ -34,7 +34,7 @@ class YoutubeDLExtractor(Extractor):
self.ytdl_ie_key = "Generic"
self.force_generic_extractor = True
else:
- for ie in module.extractor.gen_extractor_classes():
+ for ie in ytdl_module.extractor.gen_extractor_classes():
if ie.suitable(url):
self.ytdl_ie_key = ie.ie_key()
break
@@ -48,7 +48,7 @@ class YoutubeDLExtractor(Extractor):
def items(self):
# import subcategory module
- ytdl_module = __import__(
+ ytdl_module = ytdl.import_module(
config.get(("extractor", "ytdl", self.subcategory), "module") or
self.ytdl_module_name)
self.log.debug("Using %s", ytdl_module)
diff --git a/gallery_dl/option.py b/gallery_dl/option.py
index 5f7b281..1967bf7 100644
--- a/gallery_dl/option.py
+++ b/gallery_dl/option.py
@@ -92,9 +92,9 @@ def build_parser():
help="Print program version and exit",
)
general.add_argument(
- "-d", "--dest",
+ "--dest",
dest="base-directory", metavar="DEST", action=ConfigAction,
- help="Destination directory",
+ help=argparse.SUPPRESS,
)
general.add_argument(
"-i", "--input-file",
@@ -103,6 +103,17 @@ def build_parser():
"More than one --input-file can be specified"),
)
general.add_argument(
+ "-f", "--filename",
+ dest="filename", metavar="FORMAT",
+ help=("Filename format string for downloaded files "
+ "('/O' for \"original\" filenames)"),
+ )
+ general.add_argument(
+ "-d", "--directory",
+ dest="directory", metavar="PATH",
+ help="Target location for file downloads",
+ )
+ general.add_argument(
"--cookies",
dest="cookies", metavar="FILE", action=ConfigAction,
help="File to load additional cookies from",
@@ -211,8 +222,22 @@ def build_parser():
)
downloader.add_argument(
"--sleep",
- dest="sleep", metavar="SECONDS", type=float, action=ConfigAction,
- help="Number of seconds to sleep before each download",
+ dest="sleep", metavar="SECONDS", action=ConfigAction,
+ help=("Number of seconds to wait before each download. "
+ "This can be either a constant value or a range "
+ "(e.g. 2.7 or 2.0-3.5)"),
+ )
+ downloader.add_argument(
+ "--sleep-request",
+ dest="sleep-request", metavar="SECONDS", action=ConfigAction,
+ help=("Number of seconds to wait between HTTP requests "
+ "during data extraction"),
+ )
+ downloader.add_argument(
+ "--sleep-extractor",
+ dest="sleep-extractor", metavar="SECONDS", action=ConfigAction,
+ help=("Number of seconds to wait before starting data extraction "
+ "for an input URL"),
)
downloader.add_argument(
"--filesize-min",
@@ -337,6 +362,11 @@ def build_parser():
"and other delegated URLs"),
)
+ infojson = {
+ "name" : "metadata",
+ "event" : "init",
+ "filename": "info.json",
+ }
postprocessor = parser.add_argument_group("Post-processing Options")
postprocessor.add_argument(
"--zip",
@@ -372,16 +402,18 @@ def build_parser():
help="Write metadata to separate JSON files",
)
postprocessor.add_argument(
- "--write-infojson",
+ "--write-info-json",
dest="postprocessors",
- action="append_const", const={
- "name" : "metadata",
- "event" : "init",
- "filename": "info.json",
- },
+ action="append_const", const=infojson,
help="Write gallery metadata to a info.json file",
)
postprocessor.add_argument(
+ "--write-infojson",
+ dest="postprocessors",
+ action="append_const", const=infojson,
+ help=argparse.SUPPRESS,
+ )
+ postprocessor.add_argument(
"--write-tags",
dest="postprocessors",
action="append_const", const={"name": "metadata", "mode": "tags"},
diff --git a/gallery_dl/output.py b/gallery_dl/output.py
index d4d295f..7e00e1a 100644
--- a/gallery_dl/output.py
+++ b/gallery_dl/output.py
@@ -265,10 +265,14 @@ class NullOutput():
class PipeOutput(NullOutput):
def skip(self, path):
- print(CHAR_SKIP, path, sep="", flush=True)
+ stdout = sys.stdout
+ stdout.write(CHAR_SKIP + path + "\n")
+ stdout.flush()
def success(self, path, tries):
- print(path, flush=True)
+ stdout = sys.stdout
+ stdout.write(path + "\n")
+ stdout.flush()
class TerminalOutput(NullOutput):
@@ -284,34 +288,38 @@ class TerminalOutput(NullOutput):
self.shorten = util.identity
def start(self, path):
- print(self.shorten(" " + path), end="", flush=True)
+ stdout = sys.stdout
+ stdout.write(self.shorten(" " + path))
+ stdout.flush()
def skip(self, path):
- print(self.shorten(CHAR_SKIP + path))
+ sys.stdout.write(self.shorten(CHAR_SKIP + path) + "\n")
def success(self, path, tries):
- print("\r", self.shorten(CHAR_SUCCESS + path), sep="")
+ sys.stdout.write("\r" + self.shorten(CHAR_SUCCESS + path) + "\n")
def progress(self, bytes_total, bytes_downloaded, bytes_per_second):
bdl = util.format_value(bytes_downloaded)
bps = util.format_value(bytes_per_second)
if bytes_total is None:
- print("\r{:>7}B {:>7}B/s ".format(bdl, bps), end="")
+ sys.stderr.write("\r{:>7}B {:>7}B/s ".format(bdl, bps))
else:
- print("\r{:>3}% {:>7}B {:>7}B/s ".format(
- bytes_downloaded * 100 // bytes_total, bdl, bps), end="")
+ sys.stderr.write("\r{:>3}% {:>7}B {:>7}B/s ".format(
+ bytes_downloaded * 100 // bytes_total, bdl, bps))
class ColorOutput(TerminalOutput):
def start(self, path):
- print(self.shorten(path), end="", flush=True)
+ stdout = sys.stdout
+ stdout.write(self.shorten(path))
+ stdout.flush()
def skip(self, path):
- print("\033[2m", self.shorten(path), "\033[0m", sep="")
+ sys.stdout.write("\033[2m" + self.shorten(path) + "\033[0m\n")
def success(self, path, tries):
- print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="")
+ sys.stdout.write("\r\033[1;32m" + self.shorten(path) + "\033[0m\n")
class EAWCache(dict):
diff --git a/gallery_dl/path.py b/gallery_dl/path.py
index 12ce8ad..9e9e983 100644
--- a/gallery_dl/path.py
+++ b/gallery_dl/path.py
@@ -177,8 +177,11 @@ class PathFormat():
self.directory = directory = self.basedirectory
if WINDOWS:
- # Enable longer-than-260-character paths on Windows
- directory = "\\\\?\\" + os.path.abspath(directory)
+ # Enable longer-than-260-character paths
+ if directory.startswith("\\\\"):
+ directory = "\\\\?\\UNC\\" + directory[2:]
+ else:
+ directory = "\\\\?\\" + os.path.abspath(directory)
# abspath() in Python 3.7+ removes trailing path separators (#402)
if directory[-1] != sep:
diff --git a/gallery_dl/util.py b/gallery_dl/util.py
index d25194e..bccae2d 100644
--- a/gallery_dl/util.py
+++ b/gallery_dl/util.py
@@ -428,18 +428,26 @@ def build_duration_func(duration, min=0.0):
if not duration:
return None
- try:
- lower, upper = duration
- except TypeError:
- pass
+ if isinstance(duration, str):
+ lower, _, upper = duration.partition("-")
+ lower = float(lower)
else:
+ try:
+ lower, upper = duration
+ except TypeError:
+ lower, upper = duration, None
+
+ if upper:
+ upper = float(upper)
return functools.partial(
random.uniform,
lower if lower > min else min,
upper if upper > min else min,
)
-
- return functools.partial(identity, duration if duration > min else min)
+ else:
+ if lower < min:
+ lower = min
+ return lambda: lower
def build_extractor_filter(categories, negate=True, special=None):
diff --git a/gallery_dl/version.py b/gallery_dl/version.py
index a363a97..b5114e8 100644
--- a/gallery_dl/version.py
+++ b/gallery_dl/version.py
@@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-__version__ = "1.19.3"
+__version__ = "1.20.0"
diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py
index 4266f48..e6953eb 100644
--- a/gallery_dl/ytdl.py
+++ b/gallery_dl/ytdl.py
@@ -14,6 +14,15 @@ import itertools
from . import text, util, exception
+def import_module(module_name):
+ if module_name is None:
+ try:
+ return __import__("yt_dlp")
+ except ImportError:
+ return __import__("youtube_dl")
+ return __import__(module_name.replace("-", "_"))
+
+
def construct_YoutubeDL(module, obj, user_opts, system_opts=None):
opts = argv = None
config = obj.config
@@ -95,6 +104,8 @@ def parse_command_line(module, argv):
opts.continue_dl = False
if opts.retries is not None:
opts.retries = parse_retries(opts.retries)
+ if getattr(opts, "file_access_retries", None) is not None:
+ opts.file_access_retries = parse_retries(opts.file_access_retries)
if opts.fragment_retries is not None:
opts.fragment_retries = parse_retries(opts.fragment_retries)
if getattr(opts, "extractor_retries", None) is not None:
@@ -111,6 +122,10 @@ def parse_command_line(module, argv):
opts.recodevideo = opts.recodevideo.replace(" ", "")
if getattr(opts, "remuxvideo", None) is not None:
opts.remuxvideo = opts.remuxvideo.replace(" ", "")
+ if getattr(opts, "wait_for_video", None) is not None:
+ min_wait, _, max_wait = opts.wait_for_video.partition("-")
+ opts.wait_for_video = (module.parse_duration(min_wait),
+ module.parse_duration(max_wait))
if opts.date is not None:
date = module.DateRange.day(opts.date)
@@ -207,10 +222,6 @@ def parse_command_line(module, argv):
opts.sponsorblock_remove = \
getattr(opts, "sponsorblock_remove", None) or set()
sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove
-
- addchapters = getattr(opts, "addchapters", None)
- if (opts.addmetadata or opts.sponsorblock_mark) and addchapters is None:
- addchapters = True
opts.remove_chapters = getattr(opts, "remove_chapters", None) or ()
# PostProcessors
@@ -297,11 +308,17 @@ def parse_command_line(module, argv):
"sponsorblock_chapter_title": opts.sponsorblock_chapter_title,
"force_keyframes": opts.force_keyframes_at_cuts,
})
- if opts.addmetadata or addchapters:
+ addchapters = getattr(opts, "addchapters", None)
+ embed_infojson = getattr(opts, "embed_infojson", None)
+ if opts.addmetadata or addchapters or embed_infojson:
pp = {"key": "FFmpegMetadata"}
if ytdlp:
- pp["add_chapters"] = addchapters
+ if embed_infojson is None:
+ embed_infojson = "if_exists"
pp["add_metadata"] = opts.addmetadata
+ pp["add_chapters"] = addchapters
+ pp["add_infojson"] = embed_infojson
+
postprocessors.append(pp)
if getattr(opts, "sponskrub", False) is not False:
postprocessors.append({
@@ -311,10 +328,11 @@ def parse_command_line(module, argv):
"cut": opts.sponskrub_cut,
"force": opts.sponskrub_force,
"ignoreerror": opts.sponskrub is None,
+ "_from_cli": True,
})
if opts.embedthumbnail:
already_have_thumbnail = (opts.writethumbnail or
- opts.write_all_thumbnails)
+ getattr(opts, "write_all_thumbnails", False))
postprocessors.append({
"key": "EmbedThumbnail",
"already_have_thumbnail": already_have_thumbnail,
@@ -395,6 +413,7 @@ def parse_command_line(module, argv):
"throttledratelimit": getattr(opts, "throttledratelimit", None),
"overwrites": getattr(opts, "overwrites", None),
"retries": opts.retries,
+ "file_access_retries": getattr(opts, "file_access_retries", None),
"fragment_retries": opts.fragment_retries,
"extractor_retries": getattr(opts, "extractor_retries", None),
"skip_unavailable_fragments": opts.skip_unavailable_fragments,
@@ -421,8 +440,9 @@ def parse_command_line(module, argv):
"allow_playlist_files": opts.allow_playlist_files,
"clean_infojson": opts.clean_infojson,
"getcomments": getattr(opts, "getcomments", None),
- "writethumbnail": opts.writethumbnail,
- "write_all_thumbnails": opts.write_all_thumbnails,
+ "writethumbnail": opts.writethumbnail is True,
+ "write_all_thumbnails": getattr(opts, "write_all_thumbnails", None) or
+ opts.writethumbnail == "all",
"writelink": getattr(opts, "writelink", None),
"writeurllink": getattr(opts, "writeurllink", None),
"writewebloclink": getattr(opts, "writewebloclink", None),
@@ -454,6 +474,7 @@ def parse_command_line(module, argv):
"download_archive": download_archive_fn,
"break_on_existing": getattr(opts, "break_on_existing", None),
"break_on_reject": getattr(opts, "break_on_reject", None),
+ "break_per_url": getattr(opts, "break_per_url", None),
"skip_playlist_after_errors": getattr(
opts, "skip_playlist_after_errors", None),
"cookiefile": opts.cookiefile,
@@ -475,6 +496,8 @@ def parse_command_line(module, argv):
opts, "youtube_include_hls_manifest", None),
"encoding": opts.encoding,
"extract_flat": opts.extract_flat,
+ "live_from_start": getattr(opts, "live_from_start", None),
+ "wait_for_video": getattr(opts, "wait_for_video", None),
"mark_watched": opts.mark_watched,
"merge_output_format": opts.merge_output_format,
"postprocessors": postprocessors,