diff options
Diffstat (limited to 'gallery_dl')
47 files changed, 1059 insertions, 233 deletions
diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 2cad029..ad8286e 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -115,6 +115,13 @@ def main(): config.load(args.cfgfiles, strict=True) if args.yamlfiles: config.load(args.yamlfiles, strict=True, fmt="yaml") + if args.filename: + if args.filename == "/O": + args.filename = "{filename}.{extension}" + config.set((), "filename", args.filename) + if args.directory: + config.set((), "base-directory", args.directory) + config.set((), "directory", ()) if args.postprocessors: config.set((), "postprocessors", args.postprocessors) if args.abort: @@ -142,20 +149,23 @@ def main(): import os.path import requests - head = "" - try: - out, err = subprocess.Popen( - ("git", "rev-parse", "--short", "HEAD"), - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__)), - ).communicate() - if out and not err: - head = " - Git HEAD: " + out.decode().rstrip() - except (OSError, subprocess.SubprocessError): - pass + extra = "" + if getattr(sys, "frozen", False): + extra = " - Executable" + else: + try: + out, err = subprocess.Popen( + ("git", "rev-parse", "--short", "HEAD"), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + cwd=os.path.dirname(os.path.abspath(__file__)), + ).communicate() + if out and not err: + extra = " - Git HEAD: " + out.decode().rstrip() + except (OSError, subprocess.SubprocessError): + pass - log.debug("Version %s%s", __version__, head) + log.debug("Version %s%s", __version__, extra) log.debug("Python %s - %s", platform.python_version(), platform.platform()) try: diff --git a/gallery_dl/downloader/ytdl.py b/gallery_dl/downloader/ytdl.py index 8416ca0..30f628e 100644 --- a/gallery_dl/downloader/ytdl.py +++ b/gallery_dl/downloader/ytdl.py @@ -39,7 +39,7 @@ class YoutubeDLDownloader(DownloaderBase): if not ytdl_instance: ytdl_instance = self.ytdl_instance if not ytdl_instance: - module = __import__(self.config("module") or "youtube_dl") + module = ytdl.import_module(self.config("module")) self.ytdl_instance = ytdl_instance = ytdl.construct_YoutubeDL( module, self, self.ytdl_opts) if self.outtmpl == "default": diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py index c92969b..38b2d5a 100644 --- a/gallery_dl/extractor/2chan.py +++ b/gallery_dl/extractor/2chan.py @@ -20,7 +20,7 @@ class _2chanThreadExtractor(Extractor): filename_fmt = "{tim}.{extension}" archive_fmt = "{board}_{thread}_{tim}" url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" - pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)" + pattern = r"(?:https?://)?([\w-]+)\.2chan\.net/([^/]+)/res/(\d+)" test = ("http://dec.2chan.net/70/res/4752.htm", { "url": "f49aa31340e9a3429226af24e19e01f5b819ca1f", "keyword": "44599c21b248e79692b2eb2da12699bd0ed5640a", diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py index 8c6fa09..88ceaeb 100644 --- a/gallery_dl/extractor/500px.py +++ b/gallery_dl/extractor/500px.py @@ -21,13 +21,13 @@ class _500pxExtractor(Extractor): filename_fmt = "{id}_{name}.{extension}" archive_fmt = "{id}" root = "https://500px.com" + cookiedomain = ".500px.com" def __init__(self, match): Extractor.__init__(self, match) self.session.headers["Referer"] = self.root + "/" def items(self): - first = True data = self.metadata() for photo in self.photos(): @@ -35,9 +35,7 @@ class _500pxExtractor(Extractor): photo["extension"] = photo["image_format"] if data: photo.update(data) - if first: - first = False - yield Message.Directory, photo + yield Message.Directory, photo yield Message.Url, url, photo def metadata(self): @@ -72,24 +70,33 @@ class _500pxExtractor(Extractor): self.log.warning("Unable to fetch photo %s", pid) ] - def _request_api(self, url, params, csrf_token=None): - headers = {"Origin": self.root, "X-CSRF-Token": csrf_token} + def _request_api(self, url, params): + headers = { + "Origin": self.root, + "x-csrf-token": self.session.cookies.get( + "x-csrf-token", domain=".500px.com"), + } return self.request(url, headers=headers, params=params).json() def _request_graphql(self, opname, variables): url = "https://api.500px.com/graphql" + headers = { + "x-csrf-token": self.session.cookies.get( + "x-csrf-token", domain=".500px.com"), + } data = { "operationName": opname, "variables" : json.dumps(variables), "query" : QUERIES[opname], } - return self.request(url, method="POST", json=data).json()["data"] + return self.request( + url, method="POST", headers=headers, json=data).json()["data"] class _500pxUserExtractor(_500pxExtractor): """Extractor for photos from a user's photostream on 500px.com""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!photo/)(?:p/)?([^/?#]+)/?(?:$|[?#])" + pattern = BASE_PATTERN + r"/(?!photo/|liked)(?:p/)?([^/?#]+)/?(?:$|[?#])" test = ( ("https://500px.com/p/light_expression_photography", { "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D4096/v2", @@ -137,10 +144,6 @@ class _500pxGalleryExtractor(_500pxExtractor): "user": dict, }, }), - # unavailable photos (#1335) - ("https://500px.com/p/Light_Expression_Photography/galleries/street", { - "count": 4, - }), ("https://500px.com/fashvamp/galleries/lera"), ) @@ -194,6 +197,30 @@ class _500pxGalleryExtractor(_500pxExtractor): )["galleryByOwnerIdAndSlugOrToken"]["photos"] +class _500pxFavoriteExtractor(_500pxExtractor): + """Extractor for favorite 500px photos""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/liked/?$" + test = ("https://500px.com/liked",) + + def photos(self): + variables = {"pageSize": 20} + photos = self._request_graphql( + "LikedPhotosQueryRendererQuery", variables, + )["likedPhotos"] + + while True: + yield from self._extend(photos["edges"]) + + if not photos["pageInfo"]["hasNextPage"]: + return + + variables["cursor"] = photos["pageInfo"]["endCursor"] + photos = self._request_graphql( + "LikedPhotosPaginationContainerQuery", variables, + )["likedPhotos"] + + class _500pxImageExtractor(_500pxExtractor): """Extractor for individual images from 500px.com""" subcategory = "image" @@ -640,4 +667,122 @@ fragment GalleriesDetailPaginationContainer_gallery_3e6UuE on Gallery { } """, + "LikedPhotosQueryRendererQuery": """\ +query LikedPhotosQueryRendererQuery($pageSize: Int) { + ...LikedPhotosPaginationContainer_query_RlXb8 +} + +fragment LikedPhotosPaginationContainer_query_RlXb8 on Query { + likedPhotos(first: $pageSize) { + edges { + node { + id + legacyId + canonicalPath + name + description + category + uploadedAt + location + width + height + isLikedByMe + notSafeForWork + tags + photographer: uploader { + id + legacyId + username + displayName + canonicalPath + avatar { + images { + url + id + } + id + } + followedByUsers { + totalCount + isFollowedByMe + } + } + images(sizes: [33, 35]) { + size + url + jpegUrl + webpUrl + id + } + __typename + } + cursor + } + pageInfo { + endCursor + hasNextPage + } + } +} +""", + + "LikedPhotosPaginationContainerQuery": """\ +query LikedPhotosPaginationContainerQuery($cursor: String, $pageSize: Int) { + ...LikedPhotosPaginationContainer_query_3e6UuE +} + +fragment LikedPhotosPaginationContainer_query_3e6UuE on Query { + likedPhotos(first: $pageSize, after: $cursor) { + edges { + node { + id + legacyId + canonicalPath + name + description + category + uploadedAt + location + width + height + isLikedByMe + notSafeForWork + tags + photographer: uploader { + id + legacyId + username + displayName + canonicalPath + avatar { + images { + url + id + } + id + } + followedByUsers { + totalCount + isFollowedByMe + } + } + images(sizes: [33, 35]) { + size + url + jpegUrl + webpUrl + id + } + __typename + } + cursor + } + pageInfo { + endCursor + hasNextPage + } + } +} +""", + } diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index dd9da01..65c994d 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -108,6 +108,7 @@ modules = [ "readcomiconline", "reddit", "redgifs", + "rule34us", "sankaku", "sankakucomplex", "seiga", @@ -144,12 +145,14 @@ modules = [ "foolslide", "mastodon", "shopify", + "lolisafe", "imagehosts", "directlink", "recursive", "oauth", "test", "ytdl", + "generic", ] diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py index f687ff8..5675081 100644 --- a/gallery_dl/extractor/artstation.py +++ b/gallery_dl/extractor/artstation.py @@ -29,12 +29,12 @@ class ArtstationExtractor(Extractor): def items(self): data = self.metadata() - yield Message.Directory, data for project in self.projects(): for asset in self.get_project_assets(project["hash_id"]): asset.update(data) adict = asset["asset"] + yield Message.Directory, asset if adict["has_embedded_player"] and self.external: player = adict["player_embedded"] diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index 7e7c282..9a86cc4 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -15,7 +15,7 @@ import re BASE_PATTERN = ( r"(?:blogger:(?:https?://)?([^/]+)|" - r"(?:https?://)?([^.]+\.blogspot\.com))") + r"(?:https?://)?([\w-]+\.blogspot\.com))") class BloggerExtractor(Extractor): diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index e80366e..c440aee 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -571,7 +571,11 @@ class BaseExtractor(Extractor): if not self.category: for index, group in enumerate(match.groups()): if group is not None: - self.category, self.root = self.instances[index] + if index: + self.category, self.root = self.instances[index-1] + else: + self.root = group + self.category = group.partition("://")[2] break Extractor.__init__(self, match) @@ -594,7 +598,10 @@ class BaseExtractor(Extractor): pattern = re.escape(root[root.index(":") + 3:]) pattern_list.append(pattern + "()") - return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")" + return ( + r"(?:" + cls.basecategory + r":(https?://[^/?#]+)|" + r"(?:https?://)?(?:" + "|".join(pattern_list) + r"))" + ) class HTTPSAdapter(HTTPAdapter): diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index dbaa97e..6d6e192 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -6,16 +6,13 @@ """Extractors for https://cyberdrop.me/""" -from .common import Extractor, Message +from . import lolisafe from .. import text -class CyberdropAlbumExtractor(Extractor): +class CyberdropAlbumExtractor(lolisafe.LolisafelbumExtractor): category = "cyberdrop" - subcategory = "album" root = "https://cyberdrop.me" - directory_fmt = ("{category}", "{album_name} ({album_id})") - archive_fmt = "{album_id}_{id}" pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.me/a/([^/?#]+)" test = ( # images @@ -44,11 +41,7 @@ class CyberdropAlbumExtractor(Extractor): }), ) - def __init__(self, match): - Extractor.__init__(self, match) - self.album_id = match.group(1) - - def items(self): + def fetch_album(self, album_id): url = self.root + "/a/" + self.album_id extr = text.extract_from(self.request(url).text) @@ -58,9 +51,9 @@ class CyberdropAlbumExtractor(Extractor): url = extr('id="file" href="', '"') if not url: break - append(text.unescape(url)) + append({"file": text.unescape(url)}) - data = { + return files, { "album_id" : self.album_id, "album_name" : extr("name: '", "'"), "date" : text.parse_timestamp(extr("timestamp: ", ",")), @@ -68,9 +61,3 @@ class CyberdropAlbumExtractor(Extractor): "description": extr("description: `", "`"), "count" : len(files), } - - yield Message.Directory, data - for url in files: - text.nameext_from_url(url, data) - data["filename"], _, data["id"] = data["filename"].rpartition("-") - yield Message.Url, url, data diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 61affb5..94fec16 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -772,6 +772,7 @@ class DeviantartPopularExtractor(DeviantartExtractor): if trange.startswith("popular-"): trange = trange[8:] self.time_range = { + "newest" : "now", "most-recent" : "now", "this-week" : "1week", "this-month" : "1month", @@ -786,6 +787,8 @@ class DeviantartPopularExtractor(DeviantartExtractor): } def deviations(self): + if self.time_range == "now": + return self.api.browse_newest(self.search_term, self.offset) return self.api.browse_popular( self.search_term, self.time_range, self.offset) @@ -1034,21 +1037,32 @@ class DeviantartOAuthAPI(): def browse_deviantsyouwatch(self, offset=0): """Yield deviations from users you watch""" - endpoint = "browse/deviantsyouwatch" + endpoint = "/browse/deviantsyouwatch" params = {"limit": "50", "offset": offset, "mature_content": self.mature} return self._pagination(endpoint, params, public=False) def browse_posts_deviantsyouwatch(self, offset=0): """Yield posts from users you watch""" - endpoint = "browse/posts/deviantsyouwatch" + endpoint = "/browse/posts/deviantsyouwatch" params = {"limit": "50", "offset": offset, "mature_content": self.mature} return self._pagination(endpoint, params, public=False, unpack=True) + def browse_newest(self, query=None, offset=0): + """Browse newest deviations""" + endpoint = "/browse/newest" + params = { + "q" : query, + "limit" : 50 if self.metadata else 120, + "offset" : offset, + "mature_content": self.mature, + } + return self._pagination(endpoint, params) + def browse_popular(self, query=None, timerange=None, offset=0): """Yield popular deviations""" - endpoint = "browse/popular" + endpoint = "/browse/popular" params = { "q" : query, "limit" : 50 if self.metadata else 120, @@ -1060,7 +1074,7 @@ class DeviantartOAuthAPI(): def browse_tags(self, tag, offset=0): """ Browse a tag """ - endpoint = "browse/tags" + endpoint = "/browse/tags" params = { "tag" : tag, "offset" : offset, @@ -1071,14 +1085,14 @@ class DeviantartOAuthAPI(): def browse_user_journals(self, username, offset=0): """Yield all journal entries of a specific user""" - endpoint = "browse/user/journals" + endpoint = "/browse/user/journals" params = {"username": username, "offset": offset, "limit": 50, "mature_content": self.mature, "featured": "false"} return self._pagination(endpoint, params) def collections(self, username, folder_id, offset=0): """Yield all Deviation-objects contained in a collection folder""" - endpoint = "collections/" + folder_id + endpoint = "/collections/" + folder_id params = {"username": username, "offset": offset, "limit": 24, "mature_content": self.mature} return self._pagination(endpoint, params) @@ -1086,21 +1100,21 @@ class DeviantartOAuthAPI(): @memcache(keyarg=1) def collections_folders(self, username, offset=0): """Yield all collection folders of a specific user""" - endpoint = "collections/folders" + endpoint = "/collections/folders" params = {"username": username, "offset": offset, "limit": 50, "mature_content": self.mature} return self._pagination_list(endpoint, params) def comments_deviation(self, deviation_id, offset=0): """Fetch comments posted on a deviation""" - endpoint = "comments/deviation/" + deviation_id + endpoint = "/comments/deviation/" + deviation_id params = {"maxdepth": "5", "offset": offset, "limit": 50, "mature_content": self.mature} return self._pagination_list(endpoint, params=params, key="thread") def deviation(self, deviation_id, public=True): """Query and return info about a single Deviation""" - endpoint = "deviation/" + deviation_id + endpoint = "/deviation/" + deviation_id deviation = self._call(endpoint, public=public) if self.metadata: self._metadata((deviation,)) @@ -1110,13 +1124,13 @@ class DeviantartOAuthAPI(): def deviation_content(self, deviation_id, public=False): """Get extended content of a single Deviation""" - endpoint = "deviation/content" + endpoint = "/deviation/content" params = {"deviationid": deviation_id} return self._call(endpoint, params=params, public=public) def deviation_download(self, deviation_id, public=True): """Get the original file download (if allowed)""" - endpoint = "deviation/download/" + deviation_id + endpoint = "/deviation/download/" + deviation_id params = {"mature_content": self.mature} return self._call(endpoint, params=params, public=public) @@ -1124,7 +1138,7 @@ class DeviantartOAuthAPI(): """ Fetch deviation metadata for a set of deviations""" if not deviations: return [] - endpoint = "deviation/metadata?" + "&".join( + endpoint = "/deviation/metadata?" + "&".join( "deviationids[{}]={}".format(num, deviation["deviationid"]) for num, deviation in enumerate(deviations) ) @@ -1133,14 +1147,14 @@ class DeviantartOAuthAPI(): def gallery(self, username, folder_id, offset=0, extend=True, public=True): """Yield all Deviation-objects contained in a gallery folder""" - endpoint = "gallery/" + folder_id + endpoint = "/gallery/" + folder_id params = {"username": username, "offset": offset, "limit": 24, "mature_content": self.mature, "mode": "newest"} return self._pagination(endpoint, params, extend, public) def gallery_all(self, username, offset=0): """Yield all Deviation-objects of a specific user""" - endpoint = "gallery/all" + endpoint = "/gallery/all" params = {"username": username, "offset": offset, "limit": 24, "mature_content": self.mature} return self._pagination(endpoint, params) @@ -1148,7 +1162,7 @@ class DeviantartOAuthAPI(): @memcache(keyarg=1) def gallery_folders(self, username, offset=0): """Yield all gallery folders of a specific user""" - endpoint = "gallery/folders" + endpoint = "/gallery/folders" params = {"username": username, "offset": offset, "limit": 50, "mature_content": self.mature} return self._pagination_list(endpoint, params) @@ -1156,12 +1170,12 @@ class DeviantartOAuthAPI(): @memcache(keyarg=1) def user_profile(self, username): """Get user profile information""" - endpoint = "user/profile/" + username + endpoint = "/user/profile/" + username return self._call(endpoint, fatal=False) def user_friends_watch(self, username): """Watch a user""" - endpoint = "user/friends/watch/" + username + endpoint = "/user/friends/watch/" + username data = { "watch[friend]" : "0", "watch[deviations]" : "0", @@ -1179,7 +1193,7 @@ class DeviantartOAuthAPI(): def user_friends_unwatch(self, username): """Unwatch a user""" - endpoint = "user/friends/unwatch/" + username + endpoint = "/user/friends/unwatch/" + username return self._call( endpoint, method="POST", public=False, fatal=False, ).get("success") @@ -1217,7 +1231,7 @@ class DeviantartOAuthAPI(): def _call(self, endpoint, fatal=True, public=True, **kwargs): """Call an API endpoint""" - url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint + url = "https://www.deviantart.com/api/v1/oauth2" + endpoint kwargs["fatal"] = None while True: @@ -1357,7 +1371,7 @@ class DeviantartEclipseAPI(): self.log = extractor.log def deviation_extended_fetch(self, deviation_id, user=None, kind=None): - endpoint = "da-browse/shared_api/deviation/extended_fetch" + endpoint = "/da-browse/shared_api/deviation/extended_fetch" params = { "deviationid" : deviation_id, "username" : user, @@ -1367,7 +1381,7 @@ class DeviantartEclipseAPI(): return self._call(endpoint, params) def gallery_scraps(self, user, offset=None): - endpoint = "da-user-profile/api/gallery/contents" + endpoint = "/da-user-profile/api/gallery/contents" params = { "username" : user, "offset" : offset, @@ -1377,7 +1391,7 @@ class DeviantartEclipseAPI(): return self._pagination(endpoint, params) def user_watching(self, user, offset=None): - endpoint = "da-user-profile/api/module/watching" + endpoint = "/da-user-profile/api/module/watching" params = { "username": user, "moduleid": self._module_id_watching(user), @@ -1387,7 +1401,7 @@ class DeviantartEclipseAPI(): return self._pagination(endpoint, params) def _call(self, endpoint, params=None): - url = "https://www.deviantart.com/_napi/" + endpoint + url = "https://www.deviantart.com/_napi" + endpoint headers = {"Referer": "https://www.deviantart.com/"} response = self.extractor._limited_request( diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 7ffb214..cf9706b 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -176,6 +176,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.image_token = match.group(4) self.image_num = text.parse_int(match.group(6), 1) + source = self.config("source") + if source == "hitomi": + self.items = self._items_hitomi + def items(self): self.login() @@ -221,6 +225,18 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["_http_validate"] = None yield Message.Url, url, data + def _items_hitomi(self): + if self.config("metadata", False): + data = self.metadata_from_api() + data["date"] = text.parse_timestamp(data["posted"]) + else: + data = {} + + from .hitomi import HitomiGalleryExtractor + url = "https://hitomi.la/galleries/{}.html".format(self.gallery_id) + data["_extractor"] = HitomiGalleryExtractor + yield Message.Queue, url, data + def get_metadata(self, page): """Extract gallery metadata""" data = self.metadata_from_page(page) diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index cc6ee97..ef79808 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -33,7 +33,7 @@ class FanboxExtractor(Extractor): def items(self): if self._warning: - if "FANBOXSESSID" not in self.session.cookies: + if not self._check_cookies(("FANBOXSESSID",)): self.log.warning("no 'FANBOXSESSID' cookie set") FanboxExtractor._warning = False @@ -280,3 +280,24 @@ class FanboxPostExtractor(FanboxExtractor): def posts(self): return (self._get_post_data_from_id(self.post_id),) + + +class FanboxRedirectExtractor(Extractor): + """Extractor for pixiv redirects to fanbox.cc""" + category = "fanbox" + subcategory = "redirect" + pattern = r"(?:https?://)?(?:www\.)?pixiv\.net/fanbox/creator/(\d+)" + test = ("https://www.pixiv.net/fanbox/creator/52336352", { + "pattern": FanboxCreatorExtractor.pattern, + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user_id = match.group(1) + + def items(self): + url = "https://www.pixiv.net/fanbox/creator/" + self.user_id + data = {"_extractor": FanboxCreatorExtractor} + response = self.request( + url, method="HEAD", allow_redirects=False, notfound="user") + yield Message.Queue, response.headers["Location"], data diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 62f7429..89a965f 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -22,7 +22,7 @@ class FantiaExtractor(Extractor): def items(self): if self._warning: - if "_session_id" not in self.session.cookies: + if not self._check_cookies(("_session_id",)): self.log.warning("no '_session_id' cookie set") FantiaExtractor._warning = False diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index 6c5c7df..2bd8c6b 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -56,7 +56,7 @@ class FlickrImageExtractor(FlickrExtractor): subcategory = "image" pattern = (r"(?:https?://)?(?:" r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/" - r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" + r"|[\w-]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" r"|flic\.kr/p/([A-Za-z1-9]+))") test = ( ("https://www.flickr.com/photos/departingyyz/16089302239", { diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index b5ecbd6..891e0c1 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -22,6 +22,7 @@ class FuraffinityExtractor(Extractor): archive_fmt = "{id}" cookiedomain = ".furaffinity.net" root = "https://www.furaffinity.net" + _warning = True def __init__(self, match): Extractor.__init__(self, match) @@ -32,6 +33,12 @@ class FuraffinityExtractor(Extractor): self._process_description = str.strip def items(self): + + if self._warning: + if not self._check_cookies(("a", "b")): + self.log.warning("no 'a' and 'b' session cookies set") + FuraffinityExtractor._warning = False + external = self.config("external", False) metadata = self.metadata() for post_id in util.advance(self.posts(), self.offset): diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py new file mode 100644 index 0000000..bece905 --- /dev/null +++ b/gallery_dl/extractor/generic.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + +"""Extractor for images in a generic web page.""" + +from .common import Extractor, Message +from .. import config, text +import re +import os.path + + +class GenericExtractor(Extractor): + """Extractor for images in a generic web page.""" + + category = "generic" + directory_fmt = ("{category}", "{pageurl}") + archive_fmt = "{imageurl}" + + # By default, the generic extractor is disabled + # and the "g(eneric):" prefix in url is required. + # If the extractor is enabled, make the prefix optional + pattern = r"(?ix)(?P<generic>g(?:eneric)?:)" + if config.get(("extractor", "generic"), "enabled"): + pattern += r"?" + + # The generic extractor pattern should match (almost) any valid url + # Based on: https://tools.ietf.org/html/rfc3986#appendix-B + pattern += r""" + (?P<scheme>https?://)? # optional http(s) scheme + (?P<domain>[-\w\.]+) # required domain + (?P<path>/[^?&#]*)? # optional path + (?:\?(?P<query>[^/?#]*))? # optional query + (?:\#(?P<fragment>.*))?$ # optional fragment + """ + + def __init__(self, match): + """Init.""" + Extractor.__init__(self, match) + + # Strip the "g(eneric):" prefix + # and inform about "forced" or "fallback" mode + if match.group('generic'): + self.log.info("Forcing use of generic information extractor.") + self.url = match.group(0).partition(":")[2] + else: + self.log.info("Falling back on generic information extractor.") + self.url = match.group(0) + + # Make sure we have a scheme, or use https + if match.group('scheme'): + self.scheme = match.group('scheme') + else: + self.scheme = 'https://' + self.url = self.scheme + self.url + + # Used to resolve relative image urls + self.root = self.scheme + match.group('domain') + + def items(self): + """Get page, extract metadata & images, yield them in suitable messages. + + Adapted from common.GalleryExtractor.items() + + """ + page = self.request(self.url).text + data = self.metadata(page) + imgs = self.images(page) + + try: + data["count"] = len(imgs) + except TypeError: + pass + images = enumerate(imgs, 1) + + yield Message.Version, 1 + yield Message.Directory, data + + for data["num"], (url, imgdata) in images: + if imgdata: + data.update(imgdata) + if "extension" not in imgdata: + text.nameext_from_url(url, data) + else: + text.nameext_from_url(url, data) + yield Message.Url, url, data + + def metadata(self, page): + """Extract generic webpage metadata, return them in a dict.""" + data = {} + data['pageurl'] = self.url + data['title'] = text.extract(page, '<title>', "</title>")[0] or "" + data['description'] = text.extract( + page, '<meta name="description" content="', '"')[0] or "" + data['keywords'] = text.extract( + page, '<meta name="keywords" content="', '"')[0] or "" + data['language'] = text.extract( + page, '<meta name="language" content="', '"')[0] or "" + data['name'] = text.extract( + page, '<meta itemprop="name" content="', '"')[0] or "" + data['copyright'] = text.extract( + page, '<meta name="copyright" content="', '"')[0] or "" + data['og_site'] = text.extract( + page, '<meta property="og:site" content="', '"')[0] or "" + data['og_site_name'] = text.extract( + page, '<meta property="og:site_name" content="', '"')[0] or "" + data['og_title'] = text.extract( + page, '<meta property="og:title" content="', '"')[0] or "" + data['og_description'] = text.extract( + page, '<meta property="og:description" content="', '"')[0] or "" + + data = {k: text.unescape(data[k]) for k in data if data[k] != ""} + + return data + + def images(self, page): + """Extract image urls, return a list of (image url, metadata) tuples. + + The extractor aims at finding as many _likely_ image urls as possible, + using two strategies (see below); since these often overlap, any + duplicate urls will be removed at the end of the process. + + Note: since we are using re.findall() (see below), it's essential that + the following patterns contain 0 or at most 1 capturing group, so that + re.findall() return a list of urls (instead of a list of tuples of + matching groups). All other groups used in the pattern should be + non-capturing (?:...). + + 1: Look in src/srcset attributes of img/video/source elements + + See: + https://www.w3schools.com/tags/att_src.asp + https://www.w3schools.com/tags/att_source_srcset.asp + + We allow both absolute and relative urls here. + + Note that srcset attributes often contain multiple space separated + image urls; this pattern matches only the first url; remaining urls + will be matched by the "imageurl_pattern_ext" pattern below. + """ + imageurl_pattern_src = r"""(?ix) + <(?:img|video|source)\s.*? # <img>, <video> or <source> + src(?:set)?=["']? # src or srcset attributes + (?P<URL>[^"'\s>]+) # url + """ + + """ + 2: Look anywhere for urls containing common image/video extensions + + The list of allowed extensions is borrowed from the directlink.py + extractor; other could be added, see + https://en.wikipedia.org/wiki/List_of_file_formats + + Compared to the "pattern" class variable, here we must exclude also + other special characters (space, ", ', >), since we are looking for + urls in html tags. + """ + + imageurl_pattern_ext = r"""(?ix) + (?:[^?&#"'>\s]+) # anything until dot+extension + \.(?:jpe?g|jpe|png|gif + |web[mp]|mp4|mkv|og[gmv]|opus) # dot + image/video extensions + (?:[^"'>\s]*)? # optional query and fragment + """ + + imageurls_src = re.findall(imageurl_pattern_src, page) + imageurls_ext = re.findall(imageurl_pattern_ext, page) + imageurls = imageurls_src + imageurls_ext + + # Resolve relative urls + # + # Image urls catched so far may be relative, so we must resolve them + # by prepending a suitable base url. + # + # If the page contains a <base> element, use it as base url + basematch = re.search( + r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)", page) + if basematch: + self.baseurl = basematch.group('url').rstrip('/') + # Otherwise, extract the base url from self.url + else: + if self.url.endswith("/"): + self.baseurl = self.url.rstrip('/') + else: + self.baseurl = os.path.dirname(self.url) + + # Build the list of absolute image urls + absimageurls = [] + for u in imageurls: + # Absolute urls are taken as-is + if u.startswith('http'): + absimageurls.append(u) + # // relative urls are prefixed with current scheme + elif u.startswith('//'): + absimageurls.append(self.scheme + u.lstrip('/')) + # / relative urls are prefixed with current scheme+domain + elif u.startswith('/'): + absimageurls.append(self.root + u) + # other relative urls are prefixed with baseurl + else: + absimageurls.append(self.baseurl + '/' + u) + + # Remove duplicates + absimageurls = set(absimageurls) + + # Create the image metadata dict and add imageurl to it + # (image filename and extension are added by items()) + images = [(u, {'imageurl': u}) for u in absimageurls] + + return images diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index a4ce925..88cf98c 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -10,9 +10,11 @@ from .common import GalleryExtractor, Extractor, Message from .nozomi import decode_nozomi +from ..cache import memcache from .. import text, util import string import json +import re class HitomiGalleryExtractor(GalleryExtractor): @@ -24,8 +26,10 @@ class HitomiGalleryExtractor(GalleryExtractor): r"/(?:[^/?#]+-)?(\d+)") test = ( ("https://hitomi.la/galleries/867789.html", { - "pattern": r"https://[a-c]b.hitomi.la/images/./../[0-9a-f]+.jpg", + "pattern": r"https://[a-c]b.hitomi.la/images/1639745412/\d+" + r"/[0-9a-f]{64}\.jpg", "keyword": "4873ef9a523621fc857b114e0b2820ba4066e9ae", + "options": (("metadata", True),), "count": 16, }), # download test @@ -35,12 +39,12 @@ class HitomiGalleryExtractor(GalleryExtractor): }), # Game CG with scenes (#321) ("https://hitomi.la/galleries/733697.html", { - "url": "0cb629ab2bfe93d994a7972f68ad2a5a64ecc161", + "url": "479d16fe92117a6a2ce81b4e702e6347922c81e3", "count": 210, }), # fallback for galleries only available through /reader/ URLs ("https://hitomi.la/galleries/1045954.html", { - "url": "b420755d56a1135104ca8ca0765f44e290db70c3", + "url": "ebc1415c5d7f634166ef7e2635b77735de1ea7a2", "count": 1413, }), # gallery with "broken" redirect @@ -71,7 +75,7 @@ class HitomiGalleryExtractor(GalleryExtractor): self.info = info = json.loads(page.partition("=")[2]) data = self._data_from_gallery_info(info) - if self.config("metadata", True): + if self.config("metadata", False): data.update(self._data_from_gallery_page(info)) return data @@ -133,19 +137,19 @@ class HitomiGalleryExtractor(GalleryExtractor): } def images(self, _): + # see https://ltn.hitomi.la/gg.js + gg_m, gg_b = _parse_gg(self) + result = [] for image in self.info["files"]: ihash = image["hash"] idata = text.nameext_from_url(image["name"]) # see https://ltn.hitomi.la/common.js - inum = int(ihash[-3:-1], 16) - offset = 1 if inum < 0x7c else 0 - + inum = int(ihash[-1] + ihash[-3:-1], 16) url = "https://{}b.hitomi.la/images/{}/{}/{}.{}".format( - chr(97 + offset), - ihash[-1], ihash[-3:-1], ihash, - idata["extension"], + chr(97 + gg_m.get(inum, 0)), + gg_b, inum, ihash, idata["extension"], ) result.append((url, idata)) return result @@ -185,3 +189,16 @@ class HitomiTagExtractor(Extractor): for gallery_id in decode_nozomi(self.request(url).content): url = "https://hitomi.la/galleries/{}.html".format(gallery_id) yield Message.Queue, url, data + + +@memcache() +def _parse_gg(extr): + page = extr.request("https://ltn.hitomi.la/gg.js").text + + m = { + int(match.group(1)): int(match.group(2)) + for match in re.finditer(r"case (\d+): o = (\d+); break;", page) + } + b = re.search(r"b:\s*[\"'](.+)[\"']", page) + + return m, b.group(1).strip("/") diff --git a/gallery_dl/extractor/imgbb.py b/gallery_dl/extractor/imgbb.py index 1e875f0..f32093a 100644 --- a/gallery_dl/extractor/imgbb.py +++ b/gallery_dl/extractor/imgbb.py @@ -169,7 +169,7 @@ class ImgbbAlbumExtractor(ImgbbExtractor): class ImgbbUserExtractor(ImgbbExtractor): """Extractor for user profiles in imgbb.com""" subcategory = "user" - pattern = r"(?:https?://)?([^.]+)\.imgbb\.com/?(?:\?([^#]+))?$" + pattern = r"(?:https?://)?([\w-]+)\.imgbb\.com/?(?:\?([^#]+))?$" test = ("https://folkie.imgbb.com", { "range": "1-80", "pattern": r"https?://i\.ibb\.co/\w+/[^/?#]+", diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 3d09d79..8ee8ca9 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -205,6 +205,28 @@ class InkbunnyFavoriteExtractor(InkbunnyExtractor): return self.api.search(params) +class InkbunnySearchExtractor(InkbunnyExtractor): + """Extractor for inkbunny search results""" + subcategory = "search" + pattern = (BASE_PATTERN + + r"/submissionsviewall\.php\?([^#]+&mode=search&[^#]+)") + test = (("https://inkbunny.net/submissionsviewall.php?rid=ffffffffff" + "&mode=search&page=1&orderby=create_datetime&text=cute" + "&stringtype=and&keywords=yes&title=yes&description=no&artist=" + "&favsby=&type=&days=&keyword_id=&user_id=&random=&md5="), { + "range": "1-10", + "count": 10, + }) + + def __init__(self, match): + InkbunnyExtractor.__init__(self, match) + self.params = text.parse_query(match.group(1)) + self.params.pop("rid", None) + + def posts(self): + return self.api.search(self.params) + + class InkbunnyFollowingExtractor(InkbunnyExtractor): """Extractor for inkbunny user watches""" subcategory = "following" diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index a1dd465..781bf01 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -174,10 +174,16 @@ class InstagramExtractor(Extractor): if post.get("is_video") and "video_url" not in post: url = "{}/tv/{}/".format(self.root, post["shortcode"]) post = self._extract_post_page(url) + if "items" in post: + return self._parse_post_api({"media": post["items"][0]}) + post = post["graphql"]["shortcode_media"] elif typename == "GraphSidecar" and \ "edge_sidecar_to_children" not in post: url = "{}/p/{}/".format(self.root, post["shortcode"]) post = self._extract_post_page(url) + if "items" in post: + return self._parse_post_api({"media": post["items"][0]}) + post = post["graphql"]["shortcode_media"] owner = post["owner"] data = { @@ -347,7 +353,7 @@ class InstagramExtractor(Extractor): data = self._extract_shared_data(url)["entry_data"] if "HttpErrorPage" in data: raise exception.NotFoundError("post") - return data["PostPage"][0]["graphql"]["shortcode_media"] + return data["PostPage"][0] def _get_edge_data(self, user, key): cursor = self.config("cursor") @@ -564,7 +570,7 @@ class InstagramPostExtractor(InstagramExtractor): """Extractor for an Instagram post""" subcategory = "post" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/(?:p|tv|reel)/([^/?#]+)") + r"/(?:[^/?#]+/)?(?:p|tv|reel)/([^/?#]+)") test = ( # GraphImage ("https://www.instagram.com/p/BqvsDleB3lV/", { @@ -663,6 +669,9 @@ class InstagramPostExtractor(InstagramExtractor): } }), + # URL with username (#2085) + ("https://www.instagram.com/dm/p/CW042g7B9CY/"), + ("https://www.instagram.com/reel/CDg_6Y1pxWu/"), ) @@ -686,14 +695,15 @@ class InstagramStoriesExtractor(InstagramExtractor): """Extractor for Instagram stories""" subcategory = "stories" pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" - r"/stories/(?:highlights/(\d+)|([^/?#]+))") + r"/stories/(?:highlights/(\d+)|([^/?#]+)(?:/(\d+))?)") test = ( ("https://www.instagram.com/stories/instagram/"), ("https://www.instagram.com/stories/highlights/18042509488170095/"), + ("https://instagram.com/stories/geekmig/2724343156064789461"), ) def __init__(self, match): - self.highlight_id, self.user = match.groups() + self.highlight_id, self.user, self.media_id = match.groups() if self.highlight_id: self.subcategory = InstagramHighlightsExtractor.subcategory InstagramExtractor.__init__(self, match) @@ -712,7 +722,18 @@ class InstagramStoriesExtractor(InstagramExtractor): endpoint = "/v1/feed/reels_media/" params = {"reel_ids": reel_id} - return self._request_api(endpoint, params=params)["reels"].values() + reels = self._request_api(endpoint, params=params)["reels"] + + if self.media_id: + reel = reels[reel_id] + for item in reel["items"]: + if item["pk"] == self.media_id: + reel["items"] = (item,) + break + else: + raise exception.NotFoundError("story") + + return reels.values() class InstagramHighlightsExtractor(InstagramExtractor): diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py index 4012760..50ce0d3 100644 --- a/gallery_dl/extractor/keenspot.py +++ b/gallery_dl/extractor/keenspot.py @@ -19,7 +19,7 @@ class KeenspotComicExtractor(Extractor): directory_fmt = ("{category}", "{comic}") filename_fmt = "{filename}.{extension}" archive_fmt = "{comic}_{filename}" - pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?" + pattern = r"(?:https?://)?(?!www\.|forums\.)([\w-]+)\.keenspot\.com(/.+)?" test = ( ("http://marksmen.keenspot.com/", { # link "range": "1-3", diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 6483278..f1d7bcf 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -14,7 +14,7 @@ from ..cache import cache import itertools import re -BASE_PATTERN = r"(?:https?://)?(?:www\.)?kemono\.party" +BASE_PATTERN = r"(?:https?://)?(?:www\.)?(kemono|coomer)\.party" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" @@ -27,17 +27,30 @@ class KemonopartyExtractor(Extractor): archive_fmt = "{service}_{user}_{id}_{num}" cookiedomain = ".kemono.party" + def __init__(self, match): + if match.group(1) == "coomer": + self.category = "coomerparty" + self.root = "https://coomer.party" + self.cookiedomain = ".coomer.party" + Extractor.__init__(self, match) + def items(self): self._prepare_ddosguard_cookies() self._find_inline = re.compile( - r'src="(?:https?://kemono\.party)?(/inline/[^"]+' + r'src="(?:https?://(?:kemono|coomer)\.party)?(/inline/[^"]+' r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall find_hash = re.compile("/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})").match generators = self._build_file_generators(self.config("files")) comments = self.config("comments") username = dms = None + # prevent files from coomer.party to be sent with gzip compression + if "coomer" in self.root: + headers = {"Accept-Encoding": "identity"} + else: + headers = None + if self.config("metadata"): username = text.unescape(text.extract( self.request(self.user_url).text, @@ -83,10 +96,11 @@ class KemonopartyExtractor(Extractor): post["type"] = file["type"] post["num"] += 1 + post["_http_headers"] = headers if url[0] == "/": url = self.root + "/data" + url - elif url.startswith("https://kemono.party"): + elif url.startswith(self.root): url = self.root + "/data" + url[20:] text.nameext_from_url(file["name"], post) @@ -129,7 +143,7 @@ class KemonopartyExtractor(Extractor): def _build_file_generators(self, filetypes): if filetypes is None: - return (self._file, self._attachments, self._inline) + return (self._attachments, self._file, self._inline) genmap = { "file" : self._file, "attachments": self._attachments, @@ -191,8 +205,9 @@ class KemonopartyUserExtractor(KemonopartyExtractor): ) def __init__(self, match): + _, service, user_id, offset = match.groups() + self.subcategory = service KemonopartyExtractor.__init__(self, match) - service, user_id, offset = match.groups() self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) self.offset = text.parse_int(offset) @@ -233,7 +248,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "published": "Sun, 11 Aug 2019 02:09:04 GMT", "service": "fanbox", "shared_file": False, - "subcategory": "post", + "subcategory": "fanbox", "title": "c96取り置き", "type": "file", "user": "6993449", @@ -249,7 +264,7 @@ class KemonopartyPostExtractor(KemonopartyExtractor): # kemono.party -> data.kemono.party ("https://kemono.party/gumroad/user/trylsc/post/IURjT", { "pattern": r"https://kemono\.party/data/(" - r"files/gumroad/trylsc/IURjT/reward8\.jpg|" + r"a4/7b/a47bfe938d8c1682eef06e885927484cd8df1b.+\.jpg|" r"c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)", }), # username (#1548, #1652) @@ -272,13 +287,19 @@ class KemonopartyPostExtractor(KemonopartyExtractor): "date": "2021-07-31 02:47:51.327865", }]}, }), + # coomer.party (#2100) + ("https://coomer.party/onlyfans/user/alinity/post/125962203", { + "pattern": r"https://coomer\.party/data/7d/3f/7d3fd9804583dc224968" + r"c0591163ec91794552b04f00a6c2f42a15b68231d5a8\.jpg", + }), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"), ) def __init__(self, match): + _, service, user_id, post_id = match.groups() + self.subcategory = service KemonopartyExtractor.__init__(self, match) - service, user_id, post_id = match.groups() self.api_url = "{}/api/{}/user/{}/post/{}".format( self.root, service, user_id, post_id) self.user_url = "{}/{}/user/{}".format(self.root, service, user_id) @@ -319,7 +340,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.server, self.channel, self.channel_name = match.groups() + _, self.server, self.channel, self.channel_name = match.groups() def items(self): self._prepare_ddosguard_cookies() @@ -353,7 +374,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): url = file["path"] if url[0] == "/": url = self.root + "/data" + url - elif url.startswith("https://kemono.party"): + elif url.startswith(self.root): url = self.root + "/data" + url[20:] text.nameext_from_url(file["name"], post) @@ -392,7 +413,7 @@ class KemonopartyDiscordServerExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.server = match.group(1) + self.server = match.group(2) def items(self): url = "{}/api/discord/channels/lookup?q={}".format( diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py new file mode 100644 index 0000000..cdaf22b --- /dev/null +++ b/gallery_dl/extractor/lolisafe.py @@ -0,0 +1,79 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for lolisafe/chibisafe instances""" + +from .common import BaseExtractor, Message +from .. import text + + +class LolisafeExtractor(BaseExtractor): + """Base class for lolisafe extractors""" + basecategory = "lolisafe" + directory_fmt = ("{category}", "{album_name} ({album_id})") + archive_fmt = "{album_id}_{id}" + + +BASE_PATTERN = LolisafeExtractor.update({ + "bunkr": {"root": "https://bunkr.is", "pattern": r"bunkr\.(?:is|to)"}, + "zzzz" : {"root": "https://zz.ht" , "pattern": r"zz\.(?:ht|fo)"}, +}) + + +class LolisafelbumExtractor(LolisafeExtractor): + subcategory = "album" + pattern = BASE_PATTERN + "/a/([^/?#]+)" + test = ( + ("https://bunkr.is/a/Lktg9Keq", { + "pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + "keyword": { + "album_id": "Lktg9Keq", + "album_name": 'test テスト "&>', + "count": 1, + "filename": 'test-テスト-"&>-QjgneIQv', + "id": "QjgneIQv", + "name": 'test-テスト-"&>', + "num": int, + }, + }), + ("https://bunkr.to/a/Lktg9Keq"), + ("https://zz.ht/a/lop7W6EZ", { + "pattern": r"https://z\.zz\.fo/(4anuY|ih560)\.png", + "count": 2, + "keyword": { + "album_id": "lop7W6EZ", + "album_name": "ferris", + }, + }), + ("https://zz.fo/a/lop7W6EZ"), + ) + + def __init__(self, match): + LolisafeExtractor.__init__(self, match) + self.album_id = match.group(match.lastindex) + + def items(self): + files, data = self.fetch_album(self.album_id) + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + url = file["file"] + text.nameext_from_url(url, data) + data["name"], sep, data["id"] = data["filename"].rpartition("-") + yield Message.Url, url, data + + def fetch_album(self, album_id): + url = "{}/api/album/get/{}".format(self.root, album_id) + data = self.request(url).json() + + return data["files"], { + "album_id" : self.album_id, + "album_name": text.unescape(data["title"]), + "count" : data["count"], + } diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index 5c202f3..f06ab70 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -20,8 +20,8 @@ class MyportfolioGalleryExtractor(Extractor): filename_fmt = "{num:>02}.{extension}" archive_fmt = "{user}_{filename}" pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|" - r"(?:https?://)?([^.]+\.myportfolio\.com))" - r"(/[^/?#]+)?") + r"(?:https?://)?([\w-]+\.myportfolio\.com))" + r"(/[^/?&#]+)?") test = ( ("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", { "url": "acea0690c76db0e5cf267648cefd86e921bc3499", diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py index a699401..4351b3e 100644 --- a/gallery_dl/extractor/newgrounds.py +++ b/gallery_dl/extractor/newgrounds.py @@ -420,7 +420,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): """Extractor for posts favorited by a newgrounds user""" subcategory = "favorite" directory_fmt = ("{category}", "{user}", "Favorites") - pattern = (r"(?:https?://)?([^.]+)\.newgrounds\.com" + pattern = (r"(?:https?://)?([\w-]+)\.newgrounds\.com" r"/favorites(?!/following)(?:/(art|audio|movies))?/?") test = ( ("https://tomfulp.newgrounds.com/favorites/art", { @@ -475,7 +475,7 @@ class NewgroundsFavoriteExtractor(NewgroundsExtractor): class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor): """Extractor for a newgrounds user's favorited users""" subcategory = "following" - pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/favorites/(following)" + pattern = r"(?:https?://)?([\w-]+)\.newgrounds\.com/favorites/(following)" test = ("https://tomfulp.newgrounds.com/favorites/following", { "pattern": NewgroundsUserExtractor.pattern, "range": "76-125", diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py index 62e4f58..f8c80ef 100644 --- a/gallery_dl/extractor/patreon.py +++ b/gallery_dl/extractor/patreon.py @@ -29,7 +29,7 @@ class PatreonExtractor(Extractor): def items(self): if self._warning: - if "session_id" not in self.session.cookies: + if not self._check_cookies(("session_id",)): self.log.warning("no 'session_id' cookie set") PatreonExtractor._warning = False generators = self._build_file_generators(self.config("files")) diff --git a/gallery_dl/extractor/philomena.py b/gallery_dl/extractor/philomena.py index 51a0d38..6377fb0 100644 --- a/gallery_dl/extractor/philomena.py +++ b/gallery_dl/extractor/philomena.py @@ -46,7 +46,7 @@ class PhilomenaExtractor(BooruExtractor): try: params["filter_id"] = INSTANCES[self.category]["filter_id"] except (KeyError, TypeError): - pass + params["filter_id"] = "2" while True: data = self.request(url, params=params).json() diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py index bea0276..1993ab6 100644 --- a/gallery_dl/extractor/photobucket.py +++ b/gallery_dl/extractor/photobucket.py @@ -21,8 +21,8 @@ class PhotobucketAlbumExtractor(Extractor): directory_fmt = ("{category}", "{username}", "{location}") filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}" archive_fmt = "{id}" - pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)" - r"/user/[^/?#]+/library(?:/[^?#]*)?") + pattern = (r"(?:https?://)?((?:[\w-]+\.)?photobucket\.com)" + r"/user/[^/?&#]+/library(?:/[^?&#]*)?") test = ( ("https://s369.photobucket.com/user/CrpyLrkr/library", { "pattern": r"https?://[oi]+\d+.photobucket.com/albums/oo139/", @@ -109,9 +109,9 @@ class PhotobucketImageExtractor(Extractor): directory_fmt = ("{category}", "{username}") filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}" archive_fmt = "{username}_{id}" - pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com" - r"(?:/gallery/user/([^/?#]+)/media/([^/?#]+)" - r"|/user/([^/?#]+)/media/[^?#]+\.html)") + pattern = (r"(?:https?://)?(?:[\w-]+\.)?photobucket\.com" + r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)" + r"|/user/([^/?&#]+)/media/[^?&#]+\.html)") test = ( (("https://s271.photobucket.com/user/lakerfanryan" "/media/Untitled-3-1.jpg.html"), { diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index 8e47e2e..8943747 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -456,7 +456,9 @@ class PixivSearchExtractor(PixivExtractor): self.sort = self.target = None def works(self): - return self.api.search_illust(self.word, self.sort, self.target) + return self.api.search_illust( + self.word, self.sort, self.target, + date_start=self.date_start, date_end=self.date_end) def metadata(self): query = text.parse_query(self.query) @@ -489,10 +491,15 @@ class PixivSearchExtractor(PixivExtractor): target = "s_tag" self.target = target_map[target] + self.date_start = query.get("scd") + self.date_end = query.get("ecd") + return {"search": { "word": self.word, "sort": self.sort, "target": self.target, + "date_start": self.date_start, + "date_end": self.date_end, }} @@ -710,9 +717,11 @@ class PixivAppAPI(): params = {"illust_id": illust_id} return self._pagination("v2/illust/related", params) - def search_illust(self, word, sort=None, target=None, duration=None): + def search_illust(self, word, sort=None, target=None, duration=None, + date_start=None, date_end=None): params = {"word": word, "search_target": target, - "sort": sort, "duration": duration} + "sort": sort, "duration": duration, + "start_date": date_start, "end_date": date_end} return self._pagination("v1/search/illust", params) def user_bookmarks_illust(self, user_id, tag=None, restrict="public"): diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py index 98928d6..a52071e 100644 --- a/gallery_dl/extractor/pixnet.py +++ b/gallery_dl/extractor/pixnet.py @@ -12,7 +12,7 @@ from .common import Extractor, Message from .. import text, exception -BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net" +BASE_PATTERN = r"(?:https?://)?(?!www\.)([\w-]+)\.pixnet.net" class PixnetExtractor(Extractor): diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index f976e82..f8497c0 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -12,7 +12,7 @@ from .common import Extractor, Message from .. import text, exception -BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?pornhub\.com" +BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com" class PornhubExtractor(Extractor): diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py new file mode 100644 index 0000000..00b6972 --- /dev/null +++ b/gallery_dl/extractor/rule34us.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://rule34.us/""" + +from .booru import BooruExtractor +from .. import text +import re +import collections + + +class Rule34usExtractor(BooruExtractor): + category = "rule34us" + root = "https://rule34.us" + per_page = 42 + + def __init__(self, match): + BooruExtractor.__init__(self, match) + self._find_tags = re.compile( + r'<li class="([^-"]+)-tag"[^>]*><a href="[^;"]+;q=([^"]+)').findall + + def _parse_post(self, post_id): + url = "{}/index.php?r=posts/view&id={}".format(self.root, post_id) + page = self.request(url).text + extr = text.extract_from(page) + + post = { + "id" : post_id, + "tags" : text.unescape(extr( + 'name="keywords" content="', '"').rstrip(", ")), + "uploader": text.extract(extr('Added by: ', '</li>'), ">", "<")[0], + "score" : text.extract(extr('Score: ', '> - <'), ">", "<")[0], + "width" : extr('Size: ', 'w'), + "height" : extr(' x ', 'h'), + "file_url": extr(' src="', '"'), + } + post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] + + tags = collections.defaultdict(list) + for tag_type, tag_name in self._find_tags(page): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + post["tags_" + key] = " ".join(value) + + return post + + +class Rule34usTagExtractor(Rule34usExtractor): + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/index&q=([^&#]+)" + test = ("https://rule34.us/index.php?r=posts/index&q=[terios]_elysion", { + "pattern": r"https://img\d*\.rule34\.us" + r"/images/../../[0-9a-f]{32}\.\w+", + "count": 10, + }) + + def __init__(self, match): + Rule34usExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1).replace("+", " ")) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + url = self.root + "/index.php" + params = { + "r" : "posts/index", + "q" : self.tags, + "page": self.page_start, + } + + while True: + page = self.request(url, params=params).text + + cnt = 0 + for post_id in text.extract_iter(page, '><a id="', '"'): + yield self._parse_post(post_id) + cnt += 1 + + if cnt < self.per_page: + return + + if "page" in params: + del params["page"] + params["q"] = self.tags + " id:<" + post_id + + +class Rule34usPostExtractor(Rule34usExtractor): + subcategory = "post" + archive_fmt = "{id}" + pattern = r"(?:https?://)?rule34\.us/index\.php\?r=posts/view&id=(\d+)" + test = ( + ("https://rule34.us/index.php?r=posts/view&id=3709005", { + "pattern": r"https://img\d*\.rule34\.us/images/14/7b" + r"/147bee6fc2e13f73f5f9bac9d4930b13\.png", + "content": "d714342ea84050f82dda5f0c194d677337abafc5", + }), + ("https://rule34.us/index.php?r=posts/view&id=4576310", { + "pattern": r"https://video\.rule34\.us/images/a2/94" + r"/a294ff8e1f8e0efa041e5dc9d1480011\.mp4", + "keyword": { + "extension": "mp4", + "file_url": str, + "filename": "a294ff8e1f8e0efa041e5dc9d1480011", + "height": "3982", + "id": "4576310", + "md5": "a294ff8e1f8e0efa041e5dc9d1480011", + "score": r"re:\d+", + "tags": "tagme, video", + "tags_general": "video", + "tags_metadata": "tagme", + "uploader": "Anonymous", + "width": "3184", + }, + }), + ) + + def __init__(self, match): + Rule34usExtractor.__init__(self, match) + self.post_id = match.group(1) + + def posts(self): + return (self._parse_post(self.post_id),) diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index ccedff3..199b1ba 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -78,9 +78,14 @@ class SexcomExtractor(Extractor): path += "/hd" data["url"] = self.root + path else: + iframe = extr('<iframe', '>') + src = (text.extract(iframe, ' src="', '"')[0] or + text.extract(iframe, " src='", "'")[0]) + if not src: + self.log.warning("Unable to fetch media from %s", url) + return None data["extension"] = None - data["url"] = "ytdl:" + text.extract( - extr('<iframe', '>'), ' src="', '"')[0] + data["url"] = "ytdl:" + src else: data["url"] = text.unescape(extr(' src="', '"').partition("?")[0]) text.nameext_from_url(data["url"], data) diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py index b5fbdc2..7b5982a 100644 --- a/gallery_dl/extractor/slickpic.py +++ b/gallery_dl/extractor/slickpic.py @@ -13,7 +13,7 @@ from .. import text import time -BASE_PATTERN = r"(?:https?://)?([^.]+)\.slickpic\.com" +BASE_PATTERN = r"(?:https?://)?([\w-]+)\.slickpic\.com" class SlickpicExtractor(Extractor): diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 5d582b5..bdf6036 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -13,7 +13,7 @@ from .. import text, oauth, exception BASE_PATTERN = ( r"(?:smugmug:(?!album:)(?:https?://)?([^/]+)|" - r"(?:https?://)?([^.]+)\.smugmug\.com)") + r"(?:https?://)?([\w-]+)\.smugmug\.com)") class SmugmugExtractor(Extractor): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 243710d..358bc95 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -35,7 +35,7 @@ POST_TYPES = frozenset(( BASE_PATTERN = ( r"(?:tumblr:(?:https?://)?([^/]+)|" - r"(?:https?://)?([^.]+\.tumblr\.com))") + r"(?:https?://)?([\w-]+\.tumblr\.com))") class TumblrExtractor(Extractor): diff --git a/gallery_dl/extractor/tumblrgallery.py b/gallery_dl/extractor/tumblrgallery.py index 849dc49..e790613 100644 --- a/gallery_dl/extractor/tumblrgallery.py +++ b/gallery_dl/extractor/tumblrgallery.py @@ -19,6 +19,20 @@ class TumblrgalleryExtractor(GalleryExtractor): directory_fmt = ("{category}", "{gallery_id} {title}") root = "https://tumblrgallery.xyz" + @staticmethod + def _urls_from_page(page): + return text.extract_iter( + page, '<div class="report"> <a class="xx-co-me" href="', '"') + + @staticmethod + def _data_from_url(url): + filename = text.nameext_from_url(url)["filename"] + parts = filename.split("_") + try: + return {"id": parts[1] if parts[1] != "inline" else parts[2]} + except IndexError: + return {"id": filename} + class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor): """Extractor for Tumblrblog on tumblrgallery.xyz""" @@ -39,34 +53,27 @@ class TumblrgalleryTumblrblogExtractor(TumblrgalleryExtractor): def images(self, _): page_num = 1 while True: - response = self.request( - "{}/tumblrblog/gallery/{}/{}.html" - .format(self.root, self.gallery_id, page_num), - allow_redirects=False - ) - if response.status_code != 200: + url = "{}/tumblrblog/gallery/{}/{}.html".format( + self.root, self.gallery_id, page_num) + response = self.request(url, allow_redirects=False, fatal=False) + + if response.status_code >= 300: return - page = response.text + for url in self._urls_from_page(response.text): + yield url, self._data_from_url(url) page_num += 1 - urls = list(text.extract_iter( - page, - '<div class="report xx-co-me"> <a href="', - '" data-fancybox="gallery"' - )) - - for image_src in urls: - yield image_src, { - "id": text.extract(image_src, "tumblr_", "_")[0] - } - class TumblrgalleryPostExtractor(TumblrgalleryExtractor): """Extractor for Posts on tumblrgallery.xyz""" subcategory = "post" pattern = BASE_PATTERN + r"(/post/(\d+)\.html)" - test = ("https://tumblrgallery.xyz/post/405674.html",) + test = ("https://tumblrgallery.xyz/post/405674.html", { + "pattern": r"https://78\.media\.tumblr\.com/bec67072219c1f3bc04fd9711d" + r"ec42ef/tumblr_p51qq1XCHS1txhgk3o1_1280\.jpg", + "count": 3, + }) def __init__(self, match): TumblrgalleryExtractor.__init__(self, match) @@ -81,17 +88,8 @@ class TumblrgalleryPostExtractor(TumblrgalleryExtractor): } def images(self, page): - urls = list(text.extract_iter( - page, - '<div class="report xx-co-me"> <a href="', - '" data-fancybox="gallery"' - )) - - for image_src in urls: - yield image_src, { - "id": text.extract(image_src, "tumblr_", "_")[0] or - text.nameext_from_url(image_src)["filename"] - } + for url in self._urls_from_page(page): + yield url, self._data_from_url(url) class TumblrgallerySearchExtractor(TumblrgalleryExtractor): @@ -100,7 +98,10 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor): filename_fmt = "{category}_{num:>03}_{gallery_id}_{id}_{title}.{extension}" directory_fmt = ("{category}", "{search_term}") pattern = BASE_PATTERN + r"(/s\.php\?q=([^&#]+))" - test = ("https://tumblrgallery.xyz/s.php?q=everyday-life",) + test = ("https://tumblrgallery.xyz/s.php?q=everyday-life", { + "pattern": r"https://\d+\.media\.tumblr\.com/.+", + "count": "< 1000", + }) def __init__(self, match): TumblrgalleryExtractor.__init__(self, match) @@ -112,38 +113,26 @@ class TumblrgallerySearchExtractor(TumblrgalleryExtractor): } def images(self, _): - page_num = 1 + page_url = "s.php?q=" + self.search_term while True: - response = self.request( - "{}/s.php?q={}&page={}" - .format(self.root, self.search_term, page_num), - allow_redirects=False - ) - if response.status_code != 200: - return + page = self.request(self.root + "/" + page_url).text - page = response.text - page_num += 1 + for gallery_id in text.extract_iter( + page, '<div class="title"><a href="post/', '.html'): - gallery_ids = list(text.extract_iter( - page, - '<div class="title"><a href="post/', - '.html' - )) - - for gallery_id in gallery_ids: - post_page = self.request( - "{}/post/{}.html" - .format(self.root, gallery_id), - allow_redirects=False - ).text - for image_src in TumblrgalleryPostExtractor.images( - self, post_page - ): - image_src[1]["title"] = text.remove_html( - text.unescape( - text.extract(post_page, "<title>", "</title>")[0] - ) - ).replace("_", "-") - image_src[1]["gallery_id"] = gallery_id - yield image_src + url = "{}/post/{}.html".format(self.root, gallery_id) + post_page = self.request(url).text + + for url in self._urls_from_page(post_page): + data = self._data_from_url(url) + data["gallery_id"] = gallery_id + data["title"] = text.remove_html(text.unescape( + text.extract(post_page, "<title>", "</title>")[0] + )).replace("_", "-") + yield url, data + + next_url = text.extract( + page, '</span> <a class="btn btn-primary" href="', '"')[0] + if not next_url or page_url == next_url: + return + page_url = next_url diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index f1c392d..a49f1f2 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -47,7 +47,7 @@ class TwitterExtractor(Extractor): size = self.config("size") if size is None: self._size_image = "orig" - self._size_fallback = ("large", "medium", "small") + self._size_fallback = ("4096x4096", "large", "medium", "small") else: if isinstance(size, str): size = size.split(",") diff --git a/gallery_dl/extractor/wordpress.py b/gallery_dl/extractor/wordpress.py new file mode 100644 index 0000000..dd7d28a --- /dev/null +++ b/gallery_dl/extractor/wordpress.py @@ -0,0 +1,41 @@ +# -*- coding: utf-8 -*- + +# Copyright 2021 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for WordPress blogs""" + +from .common import BaseExtractor, Message +from .. import text + + +class WordpressExtractor(BaseExtractor): + """Base class for wordpress extractors""" + basecategory = "wordpress" + + def items(self): + for post in self.posts(): + yield Message.Difrectory, post + + + +BASE_PATTERN = WordpressExtractor.update({}) + + +class WordpressBlogExtractor(WordpressExtractor): + """Extractor for WordPress blogs""" + subcategory = "blog" + directory_fmt = ("{category}", "{blog}") + pattern = BASE_PATTERN + r"/?$" + + def posts(self): + url = self.root + "/wp-json/wp/v2/posts" + params = {"page": 1, "per_page": "100"} + + while True: + data = self.request(url, params=params).json() + exit() + yield 1 diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py index f7a0a7e..146ab04 100644 --- a/gallery_dl/extractor/xhamster.py +++ b/gallery_dl/extractor/xhamster.py @@ -13,7 +13,7 @@ from .. import text import json -BASE_PATTERN = (r"(?:https?://)?((?:[^.]+\.)?xhamster" +BASE_PATTERN = (r"(?:https?://)?((?:[\w-]+\.)?xhamster" r"(?:\d?\.(?:com|one|desi)|\.porncache\.net))") diff --git a/gallery_dl/extractor/ytdl.py b/gallery_dl/extractor/ytdl.py index 8eb0c83..8f3ef9a 100644 --- a/gallery_dl/extractor/ytdl.py +++ b/gallery_dl/extractor/ytdl.py @@ -23,9 +23,9 @@ class YoutubeDLExtractor(Extractor): def __init__(self, match): # import main youtube_dl module - module_name = self.ytdl_module_name = config.get( - ("extractor", "ytdl"), "module") or "youtube_dl" - module = __import__(module_name) + ytdl_module = ytdl.import_module(config.get( + ("extractor", "ytdl"), "module")) + self.ytdl_module_name = ytdl_module.__name__ # find suitable youtube_dl extractor self.ytdl_url = url = match.group(1) @@ -34,7 +34,7 @@ class YoutubeDLExtractor(Extractor): self.ytdl_ie_key = "Generic" self.force_generic_extractor = True else: - for ie in module.extractor.gen_extractor_classes(): + for ie in ytdl_module.extractor.gen_extractor_classes(): if ie.suitable(url): self.ytdl_ie_key = ie.ie_key() break @@ -48,7 +48,7 @@ class YoutubeDLExtractor(Extractor): def items(self): # import subcategory module - ytdl_module = __import__( + ytdl_module = ytdl.import_module( config.get(("extractor", "ytdl", self.subcategory), "module") or self.ytdl_module_name) self.log.debug("Using %s", ytdl_module) diff --git a/gallery_dl/option.py b/gallery_dl/option.py index 5f7b281..1967bf7 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -92,9 +92,9 @@ def build_parser(): help="Print program version and exit", ) general.add_argument( - "-d", "--dest", + "--dest", dest="base-directory", metavar="DEST", action=ConfigAction, - help="Destination directory", + help=argparse.SUPPRESS, ) general.add_argument( "-i", "--input-file", @@ -103,6 +103,17 @@ def build_parser(): "More than one --input-file can be specified"), ) general.add_argument( + "-f", "--filename", + dest="filename", metavar="FORMAT", + help=("Filename format string for downloaded files " + "('/O' for \"original\" filenames)"), + ) + general.add_argument( + "-d", "--directory", + dest="directory", metavar="PATH", + help="Target location for file downloads", + ) + general.add_argument( "--cookies", dest="cookies", metavar="FILE", action=ConfigAction, help="File to load additional cookies from", @@ -211,8 +222,22 @@ def build_parser(): ) downloader.add_argument( "--sleep", - dest="sleep", metavar="SECONDS", type=float, action=ConfigAction, - help="Number of seconds to sleep before each download", + dest="sleep", metavar="SECONDS", action=ConfigAction, + help=("Number of seconds to wait before each download. " + "This can be either a constant value or a range " + "(e.g. 2.7 or 2.0-3.5)"), + ) + downloader.add_argument( + "--sleep-request", + dest="sleep-request", metavar="SECONDS", action=ConfigAction, + help=("Number of seconds to wait between HTTP requests " + "during data extraction"), + ) + downloader.add_argument( + "--sleep-extractor", + dest="sleep-extractor", metavar="SECONDS", action=ConfigAction, + help=("Number of seconds to wait before starting data extraction " + "for an input URL"), ) downloader.add_argument( "--filesize-min", @@ -337,6 +362,11 @@ def build_parser(): "and other delegated URLs"), ) + infojson = { + "name" : "metadata", + "event" : "init", + "filename": "info.json", + } postprocessor = parser.add_argument_group("Post-processing Options") postprocessor.add_argument( "--zip", @@ -372,16 +402,18 @@ def build_parser(): help="Write metadata to separate JSON files", ) postprocessor.add_argument( - "--write-infojson", + "--write-info-json", dest="postprocessors", - action="append_const", const={ - "name" : "metadata", - "event" : "init", - "filename": "info.json", - }, + action="append_const", const=infojson, help="Write gallery metadata to a info.json file", ) postprocessor.add_argument( + "--write-infojson", + dest="postprocessors", + action="append_const", const=infojson, + help=argparse.SUPPRESS, + ) + postprocessor.add_argument( "--write-tags", dest="postprocessors", action="append_const", const={"name": "metadata", "mode": "tags"}, diff --git a/gallery_dl/output.py b/gallery_dl/output.py index d4d295f..7e00e1a 100644 --- a/gallery_dl/output.py +++ b/gallery_dl/output.py @@ -265,10 +265,14 @@ class NullOutput(): class PipeOutput(NullOutput): def skip(self, path): - print(CHAR_SKIP, path, sep="", flush=True) + stdout = sys.stdout + stdout.write(CHAR_SKIP + path + "\n") + stdout.flush() def success(self, path, tries): - print(path, flush=True) + stdout = sys.stdout + stdout.write(path + "\n") + stdout.flush() class TerminalOutput(NullOutput): @@ -284,34 +288,38 @@ class TerminalOutput(NullOutput): self.shorten = util.identity def start(self, path): - print(self.shorten(" " + path), end="", flush=True) + stdout = sys.stdout + stdout.write(self.shorten(" " + path)) + stdout.flush() def skip(self, path): - print(self.shorten(CHAR_SKIP + path)) + sys.stdout.write(self.shorten(CHAR_SKIP + path) + "\n") def success(self, path, tries): - print("\r", self.shorten(CHAR_SUCCESS + path), sep="") + sys.stdout.write("\r" + self.shorten(CHAR_SUCCESS + path) + "\n") def progress(self, bytes_total, bytes_downloaded, bytes_per_second): bdl = util.format_value(bytes_downloaded) bps = util.format_value(bytes_per_second) if bytes_total is None: - print("\r{:>7}B {:>7}B/s ".format(bdl, bps), end="") + sys.stderr.write("\r{:>7}B {:>7}B/s ".format(bdl, bps)) else: - print("\r{:>3}% {:>7}B {:>7}B/s ".format( - bytes_downloaded * 100 // bytes_total, bdl, bps), end="") + sys.stderr.write("\r{:>3}% {:>7}B {:>7}B/s ".format( + bytes_downloaded * 100 // bytes_total, bdl, bps)) class ColorOutput(TerminalOutput): def start(self, path): - print(self.shorten(path), end="", flush=True) + stdout = sys.stdout + stdout.write(self.shorten(path)) + stdout.flush() def skip(self, path): - print("\033[2m", self.shorten(path), "\033[0m", sep="") + sys.stdout.write("\033[2m" + self.shorten(path) + "\033[0m\n") def success(self, path, tries): - print("\r\033[1;32m", self.shorten(path), "\033[0m", sep="") + sys.stdout.write("\r\033[1;32m" + self.shorten(path) + "\033[0m\n") class EAWCache(dict): diff --git a/gallery_dl/path.py b/gallery_dl/path.py index 12ce8ad..9e9e983 100644 --- a/gallery_dl/path.py +++ b/gallery_dl/path.py @@ -177,8 +177,11 @@ class PathFormat(): self.directory = directory = self.basedirectory if WINDOWS: - # Enable longer-than-260-character paths on Windows - directory = "\\\\?\\" + os.path.abspath(directory) + # Enable longer-than-260-character paths + if directory.startswith("\\\\"): + directory = "\\\\?\\UNC\\" + directory[2:] + else: + directory = "\\\\?\\" + os.path.abspath(directory) # abspath() in Python 3.7+ removes trailing path separators (#402) if directory[-1] != sep: diff --git a/gallery_dl/util.py b/gallery_dl/util.py index d25194e..bccae2d 100644 --- a/gallery_dl/util.py +++ b/gallery_dl/util.py @@ -428,18 +428,26 @@ def build_duration_func(duration, min=0.0): if not duration: return None - try: - lower, upper = duration - except TypeError: - pass + if isinstance(duration, str): + lower, _, upper = duration.partition("-") + lower = float(lower) else: + try: + lower, upper = duration + except TypeError: + lower, upper = duration, None + + if upper: + upper = float(upper) return functools.partial( random.uniform, lower if lower > min else min, upper if upper > min else min, ) - - return functools.partial(identity, duration if duration > min else min) + else: + if lower < min: + lower = min + return lambda: lower def build_extractor_filter(categories, negate=True, special=None): diff --git a/gallery_dl/version.py b/gallery_dl/version.py index a363a97..b5114e8 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.19.3" +__version__ = "1.20.0" diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index 4266f48..e6953eb 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -14,6 +14,15 @@ import itertools from . import text, util, exception +def import_module(module_name): + if module_name is None: + try: + return __import__("yt_dlp") + except ImportError: + return __import__("youtube_dl") + return __import__(module_name.replace("-", "_")) + + def construct_YoutubeDL(module, obj, user_opts, system_opts=None): opts = argv = None config = obj.config @@ -95,6 +104,8 @@ def parse_command_line(module, argv): opts.continue_dl = False if opts.retries is not None: opts.retries = parse_retries(opts.retries) + if getattr(opts, "file_access_retries", None) is not None: + opts.file_access_retries = parse_retries(opts.file_access_retries) if opts.fragment_retries is not None: opts.fragment_retries = parse_retries(opts.fragment_retries) if getattr(opts, "extractor_retries", None) is not None: @@ -111,6 +122,10 @@ def parse_command_line(module, argv): opts.recodevideo = opts.recodevideo.replace(" ", "") if getattr(opts, "remuxvideo", None) is not None: opts.remuxvideo = opts.remuxvideo.replace(" ", "") + if getattr(opts, "wait_for_video", None) is not None: + min_wait, _, max_wait = opts.wait_for_video.partition("-") + opts.wait_for_video = (module.parse_duration(min_wait), + module.parse_duration(max_wait)) if opts.date is not None: date = module.DateRange.day(opts.date) @@ -207,10 +222,6 @@ def parse_command_line(module, argv): opts.sponsorblock_remove = \ getattr(opts, "sponsorblock_remove", None) or set() sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove - - addchapters = getattr(opts, "addchapters", None) - if (opts.addmetadata or opts.sponsorblock_mark) and addchapters is None: - addchapters = True opts.remove_chapters = getattr(opts, "remove_chapters", None) or () # PostProcessors @@ -297,11 +308,17 @@ def parse_command_line(module, argv): "sponsorblock_chapter_title": opts.sponsorblock_chapter_title, "force_keyframes": opts.force_keyframes_at_cuts, }) - if opts.addmetadata or addchapters: + addchapters = getattr(opts, "addchapters", None) + embed_infojson = getattr(opts, "embed_infojson", None) + if opts.addmetadata or addchapters or embed_infojson: pp = {"key": "FFmpegMetadata"} if ytdlp: - pp["add_chapters"] = addchapters + if embed_infojson is None: + embed_infojson = "if_exists" pp["add_metadata"] = opts.addmetadata + pp["add_chapters"] = addchapters + pp["add_infojson"] = embed_infojson + postprocessors.append(pp) if getattr(opts, "sponskrub", False) is not False: postprocessors.append({ @@ -311,10 +328,11 @@ def parse_command_line(module, argv): "cut": opts.sponskrub_cut, "force": opts.sponskrub_force, "ignoreerror": opts.sponskrub is None, + "_from_cli": True, }) if opts.embedthumbnail: already_have_thumbnail = (opts.writethumbnail or - opts.write_all_thumbnails) + getattr(opts, "write_all_thumbnails", False)) postprocessors.append({ "key": "EmbedThumbnail", "already_have_thumbnail": already_have_thumbnail, @@ -395,6 +413,7 @@ def parse_command_line(module, argv): "throttledratelimit": getattr(opts, "throttledratelimit", None), "overwrites": getattr(opts, "overwrites", None), "retries": opts.retries, + "file_access_retries": getattr(opts, "file_access_retries", None), "fragment_retries": opts.fragment_retries, "extractor_retries": getattr(opts, "extractor_retries", None), "skip_unavailable_fragments": opts.skip_unavailable_fragments, @@ -421,8 +440,9 @@ def parse_command_line(module, argv): "allow_playlist_files": opts.allow_playlist_files, "clean_infojson": opts.clean_infojson, "getcomments": getattr(opts, "getcomments", None), - "writethumbnail": opts.writethumbnail, - "write_all_thumbnails": opts.write_all_thumbnails, + "writethumbnail": opts.writethumbnail is True, + "write_all_thumbnails": getattr(opts, "write_all_thumbnails", None) or + opts.writethumbnail == "all", "writelink": getattr(opts, "writelink", None), "writeurllink": getattr(opts, "writeurllink", None), "writewebloclink": getattr(opts, "writewebloclink", None), @@ -454,6 +474,7 @@ def parse_command_line(module, argv): "download_archive": download_archive_fn, "break_on_existing": getattr(opts, "break_on_existing", None), "break_on_reject": getattr(opts, "break_on_reject", None), + "break_per_url": getattr(opts, "break_per_url", None), "skip_playlist_after_errors": getattr( opts, "skip_playlist_after_errors", None), "cookiefile": opts.cookiefile, @@ -475,6 +496,8 @@ def parse_command_line(module, argv): opts, "youtube_include_hls_manifest", None), "encoding": opts.encoding, "extract_flat": opts.extract_flat, + "live_from_start": getattr(opts, "live_from_start", None), + "wait_for_video": getattr(opts, "wait_for_video", None), "mark_watched": opts.mark_watched, "merge_output_format": opts.merge_output_format, "postprocessors": postprocessors, |
