diff options
Diffstat (limited to 'gallery_dl/extractor')
31 files changed, 453 insertions, 146 deletions
diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py index 22e4fe3..72239d5 100644 --- a/gallery_dl/extractor/__init__.py +++ b/gallery_dl/extractor/__init__.py @@ -117,6 +117,7 @@ modules = [ "piczel", "pillowfort", "pinterest", + "pixeldrain", "pixiv", "pixnet", "plurk", @@ -147,6 +148,7 @@ modules = [ "tapas", "tcbscans", "telegraph", + "tmohentai", "toyhouse", "tsumino", "tumblr", diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index a92918e..ad0caf9 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -89,6 +89,17 @@ class BehanceGalleryExtractor(BehanceExtractor): BehanceExtractor.__init__(self, match) self.gallery_id = match.group(1) + def _init(self): + BehanceExtractor._init(self) + + modules = self.config("modules") + if modules: + if isinstance(modules, str): + modules = modules.split(",") + self.modules = set(modules) + else: + self.modules = {"image", "video", "mediacollection", "embed"} + def items(self): data = self.get_gallery_data() imgs = self.get_images(data) @@ -97,7 +108,8 @@ class BehanceGalleryExtractor(BehanceExtractor): yield Message.Directory, data for data["num"], (url, module) in enumerate(imgs, 1): data["module"] = module - data["extension"] = text.ext_from_url(url) + data["extension"] = (module.get("extension") or + text.ext_from_url(url)) yield Message.Url, url, data def get_gallery_data(self): @@ -133,13 +145,17 @@ class BehanceGalleryExtractor(BehanceExtractor): append = result.append for module in data["modules"]: - mtype = module["__typename"] + mtype = module["__typename"][:-6].lower() - if mtype == "ImageModule": + if mtype not in self.modules: + self.log.debug("Skipping '%s' module", mtype) + continue + + if mtype == "image": url = module["imageSizes"]["size_original"]["url"] append((url, module)) - elif mtype == "VideoModule": + elif mtype == "video": try: renditions = module["videoData"]["renditions"] except Exception: @@ -158,7 +174,7 @@ class BehanceGalleryExtractor(BehanceExtractor): append((url, module)) - elif mtype == "MediaCollectionModule": + elif mtype == "mediacollection": for component in module["components"]: for size in component["imageSizes"].values(): if size: @@ -167,12 +183,17 @@ class BehanceGalleryExtractor(BehanceExtractor): append(("/".join(parts), module)) break - elif mtype == "EmbedModule": + elif mtype == "embed": embed = module.get("originalEmbed") or module.get("fluidEmbed") if embed: embed = text.unescape(text.extr(embed, 'src="', '"')) + module["extension"] = "mp4" append(("ytdl:" + embed, module)) + elif mtype == "text": + module["extension"] = "txt" + append(("text:" + module["text"], module)) + return result diff --git a/gallery_dl/extractor/blogger.py b/gallery_dl/extractor/blogger.py index d75c349..58ae59d 100644 --- a/gallery_dl/extractor/blogger.py +++ b/gallery_dl/extractor/blogger.py @@ -8,30 +8,22 @@ """Extractors for Blogger blogs""" -from .common import Extractor, Message +from .common import BaseExtractor, Message from .. import text, util import re -BASE_PATTERN = ( - r"(?:blogger:(?:https?://)?([^/]+)|" - r"(?:https?://)?([\w-]+\.blogspot\.com))") - -class BloggerExtractor(Extractor): +class BloggerExtractor(BaseExtractor): """Base class for blogger extractors""" - category = "blogger" - directory_fmt = ("{category}", "{blog[name]}", + basecategory = "blogger" + directory_fmt = ("blogger", "{blog[name]}", "{post[date]:%Y-%m-%d} {post[title]}") filename_fmt = "{num:>03}.{extension}" archive_fmt = "{post[id]}_{num}" - root = "https://www.blogger.com" - - def __init__(self, match): - Extractor.__init__(self, match) - self.blog = match.group(1) or match.group(2) def _init(self): self.api = BloggerAPI(self) + self.blog = self.root.rpartition("/")[2] self.videos = self.config("videos", True) def items(self): @@ -92,6 +84,18 @@ class BloggerExtractor(Extractor): """Return additional metadata""" +BASE_PATTERN = BloggerExtractor.update({ + "blogspot": { + "root": None, + "pattern": r"[\w-]+\.blogspot\.com", + }, + "micmicidol": { + "root": "https://www.micmicidol.club", + "pattern": r"(?:www\.)?micmicidol\.club", + }, +}) + + class BloggerPostExtractor(BloggerExtractor): """Extractor for a single blog post""" subcategory = "post" @@ -100,7 +104,7 @@ class BloggerPostExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.path = match.group(3) + self.path = match.group(match.lastindex) def posts(self, blog): return (self.api.post_by_path(blog["id"], self.path),) @@ -124,7 +128,7 @@ class BloggerSearchExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.query = text.unquote(match.group(3)) + self.query = text.unquote(match.group(match.lastindex)) def posts(self, blog): return self.api.blog_search(blog["id"], self.query) @@ -141,7 +145,7 @@ class BloggerLabelExtractor(BloggerExtractor): def __init__(self, match): BloggerExtractor.__init__(self, match) - self.label = text.unquote(match.group(3)) + self.label = text.unquote(match.group(match.lastindex)) def posts(self, blog): return self.api.blog_posts(blog["id"], self.label) diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 3bec424..f378427 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -78,6 +78,12 @@ class Extractor(): def config(self, key, default=None): return config.interpolate(self._cfgpath, key, default) + def config2(self, key, key2, default=None, sentinel=util.SENTINEL): + value = self.config(key, sentinel) + if value is not sentinel: + return value + return self.config(key2, default) + def config_deprecated(self, key, deprecated, default=None, sentinel=util.SENTINEL, history=set()): value = self.config(deprecated, sentinel) diff --git a/gallery_dl/extractor/cyberdrop.py b/gallery_dl/extractor/cyberdrop.py index 59fd1e5..d864960 100644 --- a/gallery_dl/extractor/cyberdrop.py +++ b/gallery_dl/extractor/cyberdrop.py @@ -7,6 +7,7 @@ """Extractors for https://cyberdrop.me/""" from . import lolisafe +from .common import Message from .. import text @@ -16,24 +17,43 @@ class CyberdropAlbumExtractor(lolisafe.LolisafeAlbumExtractor): pattern = r"(?:https?://)?(?:www\.)?cyberdrop\.(?:me|to)/a/([^/?#]+)" example = "https://cyberdrop.me/a/ID" + def items(self): + files, data = self.fetch_album(self.album_id) + + yield Message.Directory, data + for data["num"], file in enumerate(files, 1): + file.update(data) + text.nameext_from_url(file["name"], file) + file["name"], sep, file["id"] = file["filename"].rpartition("-") + yield Message.Url, file["url"], file + def fetch_album(self, album_id): - url = self.root + "/a/" + self.album_id - extr = text.extract_from(self.request(url).text) - - files = [] - append = files.append - while True: - url = text.unescape(extr('id="file" href="', '"')) - if not url: - break - append({"file": url, - "_fallback": (self.root + url[url.find("/", 8):],)}) - - return files, { + url = "{}/a/{}".format(self.root, album_id) + page = self.request(url).text + extr = text.extract_from(page) + + desc = extr('property="og:description" content="', '"') + if desc.startswith("A privacy-focused censorship-resistant file " + "sharing platform free for everyone."): + desc = "" + extr('id="title"', "") + + album = { "album_id" : self.album_id, - "album_name" : extr("name: '", "'"), - "date" : text.parse_timestamp(extr("timestamp: ", ",")), - "album_size" : text.parse_int(extr("totalSize: ", ",")), - "description": extr("description: `", "`"), - "count" : len(files), + "album_name" : text.unescape(extr('title="', '"')), + "album_size" : text.parse_bytes(extr( + '<p class="title">', "B")), + "date" : text.parse_datetime(extr( + '<p class="title">', '<'), "%d.%m.%Y"), + "description": text.unescape(text.unescape( # double + desc.rpartition(" [R")[0])), } + + file_ids = list(text.extract_iter(page, 'id="file" href="/f/', '"')) + album["count"] = len(file_ids) + return self._extract_files(file_ids), album + + def _extract_files(self, file_ids): + for file_id in file_ids: + url = "{}/api/f/{}".format(self.root, file_id) + yield self.request(url).json() diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 2aed678..6a0e069 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -44,11 +44,15 @@ class EromeExtractor(Extractor): pos = page.index('<div class="user-profile', pos) user, pos = text.extract( page, 'href="https://www.erome.com/', '"', pos) + count, pos = text.extract( + page, 'fa-camera"></i>', '</span>', pos) + data = { "album_id" : album_id, "title" : text.unescape(title), "user" : text.unquote(user), "_http_headers": {"Referer": url}, + "count" : text.parse_int(count), } yield Message.Directory, data diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index 182910c..5dc498f 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -40,6 +40,7 @@ class ExhentaiExtractor(Extractor): if domain == "auto": domain = ("ex" if self.version == "ex" else "e-") + "hentai.org" self.root = "https://" + domain + self.api_url = self.root + "/api.php" self.cookies_domain = "." + domain Extractor.initialize(self) @@ -120,7 +121,6 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.key_start = None self.key_show = None self.key_next = None - self.api_url = "" self.count = 0 def _init(self): @@ -171,6 +171,21 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): # declared inside 'items()' to be able to access 'data' if not response.history and response.headers.get( "content-type", "").startswith("text/html"): + page = response.text + self.log.warning("'%s'", page) + + if " requires GP" in page: + gp = self.config("gp") + if gp == "stop": + raise exception.StopExtraction("Not enough GP") + elif gp == "wait": + input("Press ENTER to continue.") + return response.url + + self.log.info("Falling back to non-original downloads") + self.original = False + return data["_url_1280"] + self._report_limits(data) return True @@ -212,7 +227,10 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): def metadata_from_page(self, page): extr = text.extract_from(page) - self.api_url = extr('var api_url = "', '"') or (self.root + "/api.php") + + api_url = extr('var api_url = "', '"') + if api_url: + self.api_url = api_url data = { "gid" : self.gallery_id, @@ -296,6 +314,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = self.image_num data["image_token"] = self.key_start = extr('var startkey="', '";') + data["_url_1280"] = iurl self.key_show = extr('var showkey="', '";') self._check_509(iurl, data) @@ -345,6 +364,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): data["num"] = request["page"] data["image_token"] = imgkey + data["_url_1280"] = imgurl self._check_509(imgurl, data) yield url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index d4524e0..aff8e61 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -10,6 +10,9 @@ from .common import Extractor, Message from .. import text, exception +BASE_PATTERN = r"(?:https?://)?(?:www\.)?fapello\.(?:com|su)" + + class FapelloPostExtractor(Extractor): """Extractor for individual posts on fapello.com""" category = "fapello" @@ -17,16 +20,16 @@ class FapelloPostExtractor(Extractor): directory_fmt = ("{category}", "{model}") filename_fmt = "{model}_{id}.{extension}" archive_fmt = "{type}_{model}_{id}" - pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" - r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)") + pattern = BASE_PATTERN + r"/(?!search/|popular_videos/)([^/?#]+)/(\d+)" example = "https://fapello.com/MODEL/12345/" def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.model, self.id = match.groups() def items(self): - url = "https://fapello.com/{}/{}/".format(self.model, self.id) + url = "{}/{}/{}/".format(self.root, self.model, self.id) page = text.extr( self.request(url, allow_redirects=False).text, 'class="uk-align-center"', "</div>", None) @@ -48,27 +51,29 @@ class FapelloModelExtractor(Extractor): """Extractor for all posts from a fapello model""" category = "fapello" subcategory = "model" - pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" - r"/(?!top-(?:likes|followers)|popular_videos" + pattern = (BASE_PATTERN + r"/(?!top-(?:likes|followers)|popular_videos" r"|videos|trending|search/?$)" r"([^/?#]+)/?$") example = "https://fapello.com/model/" def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.model = match.group(1) def items(self): num = 1 data = {"_extractor": FapelloPostExtractor} while True: - url = "https://fapello.com/ajax/model/{}/page-{}/".format( - self.model, num) + url = "{}/ajax/model/{}/page-{}/".format( + self.root, self.model, num) page = self.request(url).text if not page: return for url in text.extract_iter(page, '<a href="', '"'): + if url == "javascript:void(0);": + continue yield Message.Queue, url, data num += 1 @@ -77,13 +82,14 @@ class FapelloPathExtractor(Extractor): """Extractor for models and posts from fapello.com paths""" category = "fapello" subcategory = "path" - pattern = (r"(?:https?://)?(?:www\.)?fapello\.com" + pattern = (BASE_PATTERN + r"/(?!search/?$)(top-(?:likes|followers)|videos|trending" r"|popular_videos/[^/?#]+)/?$") example = "https://fapello.com/trending/" def __init__(self, match): Extractor.__init__(self, match) + self.root = text.root_from_url(match.group(0)) self.path = match.group(1) def items(self): @@ -93,9 +99,14 @@ class FapelloPathExtractor(Extractor): else: data = {"_extractor": FapelloPostExtractor} + if "fapello.su" in self.root: + self.path = self.path.replace("-", "/") + if self.path == "trending": + data = {"_extractor": FapelloModelExtractor} + while True: - page = self.request("https://fapello.com/ajax/{}/page-{}/".format( - self.path, num)).text + page = self.request("{}/ajax/{}/page-{}/".format( + self.root, self.path, num)).text if not page: return diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index b0699b0..bb684c2 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -38,10 +38,6 @@ class FoolslideExtractor(BaseExtractor): BASE_PATTERN = FoolslideExtractor.update({ - "powermanga": { - "root": "https://read.powermanga.org", - "pattern": r"read(?:er)?\.powermanga\.org", - }, }) diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 62df192..d5ff8c8 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -42,7 +42,7 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor): def images(self, page): return [ - (url, None) + (url.replace("http:", "https:", 1), None) for url in text.extract_iter( page, '<amp-img class="auto-style" src="', '"') ] diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index 8ba23c2..c75c90d 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -133,9 +133,25 @@ class HentaifoundryExtractor(Extractor): return text.nameext_from_url(data["src"], data) - def _init_site_filters(self): + def _request_check(self, url, **kwargs): + self.request = self._request_original + + # check for Enter button / front page + # and update PHPSESSID and content filters if necessary + response = self.request(url, **kwargs) + content = response.content + if len(content) < 5000 and \ + b'<div id="entryButtonContainer"' in content: + self._init_site_filters(False) + response = self.request(url, **kwargs) + return response + + def _init_site_filters(self, check_cookies=True): """Set site-internal filters to show all images""" - if self.cookies.get("PHPSESSID", domain=self.cookies_domain): + if check_cookies and self.cookies.get( + "PHPSESSID", domain=self.cookies_domain): + self._request_original = self.request + self.request = self._request_check return url = self.root + "/?enterAgree=1" diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 32ca151..20491b5 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -30,10 +30,10 @@ class HiperdexBase(): extr = text.extract_from(page) return { - "manga" : text.unescape(extr( - "<title>", "<").rpartition(" Manga - ")[0].strip()), "url" : text.unescape(extr( 'property="og:url" content="', '"')), + "manga" : text.unescape(extr( + '"headline": "', '"')), "score" : text.parse_float(extr( 'id="averagerate">', '<')), "author" : text.remove_html(extr( diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index b7b6ef1..5c7a1b3 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -15,15 +15,17 @@ from .. import text, util, exception import collections import re +BASE_PATTERN = r"(?:https?://)?idol\.sankakucomplex\.com(?:/[a-z]{2})?" + class IdolcomplexExtractor(SankakuExtractor): """Base class for idolcomplex extractors""" category = "idolcomplex" + root = "https://idol.sankakucomplex.com" cookies_domain = "idol.sankakucomplex.com" - cookies_names = ("login", "pass_hash") - root = "https://" + cookies_domain + cookies_names = ("_idolcomplex_session",) referer = False - request_interval = 5.0 + request_interval = (4.0, 6.0) def __init__(self, match): SankakuExtractor.__init__(self, match) @@ -32,14 +34,16 @@ class IdolcomplexExtractor(SankakuExtractor): self.start_post = 0 def _init(self): - self.extags = self.config("tags", False) + self.find_tags = re.compile( + r'tag-type-([^"]+)">\s*<div [^>]+>\s*<a href="/\?tags=([^"]+)' + ).findall def items(self): self.login() data = self.metadata() for post_id in util.advance(self.post_ids(), self.start_post): - post = self._parse_post(post_id) + post = self._extract_post(post_id) url = post["file_url"] post.update(data) text.nameext_from_url(url, post) @@ -67,63 +71,75 @@ class IdolcomplexExtractor(SankakuExtractor): def _login_impl(self, username, password): self.log.info("Logging in as %s", username) - url = self.root + "/user/authenticate" + url = self.root + "/users/login" + page = self.request(url).text + + headers = { + "Referer": url, + } + url = self.root + (text.extr(page, '<form action="', '"') or + "/en/user/authenticate") data = { + "authenticity_token": text.unescape(text.extr( + page, 'name="authenticity_token" value="', '"')), "url" : "", "user[name]" : username, "user[password]": password, "commit" : "Login", } - response = self.request(url, method="POST", data=data) + response = self.request(url, method="POST", headers=headers, data=data) - if not response.history or response.url != self.root + "/user/home": + if not response.history or response.url.endswith("/user/home"): raise exception.AuthenticationError() - cookies = response.history[0].cookies - return {c: cookies[c] for c in self.cookies_names} + return {c.name: c.value for c in response.history[0].cookies} - def _parse_post(self, post_id): - """Extract metadata of a single post""" - url = self.root + "/post/show/" + post_id + def _extract_post(self, post_id): + url = self.root + "/posts/" + post_id page = self.request(url, retries=10).text - extr = text.extract + extr = text.extract_from(page) - tags , pos = extr(page, "<title>", " | ") - vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos) - vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos) - _ , pos = extr(page, "Posted: <", "", pos) - created, pos = extr(page, ' title="', '"', pos) - rating = extr(page, "<li>Rating: ", "<", pos)[0] + tags = extr("<title>", " | ") + vavg = extr('itemprop="ratingValue">', "<") + vcnt = extr('itemprop="reviewCount">', "<") + pid = extr(">Post ID:", "<") + created = extr(' title="', '"') - file_url, pos = extr(page, '<li>Original: <a href="', '"', pos) + file_url = extr('>Original:', 'id=') if file_url: - width , pos = extr(page, '>', 'x', pos) - height, pos = extr(page, '', ' ', pos) + file_url = extr(' href="', '"') + width = extr(">", "x") + height = extr("", " ") else: - width , pos = extr(page, '<object width=', ' ', pos) - height, pos = extr(page, 'height=', '>', pos) - file_url = extr(page, '<embed src="', '"', pos)[0] + width = extr('<object width=', ' ') + height = extr('height=', '>') + file_url = extr('<embed src="', '"') + + rating = extr(">Rating:", "<br") data = { - "id": text.parse_int(post_id), - "md5": file_url.rpartition("/")[2].partition(".")[0], - "tags": text.unescape(tags), + "id" : text.parse_int(pid), + "md5" : file_url.rpartition("/")[2].partition(".")[0], + "tags" : text.unescape(tags), "vote_average": text.parse_float(vavg), - "vote_count": text.parse_int(vcnt), - "created_at": created, - "rating": (rating or "?")[0].lower(), - "file_url": "https:" + text.unescape(file_url), - "width": text.parse_int(width), - "height": text.parse_int(height), + "vote_count" : text.parse_int(vcnt), + "created_at" : created, + "date" : text.parse_datetime( + created, "%Y-%m-%d %H:%M:%S.%f"), + "rating" : text.remove_html(rating).lower(), + "file_url" : "https:" + text.unescape(file_url), + "width" : text.parse_int(width), + "height" : text.parse_int(height), } - if self.extags: - tags = collections.defaultdict(list) - tags_html = text.extr(page, '<ul id=tag-sidebar>', '</ul>') - pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)') - for tag_type, tag_name in pattern.findall(tags_html or ""): - tags[tag_type].append(text.unquote(tag_name)) - for key, value in tags.items(): - data["tags_" + key] = " ".join(value) + tags = collections.defaultdict(list) + tags_list = [] + tags_html = text.extr(page, '<ul id="tag-sidebar"', '</ul>') + for tag_type, tag_name in self.find_tags(tags_html or ""): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + data["tags_" + key] = " ".join(value) + tags_list += value + data["tags"] = " ".join(tags_list) return data @@ -178,15 +194,16 @@ class IdolcomplexTagExtractor(IdolcomplexExtractor): while True: page = self.request(self.root, params=params, retries=10).text - pos = page.find("<div id=more-popular-posts-link>") + 1 - yield from text.extract_iter(page, '" id=p', '>', pos) + pos = ((page.find('id="more-popular-posts-link"') + 1) or + (page.find('<span class="thumb') + 1)) + yield from text.extract_iter(page, ' href="/posts/', '"', pos) next_url = text.extract(page, 'next-page-url="', '"', pos)[0] if not next_url: return - next_params = text.parse_query(text.unescape( - next_url).lstrip("?/")) + next_params = text.parse_query(text.unescape(text.unescape( + next_url).lstrip("?/"))) if "next" in next_params: # stop if the same "next" value occurs twice in a row (#265) @@ -201,8 +218,8 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool}") archive_fmt = "p_{pool}_{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)" - example = "https://idol.sankakucomplex.com/pool/show/12345" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pools?/show/(\d+)" + example = "https://idol.sankakucomplex.com/pools/show/12345" per_page = 24 def __init__(self, match): @@ -219,15 +236,17 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor): return {"pool": self.pool_id} def post_ids(self): - url = self.root + "/pool/show/" + self.pool_id + url = self.root + "/pools/show/" + self.pool_id params = {"page": self.start_page} while True: page = self.request(url, params=params, retries=10).text - ids = list(text.extract_iter(page, '" id=p', '>')) + pos = page.find('id="pool-show"') + 1 + post_ids = list(text.extract_iter( + page, ' href="/posts/', '"', pos)) - yield from ids - if len(ids) < self.per_page: + yield from post_ids + if len(post_ids) < self.per_page: return params["page"] += 1 @@ -236,8 +255,8 @@ class IdolcomplexPostExtractor(IdolcomplexExtractor): """Extractor for single images from idol.sankakucomplex.com""" subcategory = "post" archive_fmt = "{id}" - pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)" - example = "https://idol.sankakucomplex.com/post/show/12345" + pattern = BASE_PATTERN + r"/posts?/(?:show/)?([0-9a-f]+)" + example = "https://idol.sankakucomplex.com/posts/0123456789abcdef" def __init__(self, match): IdolcomplexExtractor.__init__(self, match) diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index aca101e..3bdcfdf 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -126,14 +126,15 @@ class ImagefapImageExtractor(ImagefapExtractor): url = "{}/photo/{}/".format(self.root, self.image_id) page = self.request(url).text + url, pos = text.extract( + page, 'original="', '"') info, pos = text.extract( - page, '<script type="application/ld+json">', '</script>') + page, '<script type="application/ld+json">', '</script>', pos) image_id, pos = text.extract( page, 'id="imageid_input" value="', '"', pos) gallery_id, pos = text.extract( page, 'id="galleryid_input" value="', '"', pos) info = util.json_loads(info) - url = info["contentUrl"] return url, text.nameext_from_url(url, { "title": text.unescape(info["name"]), diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index b0789be..8ec6741 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -217,9 +217,10 @@ class InstagramExtractor(Extractor): data["post_shortcode"]) continue - if "video_versions" in item: + video_versions = item.get("video_versions") + if video_versions: video = max( - item["video_versions"], + video_versions, key=lambda x: (x["width"], x["height"], x["type"]), ) media = video diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py index 8f19374..4b017dc 100644 --- a/gallery_dl/extractor/mangaread.py +++ b/gallery_dl/extractor/mangaread.py @@ -50,8 +50,8 @@ class MangareadChapterExtractor(MangareadBase, ChapterExtractor): page = text.extr( page, '<div class="reading-content">', '<div class="entry-header') return [ - (url.strip(), None) - for url in text.extract_iter(page, 'data-src="', '"') + (text.extr(img, 'src="', '"').strip(), None) + for img in text.extract_iter(page, '<img id="image-', '>') ] diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 3c2b03e..c5fe840 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -45,6 +45,9 @@ class MastodonExtractor(BaseExtractor): attachments = status["media_attachments"] del status["media_attachments"] + if status["reblog"]: + attachments.extend(status["reblog"]["media_attachments"]) + status["instance"] = self.instance acct = status["account"]["acct"] status["instance_remote"] = \ @@ -113,7 +116,10 @@ class MastodonUserExtractor(MastodonExtractor): return api.account_statuses( api.account_id_by_username(self.item), - only_media=not self.config("text-posts", False), + only_media=( + not self.reblogs and + not self.config("text-posts", False) + ), exclude_replies=not self.replies, ) diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 9f5cc9d..bc7b308 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -96,6 +96,8 @@ class NitterExtractor(BaseExtractor): for url in text.extract_iter( attachments, '<source src="', '"'): + if url[0] == "/": + url = self.root + url append(text.nameext_from_url(url, {"url": url})) else: diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 45313c5..d1f135d 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, oauth, util, config, exception from ..output import stdout_write -from ..cache import cache +from ..cache import cache, memcache import urllib.parse import binascii import hashlib @@ -31,6 +31,9 @@ class OAuthBase(Extractor): def _init(self): self.cache = config.get(("extractor", self.category), "cache", True) + if self.cache and cache is memcache: + self.log.warning("cache file is not writeable") + self.cache = False def oauth_config(self, key, default=None): value = config.interpolate(("extractor", self.subcategory), key) diff --git a/gallery_dl/extractor/pixeldrain.py b/gallery_dl/extractor/pixeldrain.py new file mode 100644 index 0000000..34b4ebf --- /dev/null +++ b/gallery_dl/extractor/pixeldrain.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://pixeldrain.com/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?pixeldrain\.com" + + +class PixeldrainExtractor(Extractor): + """Base class for pixeldrain extractors""" + category = "pixeldrain" + root = "https://pixeldrain.com" + archive_fmt = "{id}" + + def _init(self): + api_key = self.config("api-key") + if api_key: + self.session.auth = ("", api_key) + + def parse_datetime(self, date_string): + return text.parse_datetime( + date_string, "%Y-%m-%dT%H:%M:%S.%fZ") + + +class PixeldrainFileExtractor(PixeldrainExtractor): + """Extractor for pixeldrain files""" + subcategory = "file" + filename_fmt = "{filename[:230]} ({id}).{extension}" + pattern = BASE_PATTERN + r"/(?:u|api/file)/(\w+)" + example = "https://pixeldrain.com/u/abcdefgh" + + def __init__(self, match): + Extractor.__init__(self, match) + self.file_id = match.group(1) + + def items(self): + url = "{}/api/file/{}".format(self.root, self.file_id) + file = self.request(url + "/info").json() + + file["url"] = url + "?download" + file["date"] = self.parse_datetime(file["date_upload"]) + + text.nameext_from_url(file["name"], file) + yield Message.Directory, file + yield Message.Url, file["url"], file + + +class PixeldrainAlbumExtractor(PixeldrainExtractor): + """Extractor for pixeldrain albums""" + subcategory = "album" + directory_fmt = ("{category}", + "{album[date]:%Y-%m-%d} {album[title]} ({album[id]})") + filename_fmt = "{num:>03} {filename[:230]} ({id}).{extension}" + pattern = BASE_PATTERN + r"/(?:l|api/list)/(\w+)" + example = "https://pixeldrain.com/l/abcdefgh" + + def __init__(self, match): + Extractor.__init__(self, match) + self.album_id = match.group(1) + + def items(self): + url = "{}/api/list/{}".format(self.root, self.album_id) + album = self.request(url).json() + + files = album["files"] + album["count"] = album["file_count"] + album["date"] = self.parse_datetime(album["date_created"]) + + del album["files"] + del album["file_count"] + + yield Message.Directory, {"album": album} + for num, file in enumerate(files, 1): + file["album"] = album + file["num"] = num + file["url"] = url = "{}/api/file/{}?download".format( + self.root, file["id"]) + file["date"] = self.parse_datetime(file["date_upload"]) + text.nameext_from_url(file["name"], file) + yield Message.Url, url, file diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index c5ce832..7ff40a3 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -143,7 +143,7 @@ class PornhubGifExtractor(PornhubExtractor): "url" : extr('"contentUrl": "', '"'), "date" : text.parse_datetime( extr('"uploadDate": "', '"'), "%Y-%m-%d"), - "user" : extr('data-mxptext="', '"'), + "user" : text.remove_html(extr("Created by:", "</div>")), } yield Message.Directory, gif diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py index bebea2a..8941258 100644 --- a/gallery_dl/extractor/sankaku.py +++ b/gallery_dl/extractor/sankaku.py @@ -117,7 +117,7 @@ class SankakuPoolExtractor(SankakuExtractor): subcategory = "pool" directory_fmt = ("{category}", "pool", "{pool[id]} {pool[name_en]}") archive_fmt = "p_{pool}_{id}" - pattern = BASE_PATTERN + r"/(?:books|pool/show)/(\d+)" + pattern = BASE_PATTERN + r"/(?:books|pools?/show)/(\d+)" example = "https://sankaku.app/books/12345" def __init__(self, match): @@ -143,7 +143,7 @@ class SankakuPostExtractor(SankakuExtractor): """Extractor for single posts from sankaku.app""" subcategory = "post" archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/post(?:s|/show)/([0-9a-f]+)" + pattern = BASE_PATTERN + r"/posts?(?:/show)?/([0-9a-f]+)" example = "https://sankaku.app/post/show/12345" def __init__(self, match): diff --git a/gallery_dl/extractor/tmohentai.py b/gallery_dl/extractor/tmohentai.py new file mode 100644 index 0000000..9c29727 --- /dev/null +++ b/gallery_dl/extractor/tmohentai.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://tmohentai.com/""" + +from .common import GalleryExtractor +from .. import text + +BASE_PATTERN = r"(?:https?://)?tmohentai\.com" + + +class TmohentaiGalleryExtractor(GalleryExtractor): + category = "tmohentai" + root = "http://tmohentai.com" + directory_fmt = ("{category}", "{title} ({gallery_id})") + pattern = BASE_PATTERN + r"/(?:contents|reader)/(\w+)" + example = "https://tmohentai.com/contents/12345a67b89c0" + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/contents/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def images(self, page): + fmt = "https://imgrojo.tmohentai.com/contents/{}/{{:>03}}.webp".format( + self.gallery_id).format + cnt = page.count('class="lanzador') + return [(fmt(i), None) for i in range(0, cnt)] + + def metadata(self, page): + extr = text.extract_from(page) + + return { + "gallery_id": self.gallery_id, + "title" : text.unescape(extr("<h3>", "<").strip()), + "artists" : text.split_html(extr( + "<label>Artists and Artists Groups</label>", "</ul>")), + "genres" : text.split_html(extr( + "<label>Genders</label>", "</ul>")), + "tags" : text.split_html(extr( + "<label>Tags</label>", "</ul>")), + "uploader" : text.remove_html(extr( + "<label>Uploaded By</label>", "</ul>")), + "language" : extr(" ", "\n"), + } diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 3dab16e..f50ddb7 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -322,12 +322,15 @@ class TumblrDayExtractor(TumblrExtractor): def __init__(self, match): TumblrExtractor.__init__(self, match) year, month, day = match.group(4).split("/") - self.date_min = ( - # 719163 == date(1970, 1, 1).toordinal() - date(int(year), int(month), int(day)).toordinal() - 719163) * 86400 + self.ordinal = date(int(year), int(month), int(day)).toordinal() def _init(self): TumblrExtractor._init(self) + + self.date_min = ( + # 719163 == date(1970, 1, 1).toordinal() + (self.ordinal - 719163) * 86400) + self.api.before = self.date_min + 86400 def posts(self): diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 4766ae5..ca1e906 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -43,6 +43,7 @@ class TwitterExtractor(Extractor): self.quoted = self.config("quoted", False) self.videos = self.config("videos", True) self.cards = self.config("cards", False) + self.ads = self.config("ads", False) self.cards_blacklist = self.config("cards-blacklist") self.syndication = self.config("syndication") @@ -1034,7 +1035,7 @@ class TwitterAPI(): "focalTweetId": tweet_id, "referrer": "profile", "with_rux_injections": False, - "includePromotedContent": True, + "includePromotedContent": False, "withCommunity": True, "withQuickPromoteEligibilityTweetFields": True, "withBirdwatchNotes": True, @@ -1049,7 +1050,7 @@ class TwitterAPI(): variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, - "includePromotedContent": True, + "includePromotedContent": False, "withQuickPromoteEligibilityTweetFields": True, "withVoice": True, "withV2Timeline": True, @@ -1061,7 +1062,7 @@ class TwitterAPI(): variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, - "includePromotedContent": True, + "includePromotedContent": False, "withCommunity": True, "withVoice": True, "withV2Timeline": True, @@ -1498,13 +1499,21 @@ class TwitterAPI(): for entry in tweets: try: - tweet = ((entry.get("content") or entry["item"]) - ["itemContent"]["tweet_results"]["result"]) + item = ((entry.get("content") or entry["item"]) + ["itemContent"]) + if "promotedMetadata" in item and not extr.ads: + extr.log.debug( + "Skipping %s (ad)", + (entry.get("entryId") or "").rpartition("-")[2]) + continue + + tweet = item["tweet_results"]["result"] if "tombstone" in tweet: tweet = self._process_tombstone( entry, tweet["tombstone"]) if not tweet: continue + if "tweet" in tweet: tweet = tweet["tweet"] legacy = tweet["legacy"] diff --git a/gallery_dl/extractor/wallpapercave.py b/gallery_dl/extractor/wallpapercave.py index bce1026..faf3b0d 100644 --- a/gallery_dl/extractor/wallpapercave.py +++ b/gallery_dl/extractor/wallpapercave.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2021 David Hoppenbrouwers +# Copyright 2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,7 +23,20 @@ class WallpapercaveImageExtractor(Extractor): def items(self): page = self.request(text.ensure_http_scheme(self.url)).text + + path = None for path in text.extract_iter(page, 'class="download" href="', '"'): image = text.nameext_from_url(path) yield Message.Directory, image yield Message.Url, self.root + path, image + + if path is None: + try: + path = text.rextract( + page, 'href="', '"', page.index('id="tdownload"'))[0] + except Exception: + pass + else: + image = text.nameext_from_url(path) + yield Message.Directory, image + yield Message.Url, self.root + path, image diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py index 8e6b842..3bb635d 100644 --- a/gallery_dl/extractor/warosu.py +++ b/gallery_dl/extractor/warosu.py @@ -90,4 +90,7 @@ class WarosuThreadExtractor(Extractor): data["filename"] = text.unquote(extr( "", "<").rstrip().rpartition(".")[0]) extr("<br>", "") - data["image"] = self.root + extr("<a href=", ">") + + data["image"] = url = extr("<a href=", ">") + if url[0] == "/": + data["image"] = self.root + url diff --git a/gallery_dl/extractor/webtoons.py b/gallery_dl/extractor/webtoons.py index dc9a4f1..3f2f410 100644 --- a/gallery_dl/extractor/webtoons.py +++ b/gallery_dl/extractor/webtoons.py @@ -146,7 +146,12 @@ class WebtoonsComicExtractor(WebtoonsBase, Extractor): if page and path not in page: return - page = self.request(self.root + path).text + response = self.request(self.root + path) + if response.history: + parts = response.url.split("/") + self.path = "/".join(parts[3:-1]) + + page = response.text data["page"] = self.page_no for url in self.get_episode_urls(page): diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py index ed05e1f..7413b5a 100644 --- a/gallery_dl/extractor/weibo.py +++ b/gallery_dl/extractor/weibo.py @@ -41,9 +41,14 @@ class WeiboExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and "passport.weibo.com" in response.url: - self._sina_visitor_system(response) - response = Extractor.request(self, url, **kwargs) + if response.history: + if "login.sina.com" in response.url: + raise exception.StopExtraction( + "HTTP redirect to login page (%s)", + response.url.partition("?")[0]) + if "passport.weibo.com" in response.url: + self._sina_visitor_system(response) + response = Extractor.request(self, url, **kwargs) return response diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py index a28d8f5..46e574e 100644 --- a/gallery_dl/extractor/xvideos.py +++ b/gallery_dl/extractor/xvideos.py @@ -38,13 +38,13 @@ class XvideosGalleryExtractor(XvideosBase, GalleryExtractor): def metadata(self, page): extr = text.extract_from(page) - title = extr('"title":"', '"') user = { "id" : text.parse_int(extr('"id_user":', ',')), "display": extr('"display":"', '"'), "sex" : extr('"sex":"', '"'), "name" : self.user, } + title = extr('"title":"', '"') user["description"] = extr( '<small class="mobile-hide">', '</small>').strip() tags = extr('<em>Tagged:</em>', '<').strip() diff --git a/gallery_dl/extractor/zerochan.py b/gallery_dl/extractor/zerochan.py index 5fe1943..1307399 100644 --- a/gallery_dl/extractor/zerochan.py +++ b/gallery_dl/extractor/zerochan.py @@ -63,14 +63,14 @@ class ZerochanExtractor(BooruExtractor): data = { "id" : text.parse_int(entry_id), - "author" : extr('"author": "', '"'), + "author" : text.parse_unicode_escapes(extr(' "name": "', '"')), "file_url": extr('"contentUrl": "', '"'), "date" : text.parse_datetime(extr('"datePublished": "', '"')), "width" : text.parse_int(extr('"width": "', ' ')), "height" : text.parse_int(extr('"height": "', ' ')), "size" : text.parse_bytes(extr('"contentSize": "', 'B')), "path" : text.split_html(extr( - 'class="breadcrumbs', '</p>'))[2:], + 'class="breadcrumbs', '</nav>'))[2:], "uploader": extr('href="/user/', '"'), "tags" : extr('<ul id="tags"', '</ul>'), "source" : extr('<h2>Source</h2>', '</p><h2>').rpartition( @@ -80,9 +80,9 @@ class ZerochanExtractor(BooruExtractor): html = data["tags"] tags = data["tags"] = [] for tag in html.split("<li class=")[1:]: - category = text.extr(tag, 'alt="', '"') - name = text.extr(tag, ">-->", "</a>") - tags.append(category + ":" + name.strip()) + category = text.extr(tag, 'data-type="', '"') + name = text.extr(tag, 'data-tag="', '"') + tags.append(category.capitalize() + ":" + name) return data |
