diff options
Diffstat (limited to 'gallery_dl/extractor/imagehosts.py')
| -rw-r--r-- | gallery_dl/extractor/imagehosts.py | 101 |
1 files changed, 70 insertions, 31 deletions
diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 817d2c4..21e6cf8 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -19,6 +19,7 @@ class ImagehostImageExtractor(Extractor): basecategory = "imagehost" subcategory = "image" archive_fmt = "{token}" + parent = True _https = True _params = None _cookies = None @@ -27,7 +28,10 @@ class ImagehostImageExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.page_url = f"http{'s' if self._https else ''}://{match[1]}" + if self.root: + self.page_url = f"{self.root}{match[1]}" + else: + self.page_url = f"http{'s' if self._https else ''}://{match[1]}" self.token = match[2] if self._params == "simple": @@ -53,14 +57,25 @@ class ImagehostImageExtractor(Extractor): ).text url, filename = self.get_info(page) - data = text.nameext_from_url(filename, {"token": self.token}) + if not url: + return + + if filename: + data = text.nameext_from_name(filename) + if not data["extension"]: + data["extension"] = text.ext_from_url(url) + else: + data = text.nameext_from_url(url) + data["token"] = self.token + data["post_url"] = self.page_url data.update(self.metadata(page)) + if self._https and url.startswith("http:"): url = "https:" + url[5:] if self._validate is not None: data["_http_validate"] = self._validate - yield Message.Directory, data + yield Message.Directory, "", data yield Message.Url, url, data def get_info(self, page): @@ -70,6 +85,9 @@ class ImagehostImageExtractor(Extractor): """Return additional metadata""" return () + def not_found(self, resource=None): + raise exception.NotFoundError(resource or self.__class__.subcategory) + class ImxtoImageExtractor(ImagehostImageExtractor): """Extractor for single images from imx.to""" @@ -92,7 +110,7 @@ class ImxtoImageExtractor(ImagehostImageExtractor): url, pos = text.extract( page, '<div style="text-align:center;"><a href="', '"') if not url: - raise exception.NotFoundError("image") + self.not_found() filename, pos = text.extract(page, ' title="', '"', pos) if self.url_ext and filename: filename += splitext(url)[1] @@ -152,7 +170,7 @@ class AcidimgImageExtractor(ImagehostImageExtractor): if not url: url, pos = text.extract(page, '<img class="centred" src="', '"') if not url: - raise exception.NotFoundError("image") + self.not_found() filename, pos = text.extract(page, "alt='", "'", pos) if not filename: @@ -169,7 +187,11 @@ class ImagevenueImageExtractor(ImagehostImageExtractor): example = "https://www.imagevenue.com/ME123456789" def get_info(self, page): - pos = page.index('class="card-body') + try: + pos = page.index('class="card-body') + except ValueError: + self.not_found() + url, pos = text.extract(page, '<img src="', '"', pos) if url.endswith("/loader.svg"): url, pos = text.extract(page, '<img src="', '"', pos) @@ -199,6 +221,8 @@ class ImagetwistImageExtractor(ImagehostImageExtractor): def get_info(self, page): url , pos = text.extract(page, '<img src="', '"') + if url and url.startswith("/imgs/"): + self.not_found() filename, pos = text.extract(page, ' alt="', '"', pos) return url, filename @@ -249,7 +273,7 @@ class ImgspiceImageExtractor(ImagehostImageExtractor): def get_info(self, page): pos = page.find('id="imgpreview"') if pos < 0: - raise exception.NotFoundError("image") + self.not_found() url , pos = text.extract(page, 'src="', '"', pos) name, pos = text.extract(page, 'alt="', '"', pos) return url, text.unescape(name) @@ -258,23 +282,26 @@ class ImgspiceImageExtractor(ImagehostImageExtractor): class PixhostImageExtractor(ImagehostImageExtractor): """Extractor for single images from pixhost.to""" category = "pixhost" - pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" - r"/show/\d+/(\d+)_[^/?#]+)") + root = "https://pixhost.to" + pattern = (r"(?:https?://)?(?:www\.)?pixhost\.(?:to|org)" + r"(/show/\d+/(\d+)_[^/?#]+)") example = "https://pixhost.to/show/123/12345_NAME.EXT" _cookies = {"pixhostads": "1", "pixhosttest": "1"} def get_info(self, page): - url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"") - filename, pos = text.extract(page, "alt=\"", "\"", pos) - return url, filename + self.kwdict["directory"] = self.page_url.rsplit("/")[-2] + url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"") + name, pos = text.extract(page, "alt=\"", "\"", pos) + return url, text.unescape(name) if name else None class PixhostGalleryExtractor(ImagehostImageExtractor): """Extractor for image galleries from pixhost.to""" category = "pixhost" subcategory = "gallery" - pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" - r"/gallery/([^/?#]+))") + root = "https://pixhost.to" + pattern = (r"(?:https?://)?(?:www\.)?pixhost\.(?:to|org)" + r"(/gallery/([^/?#]+))") example = "https://pixhost.to/gallery/ID" def items(self): @@ -288,29 +315,39 @@ class PixhostGalleryExtractor(ImagehostImageExtractor): class PostimgImageExtractor(ImagehostImageExtractor): """Extractor for single images from postimages.org""" category = "postimg" - pattern = (r"(?:https?://)?((?:www\.)?(?:postim(?:ages|g)|pixxxels)" - r"\.(?:cc|org)/(?!gallery/)(?:image/)?([^/?#]+)/?)") - example = "https://postimages.org/ID" + root = "https://postimg.cc" + pattern = (r"(?:https?://)?(?:www\.)?(?:postim(?:ages|g)|pixxxels)" + r"\.(?:cc|org)(/(?!gallery/)(?:image/)?([^/?#]+)/?)") + example = "https://postimg.cc/ID" def get_info(self, page): pos = page.index(' id="download"') url , pos = text.rextract(page, ' href="', '"', pos) - filename, pos = text.extract(page, 'class="imagename">', '<', pos) - return url, text.unescape(filename) + filename, pos = text.extract(page, ' class="my-4">', '<', pos) + return url, text.unescape(filename) if filename else None class PostimgGalleryExtractor(ImagehostImageExtractor): """Extractor for images galleries from postimages.org""" category = "postimg" subcategory = "gallery" - pattern = (r"(?:https?://)?((?:www\.)?(?:postim(?:ages|g)|pixxxels)" - r"\.(?:cc|org)/gallery/([^/?#]+))") - example = "https://postimages.org/gallery/ID" + root = "https://postimg.cc" + pattern = (r"(?:https?://)?(?:www\.)?(?:postim(?:ages|g)|pixxxels)" + r"\.(?:cc|org)(/gallery/([^/?#]+))") + example = "https://postimg.cc/gallery/ID" def items(self): page = self.request(self.page_url).text - data = {"_extractor": PostimgImageExtractor} - for url in text.extract_iter(page, ' class="thumb"><a href="', '"'): + title = text.extr( + page, 'property="og:title" content="', ' — Postimages"') + + data = { + "_extractor" : PostimgImageExtractor, + "gallery_title": text.unescape(title), + } + + for token in text.extract_iter(page, 'data-image="', '"'): + url = f"{self.root}/{token}" yield Message.Queue, url, data @@ -323,7 +360,7 @@ class TurboimagehostImageExtractor(ImagehostImageExtractor): def get_info(self, page): url = text.extract(page, 'src="', '"', page.index("<img "))[0] - return url, url + return url, None class TurboimagehostGalleryExtractor(ImagehostImageExtractor): @@ -343,7 +380,7 @@ class TurboimagehostGalleryExtractor(ImagehostImageExtractor): if params["p"] == 1 and \ "Requested gallery don`t exist on our website." in page: - raise exception.NotFoundError("gallery") + self.not_found() thumb_url = None for thumb_url in text.extract_iter(page, '"><a href="', '"'): @@ -362,7 +399,7 @@ class ViprImageExtractor(ImagehostImageExtractor): def get_info(self, page): url = text.extr(page, '<img src="', '"') - return url, url + return url, None class ImgclickImageExtractor(ImagehostImageExtractor): @@ -439,14 +476,16 @@ class ImgdriveImageExtractor(ImagehostImageExtractor): class SilverpicImageExtractor(ImagehostImageExtractor): """Extractor for single images from silverpic.com""" category = "silverpic" - pattern = (r"(?:https?://)?((?:www\.)?silverpic\.com" - r"/([a-z0-9]{10,})/[\S]+\.html)") - example = "https://silverpic.com/a1b2c3d4f5g6/NAME.EXT.html" + root = "https://silverpic.net" + _params = "complex" + pattern = (r"(?:https?://)?(?:www\.)?silverpic\.(?:net|com)" + r"(/([a-z0-9]{10,})/[\S]+\.html)") + example = "https://silverpic.net/a1b2c3d4f5g6/NAME.EXT.html" def get_info(self, page): url, pos = text.extract(page, '<img src="/img/', '"') alt, pos = text.extract(page, 'alt="', '"', pos) - return f"https://silverpic.com/img/{url}", alt + return f"{self.root}/img/{url}", alt def metadata(self, page): pos = page.find('<img src="/img/') |
