diff options
Diffstat (limited to 'gallery_dl/extractor/sankakucomplex.py')
| -rw-r--r-- | gallery_dl/extractor/sankakucomplex.py | 79 |
1 files changed, 50 insertions, 29 deletions
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py index f6ad327..972750c 100644 --- a/gallery_dl/extractor/sankakucomplex.py +++ b/gallery_dl/extractor/sankakucomplex.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019 Mike Fährmann +# Copyright 2019-2020 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,7 +9,7 @@ """Extractors for https://www.sankakucomplex.com/""" from .common import Extractor, Message -from .. import text +from .. import text, util import re @@ -40,6 +40,21 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c", "keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68", }), + # videos (#308) + (("https://www.sankakucomplex.com/2019/06/11" + "/darling-ol-goddess-shows-off-her-plump-lower-area/"), { + "pattern": r"/wp-content/uploads/2019/06/[^/]+\d\.mp4", + "range": "26-", + "count": 5, + }), + # youtube embeds (#308) + (("https://www.sankakucomplex.com/2015/02/12" + "/snow-miku-2015-live-magical-indeed/"), { + "options": (("embeds", True),), + "pattern": r"https://www.youtube.com/embed/", + "range": "2-", + "count": 2, + }), ) def items(self): @@ -53,38 +68,44 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor): "date" : text.parse_datetime( extr('property="article:published_time" content="', '"')), } - imgs = self.images(extr) - data["count"] = len(imgs) + content = extr('<div class="entry-content">', '</article>') data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2] - yield Message.Version, 1 - yield Message.Directory, data - for img in imgs: - img.update(data) - yield Message.Url, img["url"], img + files = self._extract_images(content) + if self.config("videos", True): + files += self._extract_videos(content) + if self.config("embeds", False): + files += self._extract_embeds(content) + data["count"] = len(files) - def images(self, extr): - num = 0 - imgs = [] - urls = set() - orig = re.compile(r"-\d+x\d+\.") - - extr('<div class="entry-content">', '') - while True: - url = extr('data-lazy-src="', '"') - if not url: - return imgs - if url in urls: - continue + yield Message.Directory, data + for num, url in enumerate(files, 1): + file = text.nameext_from_url(url) if url[0] == "/": url = text.urljoin(self.root, url) - url = orig.sub(".", url) - num += 1 - imgs.append(text.nameext_from_url(url, { - "url" : url, - "num" : num, - })) - urls.add(url) + file["url"] = url + file["num"] = num + file.update(data) + yield Message.Url, url, file + + @staticmethod + def _extract_images(content): + orig_sub = re.compile(r"-\d+x\d+\.").sub + return [ + orig_sub(".", url) for url in + util.unique(text.extract_iter(content, 'data-lazy-src="', '"')) + ] + + @staticmethod + def _extract_videos(content): + return re.findall(r"<source [^>]*src=[\"']([^\"']+)", content) + + @staticmethod + def _extract_embeds(content): + return [ + "ytdl:" + url for url in + re.findall(r"<iframe [^>]*src=[\"']([^\"']+)", content) + ] class SankakucomplexTagExtractor(SankakucomplexExtractor): |
