summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/sankakucomplex.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/sankakucomplex.py')
-rw-r--r--gallery_dl/extractor/sankakucomplex.py79
1 files changed, 50 insertions, 29 deletions
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
index f6ad327..972750c 100644
--- a/gallery_dl/extractor/sankakucomplex.py
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
-# Copyright 2019 Mike Fährmann
+# Copyright 2019-2020 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@@ -9,7 +9,7 @@
"""Extractors for https://www.sankakucomplex.com/"""
from .common import Extractor, Message
-from .. import text
+from .. import text, util
import re
@@ -40,6 +40,21 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
"url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
"keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68",
}),
+ # videos (#308)
+ (("https://www.sankakucomplex.com/2019/06/11"
+ "/darling-ol-goddess-shows-off-her-plump-lower-area/"), {
+ "pattern": r"/wp-content/uploads/2019/06/[^/]+\d\.mp4",
+ "range": "26-",
+ "count": 5,
+ }),
+ # youtube embeds (#308)
+ (("https://www.sankakucomplex.com/2015/02/12"
+ "/snow-miku-2015-live-magical-indeed/"), {
+ "options": (("embeds", True),),
+ "pattern": r"https://www.youtube.com/embed/",
+ "range": "2-",
+ "count": 2,
+ }),
)
def items(self):
@@ -53,38 +68,44 @@ class SankakucomplexArticleExtractor(SankakucomplexExtractor):
"date" : text.parse_datetime(
extr('property="article:published_time" content="', '"')),
}
- imgs = self.images(extr)
- data["count"] = len(imgs)
+ content = extr('<div class="entry-content">', '</article>')
data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2]
- yield Message.Version, 1
- yield Message.Directory, data
- for img in imgs:
- img.update(data)
- yield Message.Url, img["url"], img
+ files = self._extract_images(content)
+ if self.config("videos", True):
+ files += self._extract_videos(content)
+ if self.config("embeds", False):
+ files += self._extract_embeds(content)
+ data["count"] = len(files)
- def images(self, extr):
- num = 0
- imgs = []
- urls = set()
- orig = re.compile(r"-\d+x\d+\.")
-
- extr('<div class="entry-content">', '')
- while True:
- url = extr('data-lazy-src="', '"')
- if not url:
- return imgs
- if url in urls:
- continue
+ yield Message.Directory, data
+ for num, url in enumerate(files, 1):
+ file = text.nameext_from_url(url)
if url[0] == "/":
url = text.urljoin(self.root, url)
- url = orig.sub(".", url)
- num += 1
- imgs.append(text.nameext_from_url(url, {
- "url" : url,
- "num" : num,
- }))
- urls.add(url)
+ file["url"] = url
+ file["num"] = num
+ file.update(data)
+ yield Message.Url, url, file
+
+ @staticmethod
+ def _extract_images(content):
+ orig_sub = re.compile(r"-\d+x\d+\.").sub
+ return [
+ orig_sub(".", url) for url in
+ util.unique(text.extract_iter(content, 'data-lazy-src="', '"'))
+ ]
+
+ @staticmethod
+ def _extract_videos(content):
+ return re.findall(r"<source [^>]*src=[\"']([^\"']+)", content)
+
+ @staticmethod
+ def _extract_embeds(content):
+ return [
+ "ytdl:" + url for url in
+ re.findall(r"<iframe [^>]*src=[\"']([^\"']+)", content)
+ ]
class SankakucomplexTagExtractor(SankakucomplexExtractor):