summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/sankakucomplex.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/sankakucomplex.py')
-rw-r--r--gallery_dl/extractor/sankakucomplex.py120
1 files changed, 120 insertions, 0 deletions
diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py
new file mode 100644
index 0000000..22b2b63
--- /dev/null
+++ b/gallery_dl/extractor/sankakucomplex.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.sankakucomplex.com/"""
+
+from .common import Extractor, Message
+from .. import text
+import re
+
+
+class SankakucomplexExtractor(Extractor):
+ """Base class for sankakucomplex extractors"""
+ category = "sankakucomplex"
+ root = "https://www.sankakucomplex.com"
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.path = match.group(1)
+
+
+class SankakucomplexArticleExtractor(SankakucomplexExtractor):
+ """Extractor for articles on www.sankakucomplex.com"""
+ subcategory = "article"
+ directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}")
+ filename_fmt = "{filename}.{extension}"
+ archive_fmt = "{date:%Y%m%d}_{filename}"
+ pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
+ r"/(\d{4}/\d\d/\d\d/[^/?&#]+)")
+ test = (
+ ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", {
+ "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d",
+ "keyword": "4b3b5766b277a5d0acbec90fa8f2343262b07efd",
+ }),
+ ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", {
+ "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
+ "keyword": "f47a416d680717855bbc3e4f0cd44479f61d9aa4",
+ }),
+ )
+
+ def items(self):
+ url = "{}/{}/?pg=X".format(self.root, self.path)
+ extr = text.extract_from(self.request(url).text)
+ data = {
+ "title" : text.unescape(
+ extr('property="og:title" content="', '"')),
+ "description": text.unescape(
+ extr('property="og:description" content="', '"')),
+ "date" : text.parse_datetime(
+ extr('property="article:published_time" content="', '"')),
+ }
+ imgs = self.images(extr)
+ data["count"] = len(imgs)
+ data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2]
+
+ yield Message.Version, 1
+ yield Message.Directory, data
+ for img in imgs:
+ img.update(data)
+ yield Message.Url, img["url"], img
+
+ def images(self, extr):
+ num = 0
+ imgs = []
+ urls = set()
+ orig = re.compile(r"-\d+x\d+\.")
+
+ extr('<div class="entry-content">', '')
+ while True:
+ url = extr('data-lazy-src="', '"')
+ if not url:
+ return imgs
+ if url in urls:
+ continue
+ if url[0] == "/":
+ url = text.urljoin(self.root, url)
+ url = orig.sub(".", url)
+ num += 1
+ imgs.append(text.nameext_from_url(url, {
+ "url" : url,
+ "num" : num,
+ }))
+ urls.add(url)
+
+
+class SankakucomplexTagExtractor(SankakucomplexExtractor):
+ """Extractor for sankakucomplex blog articles by tag or author"""
+ subcategory = "tag"
+ pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
+ r"/((?:tag|category|author)/[^/&?#]+)")
+ test = (
+ ("https://www.sankakucomplex.com/tag/cosplay/", {
+ "range": "1-50",
+ "count": 50,
+ "pattern": SankakucomplexArticleExtractor.pattern,
+ }),
+ ("https://www.sankakucomplex.com/category/anime/"),
+ ("https://www.sankakucomplex.com/author/rift/page/5/"),
+ )
+
+ def items(self):
+ pnum = 1
+ last = None
+ data = {"_extractor": SankakucomplexArticleExtractor}
+
+ yield Message.Version, 1
+ while True:
+ url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
+ response = self.request(url, expect=(404,))
+ if response.status_code == 404:
+ return
+ for url in text.extract_iter(response.text, 'data-direct="', '"'):
+ if url != last:
+ last = url
+ yield Message.Queue, url, data
+ pnum += 1