summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/tsumino.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/tsumino.py')
-rw-r--r--gallery_dl/extractor/tsumino.py343
1 files changed, 343 insertions, 0 deletions
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
new file mode 100644
index 0000000..62a9173
--- /dev/null
+++ b/gallery_dl/extractor/tsumino.py
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.tsumino.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class TsuminoBase():
+ """Base class for tsumino extractors"""
+ category = "tsumino"
+ cookiedomain = "www.tsumino.com"
+ root = "https://www.tsumino.com"
+
+ def login(self):
+ username, password = self._get_auth_info()
+ if username:
+ self._update_cookies(self._login_impl(username, password))
+ else:
+ self.session.cookies.setdefault(
+ "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5")
+
+ @cache(maxage=14*24*3600, keyarg=1)
+ def _login_impl(self, username, password):
+ self.log.info("Logging in as %s", username)
+ url = "{}/Account/Login".format(self.root)
+ headers = {"Referer": url}
+ data = {"Username": username, "Password": password}
+
+ response = self.request(url, method="POST", headers=headers, data=data)
+ if not response.history:
+ raise exception.AuthenticationError()
+ return {".aotsumino": response.history[0].cookies[".aotsumino"]}
+
+
+class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
+ """Extractor for image galleries on tsumino.com"""
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+ r"/(?:Book/Info|Read/View)/(\d+)")
+ test = (
+ ("https://www.tsumino.com/Book/Info/40996", {
+ "url": "84bf30a86623039fc87855680fada884dc8a1ddd",
+ "keyword": {
+ "title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+ "title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+ "title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
+ "gallery_id": 40996,
+ "date" : "2018 June 29",
+ "count" : 42,
+ "collection": "",
+ "artist" : ["Itou Life"],
+ "group" : ["Itou Life"],
+ "parody" : ["Fate/Grand Order"],
+ "characters": list,
+ "tags" : list,
+ "type" : "Doujinshi",
+ "rating" : float,
+ "uploader" : "sehki",
+ "lang" : "en",
+ "language" : "English",
+ "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
+ },
+ }),
+ ("https://www.tsumino.com/Read/View/45834"),
+ )
+
+ def __init__(self, match):
+ self.gallery_id = match.group(1)
+ url = "{}/Book/Info/{}".format(self.root, self.gallery_id)
+ GalleryExtractor.__init__(self, match, url)
+
+ def metadata(self, page):
+ extr = text.extract_from(page)
+ title = extr('"og:title" content="', '"')
+ title_en, _, title_jp = text.unescape(title).partition("/")
+ title_en = title_en.strip()
+ title_jp = title_jp.strip()
+
+ return {
+ "gallery_id": text.parse_int(self.gallery_id),
+ "title" : title_en or title_jp,
+ "title_en" : title_en,
+ "title_jp" : title_jp,
+ "thumbnail" : extr('"og:image" content="', '"'),
+ "uploader" : text.remove_html(extr('id="Uploader">', '</div>')),
+ "date" : extr('id="Uploaded">', '</div>').strip(),
+ "rating" : text.parse_float(extr(
+ 'id="Rating">', '</div>').partition(" ")[0]),
+ "type" : text.remove_html(extr('id="Category">' , '</div>')),
+ "collection": text.remove_html(extr('id="Collection">', '</div>')),
+ "group" : text.split_html(extr('id="Group">' , '</div>')),
+ "artist" : text.split_html(extr('id="Artist">' , '</div>')),
+ "parody" : text.split_html(extr('id="Parody">' , '</div>')),
+ "characters": text.split_html(extr('id="Character">' , '</div>')),
+ "tags" : text.split_html(extr('id="Tag">' , '</div>')),
+ "language" : "English",
+ "lang" : "en",
+ }
+
+ def images(self, page):
+ url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
+ headers = {"Referer": self.chapter_url}
+ response = self.request(url, headers=headers, expect=(404,))
+
+ if response.status_code == 404:
+ url = "{}/Read/View/{}".format(self.root, self.gallery_id)
+ self.log.error(
+ "Failed to get gallery JSON data. Visit '%s' in a browser "
+ "and solve the CAPTCHA to continue.", url)
+ raise exception.StopExtraction()
+
+ base = self.root + "/Image/Object?name="
+ return [
+ (base + text.quote(name), None)
+ for name in response.json()["reader_page_urls"]
+ ]
+
+
+class TsuminoSearchExtractor(TsuminoBase, Extractor):
+ """Extractor for search results on tsumino.com"""
+ subcategory = "search"
+ pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+ r"/(?:Books/?)?#(.+)")
+ test = (
+ ("https://www.tsumino.com/Books#?Character=Reimu+Hakurei", {
+ "pattern": TsuminoGalleryExtractor.pattern,
+ "range": "1-40",
+ "count": 40,
+ }),
+ (("http://www.tsumino.com/Books#~(Tags~(~"
+ "(Type~7~Text~'Reimu*20Hakurei~Exclude~false)~"
+ "(Type~'1~Text~'Pantyhose~Exclude~false)))#"), {
+ "pattern": TsuminoGalleryExtractor.pattern,
+ "count": ">= 3",
+ }),
+ )
+
+ def __init__(self, match):
+ Extractor.__init__(self, match)
+ self.query = match.group(1)
+
+ def items(self):
+ yield Message.Version, 1
+ for gallery in self.galleries():
+ url = "{}/Book/Info/{}".format(self.root, gallery["Id"])
+ gallery["_extractor"] = TsuminoGalleryExtractor
+ yield Message.Queue, url, gallery
+
+ def galleries(self):
+ """Return all gallery results matching 'self.query'"""
+ url = "{}/Books/Operate".format(self.root)
+ headers = {
+ "Referer": "{}/".format(self.root),
+ "X-Requested-With": "XMLHttpRequest",
+ }
+ data = {
+ "PageNumber": 1,
+ "Text": "",
+ "Sort": "Newest",
+ "List": "0",
+ "Length": "0",
+ "MinimumRating": "0",
+ "ExcludeList": "0",
+ "CompletelyExcludeHated": "false",
+ }
+ data.update(self._parse(self.query))
+
+ while True:
+ info = self.request(
+ url, method="POST", headers=headers, data=data).json()
+
+ for gallery in info["Data"]:
+ yield gallery["Entry"]
+
+ if info["PageNumber"] >= info["PageCount"]:
+ return
+ data["PageNumber"] += 1
+
+ def _parse(self, query):
+ try:
+ if query.startswith("?"):
+ return self._parse_simple(query)
+ return self._parse_jsurl(query)
+ except Exception as exc:
+ self.log.error("Invalid search query: '%s' (%s)", query, exc)
+ raise exception.StopExtraction()
+
+ @staticmethod
+ def _parse_simple(query):
+ """Parse search query with format '?<key>=value>'"""
+ key, _, value = query.partition("=")
+ tag_types = {
+ "Tag": "1",
+ "Category": "2",
+ "Collection": "3",
+ "Group": "4",
+ "Artist": "5",
+ "Parody": "6",
+ "Character": "7",
+ "Uploader": "100",
+ }
+
+ return {
+ "Tags[0][Type]": tag_types[key[1:].capitalize()],
+ "Tags[0][Text]": text.unquote(value).replace("+", " "),
+ "Tags[0][Exclude]": "false",
+ }
+
+ @staticmethod
+ def _parse_jsurl(data):
+ """Parse search query in JSURL format
+
+ Nested lists and dicts are handled in a special way to deal
+ with the way Tsumino expects its parameters -> expand(...)
+
+ Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill))
+ Ref: https://github.com/Sage/jsurl
+ """
+ if not data:
+ return {}
+ i = 0
+ imax = len(data)
+
+ def eat(expected):
+ nonlocal i
+
+ if data[i] != expected:
+ error = "bad JSURL syntax: expected '{}', got {}".format(
+ expected, data[i])
+ raise ValueError(error)
+ i += 1
+
+ def decode():
+ nonlocal i
+
+ beg = i
+ result = ""
+
+ while i < imax:
+ ch = data[i]
+
+ if ch not in "~)*!":
+ i += 1
+
+ elif ch == "*":
+ if beg < i:
+ result += data[beg:i]
+ if data[i + 1] == "*":
+ result += chr(int(data[i+2:i+6], 16))
+ i += 6
+ else:
+ result += chr(int(data[i+1:i+3], 16))
+ i += 3
+ beg = i
+
+ elif ch == "!":
+ if beg < i:
+ result += data[beg:i]
+ result += "$"
+ i += 1
+ beg = i
+
+ else:
+ break
+
+ return result + data[beg:i]
+
+ def parse_one():
+ nonlocal i
+
+ eat('~')
+ result = ""
+ ch = data[i]
+
+ if ch == "(":
+ i += 1
+
+ if data[i] == "~":
+ result = []
+ if data[i+1] == ")":
+ i += 1
+ else:
+ result.append(parse_one())
+ while data[i] == "~":
+ result.append(parse_one())
+
+ else:
+ result = {}
+
+ if data[i] != ")":
+ while True:
+ key = decode()
+ value = parse_one()
+ for ekey, evalue in expand(key, value):
+ result[ekey] = evalue
+ if data[i] != "~":
+ break
+ i += 1
+ eat(")")
+
+ elif ch == "'":
+ i += 1
+ result = decode()
+
+ else:
+ beg = i
+ i += 1
+
+ while i < imax and data[i] not in "~)":
+ i += 1
+
+ sub = data[beg:i]
+ if ch in "0123456789-":
+ fval = float(sub)
+ ival = int(fval)
+ result = ival if ival == fval else fval
+ else:
+ if sub not in ("true", "false", "null"):
+ raise ValueError("bad value keyword: " + sub)
+ result = sub
+
+ return result
+
+ def expand(key, value):
+ if isinstance(value, list):
+ for index, cvalue in enumerate(value):
+ ckey = "{}[{}]".format(key, index)
+ yield from expand(ckey, cvalue)
+ elif isinstance(value, dict):
+ for ckey, cvalue in value.items():
+ ckey = "{}[{}]".format(key, ckey)
+ yield from expand(ckey, cvalue)
+ else:
+ yield key, value
+
+ return parse_one()