1 files changed, 343 insertions, 0 deletions
diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py
new file mode 100644
index 0000000..62a9173
--- /dev/null
+++ b/gallery_dl/extractor/tsumino.py
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for https://www.tsumino.com/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, exception
+from ..cache import cache
+
+
+class TsuminoBase():
+    """Base class for tsumino extractors"""
+    category = "tsumino"
+    cookiedomain = "www.tsumino.com"
+    root = "https://www.tsumino.com"
+
+    def login(self):
+        username, password = self._get_auth_info()
+        if username:
+            self._update_cookies(self._login_impl(username, password))
+        else:
+            self.session.cookies.setdefault(
+                "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5")
+
+    @cache(maxage=14*24*3600, keyarg=1)
+    def _login_impl(self, username, password):
+        self.log.info("Logging in as %s", username)
+        url = "{}/Account/Login".format(self.root)
+        headers = {"Referer": url}
+        data = {"Username": username, "Password": password}
+
+        response = self.request(url, method="POST", headers=headers, data=data)
+        if not response.history:
+            raise exception.AuthenticationError()
+        return {".aotsumino": response.history[0].cookies[".aotsumino"]}
+
+
+class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor):
+    """Extractor for image galleries on tsumino.com"""
+    pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+               r"/(?:Book/Info|Read/View)/(\d+)")
+    test = (
+        ("https://www.tsumino.com/Book/Info/40996", {
+            "url": "84bf30a86623039fc87855680fada884dc8a1ddd",
+            "keyword": {
+                "title"     : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+                "title_en"  : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou",
+                "title_jp"  : "シコシコ大好きナイチンゲール + 会場限定おまけ本",
+                "gallery_id": 40996,
+                "date"      : "2018 June 29",
+                "count"     : 42,
+                "collection": "",
+                "artist"    : ["Itou Life"],
+                "group"     : ["Itou Life"],
+                "parody"    : ["Fate/Grand Order"],
+                "characters": list,
+                "tags"      : list,
+                "type"      : "Doujinshi",
+                "rating"    : float,
+                "uploader"  : "sehki",
+                "lang"      : "en",
+                "language"  : "English",
+                "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996",
+            },
+        }),
+        ("https://www.tsumino.com/Read/View/45834"),
+    )
+
+    def __init__(self, match):
+        self.gallery_id = match.group(1)
+        url = "{}/Book/Info/{}".format(self.root, self.gallery_id)
+        GalleryExtractor.__init__(self, match, url)
+
+    def metadata(self, page):
+        extr = text.extract_from(page)
+        title = extr('"og:title" content="', '"')
+        title_en, _, title_jp = text.unescape(title).partition("/")
+        title_en = title_en.strip()
+        title_jp = title_jp.strip()
+
+        return {
+            "gallery_id": text.parse_int(self.gallery_id),
+            "title"     : title_en or title_jp,
+            "title_en"  : title_en,
+            "title_jp"  : title_jp,
+            "thumbnail" : extr('"og:image" content="', '"'),
+            "uploader"  : text.remove_html(extr('id="Uploader">', '</div>')),
+            "date"      : extr('id="Uploaded">', '</div>').strip(),
+            "rating"    : text.parse_float(extr(
+                'id="Rating">', '</div>').partition(" ")[0]),
+            "type"      : text.remove_html(extr('id="Category">'  , '</div>')),
+            "collection": text.remove_html(extr('id="Collection">', '</div>')),
+            "group"     : text.split_html(extr('id="Group">'      , '</div>')),
+            "artist"    : text.split_html(extr('id="Artist">'     , '</div>')),
+            "parody"    : text.split_html(extr('id="Parody">'     , '</div>')),
+            "characters": text.split_html(extr('id="Character">'  , '</div>')),
+            "tags"      : text.split_html(extr('id="Tag">'        , '</div>')),
+            "language"  : "English",
+            "lang"      : "en",
+        }
+
+    def images(self, page):
+        url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id)
+        headers = {"Referer": self.chapter_url}
+        response = self.request(url, headers=headers, expect=(404,))
+
+        if response.status_code == 404:
+            url = "{}/Read/View/{}".format(self.root, self.gallery_id)
+            self.log.error(
+                "Failed to get gallery JSON data. Visit '%s' in a browser "
+                "and solve the CAPTCHA to continue.", url)
+            raise exception.StopExtraction()
+
+        base = self.root + "/Image/Object?name="
+        return [
+            (base + text.quote(name), None)
+            for name in response.json()["reader_page_urls"]
+        ]
+
+
+class TsuminoSearchExtractor(TsuminoBase, Extractor):
+    """Extractor for search results on tsumino.com"""
+    subcategory = "search"
+    pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com"
+               r"/(?:Books/?)?#(.+)")
+    test = (
+        ("https://www.tsumino.com/Books#?Character=Reimu+Hakurei", {
+            "pattern": TsuminoGalleryExtractor.pattern,
+            "range": "1-40",
+            "count": 40,
+        }),
+        (("http://www.tsumino.com/Books#~(Tags~(~"
+          "(Type~7~Text~'Reimu*20Hakurei~Exclude~false)~"
+          "(Type~'1~Text~'Pantyhose~Exclude~false)))#"), {
+            "pattern": TsuminoGalleryExtractor.pattern,
+            "count": ">= 3",
+        }),
+    )
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.query = match.group(1)
+
+    def items(self):
+        yield Message.Version, 1
+        for gallery in self.galleries():
+            url = "{}/Book/Info/{}".format(self.root, gallery["Id"])
+            gallery["_extractor"] = TsuminoGalleryExtractor
+            yield Message.Queue, url, gallery
+
+    def galleries(self):
+        """Return all gallery results matching 'self.query'"""
+        url = "{}/Books/Operate".format(self.root)
+        headers = {
+            "Referer": "{}/".format(self.root),
+            "X-Requested-With": "XMLHttpRequest",
+        }
+        data = {
+            "PageNumber": 1,
+            "Text": "",
+            "Sort": "Newest",
+            "List": "0",
+            "Length": "0",
+            "MinimumRating": "0",
+            "ExcludeList": "0",
+            "CompletelyExcludeHated": "false",
+        }
+        data.update(self._parse(self.query))
+
+        while True:
+            info = self.request(
+                url, method="POST", headers=headers, data=data).json()
+
+            for gallery in info["Data"]:
+                yield gallery["Entry"]
+
+            if info["PageNumber"] >= info["PageCount"]:
+                return
+            data["PageNumber"] += 1
+
+    def _parse(self, query):
+        try:
+            if query.startswith("?"):
+                return self._parse_simple(query)
+            return self._parse_jsurl(query)
+        except Exception as exc:
+            self.log.error("Invalid search query: '%s' (%s)", query, exc)
+            raise exception.StopExtraction()
+
+    @staticmethod
+    def _parse_simple(query):
+        """Parse search query with format '?<key>=value>'"""
+        key, _, value = query.partition("=")
+        tag_types = {
+            "Tag": "1",
+            "Category": "2",
+            "Collection": "3",
+            "Group": "4",
+            "Artist": "5",
+            "Parody": "6",
+            "Character": "7",
+            "Uploader": "100",
+        }
+
+        return {
+            "Tags[0][Type]": tag_types[key[1:].capitalize()],
+            "Tags[0][Text]": text.unquote(value).replace("+", " "),
+            "Tags[0][Exclude]": "false",
+        }
+
+    @staticmethod
+    def _parse_jsurl(data):
+        """Parse search query in JSURL format
+
+        Nested lists and dicts are handled in a special way to deal
+        with the way Tsumino expects its parameters -> expand(...)
+
+        Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill))
+        Ref: https://github.com/Sage/jsurl
+        """
+        if not data:
+            return {}
+        i = 0
+        imax = len(data)
+
+        def eat(expected):
+            nonlocal i
+
+            if data[i] != expected:
+                error = "bad JSURL syntax: expected '{}', got {}".format(
+                    expected, data[i])
+                raise ValueError(error)
+            i += 1
+
+        def decode():
+            nonlocal i
+
+            beg = i
+            result = ""
+
+            while i < imax:
+                ch = data[i]
+
+                if ch not in "~)*!":
+                    i += 1
+
+                elif ch == "*":
+                    if beg < i:
+                        result += data[beg:i]
+                    if data[i + 1] == "*":
+                        result += chr(int(data[i+2:i+6], 16))
+                        i += 6
+                    else:
+                        result += chr(int(data[i+1:i+3], 16))
+                        i += 3
+                    beg = i
+
+                elif ch == "!":
+                    if beg < i:
+                        result += data[beg:i]
+                    result += "$"
+                    i += 1
+                    beg = i
+
+                else:
+                    break
+
+            return result + data[beg:i]
+
+        def parse_one():
+            nonlocal i
+
+            eat('~')
+            result = ""
+            ch = data[i]
+
+            if ch == "(":
+                i += 1
+
+                if data[i] == "~":
+                    result = []
+                    if data[i+1] == ")":
+                        i += 1
+                    else:
+                        result.append(parse_one())
+                        while data[i] == "~":
+                            result.append(parse_one())
+
+                else:
+                    result = {}
+
+                    if data[i] != ")":
+                        while True:
+                            key = decode()
+                            value = parse_one()
+                            for ekey, evalue in expand(key, value):
+                                result[ekey] = evalue
+                            if data[i] != "~":
+                                break
+                            i += 1
+                eat(")")
+
+            elif ch == "'":
+                i += 1
+                result = decode()
+
+            else:
+                beg = i
+                i += 1
+
+                while i < imax and data[i] not in "~)":
+                    i += 1
+
+                sub = data[beg:i]
+                if ch in "0123456789-":
+                    fval = float(sub)
+                    ival = int(fval)
+                    result = ival if ival == fval else fval
+                else:
+                    if sub not in ("true", "false", "null"):
+                        raise ValueError("bad value keyword: " + sub)
+                    result = sub
+
+            return result
+
+        def expand(key, value):
+            if isinstance(value, list):
+                for index, cvalue in enumerate(value):
+                    ckey = "{}[{}]".format(key, index)
+                    yield from expand(ckey, cvalue)
+            elif isinstance(value, dict):
+                for ckey, cvalue in value.items():
+                    ckey = "{}[{}]".format(key, ckey)
+                    yield from expand(ckey, cvalue)
+            else:
+                yield key, value
+
+        return parse_one()