1 files changed, 135 insertions, 0 deletions
diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py
new file mode 100644
index 0000000..746144a
--- /dev/null
+++ b/gallery_dl/extractor/nhentai.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extract images from https://nhentai.net/"""
+
+from .common import GalleryExtractor, Extractor, Message
+from .. import text, util
+import collections
+import json
+
+
+class NhentaiBase():
+    """Base class for nhentai extractors"""
+    category = "nhentai"
+    root = "https://nhentai.net"
+    media_url = "https://i.nhentai.net"
+
+
+class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor):
+    """Extractor for image galleries from nhentai.net"""
+    pattern = r"(?:https?://)?nhentai\.net(/g/(\d+))"
+    test = ("https://nhentai.net/g/147850/", {
+        "url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0",
+        "keyword": {
+            "title"     : r"re:\[Morris\] Amazon no Hiyaku \| Amazon Elixir",
+            "title_en"  : str,
+            "title_ja"  : str,
+            "gallery_id": 147850,
+            "media_id"  : 867789,
+            "count"     : 16,
+            "date"      : 1446050915,
+            "scanlator" : "",
+            "artist"    : ["morris"],
+            "group"     : list,
+            "parody"    : list,
+            "characters": list,
+            "tags"      : list,
+            "type"      : "manga",
+            "lang"      : "en",
+            "language"  : "English",
+            "width"     : int,
+            "height"    : int,
+        },
+    })
+
+    def __init__(self, match):
+        GalleryExtractor.__init__(self, match)
+        self.gallery_id = match.group(2)
+        self.data = None
+
+    def metadata(self, page):
+        data = json.loads(text.extract(page, "N.gallery(", ");")[0])
+        self.data = data
+
+        title_en = data["title"].get("english", "")
+        title_ja = data["title"].get("japanese", "")
+
+        info = collections.defaultdict(list)
+        for tag in data["tags"]:
+            info[tag["type"]].append(tag["name"])
+
+        language = ""
+        for language in info["language"]:
+            if language != "translated":
+                language = language.capitalize()
+                break
+
+        return {
+            "title"     : title_en or title_ja,
+            "title_en"  : title_en,
+            "title_ja"  : title_ja,
+            "gallery_id": data["id"],
+            "media_id"  : text.parse_int(data["media_id"]),
+            "date"      : data["upload_date"],
+            "scanlator" : data["scanlator"],
+            "artist"    : info["artist"],
+            "group"     : info["group"],
+            "parody"    : info["parody"],
+            "characters": info["character"],
+            "tags"      : info["tag"],
+            "type"      : info["category"][0] if info["category"] else "",
+            "lang"      : util.language_to_code(language),
+            "language"  : language,
+        }
+
+    def images(self, _):
+        ufmt = "{}/galleries/{}/{{}}.{{}}".format(
+            self.media_url, self.data["media_id"])
+        extdict = {"j": "jpg", "p": "png", "g": "gif"}
+
+        return [
+            (ufmt.format(num, extdict.get(img["t"], "jpg")), {
+                "width": img["w"], "height": img["h"],
+            })
+            for num, img in enumerate(self.data["images"]["pages"], 1)
+        ]
+
+
+class NhentaiSearchExtractor(NhentaiBase, Extractor):
+    """Extractor for nhentai search results"""
+    category = "nhentai"
+    subcategory = "search"
+    pattern = r"(?:https?://)?nhentai\.net/search/?\?([^#]+)"
+    test = ("https://nhentai.net/search/?q=touhou", {
+        "pattern": NhentaiGalleryExtractor.pattern,
+        "count": 30,
+        "range": "1-30",
+    })
+
+    def __init__(self, match):
+        Extractor.__init__(self, match)
+        self.params = text.parse_query(match.group(1))
+
+    def items(self):
+        yield Message.Version, 1
+        data = {"_extractor": NhentaiGalleryExtractor}
+        for gallery_id in self._pagination(self.params):
+            url = "{}/g/{}/".format(self.root, gallery_id)
+            yield Message.Queue, url, data
+
+    def _pagination(self, params):
+        url = "{}/search/".format(self.root)
+        params["page"] = text.parse_int(params.get("page"), 1)
+
+        while True:
+            page = self.request(url, params=params).text
+            yield from text.extract_iter(page, 'href="/g/', '/')
+            if 'class="next"' not in page:
+                return
+            params["page"] += 1