diff options
Diffstat (limited to 'gallery_dl/extractor/hitomi.py')
| -rw-r--r-- | gallery_dl/extractor/hitomi.py | 146 |
1 files changed, 78 insertions, 68 deletions
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py index d6fdcf2..3baf819 100644 --- a/gallery_dl/extractor/hitomi.py +++ b/gallery_dl/extractor/hitomi.py @@ -6,7 +6,7 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://hitomi.la/""" +"""Extractors for https://hitomi.la/""" from .common import GalleryExtractor from .. import text, util @@ -27,21 +27,25 @@ class HitomiGalleryExtractor(GalleryExtractor): "keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2", "count": 16, }), + # download test ("https://hitomi.la/galleries/1401410.html", { - # download test "range": "1", "content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c", }), + # Game CG with scenes (#321) ("https://hitomi.la/galleries/733697.html", { - # Game CG with scenes (#321) - "url": "21064f9e3c244aca87f1a91967a3fbe79032c4ce", + "url": "b4cbc76032852db4a655bf6a2c4d58eae8153c8e", "count": 210, }), + # fallback for galleries only available through /reader/ URLs ("https://hitomi.la/galleries/1045954.html", { - # fallback for galleries only available through /reader/ URLs - "url": "0a67f5e6c3c6a384b578e328f4817fa6ccdf856a", + "url": "f3aa914ad148437f72d307268fa0d250eabe8dab", "count": 1413, }), + # gallery with "broken" redirect + ("https://hitomi.la/cg/scathacha-sama-okuchi-ecchi-1291900.html", { + "count": 10, + }), ("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"), ("https://hitomi.la/manga/867789.html"), ("https://hitomi.la/doujinshi/867789.html"), @@ -51,84 +55,90 @@ class HitomiGalleryExtractor(GalleryExtractor): ) def __init__(self, match): - self.gallery_id = match.group(1) - self.fallback = False - url = "{}/galleries/{}.html".format(self.root, self.gallery_id) + gid = match.group(1) + url = "https://ltn.hitomi.la/galleries/{}.js".format(gid) GalleryExtractor.__init__(self, match, url) + self.info = None + self.session.headers["Referer"] = "{}/reader/{}.html".format( + self.root, gid) + + def metadata(self, page): + self.info = info = json.loads(page.partition("=")[2]) + + data = self._data_from_gallery_info(info) + if self.config("metadata", True): + data.update(self._data_from_gallery_page(info)) + return data + + def _data_from_gallery_info(self, info): + language = info.get("language") + if language: + language = language.capitalize() + + tags = [] + for tinfo in info["tags"]: + tag = tinfo["tag"] + if tinfo.get("female"): + tag += " ♀" + elif tinfo.get("male"): + tag += " ♂" + tags.append(string.capwords(tag)) + + return { + "gallery_id": text.parse_int(info["id"]), + "title" : info["title"], + "type" : info["type"].capitalize(), + "language" : language, + "lang" : util.language_to_code(language), + "tags" : tags, + "date" : text.parse_datetime( + info["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"), + } + + def _data_from_gallery_page(self, info): + url = "{}/galleries/{}.html".format(self.root, info["id"]) - def request(self, url, **kwargs): - response = GalleryExtractor.request(self, url, fatal=False, **kwargs) - if response.status_code == 404: - self.fallback = True - url = url.replace("/galleries/", "/reader/") - response = GalleryExtractor.request(self, url, **kwargs) - elif b"<title>Redirect</title>" in response.content: + # follow redirects + while True: + response = self.request(url, fatal=False) + if b"<title>Redirect</title>" not in response.content: + break url = text.extract(response.text, "href='", "'")[0] if not url.startswith("http"): url = text.urljoin(self.root, url) - response = self.request(url, **kwargs) - return response - def metadata(self, page): - if self.fallback: - return { - "gallery_id": text.parse_int(self.gallery_id), - "title": text.unescape(text.extract( - page, "<title>", "<")[0].rpartition(" | ")[0]), - } - - extr = text.extract_from(page, page.index('<h1><a href="/reader/')) - data = { - "gallery_id": text.parse_int(self.gallery_id), - "title" : text.unescape(extr('.html">', '<').strip()), - "artist" : self._prep(extr('<h2>', '</h2>')), - "group" : self._prep(extr('<td>Group</td><td>', '</td>')), - "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')), - "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')), - "parody" : self._prep(extr('<td>Series</td><td>', '</td>')), - "characters": self._prep(extr('<td>Characters</td><td>', '</td>')), - "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')), - "date" : self._date(extr('<span class="date">', '</span>')), + if response.status_code >= 400: + return {} + + def prep(value): + return [ + text.unescape(string.capwords(v)) + for v in text.extract_iter(value or "", '.html">', '<') + ] + + extr = text.extract_from(response.text) + return { + "artist" : prep(extr('<h2>', '</h2>')), + "group" : prep(extr('<td>Group</td><td>', '</td>')), + "parody" : prep(extr('<td>Series</td><td>', '</td>')), + "characters": prep(extr('<td>Characters</td><td>', '</td>')), } - if data["language"] == "N/a": - data["language"] = None - data["lang"] = util.language_to_code(data["language"]) - return data - - def images(self, page): - # set Referer header before image downloads (#239) - self.session.headers["Referer"] = self.gallery_url - - # get 'galleryinfo' - url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id) - page = self.request(url).text + def images(self, _): result = [] - for image in json.loads(page.partition("=")[2]): + for image in self.info["files"]: ihash = image["hash"] idata = text.nameext_from_url(image["name"]) # see https://ltn.hitomi.la/common.js - offset = int(ihash[-3:-1], 16) % 3 + inum = int(ihash[-3:-1], 16) + frontends = 2 if inum < 0x30 else 3 + inum = 1 if inum < 0x09 else inum + url = "https://{}a.hitomi.la/images/{}/{}/{}.{}".format( - chr(97 + offset), + chr(97 + (inum % frontends)), ihash[-1], ihash[-3:-1], ihash, idata["extension"], ) result.append((url, idata)) return result - - @staticmethod - def _prep(value): - return [ - text.unescape(string.capwords(v)) - for v in text.extract_iter(value or "", '.html">', '<') - ] - - @staticmethod - def _prep_1(value): - return text.remove_html(value).capitalize() - - @staticmethod - def _date(value): - return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z") |
