1 files changed, 78 insertions, 68 deletions
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index d6fdcf2..3baf819 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -6,7 +6,7 @@
 # it under the terms of the GNU General Public License version 2 as
 # published by the Free Software Foundation.
 
-"""Extract images from https://hitomi.la/"""
+"""Extractors for https://hitomi.la/"""
 
 from .common import GalleryExtractor
 from .. import text, util
@@ -27,21 +27,25 @@ class HitomiGalleryExtractor(GalleryExtractor):
             "keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
             "count": 16,
         }),
+        # download test
         ("https://hitomi.la/galleries/1401410.html", {
-            # download test
             "range": "1",
             "content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c",
         }),
+        # Game CG with scenes (#321)
         ("https://hitomi.la/galleries/733697.html", {
-            # Game CG with scenes (#321)
-            "url": "21064f9e3c244aca87f1a91967a3fbe79032c4ce",
+            "url": "b4cbc76032852db4a655bf6a2c4d58eae8153c8e",
             "count": 210,
         }),
+        # fallback for galleries only available through /reader/ URLs
         ("https://hitomi.la/galleries/1045954.html", {
-            # fallback for galleries only available through /reader/ URLs
-            "url": "0a67f5e6c3c6a384b578e328f4817fa6ccdf856a",
+            "url": "f3aa914ad148437f72d307268fa0d250eabe8dab",
             "count": 1413,
         }),
+        # gallery with "broken" redirect
+        ("https://hitomi.la/cg/scathacha-sama-okuchi-ecchi-1291900.html", {
+            "count": 10,
+        }),
         ("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"),
         ("https://hitomi.la/manga/867789.html"),
         ("https://hitomi.la/doujinshi/867789.html"),
@@ -51,84 +55,90 @@ class HitomiGalleryExtractor(GalleryExtractor):
     )
 
     def __init__(self, match):
-        self.gallery_id = match.group(1)
-        self.fallback = False
-        url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
+        gid = match.group(1)
+        url = "https://ltn.hitomi.la/galleries/{}.js".format(gid)
         GalleryExtractor.__init__(self, match, url)
+        self.info = None
+        self.session.headers["Referer"] = "{}/reader/{}.html".format(
+            self.root, gid)
+
+    def metadata(self, page):
+        self.info = info = json.loads(page.partition("=")[2])
+
+        data = self._data_from_gallery_info(info)
+        if self.config("metadata", True):
+            data.update(self._data_from_gallery_page(info))
+        return data
+
+    def _data_from_gallery_info(self, info):
+        language = info.get("language")
+        if language:
+            language = language.capitalize()
+
+        tags = []
+        for tinfo in info["tags"]:
+            tag = tinfo["tag"]
+            if tinfo.get("female"):
+                tag += " ♀"
+            elif tinfo.get("male"):
+                tag += " ♂"
+            tags.append(string.capwords(tag))
+
+        return {
+            "gallery_id": text.parse_int(info["id"]),
+            "title"     : info["title"],
+            "type"      : info["type"].capitalize(),
+            "language"  : language,
+            "lang"      : util.language_to_code(language),
+            "tags"      : tags,
+            "date"      : text.parse_datetime(
+                info["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"),
+        }
+
+    def _data_from_gallery_page(self, info):
+        url = "{}/galleries/{}.html".format(self.root, info["id"])
 
-    def request(self, url, **kwargs):
-        response = GalleryExtractor.request(self, url, fatal=False, **kwargs)
-        if response.status_code == 404:
-            self.fallback = True
-            url = url.replace("/galleries/", "/reader/")
-            response = GalleryExtractor.request(self, url, **kwargs)
-        elif b"<title>Redirect</title>" in response.content:
+        # follow redirects
+        while True:
+            response = self.request(url, fatal=False)
+            if b"<title>Redirect</title>" not in response.content:
+                break
             url = text.extract(response.text, "href='", "'")[0]
             if not url.startswith("http"):
                 url = text.urljoin(self.root, url)
-            response = self.request(url, **kwargs)
-        return response
 
-    def metadata(self, page):
-        if self.fallback:
-            return {
-                "gallery_id": text.parse_int(self.gallery_id),
-                "title": text.unescape(text.extract(
-                    page, "<title>", "<")[0].rpartition(" | ")[0]),
-            }
-
-        extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
-        data = {
-            "gallery_id": text.parse_int(self.gallery_id),
-            "title"     : text.unescape(extr('.html">', '<').strip()),
-            "artist"    : self._prep(extr('<h2>', '</h2>')),
-            "group"     : self._prep(extr('<td>Group</td><td>', '</td>')),
-            "type"      : self._prep_1(extr('<td>Type</td><td>', '</td>')),
-            "language"  : self._prep_1(extr('<td>Language</td><td>', '</td>')),
-            "parody"    : self._prep(extr('<td>Series</td><td>', '</td>')),
-            "characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
-            "tags"      : self._prep(extr('<td>Tags</td><td>', '</td>')),
-            "date"      : self._date(extr('<span class="date">', '</span>')),
+        if response.status_code >= 400:
+            return {}
+
+        def prep(value):
+            return [
+                text.unescape(string.capwords(v))
+                for v in text.extract_iter(value or "", '.html">', '<')
+            ]
+
+        extr = text.extract_from(response.text)
+        return {
+            "artist"    : prep(extr('<h2>', '</h2>')),
+            "group"     : prep(extr('<td>Group</td><td>', '</td>')),
+            "parody"    : prep(extr('<td>Series</td><td>', '</td>')),
+            "characters": prep(extr('<td>Characters</td><td>', '</td>')),
         }
-        if data["language"] == "N/a":
-            data["language"] = None
-        data["lang"] = util.language_to_code(data["language"])
-        return data
-
-    def images(self, page):
-        # set Referer header before image downloads (#239)
-        self.session.headers["Referer"] = self.gallery_url
-
-        # get 'galleryinfo'
-        url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id)
-        page = self.request(url).text
 
+    def images(self, _):
         result = []
-        for image in json.loads(page.partition("=")[2]):
+        for image in self.info["files"]:
             ihash = image["hash"]
             idata = text.nameext_from_url(image["name"])
 
             # see https://ltn.hitomi.la/common.js
-            offset = int(ihash[-3:-1], 16) % 3
+            inum = int(ihash[-3:-1], 16)
+            frontends = 2 if inum < 0x30 else 3
+            inum = 1 if inum < 0x09 else inum
+
             url = "https://{}a.hitomi.la/images/{}/{}/{}.{}".format(
-                chr(97 + offset),
+                chr(97 + (inum % frontends)),
                 ihash[-1], ihash[-3:-1], ihash,
                 idata["extension"],
             )
             result.append((url, idata))
         return result
-
-    @staticmethod
-    def _prep(value):
-        return [
-            text.unescape(string.capwords(v))
-            for v in text.extract_iter(value or "", '.html">', '<')
-        ]
-
-    @staticmethod
-    def _prep_1(value):
-        return text.remove_html(value).capitalize()
-
-    @staticmethod
-    def _date(value):
-        return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")