summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/hitomi.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/hitomi.py')
-rw-r--r--gallery_dl/extractor/hitomi.py146
1 files changed, 78 insertions, 68 deletions
diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py
index d6fdcf2..3baf819 100644
--- a/gallery_dl/extractor/hitomi.py
+++ b/gallery_dl/extractor/hitomi.py
@@ -6,7 +6,7 @@
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
-"""Extract images from https://hitomi.la/"""
+"""Extractors for https://hitomi.la/"""
from .common import GalleryExtractor
from .. import text, util
@@ -27,21 +27,25 @@ class HitomiGalleryExtractor(GalleryExtractor):
"keyword": "6701f8f588f119ef84cd29bdf99a399417b0a6a2",
"count": 16,
}),
+ # download test
("https://hitomi.la/galleries/1401410.html", {
- # download test
"range": "1",
"content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c",
}),
+ # Game CG with scenes (#321)
("https://hitomi.la/galleries/733697.html", {
- # Game CG with scenes (#321)
- "url": "21064f9e3c244aca87f1a91967a3fbe79032c4ce",
+ "url": "b4cbc76032852db4a655bf6a2c4d58eae8153c8e",
"count": 210,
}),
+ # fallback for galleries only available through /reader/ URLs
("https://hitomi.la/galleries/1045954.html", {
- # fallback for galleries only available through /reader/ URLs
- "url": "0a67f5e6c3c6a384b578e328f4817fa6ccdf856a",
+ "url": "f3aa914ad148437f72d307268fa0d250eabe8dab",
"count": 1413,
}),
+ # gallery with "broken" redirect
+ ("https://hitomi.la/cg/scathacha-sama-okuchi-ecchi-1291900.html", {
+ "count": 10,
+ }),
("https://hitomi.la/manga/amazon-no-hiyaku-867789.html"),
("https://hitomi.la/manga/867789.html"),
("https://hitomi.la/doujinshi/867789.html"),
@@ -51,84 +55,90 @@ class HitomiGalleryExtractor(GalleryExtractor):
)
def __init__(self, match):
- self.gallery_id = match.group(1)
- self.fallback = False
- url = "{}/galleries/{}.html".format(self.root, self.gallery_id)
+ gid = match.group(1)
+ url = "https://ltn.hitomi.la/galleries/{}.js".format(gid)
GalleryExtractor.__init__(self, match, url)
+ self.info = None
+ self.session.headers["Referer"] = "{}/reader/{}.html".format(
+ self.root, gid)
+
+ def metadata(self, page):
+ self.info = info = json.loads(page.partition("=")[2])
+
+ data = self._data_from_gallery_info(info)
+ if self.config("metadata", True):
+ data.update(self._data_from_gallery_page(info))
+ return data
+
+ def _data_from_gallery_info(self, info):
+ language = info.get("language")
+ if language:
+ language = language.capitalize()
+
+ tags = []
+ for tinfo in info["tags"]:
+ tag = tinfo["tag"]
+ if tinfo.get("female"):
+ tag += " ♀"
+ elif tinfo.get("male"):
+ tag += " ♂"
+ tags.append(string.capwords(tag))
+
+ return {
+ "gallery_id": text.parse_int(info["id"]),
+ "title" : info["title"],
+ "type" : info["type"].capitalize(),
+ "language" : language,
+ "lang" : util.language_to_code(language),
+ "tags" : tags,
+ "date" : text.parse_datetime(
+ info["date"] + ":00", "%Y-%m-%d %H:%M:%S%z"),
+ }
+
+ def _data_from_gallery_page(self, info):
+ url = "{}/galleries/{}.html".format(self.root, info["id"])
- def request(self, url, **kwargs):
- response = GalleryExtractor.request(self, url, fatal=False, **kwargs)
- if response.status_code == 404:
- self.fallback = True
- url = url.replace("/galleries/", "/reader/")
- response = GalleryExtractor.request(self, url, **kwargs)
- elif b"<title>Redirect</title>" in response.content:
+ # follow redirects
+ while True:
+ response = self.request(url, fatal=False)
+ if b"<title>Redirect</title>" not in response.content:
+ break
url = text.extract(response.text, "href='", "'")[0]
if not url.startswith("http"):
url = text.urljoin(self.root, url)
- response = self.request(url, **kwargs)
- return response
- def metadata(self, page):
- if self.fallback:
- return {
- "gallery_id": text.parse_int(self.gallery_id),
- "title": text.unescape(text.extract(
- page, "<title>", "<")[0].rpartition(" | ")[0]),
- }
-
- extr = text.extract_from(page, page.index('<h1><a href="/reader/'))
- data = {
- "gallery_id": text.parse_int(self.gallery_id),
- "title" : text.unescape(extr('.html">', '<').strip()),
- "artist" : self._prep(extr('<h2>', '</h2>')),
- "group" : self._prep(extr('<td>Group</td><td>', '</td>')),
- "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')),
- "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')),
- "parody" : self._prep(extr('<td>Series</td><td>', '</td>')),
- "characters": self._prep(extr('<td>Characters</td><td>', '</td>')),
- "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')),
- "date" : self._date(extr('<span class="date">', '</span>')),
+ if response.status_code >= 400:
+ return {}
+
+ def prep(value):
+ return [
+ text.unescape(string.capwords(v))
+ for v in text.extract_iter(value or "", '.html">', '<')
+ ]
+
+ extr = text.extract_from(response.text)
+ return {
+ "artist" : prep(extr('<h2>', '</h2>')),
+ "group" : prep(extr('<td>Group</td><td>', '</td>')),
+ "parody" : prep(extr('<td>Series</td><td>', '</td>')),
+ "characters": prep(extr('<td>Characters</td><td>', '</td>')),
}
- if data["language"] == "N/a":
- data["language"] = None
- data["lang"] = util.language_to_code(data["language"])
- return data
-
- def images(self, page):
- # set Referer header before image downloads (#239)
- self.session.headers["Referer"] = self.gallery_url
-
- # get 'galleryinfo'
- url = "https://ltn.hitomi.la/galleries/{}.js".format(self.gallery_id)
- page = self.request(url).text
+ def images(self, _):
result = []
- for image in json.loads(page.partition("=")[2]):
+ for image in self.info["files"]:
ihash = image["hash"]
idata = text.nameext_from_url(image["name"])
# see https://ltn.hitomi.la/common.js
- offset = int(ihash[-3:-1], 16) % 3
+ inum = int(ihash[-3:-1], 16)
+ frontends = 2 if inum < 0x30 else 3
+ inum = 1 if inum < 0x09 else inum
+
url = "https://{}a.hitomi.la/images/{}/{}/{}.{}".format(
- chr(97 + offset),
+ chr(97 + (inum % frontends)),
ihash[-1], ihash[-3:-1], ihash,
idata["extension"],
)
result.append((url, idata))
return result
-
- @staticmethod
- def _prep(value):
- return [
- text.unescape(string.capwords(v))
- for v in text.extract_iter(value or "", '.html">', '<')
- ]
-
- @staticmethod
- def _prep_1(value):
- return text.remove_html(value).capitalize()
-
- @staticmethod
- def _date(value):
- return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z")