diff options
Diffstat (limited to 'gallery_dl/extractor/issuu.py')
| -rw-r--r-- | gallery_dl/extractor/issuu.py | 109 |
1 files changed, 109 insertions, 0 deletions
diff --git a/gallery_dl/extractor/issuu.py b/gallery_dl/extractor/issuu.py new file mode 100644 index 0000000..12d7487 --- /dev/null +++ b/gallery_dl/extractor/issuu.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://issuu.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import json + + +class IssuuBase(): + """Base class for issuu extractors""" + category = "issuu" + root = "https://issuu.com" + + +class IssuuPublicationExtractor(IssuuBase, GalleryExtractor): + """Extractor for a single publication""" + subcategory = "publication" + directory_fmt = ("{category}", "{document[userName]}", + "{document[originalPublishDate]} {document[title]}") + filename_fmt = "{num:>03}.{extension}" + archive_fmt = "{document[id]}_{num}" + pattern = r"(?:https?://)?issuu\.com(/[^/?&#]+/docs/[^/?&#]+)" + test = ("https://issuu.com/issuu/docs/motions-1-2019/", { + "pattern": r"https://image.isu.pub/190916155301-\w+/jpg/page_\d+.jpg", + "count" : 36, + "keyword": { + "document": { + "access" : "public", + "contentRating": dict, + "date" : "type:datetime", + "description" : "re:Motions, the brand new publication by Is", + "documentId" : r"re:\d+-d99ec95935f15091b040cb8060f05510", + "documentName" : "motions-1-2019", + "downloadState": "NOT_AVAILABLE", + "id" : r"re:\d+-d99ec95935f15091b040cb8060f05510", + "isConverting" : False, + "isQuarantined": False, + "lang" : "en", + "language" : "English", + "pageCount" : 36, + "publicationId": "d99ec95935f15091b040cb8060f05510", + "sections" : list, + "title" : "Motions by Issuu - Issue 1", + "userName" : "issuu", + }, + "extension": "jpg", + "filename" : r"re:page_\d+", + "num" : int, + }, + }) + + def metadata(self, page): + data = json.loads(text.extract( + page, 'window.__INITIAL_STATE__ =', ';\n')[0]) + + doc = data["document"] + doc["lang"] = doc["language"] + doc["language"] = util.code_to_language(doc["language"]) + doc["date"] = text.parse_datetime( + doc["originalPublishDate"], "%Y-%m-%d") + + self._cnt = text.parse_int(doc["pageCount"]) + self._tpl = "https://{}/{}/jpg/page_{{}}.jpg".format( + data["config"]["hosts"]["image"], doc["id"]) + + return {"document": doc} + + def images(self, page): + fmt = self._tpl.format + return [(fmt(i), None) for i in range(1, self._cnt + 1)] + + +class IssuuUserExtractor(IssuuBase, Extractor): + """Extractor for all publications of a user/publisher""" + subcategory = "user" + pattern = r"(?:https?://)?issuu\.com/([^/?&#]+)/?$" + test = ("https://issuu.com/issuu", { + "pattern": IssuuPublicationExtractor.pattern, + "count" : "> 25", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + url = "{}/call/profile/v1/documents/{}".format(self.root, self.user) + params = {"offset": 0, "limit": "25"} + + yield Message.Version, 1 + while True: + data = self.request(url, params=params).json() + + for publication in data["items"]: + publication["url"] = "{}/{}/docs/{}".format( + self.root, self.user, publication["uri"]) + publication["_extractor"] = IssuuPublicationExtractor + yield Message.Queue, publication["url"], publication + + if not data["hasMore"]: + return + params["offset"] += data["limit"] |
