diff options
Diffstat (limited to 'gallery_dl/extractor/mangapark.py')
| -rw-r--r-- | gallery_dl/extractor/mangapark.py | 551 |
1 files changed, 430 insertions, 121 deletions
diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 168fbe8..a0d1e80 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -8,155 +8,464 @@ """Extractors for https://mangapark.net/""" -from .common import ChapterExtractor, MangaExtractor +from .common import ChapterExtractor, Extractor, Message from .. import text, util, exception import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangapark\.(?:net|com|org|io|me)" + class MangaparkBase(): """Base class for mangapark extractors""" category = "mangapark" - root_fmt = "https://v2.mangapark.{}" - browser = "firefox" - - @staticmethod - def parse_chapter_path(path, data): - """Get volume/chapter information from url-path of a chapter""" - data["volume"], data["chapter_minor"] = 0, "" - for part in path.split("/")[1:]: - key, value = part[0], part[1:] - if key == "c": - chapter, dot, minor = value.partition(".") - data["chapter"] = text.parse_int(chapter) - data["chapter_minor"] = dot + minor - elif key == "i": - data["chapter_id"] = text.parse_int(value) - elif key == "v": - data["volume"] = text.parse_int(value) - elif key == "s": - data["stream"] = text.parse_int(value) - elif key == "e": - data["chapter_minor"] = "v" + value - - @staticmethod - def parse_chapter_title(title, data): - match = re.search(r"(?i)(?:vol(?:ume)?[ .]*(\d+) )?" - r"ch(?:apter)?[ .]*(\d+)(\.\w+)?", title) - if match: - vol, ch, data["chapter_minor"] = match.groups() - data["volume"] = text.parse_int(vol) - data["chapter"] = text.parse_int(ch) + _match_title = None + + def _parse_chapter_title(self, title): + if not self._match_title: + MangaparkBase._match_title = re.compile( + r"(?i)" + r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" + r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" + r"(?:\s*:\s*(.*))?" + ).match + match = self._match_title(title) + return match.groups() if match else (0, 0, "", "") class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" - pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" - r"/manga/([^?#]+/i\d+)") + pattern = BASE_PATTERN + r"/title/[^/?#]+/(\d+)" test = ( - ("https://mangapark.net/manga/gosu/i811653/c055/1", { - "count": 50, - "keyword": "db1ed9af4f972756a25dbfa5af69a8f155b043ff", - }), - (("https://mangapark.net/manga" - "/ad-astra-per-aspera-hata-kenjirou/i662051/c001.2/1"), { - "count": 40, - "keyword": "2bb3a8f426383ea13f17ff5582f3070d096d30ac", + ("https://mangapark.net/title/114972-aria/6710214-en-ch.60.2", { + "count": 70, + "pattern": r"https://[\w-]+\.mpcdn\.org/comic/2002/e67" + r"/61e29278a583b9227964076e/\d+_\d+_\d+_\d+\.jpeg" + r"\?acc=[^&#]+&exp=\d+", + "keyword": { + "artist": [], + "author": ["Amano Kozue"], + "chapter": 60, + "chapter_id": 6710214, + "chapter_minor": ".2", + "count": 70, + "date": "dt:2022-01-15 09:25:03", + "extension": "jpeg", + "filename": str, + "genre": ["adventure", "comedy", "drama", "sci_fi", + "shounen", "slice_of_life"], + "lang": "en", + "language": "English", + "manga": "Aria", + "manga_id": 114972, + "page": int, + "source": "Koala", + "title": "Special Navigation - Aquaria Ii", + "volume": 12, + }, }), - (("https://mangapark.net/manga" - "/gekkan-shoujo-nozaki-kun/i2067426/v7/c70/1"), { - "count": 15, - "keyword": "edc14993c4752cee3a76e09b2f024d40d854bfd1", - }), - ("https://mangapark.me/manga/gosu/i811615/c55/1"), - ("https://mangapark.com/manga/gosu/i811615/c55/1"), + ("https://mangapark.com/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.org/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.io/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.me/title/114972-aria/6710214-en-ch.60.2"), ) def __init__(self, match): - tld, self.path = match.groups() - self.root = self.root_fmt.format(tld) - url = "{}/manga/{}?zoom=2".format(self.root, self.path) + self.root = text.root_from_url(match.group(0)) + url = "{}/title/_/{}".format(self.root, match.group(1)) ChapterExtractor.__init__(self, match, url) def metadata(self, page): - data = text.extract_all(page, ( - ("manga_id" , "var _manga_id = '", "'"), - ("chapter_id", "var _book_id = '", "'"), - ("stream" , "var _stream = '", "'"), - ("path" , "var _book_link = '", "'"), - ("manga" , "<h2>", "</h2>"), - ("title" , "</a>", "<"), - ), values={"lang": "en", "language": "English"})[0] - - if not data["path"]: - raise exception.NotFoundError("chapter") - - self.parse_chapter_path(data["path"], data) - if "chapter" not in data: - self.parse_chapter_title(data["title"], data) - - data["manga"], _, data["type"] = data["manga"].rpartition(" ") - data["manga"] = text.unescape(data["manga"]) - data["title"] = data["title"].partition(": ")[2] - for key in ("manga_id", "chapter_id", "stream"): - data[key] = text.parse_int(data[key]) - - return data + data = util.json_loads(text.extr( + page, 'id="__NEXT_DATA__" type="application/json">', '<')) + chapter = (data["props"]["pageProps"]["dehydratedState"] + ["queries"][0]["state"]["data"]["data"]) + manga = chapter["comicNode"]["data"] + source = chapter["sourceNode"]["data"] + + self._urls = chapter["imageSet"]["httpLis"] + self._params = chapter["imageSet"]["wordLis"] + vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) + + return { + "manga" : manga["name"], + "manga_id" : manga["id"], + "artist" : source["artists"], + "author" : source["authors"], + "genre" : source["genres"], + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(ch), + "chapter_minor": minor, + "chapter_id": chapter["id"], + "title" : chapter["title"] or title or "", + "lang" : chapter["lang"], + "language" : util.code_to_language(chapter["lang"]), + "source" : source["srcTitle"], + "source_id" : source["id"], + "date" : text.parse_timestamp(chapter["dateCreate"] // 1000), + } def images(self, page): - data = util.json_loads(text.extr(page, "var _load_pages =", ";")) return [ - (text.urljoin(self.root, item["u"]), { - "width": text.parse_int(item["w"]), - "height": text.parse_int(item["h"]), - }) - for item in data + (url + "?" + params, None) + for url, params in zip(self._urls, self._params) ] -class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): +class MangaparkMangaExtractor(MangaparkBase, Extractor): """Extractor for manga from mangapark.net""" - chapterclass = MangaparkChapterExtractor - pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" - r"(/manga/[^/?#]+)/?$") + subcategory = "manga" + pattern = BASE_PATTERN + r"/title/(\d+)(?:-[^/?#]*)?/?$" test = ( - ("https://mangapark.net/manga/aria", { - "url": "51c6d82aed5c3c78e0d3f980b09a998e6a2a83ee", - "keyword": "cabc60cf2efa82749d27ac92c495945961e4b73c", + ("https://mangapark.net/title/114972-aria", { + "count": 141, + "pattern": MangaparkChapterExtractor.pattern, + "keyword": { + "chapter": int, + "chapter_id": int, + "chapter_minor": str, + "date": "type:datetime", + "lang": "en", + "language": "English", + "manga_id": 114972, + "source": "re:Horse|Koala", + "source_id": int, + "title": str, + "volume": int, + }, + }), + # 'source' option + ("https://mangapark.net/title/114972-aria", { + "options": (("source", "koala"),), + "count": 70, + "pattern": MangaparkChapterExtractor.pattern, + "keyword": { + "source": "Koala", + "source_id": 15150116, + }, }), - ("https://mangapark.me/manga/aria"), - ("https://mangapark.com/manga/aria"), + ("https://mangapark.com/title/114972-"), + ("https://mangapark.com/title/114972"), + ("https://mangapark.com/title/114972-aria"), + ("https://mangapark.org/title/114972-aria"), + ("https://mangapark.io/title/114972-aria"), + ("https://mangapark.me/title/114972-aria"), ) def __init__(self, match): - self.root = self.root_fmt.format(match.group(1)) - MangaExtractor.__init__(self, match, self.root + match.group(2)) - - def chapters(self, page): - results = [] - data = {"lang": "en", "language": "English"} - data["manga"] = text.unescape( - text.extr(page, '<title>', ' Manga - ')) - - for stream in page.split('<div id="stream_')[1:]: - data["stream"] = text.parse_int(text.extr(stream, '', '"')) - - for chapter in text.extract_iter(stream, '<li ', '</li>'): - path , pos = text.extract(chapter, 'href="', '"') - title1, pos = text.extract(chapter, '>', '<', pos) - title2, pos = text.extract(chapter, '>: </span>', '<', pos) - count , pos = text.extract(chapter, ' of ', ' ', pos) - - self.parse_chapter_path(path[8:], data) - if "chapter" not in data: - self.parse_chapter_title(title1, data) - - if title2: - data["title"] = title2.strip() - else: - data["title"] = title1.partition(":")[2].strip() - - data["count"] = text.parse_int(count) - results.append((self.root + path, data.copy())) - data.pop("chapter", None) - - return results + self.root = text.root_from_url(match.group(0)) + self.manga_id = int(match.group(1)) + Extractor.__init__(self, match) + + def items(self): + for chapter in self.chapters(): + chapter = chapter["data"] + url = self.root + chapter["urlPath"] + + vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) + data = { + "manga_id" : self.manga_id, + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(ch), + "chapter_minor": minor, + "chapter_id": chapter["id"], + "title" : chapter["title"] or title or "", + "lang" : chapter["lang"], + "language" : util.code_to_language(chapter["lang"]), + "source" : chapter["srcTitle"], + "source_id" : chapter["sourceId"], + "date" : text.parse_timestamp( + chapter["dateCreate"] // 1000), + "_extractor": MangaparkChapterExtractor, + } + yield Message.Queue, url, data + + def chapters(self): + source = self.config("source") + if not source: + return self.chapters_all() + + source_id = self._select_source(source) + self.log.debug("Requesting chapters for source_id %s", source_id) + return self.chapters_source(source_id) + + def chapters_all(self): + pnum = 0 + variables = { + "select": { + "comicId": self.manga_id, + "range" : None, + "isAsc" : not self.config("chapter-reverse"), + } + } + + while True: + data = self._request_graphql( + "get_content_comicChapterRangeList", variables) + + for item in data["items"]: + yield from item["chapterNodes"] + + if not pnum: + pager = data["pager"] + pnum += 1 + + try: + variables["select"]["range"] = pager[pnum] + except IndexError: + return + + def chapters_source(self, source_id): + variables = { + "sourceId": source_id, + } + chapters = self._request_graphql( + "get_content_source_chapterList", variables) + + if self.config("chapter-reverse"): + chapters.reverse() + return chapters + + def _select_source(self, source): + if isinstance(source, int): + return source + + group, _, lang = source.partition(":") + group = group.lower() + + variables = { + "comicId" : self.manga_id, + "dbStatuss" : ["normal"], + "haveChapter": True, + } + for item in self._request_graphql( + "get_content_comic_sources", variables): + data = item["data"] + if (not group or data["srcTitle"].lower() == group) and ( + not lang or data["lang"] == lang): + return data["id"] + + raise exception.StopExtraction( + "'%s' does not match any available source", source) + + def _request_graphql(self, opname, variables): + url = self.root + "/apo/" + data = { + "query" : QUERIES[opname], + "variables" : util.json_dumps(variables), + "operationName": opname, + } + return self.request( + url, method="POST", json=data).json()["data"][opname] + + +QUERIES = { + "get_content_comicChapterRangeList": """ + query get_content_comicChapterRangeList($select: Content_ComicChapterRangeList_Select) { + get_content_comicChapterRangeList( + select: $select + ) { + reqRange{x y} + missing + pager {x y} + items{ + serial + chapterNodes { + + id + data { + + + id + sourceId + + dbStatus + isNormal + isHidden + isDeleted + isFinal + + dateCreate + datePublic + dateModify + lang + volume + serial + dname + title + urlPath + + srcTitle srcColor + + count_images + + stat_count_post_child + stat_count_post_reply + stat_count_views_login + stat_count_views_guest + + userId + userNode { + + id + data { + +id +name +uniq +avatarUrl +urlPath + +verified +deleted +banned + +dateCreate +dateOnline + +stat_count_chapters_normal +stat_count_chapters_others + +is_adm is_mod is_vip is_upr + + } + + } + + disqusId + + + } + + sser_read + } + } + + } + } +""", + + "get_content_source_chapterList": """ + query get_content_source_chapterList($sourceId: Int!) { + get_content_source_chapterList( + sourceId: $sourceId + ) { + + id + data { + + + id + sourceId + + dbStatus + isNormal + isHidden + isDeleted + isFinal + + dateCreate + datePublic + dateModify + lang + volume + serial + dname + title + urlPath + + srcTitle srcColor + + count_images + + stat_count_post_child + stat_count_post_reply + stat_count_views_login + stat_count_views_guest + + userId + userNode { + + id + data { + +id +name +uniq +avatarUrl +urlPath + +verified +deleted +banned + +dateCreate +dateOnline + +stat_count_chapters_normal +stat_count_chapters_others + +is_adm is_mod is_vip is_upr + + } + + } + + disqusId + + + } + + } + } +""", + + "get_content_comic_sources": """ + query get_content_comic_sources($comicId: Int!, $dbStatuss: [String] = [], $userId: Int, $haveChapter: Boolean, $sortFor: String) { + get_content_comic_sources( + comicId: $comicId + dbStatuss: $dbStatuss + userId: $userId + haveChapter: $haveChapter + sortFor: $sortFor + ) { + +id +data{ + + id + + dbStatus + isNormal + isHidden + isDeleted + + lang name altNames authors artists + + release + genres summary{code} extraInfo{code} + + urlCover600 + urlCover300 + urlCoverOri + + srcTitle srcColor + + chapterCount + chapterNode_last { + id + data { + dateCreate datePublic dateModify + volume serial + dname title + urlPath + userNode { + id data {uniq name} + } + } + } +} + + } + } +""", +} |
