diff options
| author | 2020-08-15 17:48:11 -0400 | |
|---|---|---|
| committer | 2020-08-15 17:48:11 -0400 | |
| commit | 7cf59dc17c3607e096292462ed15d391be4e3dfd (patch) | |
| tree | 50d2750e958f43271dc6cc5310211cf8f8bbd9d0 /gallery_dl/extractor/mangareader.py | |
| parent | ba039cfb2e1ba2522ee0a0fa2a84a1a6579e4877 (diff) | |
New upstream version 1.14.4.upstream/1.14.4
Diffstat (limited to 'gallery_dl/extractor/mangareader.py')
| -rw-r--r-- | gallery_dl/extractor/mangareader.py | 122 |
1 files changed, 49 insertions, 73 deletions
diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py index 31083dc..fd9c7ac 100644 --- a/gallery_dl/extractor/mangareader.py +++ b/gallery_dl/extractor/mangareader.py @@ -6,10 +6,12 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract manga-chapters and entire manga from https://www.mangareader.net/""" +"""Extractors for https://www.mangareader.net/""" from .common import ChapterExtractor, MangaExtractor from .. import text +from ..cache import memcache +import json class MangareaderBase(): @@ -17,19 +19,35 @@ class MangareaderBase(): category = "mangareader" root = "https://www.mangareader.net" - @staticmethod - def parse_page(page, data): - """Parse metadata on 'page' and add it to 'data'""" - text.extract_all(page, ( - ("manga" , '<h2 class="aname">', '</h2>'), - ("release", '>Year of Release:</td>\n<td>', '</td>'), - ('author' , '>Author:</td>\n<td>', '</td>'), - ('artist' , '>Artist:</td>\n<td>', '</td>'), - ), values=data) - data["manga"] = data["manga"].strip() - data["author"] = text.unescape(data["author"]) - data["artist"] = text.unescape(data["artist"]) - return data + @memcache(keyarg=1) + def _manga_info(self, path, page=None): + if not page: + page = self.request(self.root + path).text + extr = text.extract_from(page) + data = { + "manga" : text.unescape(extr('class="name">', '<')), + "release" : text.unescape(extr('Year of Release :</td><td>', '<')), + "author" : text.unescape(text.unescape(extr( + 'Author :</td><td>', '<'))), + "artist" : text.unescape(text.unescape(extr( + 'Artist :</td><td>', '<'))), + "lang" : "en", + "language": "English", + } + + extr('<table', '>') + chapters = [] + while True: + url = extr('</i> <a href="', '"') + if not url: + return chapters + chapter = { + "chapter": text.parse_int(url.rpartition("/")[2]), + "title" : text.unescape(extr("</a> : ", "<")), + "date" : extr("<td>", "<"), + } + chapter.update(data) + chapters.append((self.root + url, chapter)) class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): @@ -38,59 +56,28 @@ class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))" test = (("https://www.mangareader.net" "/karate-shoukoushi-kohinata-minoru/11"), { - "url": "3d8a5b900856d59b8d8e83908d0df392be92c0f4", + "url": "45ece5668d1e9f65cf2225237d78de58660b54e4", "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6", }) def __init__(self, match): - path, self.url_title, self.chapter = match.groups() - ChapterExtractor.__init__(self, match, self.root + path) + ChapterExtractor.__init__(self, match) + _, self.path, self.chapter = match.groups() - def metadata(self, chapter_page): - page = self.request(self.root + self.url_title).text - data = self.parse_page(page, { - "chapter": text.parse_int(self.chapter), - "lang": "en", - "language": "English", - }) - text.extract_all(page, ( - ('title', ' ' + self.chapter + '</a> : ', '</td>'), - ('date', '<td>', '</td>'), - ), page.index('<div id="chapterlist">'), data) - data["count"] = text.parse_int(text.extract( - chapter_page, '</select> of ', '<')[0] - ) - return data + def metadata(self, page): + chapter = text.parse_int(self.chapter) + return self._manga_info(self.path)[chapter-1][1] def images(self, page): - while True: - next_url, image_url, image_data = self.get_image_metadata(page) - yield image_url, image_data - - if not next_url: - return - page = self.request(next_url).text - - def get_image_metadata(self, page): - """Collect next url, image-url and metadata for one manga-page""" - extr = text.extract - width = None - test , pos = extr(page, "document['pu']", '') - if test is None: - return None, None, None - if page.find("document['imgwidth']", pos, pos+200) != -1: - width , pos = extr(page, "document['imgwidth'] = ", ";", pos) - height, pos = extr(page, "document['imgheight'] = ", ";", pos) - _ , pos = extr(page, '<div id="imgholder">', '') - url, pos = extr(page, ' href="', '"', pos) - if width is None: - width , pos = extr(page, '<img id="img" width="', '"', pos) - height, pos = extr(page, ' height="', '"', pos) - image, pos = extr(page, ' src="', '"', pos) - return self.root + url, image, { - "width": text.parse_int(width), - "height": text.parse_int(height), - } + data = json.loads(text.extract( + page, 'document["mj"]=', '</script>')[0]) + return [ + (text.ensure_http_scheme(img["u"]), { + "width" : text.parse_int(img["w"]), + "height": text.parse_int(img["h"]), + }) + for img in data["im"] + ] class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): @@ -104,16 +91,5 @@ class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): }) def chapters(self, page): - results = [] - data = self.parse_page(page, {"lang": "en", "language": "English"}) - - needle = '<div class="chico_manga"></div>\n<a href="' - pos = page.index('<div id="chapterlist">') - while True: - url, pos = text.extract(page, needle, '"', pos) - if not url: - return results - data["title"], pos = text.extract(page, '</a> : ', '</td>', pos) - data["date"] , pos = text.extract(page, '<td>', '</td>', pos) - data["chapter"] = text.parse_int(url.rpartition("/")[2]) - results.append((self.root + url, data.copy())) + path = self.manga_url[len(self.root):] + return self._manga_info(path, page) |
