diff options
Diffstat (limited to 'gallery_dl/extractor/nijie.py')
| -rw-r--r-- | gallery_dl/extractor/nijie.py | 171 |
1 files changed, 98 insertions, 73 deletions
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py index 4c48d73..fdfad87 100644 --- a/gallery_dl/extractor/nijie.py +++ b/gallery_dl/extractor/nijie.py @@ -13,12 +13,15 @@ from .. import text, exception from ..cache import cache +BASE_PATTERN = r"(?:https?://)?(?:www\.)?nijie\.info" + + class NijieExtractor(AsynchronousMixin, Extractor): """Base class for nijie extractors""" category = "nijie" directory_fmt = ("{category}", "{user_id}") - filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}" - archive_fmt = "{image_id}_{index}" + filename_fmt = "{category}_{artist_id}_{image_id}_p{num:>02}.{extension}" + archive_fmt = "{image_id}_{num}" cookiedomain = "nijie.info" cookienames = ("nemail", "nlogin") root = "https://nijie.info" @@ -27,61 +30,66 @@ class NijieExtractor(AsynchronousMixin, Extractor): def __init__(self, match): Extractor.__init__(self, match) - self.user_id = match.group(1) + self.user_id = text.parse_int(match.group(1)) + self.user_name = None self.session.headers["Referer"] = self.root + "/" def items(self): self.login() - data = self.get_job_metadata() - yield Message.Version, 1 - yield Message.Directory, data - for image_id in self.get_image_ids(): - for image_url, image_data in self.get_image_data(image_id): - image_data.update(data) - if not image_data["extension"]: - image_data["extension"] = "jpg" - yield Message.Url, image_url, image_data + for image_id in self.image_ids(): + + response = self.request(self.view_url + image_id, fatal=False) + if response.status_code >= 400: + continue + page = response.text + + data = self._extract_data(page) + data["image_id"] = text.parse_int(image_id) + yield Message.Directory, data - def get_job_metadata(self): - """Collect metadata for extractor-job""" - return {"user_id": text.parse_int(self.user_id)} + for image in self._extract_images(page): + image.update(data) + if not image["extension"]: + image["extension"] = "jpg" + yield Message.Url, image["url"], image - def get_image_ids(self): + def image_ids(self): """Collect all relevant image-ids""" - def get_image_data(self, image_id): - """Get URL and metadata for images specified by 'image_id'""" - page = self.request(self.view_url + image_id).text - return self.extract_image_data(page, image_id) - - def extract_image_data(self, page, image_id): - """Get URL and metadata for images from 'page'""" - title, pos = text.extract( - page, '<meta property="og:title" content="', '"') - description, pos = text.extract( - page, '<meta property="og:description" content="', '"', pos) - artist_id, pos = text.extract( - page, '"sameAs": "https://nijie.info/members.php?id=', '"', pos) - images = list(text.extract_iter( - page, '<a href="./view_popup.php', '</a>', pos)) - - title = title.rpartition("|")[0].strip() - image_id = text.parse_int(image_id) - artist_id = text.parse_int(artist_id) - - for index, image in enumerate(images): + @staticmethod + def _extract_data(page): + """Extract image metadata from 'page'""" + extr = text.extract_from(page) + keywords = text.unescape(extr( + 'name="keywords" content="', '" />')).split(",") + data = { + "title" : keywords[0].strip(), + "description": text.unescape(extr( + '"description": "', '"').replace("&", "&")), + "date" : text.parse_datetime(extr( + '"datePublished": "', '"')[:-4] + "+0900", + "%a %d %b %Y %I:%M:%S %p%z"), + "artist_id" : text.parse_int(extr( + '"sameAs": "https://nijie.info/members.php?id=', '"')), + "artist_name": keywords[1], + "tags" : keywords[2:-1], + } + data["user_id"] = data["artist_id"] + data["user_name"] = data["artist_name"] + return data + + @staticmethod + def _extract_images(page): + """Extract image URLs from 'page'""" + images = text.extract_iter(page, '<a href="./view_popup.php', '</a>') + for num, image in enumerate(images): url = "https:" + text.extract(image, 'src="', '"')[0] - url = url.replace("/__rs_l120x120/", "/", 1) - - yield url, text.nameext_from_url(url, { - "index": index, - "count": len(images), - "title": title, - "description": description, - "image_id": image_id, - "artist_id": artist_id, + url = url.replace("/__rs_l120x120/", "/") + yield text.nameext_from_url(url, { + "num": num, + "url": url, }) def login(self): @@ -107,6 +115,10 @@ class NijieExtractor(AsynchronousMixin, Extractor): while True: page = self.request(url, params=params, notfound="artist").text + + if not self.user_name: + self.user_name = text.unescape(text.extract( + page, '<br />', '<')[0] or "") yield from text.extract_iter(page, 'illust_id="', '"') if '<a rel="next"' not in page: @@ -117,12 +129,25 @@ class NijieExtractor(AsynchronousMixin, Extractor): class NijieUserExtractor(NijieExtractor): """Extractor for works of a nijie-user""" subcategory = "user" - pattern = (r"(?:https?://)?(?:www\.)?nijie\.info" - r"/members(?:_illust)?\.php\?id=(\d+)") + pattern = BASE_PATTERN + r"/members(?:_illust)?\.php\?id=(\d+)" test = ( ("https://nijie.info/members_illust.php?id=44", { "url": "66c4ff94c6e77c0765dd88f2d8c663055fda573e", - "keyword": "d629c69e3172db1d7e026145e8eb640ac31ac16a", + "keyword": { + "artist_id": 44, + "artist_name": "ED", + "date": "type:datetime", + "description": str, + "extension": "jpg", + "filename": str, + "image_id": int, + "num": int, + "tags": list, + "title": str, + "url": r"re:https://pic.nijie.net/\d+/nijie_picture/.*jpg$", + "user_id": 44, + "user_name": "ED", + }, }), ("https://nijie.info/members_illust.php?id=43", { "exception": exception.NotFoundError, @@ -130,20 +155,23 @@ class NijieUserExtractor(NijieExtractor): ("https://nijie.info/members.php?id=44"), ) - def get_image_ids(self): + def image_ids(self): return self._pagination("members_illust") class NijieDoujinExtractor(NijieExtractor): """Extractor for doujin entries of a nijie-user""" subcategory = "doujin" - pattern = (r"(?:https?://)?(?:www\.)?nijie\.info/" - r"members_dojin\.php\?id=(\d+)") + pattern = BASE_PATTERN + r"/members_dojin\.php\?id=(\d+)" test = ("https://nijie.info/members_dojin.php?id=6782", { "count": ">= 18", + "keyword": { + "user_id" : 6782, + "user_name": "ジョニー@アビオン村", + }, }) - def get_image_ids(self): + def image_ids(self): return self._pagination("members_dojin") @@ -151,30 +179,38 @@ class NijieFavoriteExtractor(NijieExtractor): """Extractor for all favorites/bookmarks of a nijie-user""" subcategory = "favorite" directory_fmt = ("{category}", "bookmarks", "{user_id}") - archive_fmt = "f_{user_id}_{image_id}_{index}" - pattern = (r"(?:https?://)?(?:www\.)?nijie\.info" - r"/user_like_illust_view\.php\?id=(\d+)") + archive_fmt = "f_{user_id}_{image_id}_{num}" + pattern = BASE_PATTERN + r"/user_like_illust_view\.php\?id=(\d+)" test = ("https://nijie.info/user_like_illust_view.php?id=44", { "count": ">= 16", + "keyword": { + "user_id" : 44, + "user_name": "ED", + }, }) - def get_image_ids(self): + def image_ids(self): return self._pagination("user_like_illust_view") + def _extract_data(self, page): + data = NijieExtractor._extract_data(page) + data["user_id"] = self.user_id + data["user_name"] = self.user_name + return data + class NijieImageExtractor(NijieExtractor): """Extractor for a work/image from nijie.info""" subcategory = "image" - pattern = (r"(?:https?://)?(?:www\.)?nijie\.info" - r"/view(?:_popup)?\.php\?id=(\d+)") + pattern = BASE_PATTERN + r"/view(?:_popup)?\.php\?id=(\d+)" test = ( ("https://nijie.info/view.php?id=70720", { "url": "5497f897311397dafa188521258624346a0af2a3", - "keyword": "408393d010307c76d52cbd0a4368d6d357805aea", + "keyword": "fd12bca6f4402a0c996315d28c65f7914ad70c51", "content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6", }), ("https://nijie.info/view.php?id=70724", { - "exception": exception.NotFoundError, + "count": 0, }), ("https://nijie.info/view_popup.php?id=70720"), ) @@ -182,17 +218,6 @@ class NijieImageExtractor(NijieExtractor): def __init__(self, match): NijieExtractor.__init__(self, match) self.image_id = match.group(1) - self.page = "" - def get_job_metadata(self): - self.page = self.request( - self.view_url + self.image_id, notfound="image").text - self.user_id = text.extract( - self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0] - return NijieExtractor.get_job_metadata(self) - - def get_image_ids(self): + def image_ids(self): return (self.image_id,) - - def get_image_data(self, _): - return self.extract_image_data(self.page, self.image_id) |
