New upstream version 1.10.5upstream/1.10.5

author: Unit 193 <unit193@ubuntu.com> 2019-10-01 19:12:47 -0400
committer: Unit 193 <unit193@ubuntu.com> 2019-10-01 19:12:47 -0400
commit: 639d9ea4a667733aadc3ff83a1df2cc9f0add3a9 (patch)
tree: 5761b58d6fc3e8bbb99b39b8e4417673bccb0b86 /gallery_dl/extractor/nijie.py
parent: c09a9f00dd83017d486cd77650347bc2a397ad55 (diff)
1 files changed, 98 insertions, 73 deletions
diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py
index 4c48d73..fdfad87 100644
--- a/gallery_dl/extractor/nijie.py
+++ b/gallery_dl/extractor/nijie.py
@@ -13,12 +13,15 @@ from .. import text, exception
 from ..cache import cache
 
 
+BASE_PATTERN = r"(?:https?://)?(?:www\.)?nijie\.info"
+
+
 class NijieExtractor(AsynchronousMixin, Extractor):
     """Base class for nijie extractors"""
     category = "nijie"
     directory_fmt = ("{category}", "{user_id}")
-    filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}"
-    archive_fmt = "{image_id}_{index}"
+    filename_fmt = "{category}_{artist_id}_{image_id}_p{num:>02}.{extension}"
+    archive_fmt = "{image_id}_{num}"
     cookiedomain = "nijie.info"
     cookienames = ("nemail", "nlogin")
     root = "https://nijie.info"
@@ -27,61 +30,66 @@ class NijieExtractor(AsynchronousMixin, Extractor):
 
     def __init__(self, match):
         Extractor.__init__(self, match)
-        self.user_id = match.group(1)
+        self.user_id = text.parse_int(match.group(1))
+        self.user_name = None
         self.session.headers["Referer"] = self.root + "/"
 
     def items(self):
         self.login()
-        data = self.get_job_metadata()
-
         yield Message.Version, 1
-        yield Message.Directory, data
 
-        for image_id in self.get_image_ids():
-            for image_url, image_data in self.get_image_data(image_id):
-                image_data.update(data)
-                if not image_data["extension"]:
-                    image_data["extension"] = "jpg"
-                yield Message.Url, image_url, image_data
+        for image_id in self.image_ids():
+
+            response = self.request(self.view_url + image_id, fatal=False)
+            if response.status_code >= 400:
+                continue
+            page = response.text
+
+            data = self._extract_data(page)
+            data["image_id"] = text.parse_int(image_id)
+            yield Message.Directory, data
 
-    def get_job_metadata(self):
-        """Collect metadata for extractor-job"""
-        return {"user_id": text.parse_int(self.user_id)}
+            for image in self._extract_images(page):
+                image.update(data)
+                if not image["extension"]:
+                    image["extension"] = "jpg"
+                yield Message.Url, image["url"], image
 
-    def get_image_ids(self):
+    def image_ids(self):
         """Collect all relevant image-ids"""
 
-    def get_image_data(self, image_id):
-        """Get URL and metadata for images specified by 'image_id'"""
-        page = self.request(self.view_url + image_id).text
-        return self.extract_image_data(page, image_id)
-
-    def extract_image_data(self, page, image_id):
-        """Get URL and metadata for images from 'page'"""
-        title, pos = text.extract(
-            page, '<meta property="og:title" content="', '"')
-        description, pos = text.extract(
-            page, '<meta property="og:description" content="', '"', pos)
-        artist_id, pos = text.extract(
-            page, '"sameAs": "https://nijie.info/members.php?id=', '"', pos)
-        images = list(text.extract_iter(
-            page, '<a href="./view_popup.php', '</a>', pos))
-
-        title = title.rpartition("|")[0].strip()
-        image_id = text.parse_int(image_id)
-        artist_id = text.parse_int(artist_id)
-
-        for index, image in enumerate(images):
+    @staticmethod
+    def _extract_data(page):
+        """Extract image metadata from 'page'"""
+        extr = text.extract_from(page)
+        keywords = text.unescape(extr(
+            'name="keywords" content="', '" />')).split(",")
+        data = {
+            "title"      : keywords[0].strip(),
+            "description": text.unescape(extr(
+                '"description": "', '"').replace("&amp;", "&")),
+            "date"       : text.parse_datetime(extr(
+                '"datePublished": "', '"')[:-4] + "+0900",
+                "%a %d %b %Y %I:%M:%S %p%z"),
+            "artist_id"  : text.parse_int(extr(
+                '"sameAs": "https://nijie.info/members.php?id=', '"')),
+            "artist_name": keywords[1],
+            "tags"       : keywords[2:-1],
+        }
+        data["user_id"] = data["artist_id"]
+        data["user_name"] = data["artist_name"]
+        return data
+
+    @staticmethod
+    def _extract_images(page):
+        """Extract image URLs from 'page'"""
+        images = text.extract_iter(page, '<a href="./view_popup.php', '</a>')
+        for num, image in enumerate(images):
             url = "https:" + text.extract(image, 'src="', '"')[0]
-            url = url.replace("/__rs_l120x120/", "/", 1)
-
-            yield url, text.nameext_from_url(url, {
-                "index": index,
-                "count": len(images),
-                "title": title,
-                "description": description,
-                "image_id": image_id,
-                "artist_id": artist_id,
+            url = url.replace("/__rs_l120x120/", "/")
+            yield text.nameext_from_url(url, {
+                "num": num,
+                "url": url,
             })
 
     def login(self):
@@ -107,6 +115,10 @@ class NijieExtractor(AsynchronousMixin, Extractor):
 
         while True:
             page = self.request(url, params=params, notfound="artist").text
+
+            if not self.user_name:
+                self.user_name = text.unescape(text.extract(
+                    page, '<br />', '<')[0] or "")
             yield from text.extract_iter(page, 'illust_id="', '"')
 
             if '<a rel="next"' not in page:
@@ -117,12 +129,25 @@ class NijieExtractor(AsynchronousMixin, Extractor):
 class NijieUserExtractor(NijieExtractor):
     """Extractor for works of a nijie-user"""
     subcategory = "user"
-    pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
-               r"/members(?:_illust)?\.php\?id=(\d+)")
+    pattern = BASE_PATTERN + r"/members(?:_illust)?\.php\?id=(\d+)"
     test = (
         ("https://nijie.info/members_illust.php?id=44", {
             "url": "66c4ff94c6e77c0765dd88f2d8c663055fda573e",
-            "keyword": "d629c69e3172db1d7e026145e8eb640ac31ac16a",
+            "keyword": {
+                "artist_id": 44,
+                "artist_name": "ED",
+                "date": "type:datetime",
+                "description": str,
+                "extension": "jpg",
+                "filename": str,
+                "image_id": int,
+                "num": int,
+                "tags": list,
+                "title": str,
+                "url": r"re:https://pic.nijie.net/\d+/nijie_picture/.*jpg$",
+                "user_id": 44,
+                "user_name": "ED",
+            },
         }),
         ("https://nijie.info/members_illust.php?id=43", {
             "exception": exception.NotFoundError,
@@ -130,20 +155,23 @@ class NijieUserExtractor(NijieExtractor):
         ("https://nijie.info/members.php?id=44"),
     )
 
-    def get_image_ids(self):
+    def image_ids(self):
         return self._pagination("members_illust")
 
 
 class NijieDoujinExtractor(NijieExtractor):
     """Extractor for doujin entries of a nijie-user"""
     subcategory = "doujin"
-    pattern = (r"(?:https?://)?(?:www\.)?nijie\.info/"
-               r"members_dojin\.php\?id=(\d+)")
+    pattern = BASE_PATTERN + r"/members_dojin\.php\?id=(\d+)"
     test = ("https://nijie.info/members_dojin.php?id=6782", {
         "count": ">= 18",
+        "keyword": {
+            "user_id"  : 6782,
+            "user_name": "ジョニー＠アビオン村",
+        },
     })
 
-    def get_image_ids(self):
+    def image_ids(self):
         return self._pagination("members_dojin")
 
 
@@ -151,30 +179,38 @@ class NijieFavoriteExtractor(NijieExtractor):
     """Extractor for all favorites/bookmarks of a nijie-user"""
     subcategory = "favorite"
     directory_fmt = ("{category}", "bookmarks", "{user_id}")
-    archive_fmt = "f_{user_id}_{image_id}_{index}"
-    pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
-               r"/user_like_illust_view\.php\?id=(\d+)")
+    archive_fmt = "f_{user_id}_{image_id}_{num}"
+    pattern = BASE_PATTERN + r"/user_like_illust_view\.php\?id=(\d+)"
     test = ("https://nijie.info/user_like_illust_view.php?id=44", {
         "count": ">= 16",
+        "keyword": {
+            "user_id"  : 44,
+            "user_name": "ED",
+        },
     })
 
-    def get_image_ids(self):
+    def image_ids(self):
         return self._pagination("user_like_illust_view")
 
+    def _extract_data(self, page):
+        data = NijieExtractor._extract_data(page)
+        data["user_id"] = self.user_id
+        data["user_name"] = self.user_name
+        return data
+
 
 class NijieImageExtractor(NijieExtractor):
     """Extractor for a work/image from nijie.info"""
     subcategory = "image"
-    pattern = (r"(?:https?://)?(?:www\.)?nijie\.info"
-               r"/view(?:_popup)?\.php\?id=(\d+)")
+    pattern = BASE_PATTERN + r"/view(?:_popup)?\.php\?id=(\d+)"
     test = (
         ("https://nijie.info/view.php?id=70720", {
             "url": "5497f897311397dafa188521258624346a0af2a3",
-            "keyword": "408393d010307c76d52cbd0a4368d6d357805aea",
+            "keyword": "fd12bca6f4402a0c996315d28c65f7914ad70c51",
             "content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6",
         }),
         ("https://nijie.info/view.php?id=70724", {
-            "exception": exception.NotFoundError,
+            "count": 0,
         }),
         ("https://nijie.info/view_popup.php?id=70720"),
     )
@@ -182,17 +218,6 @@ class NijieImageExtractor(NijieExtractor):
     def __init__(self, match):
         NijieExtractor.__init__(self, match)
         self.image_id = match.group(1)
-        self.page = ""
 
-    def get_job_metadata(self):
-        self.page = self.request(
-            self.view_url + self.image_id, notfound="image").text
-        self.user_id = text.extract(
-            self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0]
-        return NijieExtractor.get_job_metadata(self)
-
-    def get_image_ids(self):
+    def image_ids(self):
         return (self.image_id,)
-
-    def get_image_data(self, _):
-        return self.extract_image_data(self.page, self.image_id)
author	Unit 193 <unit193@ubuntu.com>	2019-10-01 19:12:47 -0400
committer	Unit 193 <unit193@ubuntu.com>	2019-10-01 19:12:47 -0400
commit	639d9ea4a667733aadc3ff83a1df2cc9f0add3a9 (patch)
tree	5761b58d6fc3e8bbb99b39b8e4417673bccb0b86 /gallery_dl/extractor/nijie.py
parent	c09a9f00dd83017d486cd77650347bc2a397ad55 (diff)