diff options
| author | 2024-08-03 20:27:44 -0400 | |
|---|---|---|
| committer | 2024-08-03 20:27:44 -0400 | |
| commit | 032e5bed275a253e122ed9ac86dac7b8c4204172 (patch) | |
| tree | b4eda52ebfe00c4d22e9d633b1ab2d158a9f0573 /gallery_dl/extractor/cien.py | |
| parent | 80e39a8fc7de105510cbbdca8507f2a4b8c9e01d (diff) | |
New upstream version 1.27.2.upstream/1.27.2
Diffstat (limited to 'gallery_dl/extractor/cien.py')
| -rw-r--r-- | gallery_dl/extractor/cien.py | 199 |
1 files changed, 199 insertions, 0 deletions
diff --git a/gallery_dl/extractor/cien.py b/gallery_dl/extractor/cien.py new file mode 100644 index 0000000..bae86d0 --- /dev/null +++ b/gallery_dl/extractor/cien.py @@ -0,0 +1,199 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://ci-en.net/""" + +from .common import Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?ci-en\.(?:net|dlsite\.com)" + + +class CienExtractor(Extractor): + category = "cien" + root = "https://ci-en.net" + request_interval = (1.0, 2.0) + + def __init__(self, match): + self.root = text.root_from_url(match.group(0)) + Extractor.__init__(self, match) + + def _init(self): + self.cookies.set("accepted_rating", "r18g", domain="ci-en.dlsite.com") + + def _pagination_articles(self, url, params): + data = {"_extractor": CienArticleExtractor} + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + + for card in text.extract_iter( + page, ' class="c-cardCase-item', '</div>'): + article_url = text.extr(card, ' href="', '"') + yield Message.Queue, article_url, data + + if ' rel="next"' not in page: + return + params["page"] += 1 + + +class CienArticleExtractor(CienExtractor): + subcategory = "article" + filename_fmt = "{num:>02} {filename}.{extension}" + directory_fmt = ("{category}", "{author[name]}", "{post_id} {name}") + archive_fmt = "{post_id}_{num}" + pattern = BASE_PATTERN + r"/creator/(\d+)/article/(\d+)" + example = "https://ci-en.net/creator/123/article/12345" + + def items(self): + url = "{}/creator/{}/article/{}".format( + self.root, self.groups[0], self.groups[1]) + page = self.request(url, notfound="article").text + + post = util.json_loads(text.extr( + page, '<script type="application/ld+json">', '</script>'))[0] + + files = self._extract_files(post.get("articleBody") or page) + + post["post_url"] = url + post["post_id"] = text.parse_int(self.groups[1]) + post["count"] = len(files) + post["date"] = text.parse_datetime(post["datePublished"]) + + try: + del post["publisher"] + del post["sameAs"] + except Exception: + pass + + yield Message.Directory, post + for post["num"], file in enumerate(files, 1): + post.update(file) + if "extension" not in file: + text.nameext_from_url(file["url"], post) + yield Message.Url, file["url"], post + + def _extract_files(self, page): + files = [] + + filetypes = self.config("files") + if filetypes is None: + self._extract_files_image(page, files) + self._extract_files_video(page, files) + self._extract_files_download(page, files) + self._extract_files_gallery(page, files) + else: + generators = { + "image" : self._extract_files_image, + "video" : self._extract_files_video, + "download": self._extract_files_download, + "gallery" : self._extract_files_gallery, + "gallerie": self._extract_files_gallery, + } + if isinstance(filetypes, str): + filetypes = filetypes.split(",") + for ft in filetypes: + generators[ft.rstrip("s")](page, files) + + return files + + def _extract_files_image(self, page, files): + for image in text.extract_iter( + page, 'class="file-player-image"', "</figure>"): + size = text.extr(image, ' data-size="', '"') + w, _, h = size.partition("x") + + files.append({ + "url" : text.extr(image, ' data-raw="', '"'), + "width" : text.parse_int(w), + "height": text.parse_int(h), + "type" : "image", + }) + + def _extract_files_video(self, page, files): + for video in text.extract_iter( + page, "<vue-file-player", "</vue-file-player>"): + path = text.extr(video, ' base-path="', '"') + name = text.extr(video, ' file-name="', '"') + auth = text.extr(video, ' auth-key="', '"') + + file = text.nameext_from_url(name) + file["url"] = "{}video-web.mp4?{}".format(path, auth) + file["type"] = "video" + files.append(file) + + def _extract_files_download(self, page, files): + for download in text.extract_iter( + page, 'class="downloadBlock', "</div>"): + name = text.extr(download, "<p>", "<") + + file = text.nameext_from_url(name.rpartition(" ")[0]) + file["url"] = text.extr(download, ' href="', '"') + file["type"] = "download" + files.append(file) + + def _extract_files_gallery(self, page, files): + for gallery in text.extract_iter( + page, "<vue-image-gallery", "</vue-image-gallery>"): + + url = self.root + "/api/creator/gallery/images" + params = { + "hash" : text.extr(gallery, ' hash="', '"'), + "gallery_id": text.extr(gallery, ' gallery-id="', '"'), + "time" : text.extr(gallery, ' time="', '"'), + } + data = self.request(url, params=params).json() + url = self.root + "/api/creator/gallery/imagePath" + + for params["page"], params["file_id"] in enumerate( + data["imgList"]): + path = self.request(url, params=params).json()["path"] + + file = params.copy() + file["url"] = path + files.append(file) + + +class CienCreatorExtractor(CienExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/creator/(\d+)(?:/article(?:\?([^#]+))?)?/?$" + example = "https://ci-en.net/creator/123" + + def items(self): + url = "{}/creator/{}/article".format(self.root, self.groups[0]) + params = text.parse_query(self.groups[1]) + params["mode"] = "list" + return self._pagination_articles(url, params) + + +class CienRecentExtractor(CienExtractor): + subcategory = "recent" + pattern = BASE_PATTERN + r"/mypage/recent(?:\?([^#]+))?" + example = "https://ci-en.net/mypage/recent" + + def items(self): + url = self.root + "/mypage/recent" + params = text.parse_query(self.groups[0]) + return self._pagination_articles(url, params) + + +class CienFollowingExtractor(CienExtractor): + subcategory = "following" + pattern = BASE_PATTERN + r"/mypage/subscription(/following)?" + example = "https://ci-en.net/mypage/subscription" + + def items(self): + url = self.root + "/mypage/subscription" + (self.groups[0] or "") + page = self.request(url).text + data = {"_extractor": CienCreatorExtractor} + + for subscription in text.extract_iter( + page, 'class="c-grid-subscriptionInfo', '</figure>'): + url = text.extr(subscription, ' href="', '"') + yield Message.Queue, url, data |
