diff options
Diffstat (limited to 'gallery_dl/extractor')
107 files changed, 17849 insertions, 0 deletions
diff --git a/gallery_dl/extractor/2chan.py b/gallery_dl/extractor/2chan.py new file mode 100644 index 0000000..8df8645 --- /dev/null +++ b/gallery_dl/extractor/2chan.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.2chan.net/""" + +from .common import Extractor, Message +from .. import text + + +class FutabaThreadExtractor(Extractor): + """Extractor for images from threads on www.2chan.net""" + category = "2chan" + subcategory = "thread" + directory_fmt = ("{category}", "{board_name}", "{thread}") + filename_fmt = "{tim}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + url_fmt = "https://{server}.2chan.net/{board}/src/{filename}" + pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)" + test = ("http://dec.2chan.net/70/res/947.htm", { + "url": "c5c12b80b290e224b6758507b3bb952044f4595b", + "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.server, self.board, self.thread = match.groups() + + def items(self): + url = "https://{}.2chan.net/{}/res/{}.htm".format( + self.server, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(page): + if "filename" not in post: + continue + post.update(data) + url = self.url_fmt.format_map(post) + yield Message.Url, url, post + + def metadata(self, page): + """Collect metadata for extractor-job""" + title = text.extract(page, "<title>", "</title>")[0] + title, _, boardname = title.rpartition(" - ") + return { + "server": self.server, + "title": title, + "board": self.board, + "board_name": boardname[:-4], + "thread": self.thread, + } + + def posts(self, page): + """Build a list of all post-objects""" + page = text.extract( + page, '<div class="thre"', '<div style="clear:left"></div>')[0] + return [ + self.parse(post) + for post in page.split('<table border=0>') + ] + + def parse(self, post): + """Build post-object by extracting data from an HTML post""" + data = self._extract_post(post) + if '<a href="/' in post: + self._extract_image(post, data) + data["tim"], _, data["extension"] = data["filename"].partition(".") + data["time"] = data["tim"][:-3] + data["ext"] = "." + data["extension"] + return data + + @staticmethod + def _extract_post(post): + return text.extract_all(post, ( + ("no" , 'name="', '"'), + ("post", '<b>', '</b>'), + ("name", '<b>', ' </b>'), + ("now" , '</font> ', ' '), + (None , '<blockquote', ''), + ("com" , '>', '</blockquote>'), + ))[0] + + @staticmethod + def _extract_image(post, data): + text.extract_all(post, ( + (None , '_blank', ''), + ("filename", '>', '<'), + ("fsize" , '(', ' '), + ), 0, data) diff --git a/gallery_dl/extractor/35photo.py b/gallery_dl/extractor/35photo.py new file mode 100644 index 0000000..50dbfe8 --- /dev/null +++ b/gallery_dl/extractor/35photo.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://35photo.pro/""" + +from .common import Extractor, Message +from .. import text + + +class _35photoExtractor(Extractor): + category = "35photo" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{id}{title:?_//}_{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + root = "https://35photo.pro" + + def items(self): + first = True + data = self.metadata() + yield Message.Version, 1 + + for photo_id in self.photos(): + for photo in self._photo_data(photo_id): + photo.update(data) + url = photo["url"] + if first: + first = False + yield Message.Directory, photo + yield Message.Url, url, text.nameext_from_url(url, photo) + + def metadata(self): + """Returns general metadata""" + return {} + + def photos(self): + """Returns an iterable containing all relevant photo IDs""" + + def _pagination(self, params, extra_ids=None): + url = "https://35photo.pro/show_block.php" + headers = {"Referer": self.root, "X-Requested-With": "XMLHttpRequest"} + params["type"] = "getNextPageData" + + if "lastId" not in params: + params["lastId"] = "999999999" + if extra_ids: + yield from extra_ids + while params["lastId"]: + data = self.request(url, headers=headers, params=params).json() + yield from self._photo_ids(data["data"]) + params["lastId"] = data["lastId"] + + def _photo_data(self, photo_id): + params = {"method": "photo.getData", "photoId": photo_id} + data = self.request( + "https://api.35photo.pro/", params=params).json()["data"][photo_id] + info = { + "url" : data["src"], + "id" : data["photo_id"], + "title" : data["photo_name"], + "description": data["photo_desc"], + "tags" : data["tags"] or [], + "views" : data["photo_see"], + "favorites" : data["photo_fav"], + "score" : data["photo_rating"], + "type" : data["photo_type"], + "date" : data["timeAdd"], + "user" : data["user_login"], + "user_id" : data["user_id"], + "user_name" : data["user_name"], + "other" : data["otherData"], + } + + if "series" in data: + for info["num"], photo in enumerate(data["series"], 1): + info["url"] = photo["src"] + info["id_series"] = text.parse_int(photo["id"]) + info["title_series"] = photo["title"] or "" + yield info.copy() + else: + info["num"] = 1 + yield info + + @staticmethod + def _photo_ids(page): + """Extract unique photo IDs and return them as sorted list""" + # searching for photo-id="..." doesn't always work (see unit tests) + return sorted( + set(text.extract_iter(page, "/photo_", "/")), + key=text.parse_int, + reverse=True, + ) + + +class _35photoUserExtractor(_35photoExtractor): + """Extractor for all images of a user on 35photo.pro""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro" + r"/(?!photo_|genre_)([^/?&#]+)") + test = ( + ("https://35photo.pro/liya", { + "pattern": r"https://m\d+.35photo.pro/photos_(main|series)/.*.jpg", + "count": 9, + }), + ("https://35photo.pro/suhoveev", { + # last photo ID (1267028) isn't given as 'photo-id="<id>" + # there are only 23 photos without the last one + "count": ">= 33", + }), + ("https://en.35photo.pro/liya"), + ("https://ru.35photo.pro/liya"), + ) + + def __init__(self, match): + _35photoExtractor.__init__(self, match) + self.user = match.group(1) + self.user_id = 0 + + def metadata(self): + url = "{}/{}/".format(self.root, self.user) + page = self.request(url).text + self.user_id = text.parse_int(text.extract(page, "/user_", ".xml")[0]) + return { + "user": self.user, + "user_id": self.user_id, + } + + def photos(self): + return self._pagination({ + "page": "photoUser", + "user_id": self.user_id, + }) + + +class _35photoGenreExtractor(_35photoExtractor): + """Extractor for images of a specific genre on 35photo.pro""" + subcategory = "genre" + directory_fmt = ("{category}", "Genre", "{genre}") + archive_fmt = "g{genre_id}_{id}_{num}" + pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/genre_(\d+)(/new/)?" + test = ( + ("https://35photo.pro/genre_109/", { + "range": "1-30", + }), + ("https://35photo.pro/genre_109/new/"), + ) + + def __init__(self, match): + _35photoExtractor.__init__(self, match) + self.genre_id, self.new = match.groups() + self.photo_ids = None + + def metadata(self): + url = "{}/genre_{}{}".format(self.root, self.genre_id, self.new or "/") + page = self.request(url).text + self.photo_ids = self._photo_ids(text.extract( + page, ' class="photo', '\n')[0]) + return { + "genre": text.extract(page, " genre - ", ". ")[0], + "genre_id": text.parse_int(self.genre_id), + } + + def photos(self): + return self._pagination({ + "page": "genre", + "community_id": self.genre_id, + "photo_rating": "0" if self.new else "50", + "lastId": self.photo_ids[-1], + }, self.photo_ids) + + +class _35photoImageExtractor(_35photoExtractor): + """Extractor for individual images from 35photo.pro""" + subcategory = "image" + pattern = r"(?:https?://)?(?:[a-z]+\.)?35photo\.pro/photo_(\d+)" + test = ("https://35photo.pro/photo_753340/", { + "count": 1, + "keyword": { + "url" : r"re:https://m\d+.35photo.pro/photos_main/.*.jpg", + "id" : 753340, + "title" : "Winter walk", + "description": str, + "tags" : list, + "views" : int, + "favorites" : int, + "score" : int, + "type" : 0, + "date" : "15 авг, 2014", + "user" : "liya", + "user_id" : 20415, + "user_name" : "Liya Mirzaeva", + "other" : str, + }, + }) + + def __init__(self, match): + _35photoExtractor.__init__(self, match) + self.photo_id = match.group(1) + + def photos(self): + return (self.photo_id,) diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py new file mode 100644 index 0000000..d0e59ad --- /dev/null +++ b/gallery_dl/extractor/3dbooru.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://behoimi.org/""" + +from . import booru + + +class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): + """Base class for 3dbooru extractors""" + category = "3dbooru" + api_url = "http://behoimi.org/post/index.json" + post_url = "http://behoimi.org/post/show/{}" + page_limit = 1000 + + def __init__(self, match): + super().__init__(match) + self.session.headers.update({ + "Referer": "http://behoimi.org/post/show/", + "Accept-Encoding": "identity", + }) + + +class ThreedeebooruTagExtractor(booru.TagMixin, + ThreedeebooruExtractor): + """Extractor for images from behoimi.org based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org/post" + r"(?:/(?:index)?)?\?tags=(?P<tags>[^&#]+)") + test = ("http://behoimi.org/post?tags=himekawa_azuru+dress", { + "url": "ecb30c6aaaf8a6ff8f55255737a9840832a483c1", + "content": "11cbda40c287e026c1ce4ca430810f761f2d0b2a", + }) + + +class ThreedeebooruPoolExtractor(booru.PoolMixin, + ThreedeebooruExtractor): + """Extractor for image-pools from behoimi.org""" + pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(?P<pool>\d+)" + test = ("http://behoimi.org/pool/show/27", { + "url": "da75d2d1475449d5ef0c266cb612683b110a30f2", + "content": "fd5b37c5c6c2de4b4d6f1facffdefa1e28176554", + }) + + +class ThreedeebooruPostExtractor(booru.PostMixin, + ThreedeebooruExtractor): + """Extractor for single images from behoimi.org""" + pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(?P<post>\d+)" + test = ("http://behoimi.org/post/show/140852", { + "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6", + "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4", + "options": (("tags", True),), + "keyword": { + "tags_character": "furude_rika", + "tags_copyright": "higurashi_no_naku_koro_ni", + "tags_model": "himekawa_azuru", + "tags_general": str, + }, + }) + + +class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin, + ThreedeebooruExtractor): + """Extractor for popular images from behoimi.org""" + pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org" + r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)" + r"(?:\?(?P<query>[^#]*))?") + test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", { + "url": "c70268dce441a9ccc3383c244ec15edb059f494f", + "count": 20, + }) + + def __init__(self, match): + super().__init__(match) + self.api_url = "http://behoimi.org/post/popular_{scale}.json".format( + scale=self.scale) diff --git a/gallery_dl/extractor/4chan.py b/gallery_dl/extractor/4chan.py new file mode 100644 index 0000000..e387b33 --- /dev/null +++ b/gallery_dl/extractor/4chan.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images and videos from https://www.4chan.org/""" + +from . import chan +from .. import text + + +class FourchanThreadExtractor(chan.ChanThreadExtractor): + """Extractor for images from threads from 4chan.org""" + category = "4chan" + pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org" + r"/([^/]+)/thread/(\d+)") + test = ( + ("https://boards.4chan.org/tg/thread/15396072/", { + "url": "39082ad166161966d7ba8e37f2173a824eb540f0", + "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a", + "content": "20b7b51afa51c9c31a0020a0737b889532c8d7ec", + }), + ("https://boards.4channel.org/tg/thread/15396072/", { + "url": "39082ad166161966d7ba8e37f2173a824eb540f0", + "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a", + }), + ) + api_url = "https://a.4cdn.org/{board}/thread/{thread}.json" + file_url = "https://i.4cdn.org/{board}/{tim}{ext}" + + def update(self, post, data=None): + chan.ChanThreadExtractor.update(self, post, data) + post["filename"] = text.unescape(post["filename"]) diff --git a/gallery_dl/extractor/500px.py b/gallery_dl/extractor/500px.py new file mode 100644 index 0000000..00b8ab5 --- /dev/null +++ b/gallery_dl/extractor/500px.py @@ -0,0 +1,238 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://500px.com/""" + +from .common import Extractor, Message +from .. import text + + +class _500pxExtractor(Extractor): + """Base class for 500px extractors""" + category = "500px" + directory_fmt = ("{category}", "{user[username]}") + filename_fmt = "{id}_{name}.{extension}" + archive_fmt = "{id}" + root = "https://500px.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + "/" + + def items(self): + first = True + data = self.metadata() + yield Message.Version, 1 + + for photo in self.photos(): + url = photo["images"][-1]["url"] + fmt = photo["image_format"] + photo["extension"] = "jpg" if fmt == "jpeg" else fmt + if data: + photo.update(data) + if first: + first = False + yield Message.Directory, photo + yield Message.Url, url, photo + + def metadata(self): + """Returns general metadata""" + + def photos(self): + """Returns an iterable containing all relevant photo IDs""" + + def _extend(self, photos): + """Extend photos with additional metadata and higher resolution URLs""" + url = "https://api.500px.com/v1/photos" + params = { + "expanded_user_info" : "true", + "include_tags" : "true", + "include_geo" : "true", + "include_equipment_info": "true", + "vendor_photos" : "true", + "include_licensing" : "true", + "include_releases" : "true", + "liked_by" : "1", + "following_sample" : "100", + "image_size" : "32768", + "ids" : ",".join(str(p["id"]) for p in photos), + } + + data = self._api_call(url, params)["photos"] + for photo in photos: + pid = str(photo["id"]) + photo.update(data[pid]) + return photos + + def _api_call(self, url, params, csrf_token=None): + headers = {"Origin": self.root, "X-CSRF-Token": csrf_token} + return self.request(url, headers=headers, params=params).json() + + def _pagination(self, url, params, csrf): + params["page"] = 1 + while True: + data = self._api_call(url, params, csrf) + yield from self._extend(data["photos"]) + + if params["page"] >= data["total_pages"]: + return + params["page"] += 1 + + +class _500pxUserExtractor(_500pxExtractor): + """Extractor for photos from a user's photostream on 500px.com""" + subcategory = "user" + pattern = (r"(?:https?://)?500px\.com" + r"/(?!photo/)([^/?&#]+)/?(?:$|\?|#)") + test = ("https://500px.com/light_expression_photography", { + "pattern": r"https?://drscdn.500px.org/photo/\d+/m%3D5000/v2", + "range": "1-99", + "count": 99, + }) + + def __init__(self, match): + _500pxExtractor.__init__(self, match) + self.user = match.group(1) + + def photos(self): + # get csrf token and user id from webpage + url = "{}/{}".format(self.root, self.user) + page = self.request(url).text + csrf_token, pos = text.extract(page, 'csrf-token" content="', '"') + user_id , pos = text.extract(page, '/user/', '"', pos) + + # get user photos + url = "https://api.500px.com/v1/photos" + params = { + "feature" : "user", + "stream" : "photos", + "rpp" : "50", + "user_id" : user_id, + } + return self._pagination(url, params, csrf_token) + + +class _500pxGalleryExtractor(_500pxExtractor): + """Extractor for photo galleries on 500px.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{user[username]}", "{gallery[name]}") + pattern = (r"(?:https?://)?500px\.com" + r"/(?!photo/)([^/?&#]+)/galleries/([^/?&#]+)") + test = ("https://500px.com/fashvamp/galleries/lera", { + "url": "8a520272ece83278166b4f8556f9c9da43c43c45", + "count": 3, + "keyword": { + "gallery": dict, + "user": dict, + }, + }) + + def __init__(self, match): + _500pxExtractor.__init__(self, match) + self.user_name, self.gallery_name = match.groups() + self.user_id = self.gallery_id = self.csrf_token = None + + def metadata(self): + # get csrf token and user id from webpage + url = "{}/{}/galleries/{}".format( + self.root, self.user_name, self.gallery_name) + page = self.request(url).text + self.csrf_token, pos = text.extract(page, 'csrf-token" content="', '"') + self.user_id , pos = text.extract(page, 'App.CuratorId =', '\n', pos) + self.user_id = self.user_id.strip() + + # get gallery metadata; transform gallery name into id + url = "https://api.500px.com/v1/users/{}/galleries/{}".format( + self.user_id, self.gallery_name) + params = { + # "include_user": "true", + "include_cover": "1", + "cover_size": "2048", + } + data = self._api_call(url, params, self.csrf_token) + self.gallery_id = data["gallery"]["id"] + return data + + def photos(self): + url = "https://api.500px.com/v1/users/{}/galleries/{}/items".format( + self.user_id, self.gallery_id) + params = { + "sort" : "position", + "sort_direction" : "asc", + "rpp" : "50", + } + return self._pagination(url, params, self.csrf_token) + + +class _500pxImageExtractor(_500pxExtractor): + """Extractor for individual images from 500px.com""" + subcategory = "image" + pattern = r"(?:https?://)?500px\.com/photo/(\d+)" + test = ("https://500px.com/photo/222049255/queen-of-coasts", { + "url": "d1eda7afeaa589f71f05b9bb5c0694e3ffb357cd", + "count": 1, + "keyword": { + "camera": "Canon EOS 600D", + "camera_info": dict, + "collections_count": int, + "comments": list, + "comments_count": int, + "converted": False, + "converted_bits": int, + "created_at": "2017-08-01T04:40:05-04:00", + "crop_version": 0, + "description": str, + "editored_by": dict, + "editors_choice": False, + "extension": "jpg", + "favorites_count": int, + "feature": "popular", + "feature_date": "2017-08-01T09:58:28+00:00", + "focal_length": "208", + "height": 3111, + "id": 222049255, + "image_format": "jpeg", + "image_url": str, + "images": list, + "iso": "100", + "lens": "EF-S55-250mm f/4-5.6 IS II", + "lens_info": dict, + "license_type": 0, + "licensed_at": None, + "liked": False, + "location": None, + "location_details": dict, + "name": "Queen Of Coasts", + "nsfw": False, + "privacy": False, + "profile": True, + "rating": float, + "sales_count": int, + "status": 1, + "store_download": False, + "store_height": 3111, + "store_width": 4637, + "tags": list, + "taken_at": "2017-05-04T13:36:51-04:00", + "times_viewed": int, + "url": "/photo/222049255/queen-of-coasts-by-olesya-nabieva", + "user": dict, + "user_id": 12847235, + "votes_count": int, + "watermark": True, + "width": 4637, + }, + }) + + def __init__(self, match): + _500pxExtractor.__init__(self, match) + self.photo_id = match.group(1) + + def photos(self): + photos = ({"id": self.photo_id},) + return self._extend(photos) diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py new file mode 100644 index 0000000..e526da3 --- /dev/null +++ b/gallery_dl/extractor/8chan.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images and videos from https://8ch.net/""" + +from . import chan + + +class InfinitychanThreadExtractor(chan.ChanThreadExtractor): + """Extractor for images from threads from 8ch.net""" + category = "8chan" + filename_fmt = "{time}-{filename}{ext}" + pattern = r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+)" + test = ("https://8ch.net/builders/res/3.html", { + "url": "5d85c0509f907f217aea379f862b41bf3d01f645", + "keyword": "0c497190c0c0f826925fde09815351d01869c783", + }) + api_url = "https://8ch.net/{board}/res/{thread}.json" + file_url = "https://media.8ch.net/{board}/src/{tim}{ext}" + file_url_v2 = "https://media.8ch.net/file_store/{tim}{ext}" + + def build_url(self, post): + fmt = self.file_url if len(post["tim"]) < 64 else self.file_url_v2 + return fmt.format_map(post) diff --git a/gallery_dl/extractor/8muses.py b/gallery_dl/extractor/8muses.py new file mode 100644 index 0000000..6fbf6b5 --- /dev/null +++ b/gallery_dl/extractor/8muses.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.8muses.com/""" + +from .common import Extractor, Message +from .. import text +import json + + +class _8musesAlbumExtractor(Extractor): + """Extractor for image albums on www.8muses.com""" + category = "8muses" + subcategory = "album" + directory_fmt = ("{category}", "{album[path]}") + filename_fmt = "{page:>03}.{extension}" + archive_fmt = "{hash}" + root = "https://www.8muses.com" + pattern = (r"(?:https?://)?(?:www\.)?8muses\.com" + r"(/comics/album/[^?&#]+)(\?[^#]+)?") + test = ( + ("https://www.8muses.com/comics/album/Fakku-Comics/santa/Im-Sorry", { + "url": "82449d6a26a29204695cba5d52c3ec60170bc159", + "keyword": { + "url" : str, + "hash" : str, + "page" : int, + "count": 16, + "album": { + "id" : 10457, + "title" : "Im Sorry", + "path" : "Fakku Comics/santa/Im Sorry", + "private": False, + "url" : str, + "parent" : 10454, + "views" : int, + "likes" : int, + "date" : "type:datetime", + }, + }, + }), + ("https://www.8muses.com/comics/album/Fakku-Comics/santa", { + "count": ">= 3", + "pattern": pattern, + "keyword": { + "url" : str, + "name" : str, + "private": False, + }, + }), + ("https://www.8muses.com/comics/album/Fakku-Comics/6?sort=az", { + "count": ">= 70", + "keyword": {"name": r"re:^[S-Zs-z]"}, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + self.params = match.group(2) or "" + + def items(self): + url = self.root + self.path + self.params + + while True: + data = self._unobfuscate(text.extract( + self.request(url).text, + 'id="ractive-public" type="text/plain">', '</script>')[0]) + + images = data.get("pictures") + if images: + count = len(images) + album = self._make_album(data["album"]) + yield Message.Directory, {"album": album, "count": count} + for num, image in enumerate(images, 1): + url = self.root + "/image/fl/" + image["publicUri"] + img = { + "url" : url, + "page" : num, + "hash" : image["publicUri"], + "count" : count, + "album" : album, + "extension": "jpg", + } + yield Message.Url, url, img + + albums = data.get("albums") + if albums: + for album in albums: + url = self.root + "/comics/album/" + album["permalink"] + album = { + "url" : url, + "name" : album["name"], + "private": album["isPrivate"], + } + yield Message.Queue, url, album + + if data["page"] >= data["pages"]: + return + path, _, num = self.path.rstrip("/").rpartition("/") + path = path if num.isdecimal() else self.path + url = "{}{}/{}{}".format( + self.root, path, data["page"] + 1, self.params) + + def _make_album(self, album): + return { + "id" : album["id"], + "path" : album["path"], + "title" : album["name"], + "private": album["isPrivate"], + "url" : self.root + album["permalink"], + "parent" : text.parse_int(album["parentId"]), + "views" : text.parse_int(album["numberViews"]), + "likes" : text.parse_int(album["numberLikes"]), + "date" : text.parse_datetime( + album["updatedAt"], "%Y-%m-%dT%H:%M:%S.%fZ"), + } + + @staticmethod + def _unobfuscate(data): + return json.loads("".join([ + chr(33 + (ord(c) + 14) % 94) if c != " " else c + for c in text.unescape(data.strip("\t\n\r !")) + ])) diff --git a/gallery_dl/extractor/__init__.py b/gallery_dl/extractor/__init__.py new file mode 100644 index 0000000..81d480e --- /dev/null +++ b/gallery_dl/extractor/__init__.py @@ -0,0 +1,189 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +import re +import importlib + +modules = [ + "2chan", + "35photo", + "3dbooru", + "4chan", + "500px", + "8chan", + "8muses", + "artstation", + "behance", + "bobx", + "danbooru", + "deviantart", + "dynastyscans", + "e621", + "exhentai", + "fallenangels", + "flickr", + "gelbooru", + "gfycat", + "hbrowse", + "hentai2read", + "hentaicafe", + "hentaifoundry", + "hentaifox", + "hentaihere", + "hentainexus", + "hitomi", + "hypnohub", + "idolcomplex", + "imagebam", + "imagefap", + "imgbox", + "imgth", + "imgur", + "instagram", + "keenspot", + "khinsider", + "kissmanga", + "komikcast", + "konachan", + "livedoor", + "luscious", + "mangadex", + "mangafox", + "mangahere", + "mangapanda", + "mangapark", + "mangareader", + "mangastream", + "mangoxo", + "myportfolio", + "newgrounds", + "ngomik", + "nhentai", + "nijie", + "nsfwalbum", + "paheal", + "patreon", + "photobucket", + "piczel", + "pinterest", + "pixiv", + "pixnet", + "plurk", + "pornhub", + "pururin", + "reactor", + "readcomiconline", + "reddit", + "rule34", + "safebooru", + "sankaku", + "sankakucomplex", + "seiga", + "senmanga", + "sexcom", + "simplyhentai", + "slickpic", + "slideshare", + "smugmug", + "tsumino", + "tumblr", + "twitter", + "vanillarock", + "wallhaven", + "warosu", + "weibo", + "wikiart", + "xhamster", + "xvideos", + "yandere", + "yaplog", + "yuki", + "foolfuuka", + "foolslide", + "mastodon", + "shopify", + "imagehosts", + "directlink", + "recursive", + "oauth", + "test", +] + + +def find(url): + """Find a suitable extractor for the given URL""" + for cls in _list_classes(): + match = cls.pattern.match(url) + if match and cls not in _blacklist: + return cls(match) + return None + + +def add(cls): + """Add 'cls' to the list of available extractors""" + cls.pattern = re.compile(cls.pattern) + _cache.append(cls) + return cls + + +def add_module(module): + """Add all extractors in 'module' to the list of available extractors""" + classes = _get_classes(module) + for cls in classes: + cls.pattern = re.compile(cls.pattern) + _cache.extend(classes) + return classes + + +def extractors(): + """Yield all available extractor classes""" + return sorted( + _list_classes(), + key=lambda x: x.__name__ + ) + + +class blacklist(): + """Context Manager to blacklist extractor modules""" + def __init__(self, categories, extractors=None): + self.extractors = extractors or [] + for cls in _list_classes(): + if cls.category in categories: + self.extractors.append(cls) + + def __enter__(self): + _blacklist.update(self.extractors) + + def __exit__(self, etype, value, traceback): + _blacklist.clear() + + +# -------------------------------------------------------------------- +# internals + +_cache = [] +_blacklist = set() +_module_iter = iter(modules) + + +def _list_classes(): + """Yield all available extractor classes""" + yield from _cache + + for module_name in _module_iter: + module = importlib.import_module("."+module_name, __package__) + yield from add_module(module) + + +def _get_classes(module): + """Return a list of all extractor classes in a module""" + return [ + cls for cls in module.__dict__.values() if ( + hasattr(cls, "pattern") and cls.__module__ == module.__name__ + ) + ] diff --git a/gallery_dl/extractor/artstation.py b/gallery_dl/extractor/artstation.py new file mode 100644 index 0000000..24197ad --- /dev/null +++ b/gallery_dl/extractor/artstation.py @@ -0,0 +1,369 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.artstation.com/""" + +from .common import Extractor, Message +from .. import text, util, exception +import random +import string + + +class ArtstationExtractor(Extractor): + """Base class for artstation extractors""" + category = "artstation" + filename_fmt = "{category}_{id}_{asset[id]}_{title}.{extension}" + directory_fmt = ("{category}", "{userinfo[username]}") + archive_fmt = "{asset[id]}" + root = "https://www.artstation.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) or match.group(2) + self.external = self.config("external", False) + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + + for project in self.projects(): + for asset in self.get_project_assets(project["hash_id"]): + asset.update(data) + adict = asset["asset"] + + if adict["has_embedded_player"] and self.external: + player = adict["player_embedded"] + url = text.extract(player, 'src="', '"')[0] + if not url.startswith(self.root): + yield Message.Url, "ytdl:" + url, asset + continue + + if adict["has_image"]: + url = adict["image_url"] + text.nameext_from_url(url, asset) + yield Message.Url, self._no_cache(url), asset + + def metadata(self): + """Return general metadata""" + return {"userinfo": self.get_user_info(self.user)} + + def projects(self): + """Return an iterable containing all relevant project IDs""" + + def get_project_assets(self, project_id): + """Return all assets associated with 'project_id'""" + url = "{}/projects/{}.json".format(self.root, project_id) + data = self.request(url).json() + + data["title"] = text.unescape(data["title"]) + data["description"] = text.unescape(text.remove_html( + data["description"])) + + assets = data["assets"] + del data["assets"] + + if len(assets) == 1: + data["asset"] = assets[0] + yield data + else: + for asset in assets: + data["asset"] = asset + yield data.copy() + + def get_user_info(self, username): + """Return metadata for a specific user""" + url = "{}/users/{}/quick.json".format(self.root, username.lower()) + response = self.request(url, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError("user") + return response.json() + + def _pagination(self, url, params=None): + if not params: + params = {} + params["page"] = 1 + total = 0 + + while True: + data = self.request(url, params=params).json() + yield from data["data"] + + total += len(data["data"]) + if total >= data["total_count"]: + return + + params["page"] += 1 + + @staticmethod + def _no_cache(url, alphabet=(string.digits + string.ascii_letters)): + """Cause a cache miss to prevent Cloudflare 'optimizations' + + Cloudflare's 'Polish' optimization strips image metadata and may even + recompress an image as lossy JPEG. This can be prevented by causing + a cache miss when requesting an image by adding a random dummy query + parameter. + + Ref: + https://github.com/r888888888/danbooru/issues/3528 + https://danbooru.donmai.us/forum_topics/14952 + """ + param = "gallerydl_no_cache=" + util.bencode( + random.getrandbits(64), alphabet) + sep = "&" if "?" in url else "?" + return url + sep + param + + +class ArtstationUserExtractor(ArtstationExtractor): + """Extractor for all projects of an artstation user""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?&#]+)(?:/albums/all)?" + r"|((?!www)\w+)\.artstation\.com(?:/projects)?)/?$") + test = ( + ("https://www.artstation.com/gaerikim/", { + "pattern": r"https://\w+\.artstation\.com/p/assets" + r"/images/images/\d+/\d+/\d+/large/[^/]+", + "count": ">= 6", + }), + ("https://www.artstation.com/gaerikim/albums/all/"), + ("https://gaerikim.artstation.com/"), + ("https://gaerikim.artstation.com/projects/"), + ) + + def projects(self): + url = "{}/users/{}/projects.json".format(self.root, self.user) + return self._pagination(url) + + +class ArtstationAlbumExtractor(ArtstationExtractor): + """Extractor for all projects in an artstation album""" + subcategory = "album" + directory_fmt = ("{category}", "{userinfo[username]}", "Albums", + "{album[id]} - {album[title]}") + archive_fmt = "a_{album[id]}_{asset[id]}" + pattern = (r"(?:https?://)?(?:(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?&#]+)" + r"|((?!www)\w+)\.artstation\.com)/albums/(\d+)") + test = ( + ("https://www.artstation.com/huimeiye/albums/770899", { + "count": 2, + }), + ("https://www.artstation.com/huimeiye/albums/770898", { + "exception": exception.NotFoundError, + }), + ("https://huimeiye.artstation.com/albums/770899"), + ) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.album_id = text.parse_int(match.group(3)) + + def metadata(self): + userinfo = self.get_user_info(self.user) + album = None + + for album in userinfo["albums_with_community_projects"]: + if album["id"] == self.album_id: + break + else: + raise exception.NotFoundError("album") + + return { + "userinfo": userinfo, + "album": album + } + + def projects(self): + url = "{}/users/{}/projects.json".format(self.root, self.user) + params = {"album_id": self.album_id} + return self._pagination(url, params) + + +class ArtstationLikesExtractor(ArtstationExtractor): + """Extractor for liked projects of an artstation user""" + subcategory = "likes" + directory_fmt = ("{category}", "{userinfo[username]}", "Likes") + archive_fmt = "f_{userinfo[id]}_{asset[id]}" + pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" + r"/(?!artwork|projects|search)([^/?&#]+)/likes/?") + test = ( + ("https://www.artstation.com/mikf/likes", { + "pattern": r"https://\w+\.artstation\.com/p/assets" + r"/images/images/\d+/\d+/\d+/large/[^/]+", + "count": 6, + }), + # no likes + ("https://www.artstation.com/sungchoi/likes", { + "count": 0, + }), + ) + + def projects(self): + url = "{}/users/{}/likes.json".format(self.root, self.user) + return self._pagination(url) + + +class ArtstationChallengeExtractor(ArtstationExtractor): + """Extractor for submissions of artstation challenges""" + subcategory = "challenge" + filename_fmt = "{submission_id}_{asset_id}_{filename}.{extension}" + directory_fmt = ("{category}", "Challenges", + "{challenge[id]} - {challenge[title]}") + archive_fmt = "c_{challenge[id]}_{asset_id}" + pattern = (r"(?:https?://)?(?:www\.)?artstation\.com" + r"/contests/[^/?&#]+/challenges/(\d+)" + r"/?(?:\?sorting=([a-z]+))?") + test = ( + ("https://www.artstation.com/contests/thu-2017/challenges/20"), + (("https://www.artstation.com/contests/beyond-human" + "/challenges/23?sorting=winners"), { + "range": "1-30", + "count": 30, + }), + ) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.challenge_id = match.group(1) + self.sorting = match.group(2) or "popular" + + def items(self): + challenge_url = "{}/contests/_/challenges/{}.json".format( + self.root, self.challenge_id) + submission_url = "{}/contests/_/challenges/{}/submissions.json".format( + self.root, self.challenge_id) + update_url = "{}/contests/submission_updates.json".format( + self.root) + + challenge = self.request(challenge_url).json() + yield Message.Version, 1 + yield Message.Directory, {"challenge": challenge} + + params = {"sorting": self.sorting} + for submission in self._pagination(submission_url, params): + + params = {"submission_id": submission["id"]} + for update in self._pagination(update_url, params=params): + + del update["replies"] + update["challenge"] = challenge + for url in text.extract_iter( + update["body_presentation_html"], ' href="', '"'): + update["asset_id"] = self._id_from_url(url) + text.nameext_from_url(url, update) + yield Message.Url, self._no_cache(url), update + + @staticmethod + def _id_from_url(url): + """Get an image's submission ID from its URL""" + parts = url.split("/") + return text.parse_int("".join(parts[7:10])) + + +class ArtstationSearchExtractor(ArtstationExtractor): + """Extractor for artstation search results""" + subcategory = "search" + directory_fmt = ("{category}", "Searches", "{search[searchterm]}") + archive_fmt = "s_{search[searchterm]}_{asset[id]}" + pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com" + r"/search/?\?([^#]+)") + test = ("https://www.artstation.com/search?sorting=recent&q=ancient",) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + query = text.parse_query(match.group(1)) + self.searchterm = query.get("q", "") + self.order = query.get("sorting", "recent").lower() + + def metadata(self): + return {"search": { + "searchterm": self.searchterm, + "order": self.order, + }} + + def projects(self): + order = "likes_count" if self.order == "likes" else "published_at" + url = "{}/search/projects.json".format(self.root) + params = { + "direction": "desc", + "order": order, + "q": self.searchterm, + # "show_pro_first": "true", + } + return self._pagination(url, params) + + +class ArtstationArtworkExtractor(ArtstationExtractor): + """Extractor for projects on artstation's artwork page""" + subcategory = "artwork" + directory_fmt = ("{category}", "Artworks", "{artwork[sorting]!c}") + archive_fmt = "A_{asset[id]}" + pattern = (r"(?:https?://)?(?:\w+\.)?artstation\.com" + r"/artwork/?\?([^#]+)") + test = ("https://www.artstation.com/artwork?sorting=latest",) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.query = text.parse_query(match.group(1)) + + def metadata(self): + return {"artwork": self.query} + + def projects(self): + url = "{}/projects.json".format(self.root) + params = self.query.copy() + params["page"] = 1 + return self._pagination(url, params) + + +class ArtstationImageExtractor(ArtstationExtractor): + """Extractor for images from a single artstation project""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:" + r"(?:\w+\.)?artstation\.com/(?:artwork|projects|search)" + r"|artstn\.co/p)/(\w+)") + test = ( + ("https://www.artstation.com/artwork/LQVJr", { + "pattern": r"https?://\w+\.artstation\.com/p/assets" + r"/images/images/008/760/279/large/.+", + "content": "1f645ce7634e44675ebde8f6b634d36db0617d3c", + # SHA1 hash without _no_cache() + # "content": "2e8aaf6400aeff2345274f45e90b6ed3f2a0d946", + }), + # multiple images per project + ("https://www.artstation.com/artwork/Db3dy", { + "count": 4, + }), + # embedded youtube video + ("https://www.artstation.com/artwork/g4WPK", { + "range": "2", + "options": (("external", True),), + "pattern": "ytdl:https://www.youtube.com/embed/JNFfJtwwrU0", + }), + # alternate URL patterns + ("https://sungchoi.artstation.com/projects/LQVJr"), + ("https://artstn.co/p/LQVJr"), + ) + + def __init__(self, match): + ArtstationExtractor.__init__(self, match) + self.project_id = match.group(1) + self.assets = None + + def metadata(self): + self.assets = list(ArtstationExtractor.get_project_assets( + self, self.project_id)) + self.user = self.assets[0]["user"]["username"] + return ArtstationExtractor.metadata(self) + + def projects(self): + return ({"hash_id": self.project_id},) + + def get_project_assets(self, project_id): + return self.assets diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py new file mode 100644 index 0000000..111d560 --- /dev/null +++ b/gallery_dl/extractor/behance.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.behance.net/""" + +from .common import Extractor, Message +from .. import text +import json + + +class BehanceExtractor(Extractor): + """Base class for behance extractors""" + category = "behance" + root = "https://www.behance.net" + + def items(self): + yield Message.Version, 1 + for gallery in self.galleries(): + gallery["_extractor"] = BehanceGalleryExtractor + yield Message.Queue, gallery["url"], self._update(gallery) + + def galleries(self): + """Return all relevant gallery URLs""" + + @staticmethod + def _update(data): + # compress data to simple lists + data["fields"] = [field["name"] for field in data["fields"]] + data["owners"] = [owner["display_name"] for owner in data["owners"]] + if "tags" in data: + data["tags"] = [tag["title"] for tag in data["tags"]] + + # backwards compatibility + data["gallery_id"] = data["id"] + data["title"] = data["name"] + data["user"] = ", ".join(data["owners"]) + + return data + + +class BehanceGalleryExtractor(BehanceExtractor): + """Extractor for image galleries from www.behance.net""" + subcategory = "gallery" + directory_fmt = ("{category}", "{owners:J, }", "{id} {name}") + filename_fmt = "{category}_{id}_{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + pattern = r"(?:https?://)?(?:www\.)?behance\.net/gallery/(\d+)" + test = ( + ("https://www.behance.net/gallery/17386197/A-Short-Story", { + "count": 2, + "url": "ab79bd3bef8d3ae48e6ac74fd995c1dfaec1b7d2", + "keyword": { + "id": 17386197, + "name": 're:"Hi". A short story about the important things ', + "owners": ["Place Studio", "Julio César Velazquez"], + "fields": ["Animation", "Character Design", "Directing"], + "tags": list, + "module": dict, + }, + }), + ("https://www.behance.net/gallery/21324767/Nevada-City", { + "count": 6, + "url": "0258fe194fe7d828d6f2c7f6086a9a0a4140db1d", + "keyword": {"owners": ["Alex Strohl"]}, + }), + ) + + def __init__(self, match): + BehanceExtractor.__init__(self, match) + self.gallery_id = match.group(1) + + def items(self): + data = self.get_gallery_data() + imgs = self.get_images(data) + data["count"] = len(imgs) + + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], (url, module) in enumerate(imgs, 1): + data["module"] = module + data["extension"] = text.ext_from_url(url) + yield Message.Url, url, data + + def get_gallery_data(self): + """Collect gallery info dict""" + url = "{}/gallery/{}/a".format(self.root, self.gallery_id) + cookies = { + "_evidon_consent_cookie": + '{"consent_date":"2019-01-31T09:41:15.132Z"}', + "bcp": "815b5eee-8bdf-4898-ac79-33c2bcc0ed19", + "gk_suid": "66981391", + "gki": '{"feature_project_view":false,' + '"feature_discover_login_prompt":false,' + '"feature_project_login_prompt":false}', + "ilo0": "true", + } + page = self.request(url, cookies=cookies).text + + data = json.loads(text.extract( + page, 'id="beconfig-store_state">', '</script>')[0]) + return self._update(data["project"]["project"]) + + @staticmethod + def get_images(data): + """Extract image results from an API response""" + results = [] + + for module in data["modules"]: + + if module["type"] == "image": + url = module["sizes"]["original"] + results.append((url, module)) + + elif module["type"] == "embed": + embed = module.get("original_embed") or module.get("embed") + url = "ytdl:" + text.extract(embed, 'src="', '"')[0] + results.append((url, module)) + + return results + + +class BehanceUserExtractor(BehanceExtractor): + """Extractor for a user's galleries from www.behance.net""" + subcategory = "user" + categorytransfer = True + pattern = r"(?:https?://)?(?:www\.)?behance\.net/([^/?&#]+)/?$" + test = ("https://www.behance.net/alexstrohl", { + "count": ">= 8", + "pattern": BehanceGalleryExtractor.pattern, + }) + + def __init__(self, match): + BehanceExtractor.__init__(self, match) + self.user = match.group(1) + + def galleries(self): + url = "{}/{}/projects".format(self.root, self.user) + headers = {"X-Requested-With": "XMLHttpRequest"} + params = {"offset": 0} + + while True: + data = self.request(url, headers=headers, params=params).json() + work = data["profile"]["activeSection"]["work"] + yield from work["projects"] + if not work["hasMore"]: + return + params["offset"] += len(work["projects"]) + + +class BehanceCollectionExtractor(BehanceExtractor): + """Extractor for a collection's galleries from www.behance.net""" + subcategory = "collection" + categorytransfer = True + pattern = r"(?:https?://)?(?:www\.)?behance\.net/collection/(\d+)" + test = ("https://www.behance.net/collection/170615607/Sky", { + "count": ">= 13", + "pattern": BehanceGalleryExtractor.pattern, + }) + + def __init__(self, match): + BehanceExtractor.__init__(self, match) + self.collection_id = match.group(1) + + def galleries(self): + url = "{}/collection/{}/a".format(self.root, self.collection_id) + headers = {"X-Requested-With": "XMLHttpRequest"} + params = {} + + while True: + data = self.request(url, headers=headers, params=params).json() + yield from data["output"] + if not data.get("offset"): + return + params["offset"] = data["offset"] diff --git a/gallery_dl/extractor/bobx.py b/gallery_dl/extractor/bobx.py new file mode 100644 index 0000000..67427a7 --- /dev/null +++ b/gallery_dl/extractor/bobx.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://www.bobx.com/dark/""" + +from .common import Extractor, Message +from .. import text + + +class BobxExtractor(Extractor): + """Base class for bobx extractors""" + category = "bobx" + root = "http://www.bobx.com" + per_page = 80 + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + + +class BobxGalleryExtractor(BobxExtractor): + """Extractor for individual image galleries on bobx.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{model}", "{title}") + filename_fmt = "{model}_{image_id}_{num:>03}.{extension}" + archive_fmt = "{image_id}" + pattern = (r"(?:https?://)?(?:www\.)?bobx\.com" + r"/([^/]+/[^/]+/photoset/[\w-]+)-\d+-\d+-\d+\.html") + test = ( + (("http://www.bobx.com/idol/mikoto-hibi" + "/photoset/wpb-2018-_11-0-2-8.html"), { + "url": "93972d6a661f6627e963d62c9d15531e6b36a389", + "keyword": "6c620862db494ed05e69356ba30e604b167b0670", + "content": "3f176b7fe752524cec21a763aa55567e41181e07", + }), + (("http://www.bobx.com/idol/nashiko-momotsuki" + "/photoset/wpb-net-_221---2018-08---magic-of-summer-0-10-10.html"), { + "url": "f5d6c0cd0881ae6f504c21a90d86e3464dc54e8e", + "keyword": "f4819c75f494044348889ecd27771508464c0f5f", + }), + ) + + def items(self): + num = 0 + while True: + url = "{}/{}-{}-10-8.html".format(self.root, self.path, num) + page = self.request(url, encoding="utf-8").text + + if num == 0: + data = self.metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + data["num"] = 0 + + for url in self.images(page): + url = text.urljoin(self.root, url.replace("-preview-", "-")) + data = text.nameext_from_url(url, data) + data["image_id"] = text.parse_int( + data["filename"].rpartition("-")[2]) + data["num"] += 1 + yield Message.Url, url, data + + num += self.per_page + if num >= data["count"]: + return + + @staticmethod + def metadata(page): + """Collect metadata for extractor-job""" + info = text.extract(page, "<title>", "</title>")[0] + model, _, info = info.partition(" in ") + info, _, count = info.rpartition(" of ") + title = info.rpartition(" - @")[0] + return { + "title": text.unquote(title), + "model": text.unquote(model), + "count": text.parse_int(count), + } + + @staticmethod + def images(page): + """Extract all image-urls""" + page = text.extract(page, "<table CELLPADDING=", "<script ")[0] + return text.extract_iter(page, '<img src="/thumbnail', '"') + + +class BobxIdolExtractor(BobxExtractor): + """Extractor for an idol's image galleries on bobx.com""" + subcategory = "idol" + pattern = r"(?:https?://)?(?:www\.)?bobx\.com/([^/]+/[^/?&#]+)/?$" + test = ("http://www.bobx.com/idol/rin-okabe/", { + "url": "74d80bfcd53b738b31909bb42e5cc97c41b475b8", + }) + + def items(self): + url = "{}/{}/".format(self.root, self.path) + data = {"_extractor": BobxGalleryExtractor} + page = self.request(url).text + skip = True + + yield Message.Version, 1 + for part in text.extract_iter(page, '="photoset/', '"'): + # skip every other entry + skip = not skip + if skip: + continue + yield Message.Queue, "{}photoset/{}".format(url, part), data diff --git a/gallery_dl/extractor/booru.py b/gallery_dl/extractor/booru.py new file mode 100644 index 0000000..c63085a --- /dev/null +++ b/gallery_dl/extractor/booru.py @@ -0,0 +1,265 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Base classes for extractors for danbooru and co""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text, exception +from xml.etree import ElementTree +import collections +import datetime +import operator +import re + + +class BooruExtractor(SharedConfigMixin, Extractor): + """Base class for all booru extractors""" + basecategory = "booru" + filename_fmt = "{category}_{id}_{md5}.{extension}" + api_url = "" + post_url = "" + per_page = 50 + page_start = 1 + page_limit = None + sort = False + + def __init__(self, match): + super().__init__(match) + self.params = {} + self.extags = self.post_url and self.config("tags", False) + + def skip(self, num): + pages = num // self.per_page + if self.page_limit and pages + self.page_start > self.page_limit: + pages = self.page_limit - self.page_start + self.page_start += pages + return pages * self.per_page + + def items(self): + data = self.get_metadata() + + yield Message.Version, 1 + yield Message.Directory, data + + self.reset_page() + while True: + images = self.parse_response( + self.request(self.api_url, params=self.params)) + + for image in images: + try: + url = image["file_url"] + except KeyError: + continue + if url.startswith("/"): + url = text.urljoin(self.api_url, url) + image.update(data) + if self.extags: + self.extended_tags(image) + yield Message.Url, url, text.nameext_from_url(url, image) + + if len(images) < self.per_page: + return + self.update_page(image) + + def reset_page(self): + """Initialize params to point to the first page""" + self.params["page"] = self.page_start + + def update_page(self, data): + """Update params to point to the next page""" + + def parse_response(self, response): + """Parse JSON API response""" + images = response.json() + if self.sort: + images.sort(key=operator.itemgetter("score", "id"), + reverse=True) + return images + + def get_metadata(self): + """Collect metadata for extractor-job""" + return {} + + def extended_tags(self, image, page=None): + """Retrieve extended tag information""" + if not page: + url = self.post_url.format(image["id"]) + page = self.request(url).text + tags = collections.defaultdict(list) + tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0] + pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S) + for tag_type, tag_name in pattern.findall(tags_html or ""): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + image["tags_" + key] = " ".join(value) + + +class XmlParserMixin(): + """Mixin for XML based API responses""" + def parse_response(self, response): + root = ElementTree.fromstring(response.text) + return [post.attrib for post in root] + + +class DanbooruPageMixin(): + """Pagination for Danbooru v2""" + def update_page(self, data): + self.params["page"] = "b{}".format(data["id"]) + + +class MoebooruPageMixin(): + """Pagination for Moebooru and Danbooru v1""" + def update_page(self, data): + if self.page_limit: + self.params["page"] = None + self.params["before_id"] = data["id"] + else: + self.params["page"] += 1 + + +class GelbooruPageMixin(): + """Pagination for Gelbooru-like sites""" + page_start = 0 + + def reset_page(self): + self.params["pid"] = self.page_start + + def update_page(self, data): + self.params["pid"] += 1 + + +class TagMixin(): + """Extraction of images based on search-tags""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + + def __init__(self, match): + super().__init__(match) + self.tags = text.unquote(match.group("tags").replace("+", " ")) + self.params["tags"] = self.tags + self.params["limit"] = self.per_page + + def get_metadata(self): + return {"search_tags": self.tags} + + +class PoolMixin(): + """Extraction of image-pools""" + subcategory = "pool" + directory_fmt = ("{category}", "pool", "{pool}") + archive_fmt = "p_{pool}_{id}" + + def __init__(self, match): + super().__init__(match) + self.pool = match.group("pool") + self.params["tags"] = "pool:" + self.pool + self.params["limit"] = self.per_page + + def get_metadata(self): + return {"pool": text.parse_int(self.pool)} + + +class GelbooruPoolMixin(PoolMixin): + """Image-pool extraction for Gelbooru-like sites""" + per_page = 1 + + def get_metadata(self): + page = self.request(self.pool_url.format(self.pool)).text + name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>") + if not name: + name, pos = text.extract(page, "<h4>Pool: ", "</h4>") + if not name: + raise exception.NotFoundError("pool") + self.posts = list(text.extract_iter(page, 'id="p', '"', pos)) + + return { + "pool": text.parse_int(self.pool), + "pool_name": text.unescape(name), + "count": len(self.posts), + } + + def reset_page(self): + self.index = self.page_start + self.update_page(None) + + def update_page(self, data): + try: + post = self.posts[self.index] + self.index += 1 + except IndexError: + post = "0" + self.params["tags"] = "id:" + post + + +class PostMixin(): + """Extraction of a single image-post""" + subcategory = "post" + archive_fmt = "{id}" + + def __init__(self, match): + super().__init__(match) + self.post = match.group("post") + self.params["tags"] = "id:" + self.post + + +class PopularMixin(): + """Extraction and metadata handling for Danbooru v2""" + subcategory = "popular" + directory_fmt = ("{category}", "popular", "{scale}", "{date}") + archive_fmt = "P_{scale[0]}_{date}_{id}" + page_start = None + sort = True + + def __init__(self, match): + super().__init__(match) + self.params.update(text.parse_query(match.group("query"))) + + def get_metadata(self, fmt="%Y-%m-%d"): + date = self.get_date() or datetime.datetime.utcnow().strftime(fmt) + scale = self.get_scale() or "day" + + if scale == "week": + dt = datetime.datetime.strptime(date, fmt) + dt -= datetime.timedelta(days=dt.weekday()) + date = dt.strftime(fmt) + elif scale == "month": + date = date[:-3] + + return {"date": date, "scale": scale} + + def get_scale(self): + if "scale" in self.params: + return self.params["scale"] + return None + + def get_date(self): + if "date" in self.params: + return self.params["date"][:10] + return None + + +class MoebooruPopularMixin(PopularMixin): + """Extraction and metadata handling for Moebooru and Danbooru v1""" + def __init__(self, match): + super().__init__(match) + self.scale = match.group("scale") + + def get_date(self): + if "year" in self.params: + return "{:>04}-{:>02}-{:>02}".format( + self.params["year"], + self.params.get("month", "01"), + self.params.get("day", "01")) + return None + + def get_scale(self): + if self.scale and self.scale.startswith("by_"): + return self.scale[3:] + return self.scale diff --git a/gallery_dl/extractor/chan.py b/gallery_dl/extractor/chan.py new file mode 100644 index 0000000..5e44fd9 --- /dev/null +++ b/gallery_dl/extractor/chan.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Base classes for extractors for different Futaba Channel-like boards""" + +from .common import Extractor, Message +from .. import text + + +class ChanThreadExtractor(Extractor): + """Base class for extractors for Futaba Channel-like boards""" + category = "chan" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} - {title}") + filename_fmt = "{tim}-{filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + api_url = "" + file_url = "" + + def __init__(self, match): + Extractor.__init__(self, match) + self.metadata = { + "board": match.group(1), + "thread": match.group(2), + } + + def items(self): + yield Message.Version, 1 + url = self.api_url.format_map(self.metadata) + posts = self.request(url).json()["posts"] + self.metadata["title"] = self.get_thread_title(posts[0]) + yield Message.Directory, self.metadata + for post in posts: + if "filename" not in post: + continue + self.update(post) + yield Message.Url, self.build_url(post), post + if "extra_files" in post: + for file in post["extra_files"]: + self.update(post, file) + yield Message.Url, self.build_url(post), post + + def update(self, post, data=None): + """Update keyword dictionary""" + post.update(data or self.metadata) + post["extension"] = post["ext"][1:] + + def build_url(self, post): + """Construct an image url out of a post object""" + return self.file_url.format_map(post) + + @staticmethod + def get_thread_title(post): + """Return thread title from first post""" + title = post["sub"] if "sub" in post else text.remove_html(post["com"]) + return text.unescape(title)[:50] diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py new file mode 100644 index 0000000..175af63 --- /dev/null +++ b/gallery_dl/extractor/common.py @@ -0,0 +1,432 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Common classes and constants used by extractor modules.""" + +import re +import time +import netrc +import queue +import logging +import requests +import threading +import http.cookiejar +from .message import Message +from .. import config, text, exception, cloudflare + + +class Extractor(): + + category = "" + subcategory = "" + categorytransfer = False + directory_fmt = ("{category}",) + filename_fmt = "{filename}.{extension}" + archive_fmt = "" + cookiedomain = "" + root = "" + test = None + + def __init__(self, match): + self.session = requests.Session() + self.log = logging.getLogger(self.category) + self.url = match.string + self._init_headers() + self._init_cookies() + self._init_proxies() + self._retries = self.config("retries", 5) + self._timeout = self.config("timeout", 30) + self._verify = self.config("verify", True) + + @classmethod + def from_url(cls, url): + if isinstance(cls.pattern, str): + cls.pattern = re.compile(cls.pattern) + match = cls.pattern.match(url) + return cls(match) if match else None + + def __iter__(self): + return self.items() + + def items(self): + yield Message.Version, 1 + + def skip(self, num): + return 0 + + def config(self, key, default=None): + return config.interpolate( + ("extractor", self.category, self.subcategory, key), default) + + def request(self, url, method="GET", *, session=None, + encoding=None, expect=(), retries=None, **kwargs): + tries = 0 + retries = retries or self._retries + session = session or self.session + kwargs.setdefault("timeout", self._timeout) + kwargs.setdefault("verify", self._verify) + + while True: + try: + response = session.request(method, url, **kwargs) + except (requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + requests.exceptions.ChunkedEncodingError, + requests.exceptions.ContentDecodingError) as exc: + msg = exc + except (requests.exceptions.RequestException) as exc: + raise exception.HttpError(exc) + else: + code = response.status_code + if 200 <= code < 400 or code in expect: + if encoding: + response.encoding = encoding + return response + if cloudflare.is_challenge(response): + self.log.info("Solving Cloudflare challenge") + url, domain, cookies = cloudflare.solve_challenge( + session, response, kwargs) + cloudflare.cookies.update(self.category, (domain, cookies)) + continue + + msg = "{}: {} for url: {}".format(code, response.reason, url) + if code < 500 and code != 429: + break + + tries += 1 + self.log.debug("%s (%d/%d)", msg, tries, retries) + if tries >= retries: + break + time.sleep(2 ** tries) + + raise exception.HttpError(msg) + + def _get_auth_info(self): + """Return authentication information as (username, password) tuple""" + username = self.config("username") + password = None + + if username: + password = self.config("password") + elif self.config("netrc", False): + try: + info = netrc.netrc().authenticators(self.category) + username, _, password = info + except (OSError, netrc.NetrcParseError) as exc: + self.log.error("netrc: %s", exc) + except TypeError: + self.log.warning("netrc: No authentication info") + + return username, password + + def _init_headers(self): + """Set additional headers for the 'session' object""" + headers = self.session.headers + headers.clear() + + headers["User-Agent"] = self.config( + "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:62.0) " + "Gecko/20100101 Firefox/62.0")) + headers["Accept"] = "*/*" + headers["Accept-Language"] = "en-US,en;q=0.5" + headers["Accept-Encoding"] = "gzip, deflate" + headers["Connection"] = "keep-alive" + headers["Upgrade-Insecure-Requests"] = "1" + + def _init_proxies(self): + """Update the session's proxy map""" + proxies = self.config("proxy") + if proxies: + if isinstance(proxies, str): + proxies = {"http": proxies, "https": proxies} + if isinstance(proxies, dict): + for scheme, proxy in proxies.items(): + if "://" not in proxy: + proxies[scheme] = "http://" + proxy.lstrip("/") + self.session.proxies = proxies + else: + self.log.warning("invalid proxy specifier: %s", proxies) + + def _init_cookies(self): + """Populate the session's cookiejar""" + cookies = self.config("cookies") + if cookies: + if isinstance(cookies, dict): + self._update_cookies_dict(cookies, self.cookiedomain) + else: + cookiejar = http.cookiejar.MozillaCookieJar() + try: + cookiejar.load(cookies) + except OSError as exc: + self.log.warning("cookies: %s", exc) + else: + self.session.cookies.update(cookiejar) + + cookies = cloudflare.cookies(self.category) + if cookies: + domain, cookies = cookies + self._update_cookies_dict(cookies, domain) + + def _update_cookies(self, cookies, *, domain=""): + """Update the session's cookiejar with 'cookies'""" + if isinstance(cookies, dict): + self._update_cookies_dict(cookies, domain or self.cookiedomain) + else: + setcookie = self.session.cookies.set_cookie + try: + cookies = iter(cookies) + except TypeError: + setcookie(cookies) + else: + for cookie in cookies: + setcookie(cookie) + + def _update_cookies_dict(self, cookiedict, domain): + """Update cookiejar with name-value pairs from a dict""" + setcookie = self.session.cookies.set + for name, value in cookiedict.items(): + setcookie(name, value, domain=domain) + + def _check_cookies(self, cookienames, *, domain=""): + """Check if all 'cookienames' are in the session's cookiejar""" + if not domain: + domain = self.cookiedomain + try: + for name in cookienames: + self.session.cookies._find(name, domain) + except KeyError: + return False + return True + + @classmethod + def _get_tests(cls): + """Yield an extractor's test cases as (URL, RESULTS) tuples""" + tests = cls.test + if not tests: + return + + if len(tests) == 2 and (not tests[1] or isinstance(tests[1], dict)): + tests = (tests,) + + for test in tests: + if isinstance(test, str): + test = (test, None) + yield test + + +class ChapterExtractor(Extractor): + + subcategory = "chapter" + directory_fmt = ( + "{category}", "{manga}", + "{volume:?v/ />02}c{chapter:>03}{chapter_minor:?//}{title:?: //}") + filename_fmt = ( + "{manga}_c{chapter:>03}{chapter_minor:?//}_{page:>03}.{extension}") + archive_fmt = ( + "{manga}_{chapter}{chapter_minor}_{page}") + + def __init__(self, match, url=None): + Extractor.__init__(self, match) + self.chapter_url = url or self.root + match.group(1) + + def items(self): + self.login() + page = self.request(self.chapter_url).text + data = self.metadata(page) + imgs = self.images(page) + + if "count" in data: + images = zip( + range(1, data["count"]+1), + imgs, + ) + else: + try: + data["count"] = len(imgs) + except TypeError: + pass + images = enumerate(imgs, 1) + + yield Message.Version, 1 + yield Message.Directory, data + for data["page"], (url, imgdata) in images: + if imgdata: + data.update(imgdata) + yield Message.Url, url, text.nameext_from_url(url, data) + + def login(self): + """Login and set necessary cookies""" + + def metadata(self, page): + """Return a dict with general metadata""" + + def images(self, page): + """Return a list of all (image-url, metadata)-tuples""" + + +class MangaExtractor(Extractor): + + subcategory = "manga" + categorytransfer = True + chapterclass = None + reverse = True + + def __init__(self, match, url=None): + Extractor.__init__(self, match) + self.manga_url = url or self.root + match.group(1) + + if self.config("chapter-reverse", False): + self.reverse = not self.reverse + + def items(self): + self.login() + page = self.request(self.manga_url).text + + chapters = self.chapters(page) + if self.reverse: + chapters.reverse() + + yield Message.Version, 1 + for chapter, data in chapters: + data["_extractor"] = self.chapterclass + yield Message.Queue, chapter, data + + def login(self): + """Login and set necessary cookies""" + + def chapters(self, page): + """Return a list of all (chapter-url, metadata)-tuples""" + + +class GalleryExtractor(ChapterExtractor): + + subcategory = "gallery" + filename_fmt = "{category}_{gallery_id}_{page:>03}.{extension}" + directory_fmt = ("{category}", "{gallery_id} {title}") + archive_fmt = "{gallery_id}_{page}" + + +class AsynchronousMixin(): + """Run info extraction in a separate thread""" + + def __iter__(self): + messages = queue.Queue(5) + thread = threading.Thread( + target=self.async_items, + args=(messages,), + daemon=True, + ) + + thread.start() + while True: + msg = messages.get() + if msg is None: + thread.join() + return + if isinstance(msg, Exception): + thread.join() + raise msg + yield msg + messages.task_done() + + def async_items(self, messages): + try: + for msg in self.items(): + messages.put(msg) + except Exception as exc: + messages.put(exc) + messages.put(None) + + +class SharedConfigMixin(): + """Enable sharing of config settings based on 'basecategory'""" + basecategory = "" + + def config(self, key, default=None, *, sentinel=object()): + value = Extractor.config(self, key, sentinel) + if value is sentinel: + cat, self.category = self.category, self.basecategory + value = Extractor.config(self, key, default) + self.category = cat + return value + + +def generate_extractors(extractor_data, symtable, classes): + """Dynamically generate Extractor classes""" + extractors = config.get(("extractor", classes[0].basecategory)) + ckey = extractor_data.get("_ckey") + prev = None + + if extractors: + extractor_data.update(extractors) + + for category, info in extractor_data.items(): + + if not isinstance(info, dict): + continue + + root = info["root"] + domain = root[root.index(":") + 3:] + pattern = info.get("pattern") or re.escape(domain) + name = (info.get("name") or category).capitalize() + + for cls in classes: + + class Extr(cls): + pass + Extr.__module__ = cls.__module__ + Extr.__name__ = Extr.__qualname__ = \ + name + cls.subcategory.capitalize() + "Extractor" + Extr.__doc__ = \ + "Extractor for " + cls.subcategory + "s from " + domain + Extr.category = category + Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt + Extr.test = info.get("test-" + cls.subcategory) + Extr.root = root + + if "extra" in info: + for key, value in info["extra"].items(): + setattr(Extr, key, value) + if prev and ckey: + setattr(Extr, ckey, prev) + + symtable[Extr.__name__] = prev = Extr + + +# Reduce strictness of the expected magic string in cookiejar files. +# (This allows the use of Wget-generated cookiejars without modification) +http.cookiejar.MozillaCookieJar.magic_re = re.compile( + "#( Netscape)? HTTP Cookie File", re.IGNORECASE) + +# Update default cipher list of urllib3 +# to fix issues with Cloudflare and, by extension, Artstation (#227) +from requests.packages.urllib3.util import ssl_ # noqa +logging.getLogger("gallery-dl").debug("updating default urllib3 ciphers") + +# cipher list taken from urllib3 1.25 +# https://github.com/urllib3/urllib3/blob/1.25/src/urllib3/util/ssl_.py +# with additions from +# https://github.com/Anorov/cloudflare-scrape/pull/242 +ssl_.DEFAULT_CIPHERS = ( + "ECDHE+AESGCM:" + "ECDHE+CHACHA20:" + "DHE+AESGCM:" + "DHE+CHACHA20:" + "ECDH+AESGCM:" + "DH+AESGCM:" + "ECDH+AES:" + "DH+AES:" + "RSA+AESGCM:" + "RSA+AES:" + "!ECDHE+SHA:" + "!AES128-SHA:" + "!aNULL:" + "!eNULL:" + "!MD5:" + "!DSS" +) diff --git a/gallery_dl/extractor/danbooru.py b/gallery_dl/extractor/danbooru.py new file mode 100644 index 0000000..211c340 --- /dev/null +++ b/gallery_dl/extractor/danbooru.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://danbooru.donmai.us/""" + +from . import booru + + +BASE_PATTERN = ( + r"(?:https?://)?" + r"(?P<subdomain>danbooru|hijiribe|sonohara|safebooru)" + r"\.donmai\.us") + + +class DanbooruExtractor(booru.DanbooruPageMixin, booru.BooruExtractor): + """Base class for danbooru extractors""" + category = "danbooru" + page_limit = 1000 + + def __init__(self, match): + super().__init__(match) + self.subdomain = match.group("subdomain") + self.scheme = "https" if self.subdomain == "danbooru" else "http" + self.api_url = "{scheme}://{subdomain}.donmai.us/posts.json".format( + scheme=self.scheme, subdomain=self.subdomain) + + username, api_key = self._get_auth_info() + if username: + self.log.debug("Using HTTP Basic Auth for user '%s'", username) + self.session.auth = (username, api_key) + + +class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor): + """Extractor for images from danbooru based on search-tags""" + pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)" + test = ( + ("https://danbooru.donmai.us/posts?tags=bonocho", { + "content": "b196fb9f1668109d7774a0a82efea3ffdda07746", + }), + # test page transitions + ("https://danbooru.donmai.us/posts?tags=canvas_%28cocktail_soft%29", { + "count": ">= 50", + }), + ("https://hijiribe.donmai.us/posts?tags=bonocho"), + ("https://sonohara.donmai.us/posts?tags=bonocho"), + ("https://safebooru.donmai.us/posts?tags=bonocho"), + ) + + +class DanbooruPoolExtractor(booru.PoolMixin, DanbooruExtractor): + """Extractor for image-pools from danbooru""" + pattern = BASE_PATTERN + r"/pools/(?P<pool>\d+)" + test = ("https://danbooru.donmai.us/pools/7659", { + "content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99", + }) + + +class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor): + """Extractor for single images from danbooru""" + pattern = BASE_PATTERN + r"/posts/(?P<post>\d+)" + test = ("https://danbooru.donmai.us/posts/294929", { + "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + }) + + +class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor): + """Extractor for popular images from danbooru""" + pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?(?P<query>[^#]*))?" + test = ( + ("https://danbooru.donmai.us/explore/posts/popular"), + (("https://danbooru.donmai.us/explore/posts/popular" + "?date=2013-06-06+03%3A34%3A22+-0400&scale=week"), { + "count": ">= 1", + }), + ) + + def __init__(self, match): + super().__init__(match) + urlfmt = "{scheme}://{subdomain}.donmai.us/explore/posts/popular.json" + self.api_url = urlfmt.format( + scheme=self.scheme, subdomain=self.subdomain) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py new file mode 100644 index 0000000..ebab040 --- /dev/null +++ b/gallery_dl/extractor/deviantart.py @@ -0,0 +1,992 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.deviantart.com/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache, memcache +import collections +import itertools +import mimetypes +import math +import time +import re + + +BASE_PATTERN = ( + r"(?:https?://)?(?:" + r"(?:www\.)?deviantart\.com/([\w-]+)|" + r"(?!www\.)([\w-]+)\.deviantart\.com)" +) + + +class DeviantartExtractor(Extractor): + """Base class for deviantart extractors""" + category = "deviantart" + directory_fmt = ("{category}", "{author[username]!l}") + filename_fmt = "{category}_{index}_{title}.{extension}" + root = "https://www.deviantart.com" + + def __init__(self, match=None): + Extractor.__init__(self, match) + self.offset = 0 + self.flat = self.config("flat", True) + self.extra = self.config("extra", False) + self.original = self.config("original", True) + self.user = match.group(1) or match.group(2) + self.group = False + self.api = DeviantartAPI(self) + + if self.original != "image": + self._update_content = self._update_content_default + else: + self._update_content = self._update_content_image + self.original = True + + self.commit_journal = { + "html": self._commit_journal_html, + "text": self._commit_journal_text, + }.get(self.config("journals", "html")) + + def skip(self, num): + self.offset += num + return num + + def items(self): + if self.user: + self.group = not self.api.user_profile(self.user) + if self.group: + self.subcategory = "group-" + self.subcategory + + yield Message.Version, 1 + for deviation in self.deviations(): + if isinstance(deviation, tuple): + url, data = deviation + yield Message.Queue, url, data + continue + + self.prepare(deviation) + yield Message.Directory, deviation + + if "content" in deviation: + content = deviation["content"] + + if self.original and deviation["is_downloadable"] and \ + text.ext_from_url(content["src"]) != "gif": + self._update_content(deviation, content) + + if deviation["index"] <= 790677560 and \ + content["src"].startswith("https://images-wixmp-"): + # https://github.com/r888888888/danbooru/issues/4069 + content["src"] = re.sub( + r"(/f/[^/]+/[^/]+)/v\d+/.*", + r"/intermediary\1", content["src"]) + + yield self.commit(deviation, content) + + elif deviation["is_downloadable"]: + content = self.api.deviation_download(deviation["deviationid"]) + yield self.commit(deviation, content) + + if "videos" in deviation: + video = max(deviation["videos"], + key=lambda x: text.parse_int(x["quality"][:-1])) + yield self.commit(deviation, video) + + if "flash" in deviation: + yield self.commit(deviation, deviation["flash"]) + + if "excerpt" in deviation and self.commit_journal: + journal = self.api.deviation_content(deviation["deviationid"]) + yield self.commit_journal(deviation, journal) + + if self.extra: + for match in DeviantartStashExtractor.pattern.finditer( + deviation.get("description", "")): + deviation["_extractor"] = DeviantartStashExtractor + yield Message.Queue, match.group(0), deviation + + def deviations(self): + """Return an iterable containing all relevant Deviation-objects""" + + def prepare(self, deviation): + """Adjust the contents of a Deviation-object""" + try: + deviation["index"] = text.parse_int( + deviation["url"].rpartition("-")[2]) + except KeyError: + deviation["index"] = 0 + if self.user: + deviation["username"] = self.user + deviation["da_category"] = deviation["category"] + deviation["published_time"] = text.parse_int( + deviation["published_time"]) + deviation["date"] = text.parse_timestamp( + deviation["published_time"]) + + @staticmethod + def commit(deviation, target): + url = target["src"] + deviation["target"] = text.nameext_from_url(url, target.copy()) + deviation["extension"] = deviation["target"]["extension"] + return Message.Url, url, deviation + + def _commit_journal_html(self, deviation, journal): + title = text.escape(deviation["title"]) + url = deviation["url"] + thumbs = deviation["thumbs"] + html = journal["html"] + shadow = SHADOW_TEMPLATE.format_map(thumbs[0]) if thumbs else "" + + if "css" in journal: + css, cls = journal["css"], "withskin" + else: + css, cls = "", "journal-green" + + if html.find('<div class="boxtop journaltop">', 0, 250) != -1: + needle = '<div class="boxtop journaltop">' + header = HEADER_CUSTOM_TEMPLATE.format( + title=title, url=url, date=deviation["date"], + ) + else: + needle = '<div usr class="gr">' + catlist = deviation["category_path"].split("/") + categories = " / ".join( + ('<span class="crumb"><a href="{}/{}/"><span>{}</span></a>' + '</span>').format(self.root, cpath, cat.capitalize()) + for cat, cpath in zip( + catlist, + itertools.accumulate(catlist, lambda t, c: t + "/" + c) + ) + ) + username = deviation["author"]["username"] + urlname = deviation.get("username") or username.lower() + header = HEADER_TEMPLATE.format( + title=title, + url=url, + userurl="{}/{}/".format(self.root, urlname), + username=username, + date=deviation["date"], + categories=categories, + ) + + html = JOURNAL_TEMPLATE_HTML.format( + title=title, + html=html.replace(needle, header, 1), + shadow=shadow, + css=css, + cls=cls, + ) + + deviation["extension"] = "htm" + return Message.Url, html, deviation + + @staticmethod + def _commit_journal_text(deviation, journal): + content = "\n".join( + text.unescape(text.remove_html(txt)) + for txt in journal["html"].rpartition("<script")[0].split("<br />") + ) + txt = JOURNAL_TEMPLATE_TEXT.format( + title=deviation["title"], + username=deviation["author"]["username"], + date=deviation["date"], + content=content, + ) + + deviation["extension"] = "txt" + return Message.Url, txt, deviation + + @staticmethod + def _find_folder(folders, name): + pattern = re.compile( + r"[^\w]*" + name.replace("-", r"[^\w]+") + r"[^\w]*$") + for folder in folders: + if pattern.match(folder["name"]): + return folder + raise exception.NotFoundError("folder") + + def _folder_urls(self, folders, category): + url = "{}/{}/{}/0/".format(self.root, self.user, category) + return [(url + folder["name"], folder) for folder in folders] + + def _update_content_default(self, deviation, content): + content.update(self.api.deviation_download(deviation["deviationid"])) + + def _update_content_image(self, deviation, content): + data = self.api.deviation_download(deviation["deviationid"]) + url = data["src"].partition("?")[0] + mtype = mimetypes.guess_type(url, False)[0] + if mtype and mtype.startswith("image/"): + content.update(data) + + def _html_request(self, url, **kwargs): + cookies = {"userinfo": ( + '__167217c8e6aac1a3331f;{"username":"","uniqueid":"ab2e8b184471bf0' + 'e3f8ed3ee7a3220aa","vd":"Bc7vEx,BdC7Fy,A,J,A,,B,A,B,BdC7Fy,BdC7XU' + ',J,J,A,BdC7XU,13,A,B,A,,A,A,B,A,A,,A","attr":56}' + )} + return self.request(url, cookies=cookies, **kwargs) + + +class DeviantartGalleryExtractor(DeviantartExtractor): + """Extractor for all deviations from an artist's gallery""" + subcategory = "gallery" + archive_fmt = "g_{username}_{index}.{extension}" + pattern = BASE_PATTERN + r"(?:/(?:gallery/?(?:\?catpath=/)?)?)?$" + test = ( + ("https://www.deviantart.com/shimoda7/gallery/", { + "pattern": r"https://(s3.amazonaws.com/origin-(img|orig)" + r".deviantart.net/|images-wixmp-\w+.wixmp.com/)", + "count": ">= 30", + "keyword": { + "allows_comments": bool, + "author": { + "type": "regular", + "usericon": str, + "userid": "9AE51FC7-0278-806C-3FFF-F4961ABF9E2B", + "username": "shimoda7", + }, + "category_path": str, + "content": { + "filesize": int, + "height": int, + "src": str, + "transparency": bool, + "width": int, + }, + "da_category": str, + "date": "type:datetime", + "deviationid": str, + "?download_filesize": int, + "extension": str, + "index": int, + "is_deleted": bool, + "is_downloadable": bool, + "is_favourited": bool, + "is_mature": bool, + "preview": { + "height": int, + "src": str, + "transparency": bool, + "width": int, + }, + "published_time": int, + "stats": { + "comments": int, + "favourites": int, + }, + "target": dict, + "thumbs": list, + "title": str, + "url": r"re:https://www.deviantart.com/shimoda7/art/[^/]+-\d+", + "username": "shimoda7", + }, + }), + # group + ("https://www.deviantart.com/yakuzafc", { + "pattern": r"https://www.deviantart.com/yakuzafc/gallery/0/", + "count": ">= 15", + }), + # 'folders' option (#276) + ("https://www.deviantart.com/justatest235723", { + "count": 2, + "options": (("metadata", 1), ("folders", 1), ("original", 0)), + "keyword": { + "description": str, + "folders": list, + "is_watching": bool, + "license": str, + "tags": list, + }, + }), + ("https://www.deviantart.com/shimoda8/gallery/", { + "exception": exception.NotFoundError, + }), + # old-style URLs + ("https://www.deviantart.com/shimoda7/gallery/?catpath=/"), + ("https://shimoda7.deviantart.com/gallery/"), + ("https://yakuzafc.deviantart.com/"), + ("https://shimoda7.deviantart.com/gallery/?catpath=/"), + ) + + def deviations(self): + if self.flat and not self.group: + return self.api.gallery_all(self.user, self.offset) + folders = self.api.gallery_folders(self.user) + return self._folder_urls(folders, "gallery") + + +class DeviantartFolderExtractor(DeviantartExtractor): + """Extractor for deviations inside an artist's gallery folder""" + subcategory = "folder" + directory_fmt = ("{category}", "{folder[owner]}", "{folder[title]}") + archive_fmt = "F_{folder[uuid]}_{index}.{extension}" + pattern = BASE_PATTERN + r"/gallery/(\d+)/([^/?&#]+)" + test = ( + # user + ("https://www.deviantart.com/shimoda7/gallery/722019/Miscellaneous", { + "count": 5, + "options": (("original", False),), + }), + # group + ("https://www.deviantart.com/yakuzafc/gallery/37412168/Crafts", { + "count": ">= 4", + "options": (("original", False),), + }), + ("https://shimoda7.deviantart.com/gallery/722019/Miscellaneous"), + ("https://yakuzafc.deviantart.com/gallery/37412168/Crafts"), + ) + + def __init__(self, match): + DeviantartExtractor.__init__(self, match) + self.fname = match.group(4) + self.folder = {"owner": self.user, "index": match.group(3)} + + def deviations(self): + folders = self.api.gallery_folders(self.user) + folder = self._find_folder(folders, self.fname) + self.folder["title"] = folder["name"] + self.folder["uuid"] = folder["folderid"] + return self.api.gallery(self.user, folder["folderid"], self.offset) + + def prepare(self, deviation): + DeviantartExtractor.prepare(self, deviation) + deviation["folder"] = self.folder + + +class DeviantartDeviationExtractor(DeviantartExtractor): + """Extractor for single deviations""" + subcategory = "deviation" + archive_fmt = "{index}.{extension}" + pattern = BASE_PATTERN + r"/((?:art|journal)/[^/?&#]+-\d+)" + test = ( + (("https://www.deviantart.com/shimoda7/art/" + "For-the-sake-of-a-memory-10073852"), { + "options": (("original", 0),), + "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", + }), + ("https://www.deviantart.com/zzz/art/zzz-1234567890", { + "exception": exception.NotFoundError, + }), + (("https://www.deviantart.com/myria-moon/art/" + "Aime-Moi-part-en-vadrouille-261986576"), { + "pattern": (r"https?://s3\.amazonaws\.com/origin-orig\." + r"deviantart\.net/a383/f/2013/135/e/7/[^.]+\.jpg\?"), + }), + # wixmp URL rewrite + (("https://www.deviantart.com/citizenfresh/art/" + "Hverarond-14-the-beauty-of-the-earth-789295466"), { + "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" + r"/intermediary/f/[^/]+/[^.]+\.jpg$") + }), + # non-download URL for GIFs (#242) + (("https://www.deviantart.com/skatergators/art/" + "COM-Monique-Model-781571783"), { + "pattern": (r"https://images-wixmp-\w+\.wixmp\.com" + r"/f/[^/]+/[^.]+\.gif\?token="), + }), + # external URLs from description (#302) + (("https://www.deviantart.com/uotapo/art/" + "INANAKI-Memorial-Humane7-590297498"), { + "options": (("extra", 1), ("original", 0)), + "pattern": r"https?://sta\.sh/\w+$", + "range": "2-", + "count": 4, + }), + # old-style URLs + ("https://shimoda7.deviantart.com" + "/art/For-the-sake-of-a-memory-10073852"), + ("https://myria-moon.deviantart.com" + "/art/Aime-Moi-part-en-vadrouille-261986576"), + ("https://zzz.deviantart.com/art/zzz-1234567890"), + ) + + skip = Extractor.skip + + def __init__(self, match): + DeviantartExtractor.__init__(self, match) + self.path = match.group(3) + + def deviations(self): + url = "{}/{}/{}".format(self.root, self.user, self.path) + response = self._html_request(url, expect=range(400, 500)) + deviation_id = text.extract(response.text, '//deviation/', '"')[0] + if response.status_code >= 400 or not deviation_id: + raise exception.NotFoundError("image") + return (self.api.deviation(deviation_id),) + + +class DeviantartStashExtractor(DeviantartExtractor): + """Extractor for sta.sh-ed deviations""" + subcategory = "stash" + archive_fmt = "{index}.{extension}" + pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)" + test = ( + ("https://sta.sh/022c83odnaxc", { + "pattern": r"https://s3.amazonaws.com/origin-orig.deviantart.net", + "count": 1, + }), + # multiple stash items + ("https://sta.sh/21jf51j7pzl2", { + "pattern": pattern, + "count": 4, + }), + # downloadable, but no "content" field (#307) + ("https://sta.sh/024t4coz16mi", { + "count": 1, + }), + ("https://sta.sh/abcdefghijkl", { + "exception": exception.HttpError, + }), + ) + + skip = Extractor.skip + + def __init__(self, match): + DeviantartExtractor.__init__(self, match) + self.user = None + self.stash_id = match.group(1) + + def deviations(self): + url = "https://sta.sh/" + self.stash_id + page = self.request(url).text + deviation_id = text.extract(page, '//deviation/', '"')[0] + + if deviation_id: + yield self.api.deviation(deviation_id) + else: + data = {"_extractor": DeviantartStashExtractor} + page = text.extract( + page, '<div id="stash-body"', '<div class="footer"')[0] + for url in text.extract_iter(page, '<a href="', '"'): + yield url, data + + +class DeviantartFavoriteExtractor(DeviantartExtractor): + """Extractor for an artist's favorites""" + subcategory = "favorite" + directory_fmt = ("{category}", "{username}", "Favourites") + archive_fmt = "f_{username}_{index}.{extension}" + pattern = BASE_PATTERN + r"/favourites/?(?:\?catpath=/)?$" + test = ( + ("https://www.deviantart.com/h3813067/favourites/", { + "options": (("metadata", True), ("flat", False)), # issue #271 + "count": 1, + }), + ("https://www.deviantart.com/h3813067/favourites/", { + "content": "6a7c74dc823ebbd457bdd9b3c2838a6ee728091e", + }), + ("https://www.deviantart.com/h3813067/favourites/?catpath=/"), + ("https://h3813067.deviantart.com/favourites/"), + ("https://h3813067.deviantart.com/favourites/?catpath=/"), + ) + + def deviations(self): + folders = self.api.collections_folders(self.user) + if self.flat: + return itertools.chain.from_iterable( + self.api.collections(self.user, folder["folderid"]) + for folder in folders + ) + return self._folder_urls(folders, "favourites") + + +class DeviantartCollectionExtractor(DeviantartExtractor): + """Extractor for a single favorite collection""" + subcategory = "collection" + directory_fmt = ("{category}", "{collection[owner]}", + "Favourites", "{collection[title]}") + archive_fmt = "C_{collection[uuid]}_{index}.{extension}" + pattern = BASE_PATTERN + r"/favourites/(\d+)/([^/?&#]+)" + test = ( + (("https://www.deviantart.com/pencilshadings" + "/favourites/70595441/3D-Favorites"), { + "count": ">= 20", + "options": (("original", False),), + }), + ("https://pencilshadings.deviantart.com" + "/favourites/70595441/3D-Favorites"), + ) + + def __init__(self, match): + DeviantartExtractor.__init__(self, match) + _, _, cid, self.cname = match.groups() + self.collection = {"owner": self.user, "index": cid} + + def deviations(self): + folders = self.api.collections_folders(self.user) + folder = self._find_folder(folders, self.cname) + self.collection["title"] = folder["name"] + self.collection["uuid"] = folder["folderid"] + return self.api.collections(self.user, folder["folderid"], self.offset) + + def prepare(self, deviation): + DeviantartExtractor.prepare(self, deviation) + deviation["collection"] = self.collection + + +class DeviantartJournalExtractor(DeviantartExtractor): + """Extractor for an artist's journals""" + subcategory = "journal" + directory_fmt = ("{category}", "{username}", "Journal") + archive_fmt = "j_{username}_{index}.{extension}" + pattern = BASE_PATTERN + r"/(?:journal|blog)/?(?:\?catpath=/)?$" + test = ( + ("https://www.deviantart.com/angrywhitewanker/journal/", { + "url": "38db2a0d3a587a7e0f9dba7ff7d274610ebefe44", + }), + ("https://www.deviantart.com/angrywhitewanker/journal/", { + "url": "b2a8e74d275664b1a4acee0fca0a6fd33298571e", + "options": (("journals", "text"),), + }), + ("https://www.deviantart.com/angrywhitewanker/journal/", { + "count": 0, + "options": (("journals", "none"),), + }), + ("https://www.deviantart.com/shimoda7/journal/?catpath=/"), + ("https://shimoda7.deviantart.com/journal/"), + ("https://shimoda7.deviantart.com/journal/?catpath=/"), + ) + + def deviations(self): + return self.api.browse_user_journals(self.user, self.offset) + + +class DeviantartScrapsExtractor(DeviantartExtractor): + """Extractor for an artist's scraps""" + subcategory = "scraps" + directory_fmt = ("{category}", "{username}", "Scraps") + archive_fmt = "s_{username}_{index}.{extension}" + pattern = BASE_PATTERN + r"/gallery/\?catpath=scraps\b" + test = ( + ("https://www.deviantart.com/shimoda7/gallery/?catpath=scraps", { + "count": 12, + "options": (("original", False),), + }), + ("https://shimoda7.deviantart.com/gallery/?catpath=scraps"), + ) + + def deviations(self): + url = "{}/{}/gallery/?catpath=scraps".format(self.root, self.user) + page = self._html_request(url).text + csrf, pos = text.extract(page, '"csrf":"', '"') + iid , pos = text.extract(page, '"requestid":"', '"', pos) + + url = "https://www.deviantart.com/dapi/v1/gallery/0" + data = { + "username": self.user, + "offset": self.offset, + "limit": "24", + "catpath": "scraps", + "_csrf": csrf, + "dapiIid": iid + "-jsok7403-1.1" + } + + while True: + content = self.request( + url, method="POST", data=data).json()["content"] + + for item in content["results"]: + if item["html"].startswith('<div class="ad-container'): + continue + deviation_url = text.extract(item["html"], 'href="', '"')[0] + page = self._html_request(deviation_url).text + deviation_id = text.extract(page, '//deviation/', '"')[0] + if deviation_id: + yield self.api.deviation(deviation_id) + + if not content["has_more"]: + return + data["offset"] = content["next_offset"] + + +class DeviantartPopularExtractor(DeviantartExtractor): + """Extractor for popular deviations""" + subcategory = "popular" + directory_fmt = ("{category}", "Popular", + "{popular[range]}", "{popular[search]}") + archive_fmt = "P_{popular[range]}_{popular[search]}_{index}.{extension}" + pattern = (r"(?:https?://)?www\.deviantart\.com" + r"((?:/\w+)*)/(?:popular-([^/?&#]+))/?(?:\?([^#]*))?") + test = ( + ("https://www.deviantart.com/popular-24-hours/?q=tree+house", { + "options": (("original", False),), + }), + ("https://www.deviantart.com/artisan/popular-all-time/?q=tree"), + ) + + def __init__(self, match): + DeviantartExtractor.__init__(self, match) + self.search_term = self.time_range = self.category_path = None + self.user = "" + + path, trange, query = match.groups() + if path: + self.category_path = path.lstrip("/") + if trange: + self.time_range = trange.replace("-", "").replace("hours", "hr") + if query: + self.search_term = text.parse_query(query).get("q") + + self.popular = { + "search": self.search_term or "", + "range": trange or "24-hours", + "path": self.category_path, + } + + def deviations(self): + return self.api.browse_popular( + self.search_term, self.time_range, self.category_path, self.offset) + + def prepare(self, deviation): + DeviantartExtractor.prepare(self, deviation) + deviation["popular"] = self.popular + + +class DeviantartAPI(): + """Minimal interface for the DeviantArt API + + Ref: https://www.deviantart.com/developers/http/v1/20160316 + """ + CLIENT_ID = "5388" + CLIENT_SECRET = "76b08c69cfb27f26d6161f9ab6d061a1" + + def __init__(self, extractor): + self.extractor = extractor + self.log = extractor.log + self.headers = {} + + delay = extractor.config("wait-min", 0) + self.delay = math.ceil(math.log2(delay)) if delay >= 1 else -1 + self.delay_min = max(2, self.delay) + + self.mature = extractor.config("mature", "true") + if not isinstance(self.mature, str): + self.mature = "true" if self.mature else "false" + + self.folders = extractor.config("folders", False) + self.metadata = extractor.extra or extractor.config("metadata", False) + + self.refresh_token = extractor.config("refresh-token") + self.client_id = extractor.config("client-id", self.CLIENT_ID) + self.client_secret = extractor.config( + "client-secret", self.CLIENT_SECRET) + + def browse_popular(self, query=None, timerange=None, + category_path=None, offset=0): + """Yield popular deviations""" + endpoint = "browse/popular" + params = {"q": query, "offset": offset, "limit": 120, + "timerange": timerange, "category_path": category_path, + "mature_content": self.mature} + return self._pagination(endpoint, params) + + def browse_user_journals(self, username, offset=0): + """Yield all journal entries of a specific user""" + endpoint = "browse/user/journals" + params = {"username": username, "offset": offset, "limit": 50, + "mature_content": self.mature, "featured": "false"} + return self._pagination(endpoint, params) + + def collections(self, username, folder_id, offset=0): + """Yield all Deviation-objects contained in a collection folder""" + endpoint = "collections/" + folder_id + params = {"username": username, "offset": offset, "limit": 24, + "mature_content": self.mature} + return self._pagination(endpoint, params) + + @memcache(keyarg=1) + def collections_folders(self, username, offset=0): + """Yield all collection folders of a specific user""" + endpoint = "collections/folders" + params = {"username": username, "offset": offset, "limit": 50, + "mature_content": self.mature} + return self._pagination_folders(endpoint, params) + + def deviation(self, deviation_id): + """Query and return info about a single Deviation""" + endpoint = "deviation/" + deviation_id + deviation = self._call(endpoint) + if self.metadata: + self._metadata((deviation,)) + if self.folders: + self._folders((deviation,)) + return deviation + + def deviation_content(self, deviation_id): + """Get extended content of a single Deviation""" + endpoint = "deviation/content" + params = {"deviationid": deviation_id} + return self._call(endpoint, params) + + def deviation_download(self, deviation_id): + """Get the original file download (if allowed)""" + endpoint = "deviation/download/" + deviation_id + params = {"mature_content": self.mature} + return self._call(endpoint, params) + + def deviation_metadata(self, deviations): + """ Fetch deviation metadata for a set of deviations""" + endpoint = "deviation/metadata?" + "&".join( + "deviationids[{}]={}".format(num, deviation["deviationid"]) + for num, deviation in enumerate(deviations) + ) + params = {"mature_content": self.mature} + return self._call(endpoint, params)["metadata"] + + def gallery(self, username, folder_id="", offset=0, extend=True): + """Yield all Deviation-objects contained in a gallery folder""" + endpoint = "gallery/" + folder_id + params = {"username": username, "offset": offset, "limit": 24, + "mature_content": self.mature, "mode": "newest"} + return self._pagination(endpoint, params, extend) + + def gallery_all(self, username, offset=0): + """Yield all Deviation-objects of a specific user""" + endpoint = "gallery/all" + params = {"username": username, "offset": offset, "limit": 24, + "mature_content": self.mature} + return self._pagination(endpoint, params) + + @memcache(keyarg=1) + def gallery_folders(self, username, offset=0): + """Yield all gallery folders of a specific user""" + endpoint = "gallery/folders" + params = {"username": username, "offset": offset, "limit": 50, + "mature_content": self.mature} + return self._pagination_folders(endpoint, params) + + @memcache(keyarg=1) + def user_profile(self, username): + """Get user profile information""" + endpoint = "user/profile/" + username + return self._call(endpoint, expect_error=True) + + def authenticate(self, refresh_token): + """Authenticate the application by requesting an access token""" + self.headers["Authorization"] = self._authenticate_impl(refresh_token) + + @cache(maxage=3600, keyarg=1) + def _authenticate_impl(self, refresh_token): + """Actual authenticate implementation""" + url = "https://www.deviantart.com/oauth2/token" + if refresh_token: + self.log.info("Refreshing private access token") + data = {"grant_type": "refresh_token", + "refresh_token": _refresh_token_cache(refresh_token)} + else: + self.log.info("Requesting public access token") + data = {"grant_type": "client_credentials"} + + auth = (self.client_id, self.client_secret) + response = self.extractor.request( + url, method="POST", data=data, auth=auth) + data = response.json() + + if response.status_code != 200: + raise exception.AuthenticationError('"{} ({})"'.format( + data.get("error_description"), data.get("error"))) + if refresh_token: + _refresh_token_cache.update(refresh_token, data["refresh_token"]) + return "Bearer " + data["access_token"] + + def _call(self, endpoint, params=None, expect_error=False, public=True): + """Call an API endpoint""" + url = "https://www.deviantart.com/api/v1/oauth2/" + endpoint + while True: + if self.delay >= 0: + time.sleep(2 ** self.delay) + + self.authenticate(None if public else self.refresh_token) + response = self.extractor.request( + url, + params=params, + headers=self.headers, + expect=range(400, 500), + ) + data = response.json() + status = response.status_code + + if 200 <= status < 400: + if self.delay > self.delay_min: + self.delay -= 1 + return data + if expect_error: + return None + if data.get("error_description") == "User not found.": + raise exception.NotFoundError("user or group") + + self.log.debug(response.text) + msg = "API responded with {} {}".format( + status, response.reason) + if status == 429: + self.delay += 1 + self.log.warning("%s. Using %ds delay.", msg, 2 ** self.delay) + else: + self.log.error(msg) + return data + + def _pagination(self, endpoint, params, extend=True): + public = True + while True: + data = self._call(endpoint, params, public=public) + if "results" not in data: + self.log.error("Unexpected API response: %s", data) + return + if (public and self.refresh_token and + len(data["results"]) < params["limit"]): + self.log.debug("Switching to private access token") + public = False + continue + + if extend: + if self.metadata: + self._metadata(data["results"]) + if self.folders: + self._folders(data["results"]) + yield from data["results"] + + if not data["has_more"]: + return + params["offset"] = data["next_offset"] + + def _pagination_folders(self, endpoint, params): + result = [] + result.extend(self._pagination(endpoint, params, False)) + return result + + def _metadata(self, deviations): + """Add extended metadata to each deviation object""" + for deviation, metadata in zip( + deviations, self.deviation_metadata(deviations)): + deviation.update(metadata) + deviation["tags"] = [t["tag_name"] for t in deviation["tags"]] + return deviations + + def _folders(self, deviations): + """Add a list of all containing folders to each deviation object""" + for deviation in deviations: + deviation["folders"] = self._folders_map( + deviation["author"]["username"])[deviation["deviationid"]] + + @memcache(keyarg=1) + def _folders_map(self, username): + """Generate a deviation_id -> folders mapping for 'username'""" + self.log.info("Collecting folder information for '%s'", username) + folders = self.gallery_folders(username) + + # add parent names to folders, but ignore "Featured" as parent + fmap = {} + featured = folders[0]["folderid"] + for folder in folders: + if folder["parent"] and folder["parent"] != featured: + folder["name"] = fmap[folder["parent"]] + "/" + folder["name"] + fmap[folder["folderid"]] = folder["name"] + + # map deviationids to folder names + dmap = collections.defaultdict(list) + for folder in folders: + for deviation in self.gallery( + username, folder["folderid"], 0, False): + dmap[deviation["deviationid"]].append(folder["name"]) + return dmap + + +@cache(maxage=10*365*24*3600, keyarg=0) +def _refresh_token_cache(original_token, new_token=None): + return new_token or original_token + + +SHADOW_TEMPLATE = """ +<span class="shadow"> + <img src="{src}" class="smshadow" width="{width}" height="{height}"> +</span> +<br><br> +""" + +HEADER_TEMPLATE = """<div usr class="gr"> +<div class="metadata"> + <h2><a href="{url}">{title}</a></h2> + <ul> + <li class="author"> + by <span class="name"><span class="username-with-symbol u"> + <a class="u regular username" href="{userurl}">{username}</a>\ +<span class="user-symbol regular"></span></span></span>, + <span>{date}</span> + </li> + <li class="category"> + {categories} + </li> + </ul> +</div> +""" + +HEADER_CUSTOM_TEMPLATE = """<div class='boxtop journaltop'> +<h2> + <img src="https://st.deviantart.net/minish/gruzecontrol/icons/journal.gif\ +?2" style="vertical-align:middle" alt=""/> + <a href="{url}">{title}</a> +</h2> +Journal Entry: <span>{date}</span> +""" + +JOURNAL_TEMPLATE_HTML = """text:<!DOCTYPE html> +<html> +<head> + <meta charset="utf-8"> + <title>{title}</title> + <link rel="stylesheet" href="https://st.deviantart.net/\ +css/deviantart-network_lc.css?3843780832"> + <link rel="stylesheet" href="https://st.deviantart.net/\ +css/group_secrets_lc.css?3250492874"> + <link rel="stylesheet" href="https://st.deviantart.net/\ +css/v6core_lc.css?4246581581"> + <link rel="stylesheet" href="https://st.deviantart.net/\ +css/sidebar_lc.css?1490570941"> + <link rel="stylesheet" href="https://st.deviantart.net/\ +css/writer_lc.css?3090682151"> + <link rel="stylesheet" href="https://st.deviantart.net/\ +css/v6loggedin_lc.css?3001430805"> + <style>{css}</style> + <link rel="stylesheet" href="https://st.deviantart.net/\ +roses/cssmin/core.css?1488405371919" > + <link rel="stylesheet" href="https://st.deviantart.net/\ +roses/cssmin/peeky.css?1487067424177" > + <link rel="stylesheet" href="https://st.deviantart.net/\ +roses/cssmin/desktop.css?1491362542749" > +</head> +<body id="deviantART-v7" class="bubble no-apps loggedout w960 deviantart"> + <div id="output"> + <div class="dev-page-container bubbleview"> + <div class="dev-page-view view-mode-normal"> + <div class="dev-view-main-content"> + <div class="dev-view-deviation"> + {shadow} + <div class="journal-wrapper tt-a"> + <div class="journal-wrapper2"> + <div class="journal {cls} journalcontrol"> + {html} + </div> + </div> + </div> + </div> + </div> + </div> + </div> + </div> +</body> +</html> +""" + +JOURNAL_TEMPLATE_TEXT = """text:{title} +by {username}, {date} + +{content} +""" diff --git a/gallery_dl/extractor/directlink.py b/gallery_dl/extractor/directlink.py new file mode 100644 index 0000000..77a19f6 --- /dev/null +++ b/gallery_dl/extractor/directlink.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Direct link handling""" + +from .common import Extractor, Message +from .. import text + + +class DirectlinkExtractor(Extractor): + """Extractor for direct links to images and other media files""" + category = "directlink" + filename_fmt = "{domain}/{path}" + archive_fmt = "{domain}/{path}" + pattern = (r"(?i)https?://(?P<domain>[^/?&#]+)/(?P<path>[^?&#]+\." + r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))" + r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$") + test = ( + (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), { + "url": "18c5d00077332e98e53be9fed2ee4be66154b88d", + "keyword": "e81b9fe3022e971365dd859f38e4ef717a6c69ed", + }), + # more complex example + ("https://example.org/path/file.webm?que=1&ry=2#fragment", { + "url": "fd4aec8a32842343394e6078a06c3e6b647bf671", + "keyword": "ff75764b1ae66615b723a6357b8193fa2de84678", + }), + # percent-encoded characters + ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", { + "url": "2627e8140727fdf743f86fe18f69f99a052c9718", + "keyword": "4d19dc12e41ffcb4cbec2013e335cf482377c35e", + }), + # upper case file extension (#296) + ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw" + ".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP" + "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.data = match.groupdict() + + def items(self): + text.nameext_from_url(self.url, self.data) + for key, value in self.data.items(): + if value: + self.data[key] = text.unquote(value) + + yield Message.Version, 1 + yield Message.Directory, self.data + yield Message.Url, self.url, self.data diff --git a/gallery_dl/extractor/dynastyscans.py b/gallery_dl/extractor/dynastyscans.py new file mode 100644 index 0000000..b10bd35 --- /dev/null +++ b/gallery_dl/extractor/dynastyscans.py @@ -0,0 +1,145 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters from https://dynasty-scans.com/""" + +from .common import ChapterExtractor, Extractor, Message +from .. import text +import json +import re + + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?dynasty-scans\.com" + + +class DynastyscansBase(): + """Base class for dynastyscans extractors""" + category = "dynastyscans" + root = "https://dynasty-scans.com" + + def _parse_image_page(self, image_id): + url = "{}/images/{}".format(self.root, image_id) + extr = text.extract_from(self.request(url).text) + + date = extr("class='create_at'>", "</span>") + tags = extr("class='tags'>", "</span>") + src = extr("class='btn-group'>", "</div>") + url = extr(' src="', '"') + + src = text.extract(src, 'href="', '"')[0] if "Source<" in src else "" + + return { + "url" : self.root + url, + "image_id": text.parse_int(image_id), + "tags" : text.split_html(text.unescape(tags)), + "date" : text.remove_html(date), + "source" : text.unescape(src), + } + + +class DynastyscansChapterExtractor(DynastyscansBase, ChapterExtractor): + """Extractor for manga-chapters from dynasty-scans.com""" + pattern = BASE_PATTERN + r"(/chapters/[^/?&#]+)" + test = ( + (("http://dynasty-scans.com/chapters/" + "hitoribocchi_no_oo_seikatsu_ch33"), { + "url": "dce64e8c504118f1ab4135c00245ea12413896cb", + "keyword": "1564965671ac69bb7fbc340538397f6bd0aa269b", + }), + (("http://dynasty-scans.com/chapters/" + "new_game_the_spinoff_special_13"), { + "url": "dbe5bbb74da2edcfb1832895a484e2a40bc8b538", + "keyword": "22b35029bc65d6d95db2e2c147b0a37f2d290f29", + }), + ) + + def metadata(self, page): + extr = text.extract_from(page) + match = re.match( + (r"(?:<a[^>]*>)?([^<]+)(?:</a>)?" # manga name + r"(?: ch(\d+)([^:<]*))?" # chapter info + r"(?:: (.+))?"), # title + extr("<h3 id='chapter-title'><b>", "</b>"), + ) + author = extr(" by ", "</a>") + group = extr('"icon-print"></i> ', '</span>') + + return { + "manga" : text.unescape(match.group(1)), + "chapter" : text.parse_int(match.group(2)), + "chapter_minor": match.group(3) or "", + "title" : text.unescape(match.group(4) or ""), + "author" : text.remove_html(author), + "group" : (text.remove_html(group) or + text.extract(group, ' alt="', '"')[0] or ""), + "date" : extr('"icon-calendar"></i> ', '<'), + "lang" : "en", + "language": "English", + } + + def images(self, page): + data = text.extract(page, "var pages = ", ";\n")[0] + return [ + (self.root + img["image"], None) + for img in json.loads(data) + ] + + +class DynastyscansSearchExtractor(DynastyscansBase, Extractor): + """Extrator for image search results on dynasty-scans.com""" + subcategory = "search" + directory_fmt = ("{category}", "Images") + filename_fmt = "{image_id}.{extension}" + archive_fmt = "i_{image_id}" + pattern = BASE_PATTERN + r"/images/?(?:\?([^#]+))?$" + test = ( + ("https://dynasty-scans.com/images?with[]=4930&with[]=5211", { + "url": "6b570eedd8a741c2cd34fb98b22a49d772f84191", + "keyword": "a1e2d05c1406a08b02f347389616a6babb1b50bf", + }), + ("https://dynasty-scans.com/images", { + "range": "1", + "count": 1, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.query = match.group(1) or "" + + def items(self): + yield Message.Version, 1 + yield Message.Directory, {} + for image_id in self.images(): + image = self._parse_image_page(image_id) + url = image["url"] + yield Message.Url, url, text.nameext_from_url(url, image) + + def images(self): + url = self.root + "/images?" + self.query.replace("[]", "%5B%5D") + params = {"page": 1} + + while True: + page = self.request(url, params=params).text + yield from text.extract_iter(page, '"/images/', '"') + if 'rel="next"' not in page: + return + params["page"] += 1 + + +class DynastyscansImageExtractor(DynastyscansSearchExtractor): + """Extractor for individual images on dynasty-scans.com""" + subcategory = "image" + pattern = BASE_PATTERN + r"/images/(\d+)" + test = ("https://dynasty-scans.com/images/1245", { + "url": "15e54bd94148a07ed037f387d046c27befa043b2", + "keyword": "3b630c6139e5ff06e141541d57960f8a2957efbb", + }) + + def images(self): + return (self.query,) diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py new file mode 100644 index 0000000..f245ddf --- /dev/null +++ b/gallery_dl/extractor/e621.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://e621.net/""" + +from . import booru + + +class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor): + """Base class for e621 extractors""" + category = "e621" + api_url = "https://e621.net/post/index.json" + post_url = "https://e621.net/post/show/{}" + page_limit = 750 + + +class E621TagExtractor(booru.TagMixin, E621Extractor): + """Extractor for images from e621.net based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?e621\.net/post" + r"(?:/index/\d+/|\?tags=)(?P<tags>[^/?&#]+)") + test = ( + ("https://e621.net/post/index/1/anry", { + "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba", + "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58", + }), + ("https://e621.net/post?tags=anry"), + ) + + +class E621PoolExtractor(booru.PoolMixin, E621Extractor): + """Extractor for image-pools from e621.net""" + pattern = r"(?:https?://)?(?:www\.)?e621\.net/pool/show/(?P<pool>\d+)" + test = ("https://e621.net/pool/show/73", { + "url": "842f2fb065c7c339486a9b1d689020b8569888ed", + "content": "c2c87b7a9150509496cddc75ccab08109922876a", + }) + + +class E621PostExtractor(booru.PostMixin, E621Extractor): + """Extractor for single images from e621.net""" + pattern = r"(?:https?://)?(?:www\.)?e621\.net/post/show/(?P<post>\d+)" + test = ("https://e621.net/post/show/535", { + "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529", + "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", + "options": (("tags", True),), + "keyword": { + "tags_artist": "anry", + "tags_general": str, + "tags_species": str, + }, + }) + + +class E621PopularExtractor(booru.MoebooruPopularMixin, E621Extractor): + """Extractor for popular images from 621.net""" + pattern = (r"(?:https?://)?(?:www\.)?e621\.net" + r"/post/popular_by_(?P<scale>day|week|month)" + r"(?:\?(?P<query>[^#]*))?") + test = ("https://e621.net/post/popular_by_month?month=6&year=2013", { + "count": 32, + }) + + def __init__(self, match): + super().__init__(match) + self.api_url = "https://e621.net/post/popular_by_{scale}.json".format( + scale=self.scale) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py new file mode 100644 index 0000000..d67c58a --- /dev/null +++ b/gallery_dl/extractor/exhentai.py @@ -0,0 +1,382 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from galleries at https://exhentai.org/""" + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache +import itertools +import random +import time +import math + + +BASE_PATTERN = r"(?:https?://)?(e[x-]|g\.e-)hentai\.org" + + +class ExhentaiExtractor(Extractor): + """Base class for exhentai extractors""" + category = "exhentai" + directory_fmt = ("{category}", "{gallery_id}") + filename_fmt = ( + "{gallery_id}_{num:>04}_{image_token}_{filename}.{extension}") + archive_fmt = "{gallery_id}_{num}" + cookiedomain = ".exhentai.org" + cookienames = ("ipb_member_id", "ipb_pass_hash") + root = "https://exhentai.org" + + def __init__(self, match): + if match.group(1) != "ex": + self.root = "https://e-hentai.org" + self.cookiedomain = ".e-hentai.org" + Extractor.__init__(self, match) + self.limits = self.config("limits", True) + self.original = self.config("original", True) + self.wait_min = self.config("wait-min", 3) + self.wait_max = self.config("wait-max", 6) + + self._remaining = 0 + if self.wait_max < self.wait_min: + self.wait_max = self.wait_min + self.session.headers["Referer"] = self.root + "/" + + def request(self, *args, **kwargs): + response = Extractor.request(self, *args, **kwargs) + if self._is_sadpanda(response): + self.log.info("sadpanda.jpg") + raise exception.AuthorizationError() + return response + + def wait(self, waittime=None): + """Wait for a randomly chosen amount of seconds""" + if not waittime: + waittime = random.uniform(self.wait_min, self.wait_max) + else: + waittime = random.uniform(waittime * 0.66, waittime * 1.33) + time.sleep(waittime) + + def login(self): + """Login and set necessary cookies""" + if self._check_cookies(self.cookienames): + return + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + else: + self.log.info("no username given; using e-hentai.org") + self.root = "https://e-hentai.org" + self.original = False + self.limits = False + self.session.cookies["nw"] = "1" + + @cache(maxage=90*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + url = "https://forums.e-hentai.org/index.php?act=Login&CODE=01" + headers = { + "Referer": "https://e-hentai.org/bounce_login.php?b=d&bt=1-1", + } + data = { + "CookieDate": "1", + "b": "d", + "bt": "1-1", + "UserName": username, + "PassWord": password, + "ipb_login_submit": "Login!", + } + + response = self.request(url, method="POST", headers=headers, data=data) + if "You are now logged in as:" not in response.text: + raise exception.AuthenticationError() + return {c: response.cookies[c] for c in self.cookienames} + + @staticmethod + def _is_sadpanda(response): + """Return True if the response object contains a sad panda""" + return ( + response.headers.get("Content-Length") == "9615" and + "sadpanda.jpg" in response.headers.get("Content-Disposition", "") + ) + + +class ExhentaiGalleryExtractor(ExhentaiExtractor): + """Extractor for image galleries from exhentai.org""" + subcategory = "gallery" + pattern = (BASE_PATTERN + + r"(?:/g/(\d+)/([\da-f]{10})" + r"|/s/([\da-f]{10})/(\d+)-(\d+))") + test = ( + ("https://exhentai.org/g/960460/4f0e369d82/", { + "keyword": "1532ca4d0e4e0738dc994ca725a228af04a4e480", + "content": "493d759de534355c9f55f8e365565b62411de146", + }), + ("https://exhentai.org/g/960461/4f0e369d82/", { + "exception": exception.NotFoundError, + }), + ("http://exhentai.org/g/962698/7f02358e00/", { + "exception": exception.AuthorizationError, + }), + ("https://exhentai.org/s/3957343c3b/960460-5", { + "count": 2, + }), + ("https://e-hentai.org/s/3957343c3b/960460-5", { + "count": 2, + }), + ("https://g.e-hentai.org/g/960460/4f0e369d82/"), + ) + + def __init__(self, match): + ExhentaiExtractor.__init__(self, match) + self.key = {} + self.count = 0 + self.gallery_id = text.parse_int(match.group(2) or match.group(5)) + self.gallery_token = match.group(3) + self.image_token = match.group(4) + self.image_num = text.parse_int(match.group(6), 1) + + def items(self): + self.login() + + if self.gallery_token: + gpage = self._gallery_page() + self.image_token = text.extract(gpage, 'hentai.org/s/', '"')[0] + self.wait() + ipage = self._image_page() + else: + ipage = self._image_page() + part = text.extract(ipage, 'hentai.org/g/', '"')[0] + self.gallery_token = part.split("/")[1] + self.wait() + gpage = self._gallery_page() + + data = self.get_metadata(gpage) + self.count = data["count"] + + yield Message.Version, 1 + yield Message.Directory, data + + images = itertools.chain( + (self.image_from_page(ipage),), self.images_from_api()) + for url, image in images: + data.update(image) + if self.limits: + self._check_limits(data) + if "/fullimg.php" in url: + data["extension"] = "" + self.wait(1.5) + yield Message.Url, url, data + + def get_metadata(self, page): + """Extract gallery metadata""" + extr = text.extract_from(page) + data = { + "gallery_id" : self.gallery_id, + "gallery_token": self.gallery_token, + "title" : text.unescape(extr('<h1 id="gn">', '</h1>')), + "title_jp" : text.unescape(extr('<h1 id="gj">', '</h1>')), + "date" : text.parse_datetime(extr( + '>Posted:</td><td class="gdt2">', '</td>'), "%Y-%m-%d %H:%M"), + "parent" : extr( + '>Parent:</td><td class="gdt2"><a href="', '"'), + "visible" : extr( + '>Visible:</td><td class="gdt2">', '<'), + "language" : extr( + '>Language:</td><td class="gdt2">', ' '), + "gallery_size" : text.parse_bytes(extr( + '>File Size:</td><td class="gdt2">', '<').rstrip("Bb")), + "count" : text.parse_int(extr( + '>Length:</td><td class="gdt2">', ' ')), + } + + data["lang"] = util.language_to_code(data["language"]) + data["tags"] = [ + text.unquote(tag) + for tag in text.extract_iter(page, 'hentai.org/tag/', '"') + ] + + return data + + def image_from_page(self, page): + """Get image url and data from webpage""" + pos = page.index('<div id="i3"><a onclick="return load_image(') + 26 + extr = text.extract_from(page, pos) + + self.key["next"] = extr("'", "'") + iurl = extr('<img id="img" src="', '"') + orig = extr('hentai.org/fullimg.php', '"') + + if self.original and orig: + url = self.root + "/fullimg.php" + text.unescape(orig) + data = self._parse_original_info(extr('ownload original', '<')) + else: + url = iurl + data = self._parse_image_info(url) + + data["num"] = self.image_num + data["image_token"] = self.key["start"] = extr('var startkey="', '";') + self.key["show"] = extr('var showkey="', '";') + + return url, text.nameext_from_url(iurl, data) + + def images_from_api(self): + """Get image url and data from api calls""" + api_url = self.root + "/api.php" + nextkey = self.key["next"] + request = { + "method" : "showpage", + "gid" : self.gallery_id, + "imgkey" : nextkey, + "showkey": self.key["show"], + } + for request["page"] in range(self.image_num + 1, self.count + 1): + self.wait() + page = self.request(api_url, method="POST", json=request).json() + imgkey = nextkey + nextkey, pos = text.extract(page["i3"], "'", "'") + imgurl , pos = text.extract(page["i3"], 'id="img" src="', '"', pos) + origurl, pos = text.extract(page["i7"], '<a href="', '"') + + if self.original and origurl: + url = text.unescape(origurl) + data = self._parse_original_info( + text.extract(page["i7"], "ownload original", "<", pos)[0]) + else: + url = imgurl + data = self._parse_image_info(url) + + data["num"] = request["page"] + data["image_token"] = imgkey + yield url, text.nameext_from_url(imgurl, data) + + request["imgkey"] = nextkey + + def _gallery_page(self): + url = "{}/g/{}/{}/".format( + self.root, self.gallery_id, self.gallery_token) + response = self.request(url, expect=range(400, 500)) + page = response.text + + if response.status_code == 404 and "Gallery Not Available" in page: + raise exception.AuthorizationError() + if page.startswith(("Key missing", "Gallery not found")): + raise exception.NotFoundError("gallery") + return page + + def _image_page(self): + url = "{}/s/{}/{}-{}".format( + self.root, self.image_token, self.gallery_id, self.image_num) + page = self.request(url, expect=range(400, 500)).text + + if page.startswith(("Invalid page", "Keep trying")): + raise exception.NotFoundError("image page") + return page + + def _check_limits(self, data): + if not self._remaining or data["num"] % 20 == 0: + self._update_limits() + self._remaining -= data["cost"] + + if self._remaining <= 0: + url = "{}/s/{}/{}-{}".format( + self.root, data["image_token"], self.gallery_id, data["num"]) + self.log.error( + "Image limit reached! Reset it and continue with " + "'%s' as URL.", url) + raise exception.StopExtraction() + + def _update_limits(self): + url = "https://e-hentai.org/home.php" + cookies = { + cookie.name: cookie.value + for cookie in self.session.cookies + if cookie.domain == self.cookiedomain and cookie.name != "igneous" + } + + page = self.request(url, cookies=cookies).text + current, pos = text.extract(page, "<strong>", "</strong>") + maximum, pos = text.extract(page, "<strong>", "</strong>", pos) + self._remaining = text.parse_int(maximum) - text.parse_int(current) + + @staticmethod + def _parse_image_info(url): + parts = url.split("/")[4].split("-") + return { + "width": text.parse_int(parts[2]), + "height": text.parse_int(parts[3]), + "size": text.parse_int(parts[1]), + "cost": 1, + } + + @staticmethod + def _parse_original_info(info): + parts = info.lstrip().split(" ") + size = text.parse_bytes(parts[3] + parts[4][0]) + return { + "width": text.parse_int(parts[0]), + "height": text.parse_int(parts[2]), + "size": size, + "cost": 1 + math.ceil(size * 5 / 1024 / 1024) + } + + +class ExhentaiSearchExtractor(ExhentaiExtractor): + """Extractor for exhentai search results""" + subcategory = "search" + pattern = BASE_PATTERN + r"/?\?(.*)$" + test = ( + ("https://exhentai.org/?f_search=touhou"), + (("https://exhentai.org/?f_doujinshi=0&f_manga=0&f_artistcg=0" + "&f_gamecg=0&f_western=0&f_non-h=1&f_imageset=0&f_cosplay=0" + "&f_asianporn=0&f_misc=0&f_search=touhou&f_apply=Apply+Filter"), { + "pattern": ExhentaiGalleryExtractor.pattern, + "range": "1-30", + "count": 30, + }), + ) + + def __init__(self, match): + ExhentaiExtractor.__init__(self, match) + self.params = text.parse_query(match.group(2)) + self.params["page"] = text.parse_int(self.params.get("page")) + self.search_url = self.root + + def items(self): + self.login() + yield Message.Version, 1 + + while True: + last = None + page = self.request(self.search_url, params=self.params).text + + for gallery in ExhentaiGalleryExtractor.pattern.finditer(page): + url = gallery.group(0) + if url == last: + continue + last = url + yield Message.Queue, url, {} + + if 'class="ptdd">><' in page or ">No hits found</p>" in page: + return + self.params["page"] += 1 + self.wait() + + +class ExhentaiFavoriteExtractor(ExhentaiSearchExtractor): + """Extractor for favorited exhentai galleries""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favorites\.php(?:\?(.*))?" + test = ( + ("https://exhentai.org/favorites.php"), + ("https://exhentai.org/favorites.php?favcat=1&f_search=touhou" + "&f_apply=Search+Favorites"), + ) + + def __init__(self, match): + ExhentaiSearchExtractor.__init__(self, match) + self.search_url = self.root + "/favorites.php" diff --git a/gallery_dl/extractor/fallenangels.py b/gallery_dl/extractor/fallenangels.py new file mode 100644 index 0000000..a2d8c04 --- /dev/null +++ b/gallery_dl/extractor/fallenangels.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters from https://www.fascans.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, util +import json + + +class FallenangelsChapterExtractor(ChapterExtractor): + """Extractor for manga-chapters from fascans.com""" + category = "fallenangels" + pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com" + r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?") + test = ( + ("https://manga.fascans.com/manga/chronos-ruler/20/1", { + "url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3", + "keyword": "2dfcc50020e32cd207be88e2a8fac0933e36bdfb", + }), + ("http://truyen.fascans.com/manga/hungry-marie/8", { + "url": "1f923d9cb337d5e7bbf4323719881794a951c6ae", + "keyword": "2bdb7334c0e3eceb9946ffd3132df679b4a94f6a", + }), + ("http://manga.fascans.com/manga/rakudai-kishi-no-eiyuutan/19.5", { + "keyword": "9fcca4c1a90d11f00764f62477ebe10bd408021c", + }), + ) + + def __init__(self, match): + self.version, self.manga, self.chapter, self.minor = match.groups() + url = "https://{}.fascans.com/manga/{}/{}/1".format( + self.version, self.manga, self.chapter) + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + lang = "vi" if self.version == "truyen" else "en" + return { + "manga" : extr('name="description" content="', ' Chapter '), + "title" : extr(': ', ' - Page 1'), + "chapter" : self.chapter, + "chapter_minor": self.minor or "", + "lang" : lang, + "language": util.code_to_language(lang), + } + + @staticmethod + def images(page): + return [ + (img["page_image"], None) + for img in json.loads( + text.extract(page, "var pages = ", ";")[0] + ) + ] + + +class FallenangelsMangaExtractor(MangaExtractor): + """Extractor for manga from fascans.com""" + chapterclass = FallenangelsChapterExtractor + category = "fallenangels" + pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$" + test = ( + ("http://manga.fascans.com/manga/trinity-seven", { + "url": "293057f264de6c438b979bd1c3de4719568db452", + "keyword": "50e0374dba60734230e4284b5ffdadef5104ae62", + }), + ("https://truyen.fascans.com/manga/rakudai-kishi-no-eiyuutan", { + "url": "51a731a6b82d5eb7a335fbae6b02d06aeb2ab07b", + "keyword": "2d2a2a5d9ea5925eb9a47bb13d848967f3af086c", + }), + ) + + def __init__(self, match): + url = "https://" + match.group(1) + self.lang = "vi" if match.group(2) == "truyen" else "en" + MangaExtractor.__init__(self, match, url) + + def chapters(self, page): + extr = text.extract_from(page) + results = [] + language = util.code_to_language(self.lang) + while extr('<li style="', '"'): + vol = extr('class="volume-', '"') + url = extr('href="', '"') + cha = extr('>', '<') + title = extr('<em>', '</em>') + + manga, _, chapter = cha.rpartition(" ") + chapter, dot, minor = chapter.partition(".") + results.append((url, { + "manga" : manga, + "title" : text.unescape(title), + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(chapter), + "chapter_minor": dot + minor, + "lang" : self.lang, + "language": language, + })) + return results diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py new file mode 100644 index 0000000..d941d76 --- /dev/null +++ b/gallery_dl/extractor/flickr.py @@ -0,0 +1,503 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.flickr.com/""" + +from .common import Extractor, Message +from .. import text, oauth, util, exception + + +class FlickrExtractor(Extractor): + """Base class for flickr extractors""" + category = "flickr" + filename_fmt = "{category}_{id}.{extension}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = FlickrAPI(self) + self.item_id = match.group(1) + self.user = None + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for photo in self.photos(): + photo.update(data) + url = photo["url"] + yield Message.Url, url, text.nameext_from_url(url, photo) + + def metadata(self): + """Return general metadata""" + self.user = self.api.urls_lookupUser(self.item_id) + return {"user": self.user} + + def photos(self): + """Return an iterable with all relevant photo objects""" + + +class FlickrImageExtractor(FlickrExtractor): + """Extractor for individual images from flickr.com""" + subcategory = "image" + archive_fmt = "{id}" + pattern = (r"(?:https?://)?(?:" + r"(?:(?:www\.|m\.)?flickr\.com/photos/[^/]+/" + r"|[^.]+\.static\.?flickr\.com/(?:\d+/)+)(\d+)" + r"|flic\.kr/p/([A-Za-z1-9]+))") + test = ( + ("https://www.flickr.com/photos/departingyyz/16089302239", { + "pattern": pattern, + "content": "0821a28ee46386e85b02b67cf2720063440a228c", + "keyword": { + "comments": int, + "description": str, + "extension": "jpg", + "filename": "16089302239_de18cd8017_b", + "id": 16089302239, + "height": 683, + "label": "Large", + "media": "photo", + "url": str, + "views": int, + "width": 1024, + }, + }), + ("https://www.flickr.com/photos/145617051@N08/46733161535", { + "count": 1, + "keyword": {"media": "video"}, + }), + ("http://c2.staticflickr.com/2/1475/24531000464_9a7503ae68_b.jpg", { + "pattern": pattern}), + ("https://farm2.static.flickr.com/1035/1188352415_cb139831d0.jpg", { + "pattern": pattern}), + ("https://flic.kr/p/FPVo9U", { + "pattern": pattern}), + ("https://www.flickr.com/photos/zzz/16089302238", { + "exception": exception.NotFoundError}), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + if not self.item_id: + alphabet = ("123456789abcdefghijkmnopqrstu" + "vwxyzABCDEFGHJKLMNPQRSTUVWXYZ") + self.item_id = util.bdecode(match.group(2), alphabet) + + def items(self): + photo = self.api.photos_getInfo(self.item_id) + + if photo["media"] == "video" and self.api.videos: + self.api._extract_video(photo) + else: + self.api._extract_photo(photo) + + photo["title"] = photo["title"]["_content"] + photo["comments"] = text.parse_int(photo["comments"]["_content"]) + photo["description"] = photo["description"]["_content"] + photo["tags"] = [t["raw"] for t in photo["tags"]["tag"]] + photo["date"] = text.parse_timestamp(photo["dateuploaded"]) + photo["views"] = text.parse_int(photo["views"]) + photo["id"] = text.parse_int(photo["id"]) + + if "location" in photo: + location = photo["location"] + for key, value in location.items(): + if isinstance(value, dict): + location[key] = value["_content"] + + url = photo["url"] + yield Message.Version, 1 + yield Message.Directory, photo + yield Message.Url, url, text.nameext_from_url(url, photo) + + +class FlickrAlbumExtractor(FlickrExtractor): + """Extractor for photo albums from flickr.com""" + subcategory = "album" + directory_fmt = ("{category}", "{subcategory}s", + "{album[id]} - {album[title]}") + archive_fmt = "a_{album[id]}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" + r"photos/([^/]+)/(?:album|set)s(?:/(\d+))?") + test = ( + (("https://www.flickr.com/photos/shona_s/albums/72157633471741607"), { + "pattern": FlickrImageExtractor.pattern, + "count": 6, + }), + ("https://www.flickr.com/photos/shona_s/albums", { + "pattern": pattern, + "count": 2, + }), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.album_id = match.group(2) + + def items(self): + if self.album_id: + return FlickrExtractor.items(self) + return self._album_items() + + def _album_items(self): + yield Message.Version, 1 + data = FlickrExtractor.metadata(self) + data["_extractor"] = FlickrAlbumExtractor + + for album in self.api.photosets_getList(self.user["nsid"]): + self.api._clean_info(album).update(data) + url = "https://www.flickr.com/photos/{}/albums/{}".format( + self.user["path_alias"], album["id"]) + yield Message.Queue, url, album + + def metadata(self): + data = FlickrExtractor.metadata(self) + data["album"] = self.api.photosets_getInfo( + self.album_id, self.user["nsid"]) + return data + + def photos(self): + return self.api.photosets_getPhotos(self.album_id) + + +class FlickrGalleryExtractor(FlickrExtractor): + """Extractor for photo galleries from flickr.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "galleries", + "{user[username]} {gallery[id]}") + archive_fmt = "g_{gallery[id]}_{id}" + pattern = (r"(?:https?://)?(?:www\.)?flickr\.com/" + r"photos/([^/]+)/galleries/(\d+)") + test = (("https://www.flickr.com/photos/flickr/" + "galleries/72157681572514792/"), { + "pattern": FlickrImageExtractor.pattern, + "count": ">= 10", + }) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + def metadata(self): + data = FlickrExtractor.metadata(self) + data["gallery"] = self.api.galleries_getInfo(self.gallery_id) + return data + + def photos(self): + return self.api.galleries_getPhotos(self.gallery_id) + + +class FlickrGroupExtractor(FlickrExtractor): + """Extractor for group pools from flickr.com""" + subcategory = "group" + directory_fmt = ("{category}", "{subcategory}s", "{group[groupname]}") + archive_fmt = "G_{group[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/groups/([^/]+)" + test = ("https://www.flickr.com/groups/bird_headshots/", { + "pattern": FlickrImageExtractor.pattern, + "count": "> 150", + }) + + def metadata(self): + self.group = self.api.urls_lookupGroup(self.item_id) + return {"group": self.group} + + def photos(self): + return self.api.groups_pools_getPhotos(self.group["nsid"]) + + +class FlickrUserExtractor(FlickrExtractor): + """Extractor for the photostream of a flickr user""" + subcategory = "user" + directory_fmt = ("{category}", "{user[username]}") + archive_fmt = "u_{user[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/?$" + test = ("https://www.flickr.com/photos/shona_s/", { + "pattern": FlickrImageExtractor.pattern, + "count": 28, + }) + + def photos(self): + return self.api.people_getPhotos(self.user["nsid"]) + + +class FlickrFavoriteExtractor(FlickrExtractor): + """Extractor for favorite photos of a flickr user""" + subcategory = "favorite" + directory_fmt = ("{category}", "{subcategory}s", "{user[username]}") + archive_fmt = "f_{user[nsid]}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/photos/([^/]+)/favorites" + test = ("https://www.flickr.com/photos/shona_s/favorites", { + "pattern": FlickrImageExtractor.pattern, + "count": 4, + }) + + def photos(self): + return self.api.favorites_getList(self.user["nsid"]) + + +class FlickrSearchExtractor(FlickrExtractor): + """Extractor for flickr photos based on search results""" + subcategory = "search" + directory_fmt = ("{category}", "{subcategory}", "{search[text]}") + archive_fmt = "s_{search}_{id}" + pattern = r"(?:https?://)?(?:www\.)?flickr\.com/search/?\?([^#]+)" + test = ( + ("https://flickr.com/search/?text=mountain"), + ("https://flickr.com/search/?text=tree%20cloud%20house" + "&color_codes=4&styles=minimalism"), + ) + + def __init__(self, match): + FlickrExtractor.__init__(self, match) + self.search = text.parse_query(match.group(1)) + if "text" not in self.search: + self.search["text"] = "" + + def metadata(self): + return {"search": self.search} + + def photos(self): + return self.api.photos_search(self.search) + + +class FlickrAPI(oauth.OAuth1API): + """Minimal interface for the flickr API""" + API_URL = "https://api.flickr.com/services/rest/" + API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" + API_SECRET = "3adb0f568dc68393" + FORMATS = [ + ("o", "Original" , None), + ("k", "Large 2048" , 2048), + ("h", "Large 1600" , 1600), + ("l", "Large" , 1024), + ("c", "Medium 800" , 800), + ("z", "Medium 640" , 640), + ("m", "Medium" , 500), + ("n", "Small 320" , 320), + ("s", "Small" , 240), + ("q", "Large Square", 150), + ("t", "Thumbnail" , 100), + ("s", "Square" , 75), + ] + VIDEO_FORMATS = { + "orig" : 9, + "1080p" : 8, + "720p" : 7, + "360p" : 6, + "288p" : 5, + "700" : 4, + "300" : 3, + "100" : 2, + "appletv" : 1, + "iphone_wifi": 0, + } + + def __init__(self, extractor): + oauth.OAuth1API.__init__(self, extractor) + + self.videos = extractor.config("videos", True) + self.maxsize = extractor.config("size-max") + if isinstance(self.maxsize, str): + for fmt, fmtname, fmtwidth in self.FORMATS: + if self.maxsize == fmt or self.maxsize == fmtname: + self.maxsize = fmtwidth + break + else: + self.maxsize = None + extractor.log.warning( + "Could not match '%s' to any format", self.maxsize) + if self.maxsize: + self.formats = [fmt for fmt in self.FORMATS + if not fmt[2] or fmt[2] <= self.maxsize] + else: + self.formats = self.FORMATS + self.formats = self.formats[:4] + + def favorites_getList(self, user_id): + """Returns a list of the user's favorite photos.""" + params = {"user_id": user_id} + return self._pagination("favorites.getList", params) + + def galleries_getInfo(self, gallery_id): + """Gets information about a gallery.""" + params = {"gallery_id": gallery_id} + gallery = self._call("galleries.getInfo", params)["gallery"] + return self._clean_info(gallery) + + def galleries_getPhotos(self, gallery_id): + """Return the list of photos for a gallery.""" + params = {"gallery_id": gallery_id} + return self._pagination("galleries.getPhotos", params) + + def groups_pools_getPhotos(self, group_id): + """Returns a list of pool photos for a given group.""" + params = {"group_id": group_id} + return self._pagination("groups.pools.getPhotos", params) + + def people_getPhotos(self, user_id): + """Return photos from the given user's photostream.""" + params = {"user_id": user_id} + return self._pagination("people.getPhotos", params) + + def photos_getInfo(self, photo_id): + """Get information about a photo.""" + params = {"photo_id": photo_id} + return self._call("photos.getInfo", params)["photo"] + + def photos_getSizes(self, photo_id): + """Returns the available sizes for a photo.""" + params = {"photo_id": photo_id} + sizes = self._call("photos.getSizes", params)["sizes"]["size"] + if self.maxsize: + for index, size in enumerate(sizes): + if index > 0 and (int(size["width"]) > self.maxsize or + int(size["height"]) > self.maxsize): + del sizes[index:] + break + return sizes + + def photos_search(self, params): + """Return a list of photos matching some criteria.""" + return self._pagination("photos.search", params.copy()) + + def photosets_getInfo(self, photoset_id, user_id): + """Gets information about a photoset.""" + params = {"photoset_id": photoset_id, "user_id": user_id} + photoset = self._call("photosets.getInfo", params)["photoset"] + return self._clean_info(photoset) + + def photosets_getList(self, user_id): + """Returns the photosets belonging to the specified user.""" + params = {"user_id": user_id} + return self._pagination_sets("photosets.getList", params) + + def photosets_getPhotos(self, photoset_id): + """Get the list of photos in a set.""" + params = {"photoset_id": photoset_id} + return self._pagination("photosets.getPhotos", params, "photoset") + + def urls_lookupGroup(self, groupname): + """Returns a group NSID, given the url to a group's page.""" + params = {"url": "https://www.flickr.com/groups/" + groupname} + group = self._call("urls.lookupGroup", params)["group"] + return {"nsid": group["id"], + "path_alias": groupname, + "groupname": group["groupname"]["_content"]} + + def urls_lookupUser(self, username): + """Returns a user NSID, given the url to a user's photos or profile.""" + params = {"url": "https://www.flickr.com/photos/" + username} + user = self._call("urls.lookupUser", params)["user"] + return {"nsid": user["id"], + "path_alias": username, + "username": user["username"]["_content"]} + + def video_getStreamInfo(self, video_id, secret=None): + """Returns all available video streams""" + params = {"photo_id": video_id} + if not secret: + secret = self._call("photos.getInfo", params)["photo"]["secret"] + params["secret"] = secret + stream = self._call("video.getStreamInfo", params)["streams"]["stream"] + return max(stream, key=lambda s: self.VIDEO_FORMATS.get(s["type"], 0)) + + def _call(self, method, params): + params["method"] = "flickr." + method + params["format"] = "json" + params["nojsoncallback"] = "1" + if self.api_key: + params["api_key"] = self.api_key + data = self.request(self.API_URL, params=params).json() + if "code" in data: + if data["code"] == 1: + raise exception.NotFoundError(self.extractor.subcategory) + elif data["code"] == 98: + raise exception.AuthenticationError(data.get("message")) + elif data["code"] == 99: + raise exception.AuthorizationError() + self.log.error("API call failed: %s", data.get("message")) + raise exception.StopExtraction() + return data + + def _pagination(self, method, params, key="photos"): + params["extras"] = "description,date_upload,tags,views,media," + params["extras"] += ",".join("url_" + fmt[0] for fmt in self.formats) + params["page"] = 1 + + while True: + data = self._call(method, params)[key] + yield from map(self._extract_format, data["photo"]) + if params["page"] >= data["pages"]: + return + params["page"] += 1 + + def _pagination_sets(self, method, params): + params["page"] = 1 + + while True: + data = self._call(method, params)["photosets"] + yield from data["photoset"] + if params["page"] >= data["pages"]: + return + params["page"] += 1 + + def _extract_format(self, photo): + photo["description"] = photo["description"]["_content"].strip() + photo["views"] = text.parse_int(photo["views"]) + photo["date"] = text.parse_timestamp(photo["dateupload"]) + photo["tags"] = photo["tags"].split() + photo["id"] = text.parse_int(photo["id"]) + + if photo["media"] == "video" and self.videos: + return self._extract_video(photo) + + for fmt, fmtname, fmtwidth in self.formats: + key = "url_" + fmt + if key in photo: + photo["width"] = text.parse_int(photo["width_" + fmt]) + photo["height"] = text.parse_int(photo["height_" + fmt]) + if self.maxsize and (photo["width"] > self.maxsize or + photo["height"] > self.maxsize): + continue + photo["url"] = photo[key] + photo["label"] = fmtname + + # remove excess data + keys = [ + key for key in photo + if key.startswith(("url_", "width_", "height_")) + ] + for key in keys: + del photo[key] + break + else: + self._extract_photo(photo) + + return photo + + def _extract_photo(self, photo): + size = self.photos_getSizes(photo["id"])[-1] + photo["url"] = size["source"] + photo["label"] = size["label"] + photo["width"] = text.parse_int(size["width"]) + photo["height"] = text.parse_int(size["height"]) + return photo + + def _extract_video(self, photo): + stream = self.video_getStreamInfo(photo["id"], photo.get("secret")) + photo["url"] = stream["_content"] + photo["label"] = stream["type"] + photo["width"] = photo["height"] = 0 + return photo + + @staticmethod + def _clean_info(info): + info["title"] = info["title"]["_content"] + info["description"] = info["description"]["_content"] + return info diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py new file mode 100644 index 0000000..5f4c5b8 --- /dev/null +++ b/gallery_dl/extractor/foolfuuka.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for 4chan archives based on FoolFuuka""" + +from .common import Extractor, Message, SharedConfigMixin, generate_extractors +from .. import text +import itertools +import operator + + +class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor): + """Base extractor for FoolFuuka based boards/archives""" + basecategory = "foolfuuka" + subcategory = "thread" + directory_fmt = ("{category}", "{board[shortname]}", + "{thread_num}{title:? - //}") + filename_fmt = "{media[media]}" + archive_fmt = "{board[shortname]}_{num}_{timestamp}" + pattern_fmt = r"/([^/]+)/thread/(\d+)" + external = "default" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + self.session.headers["Referer"] = self.root + if self.external == "direct": + self.remote = self._remote_direct + + def items(self): + op = True + yield Message.Version, 1 + for post in self.posts(): + if op: + yield Message.Directory, post + op = False + if not post["media"]: + continue + + media = post["media"] + url = media["media_link"] + + if not url and "remote_media_link" in media: + url = self.remote(media) + if url.startswith("/"): + url = self.root + url + + post["extension"] = url.rpartition(".")[2] + yield Message.Url, url, post + + def posts(self): + """Return an iterable with all posts in this thread""" + url = self.root + "/_/api/chan/thread/" + params = {"board": self.board, "num": self.thread} + data = self.request(url, params=params).json()[self.thread] + + # sort post-objects by key + posts = sorted(data.get("posts", {}).items()) + posts = map(operator.itemgetter(1), posts) + + return itertools.chain((data["op"],), posts) + + def remote(self, media): + """Resolve a remote media link""" + needle = '<meta http-equiv="Refresh" content="0; url=' + page = self.request(media["remote_media_link"]).text + return text.extract(page, needle, '"')[0] + + @staticmethod + def _remote_direct(media): + return media["remote_media_link"] + + +EXTRACTORS = { + "4plebs": { + "name": "fourplebs", + "root": "https://archive.4plebs.org", + "pattern": r"(?:archive\.)?4plebs\.org", + "test-thread": ("https://archive.4plebs.org/tg/thread/54059290", { + "url": "07452944164b602502b02b24521f8cee5c484d2a", + }), + }, + "archivedmoe": { + "root": "https://archived.moe", + "test-thread": ( + ("https://archived.moe/gd/thread/309639/", { + "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8", + "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", + }), + ("https://archived.moe/a/thread/159767162/", { + "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87", + }), + ), + }, + "archiveofsins": { + "root": "https://archiveofsins.com", + "pattern": r"(?:www\.)?archiveofsins\.com", + "test-thread": ("https://archiveofsins.com/h/thread/4668813/", { + "url": "f612d287087e10a228ef69517cf811539db9a102", + "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4", + }), + }, + "b4k": { + "root": "https://arch.b4k.co", + "extra": {"external": "direct"}, + "test-thread": ("https://arch.b4k.co/meta/thread/196/", { + "url": "9b0ae01292133268fe9178b71332da1ee25b7704", + }), + }, + "desuarchive": { + "root": "https://desuarchive.org", + "test-thread": ("https://desuarchive.org/a/thread/159542679/", { + "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406", + }), + }, + "fireden": { + "root": "https://boards.fireden.net", + "test-thread": ("https://boards.fireden.net/a/thread/159803223/", { + "url": "01b7baacfb0656a68e566368290e3072b27f86c9", + }), + }, + "nyafuu": { + "root": "https://archive.nyafuu.org", + "pattern": r"(?:archive\.)?nyafuu\.org", + "test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", { + "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f", + }), + }, + "rbt": { + "root": "https://rbt.asia", + "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)", + "test-thread": ( + ("https://rbt.asia/g/thread/61487650/", { + "url": "61896d9d9a2edb556b619000a308a984307b6d30", + }), + ("https://archive.rebeccablacktech.com/g/thread/61487650/", { + "url": "61896d9d9a2edb556b619000a308a984307b6d30", + }), + ), + }, + "thebarchive": { + "root": "https://thebarchive.com", + "pattern": r"thebarchive\.com", + "test-thread": ("https://thebarchive.com/b/thread/739772332/", { + "url": "e8b18001307d130d67db31740ce57c8561b5d80c", + }), + }, +} + +generate_extractors(EXTRACTORS, globals(), ( + FoolfuukaThreadExtractor, +)) diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py new file mode 100644 index 0000000..14baa36 --- /dev/null +++ b/gallery_dl/extractor/foolslide.py @@ -0,0 +1,240 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for FoOlSlide based sites""" + +from .common import ( + Extractor, + ChapterExtractor, + MangaExtractor, + SharedConfigMixin, + Message, + generate_extractors, +) +from .. import text, util +import base64 +import json + + +class FoolslideBase(SharedConfigMixin): + """Base class for FoOlSlide extractors""" + basecategory = "foolslide" + + def request(self, url): + return Extractor.request( + self, url, encoding="utf-8", method="POST", data={"adult": "true"}) + + @staticmethod + def parse_chapter_url(url, data): + info = url.partition("/read/")[2].rstrip("/").split("/") + lang = info[1].partition("-")[0] + data["lang"] = lang + data["language"] = util.code_to_language(lang) + data["volume"] = text.parse_int(info[2]) + data["chapter"] = text.parse_int(info[3]) + data["chapter_minor"] = "." + info[4] if len(info) >= 5 else "" + data["title"] = data["chapter_string"].partition(":")[2].strip() + return data + + +class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor): + """Base class for chapter extractors for FoOlSlide based sites""" + directory_fmt = ( + "{category}", "{manga}", "{chapter_string}") + archive_fmt = "{id}" + pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)" + decode = "default" + + def items(self): + page = self.request(self.chapter_url).text + data = self.metadata(page) + imgs = self.images(page) + + data["count"] = len(imgs) + data["chapter_id"] = text.parse_int(imgs[0]["chapter_id"]) + + yield Message.Version, 1 + yield Message.Directory, data + for data["page"], image in enumerate(imgs, 1): + try: + url = image["url"] + del image["url"] + del image["chapter_id"] + del image["thumb_url"] + except KeyError: + pass + for key in ("height", "id", "size", "width"): + image[key] = text.parse_int(image[key]) + data.update(image) + text.nameext_from_url(data["filename"], data) + yield Message.Url, url, data + + def metadata(self, page): + extr = text.extract_from(page) + extr('<h1 class="tbtitle dnone">', '') + return self.parse_chapter_url(self.chapter_url, { + "manga" : text.unescape(extr('title="', '"')).strip(), + "chapter_string": text.unescape(extr('title="', '"')), + }) + + def images(self, page): + if self.decode == "base64": + base64_data = text.extract(page, 'atob("', '"')[0].encode() + data = base64.b64decode(base64_data).decode() + elif self.decode == "double": + pos = page.find("[{") + data = text.extract(page, " = ", ";", pos)[0] + else: + data = text.extract(page, "var pages = ", ";")[0] + return json.loads(data) + + +class FoolslideMangaExtractor(FoolslideBase, MangaExtractor): + """Base class for manga extractors for FoOlSlide based sites""" + pattern_fmt = r"(/series/[^/?&#]+)" + + def chapters(self, page): + extr = text.extract_from(page) + manga = text.unescape(extr('<h1 class="title">', '</h1>')).strip() + author = extr('<b>Author</b>: ', '<br') + artist = extr('<b>Artist</b>: ', '<br') + + results = [] + while True: + url = extr('<div class="title"><a href="', '"') + if not url: + return results + results.append((url, self.parse_chapter_url(url, { + "manga": manga, "author": author, "artist": artist, + "chapter_string": extr('title="', '"'), + "group" : extr('title="', '"'), + }))) + + +EXTRACTORS = { + "dokireader": { + "root": "https://kobato.hologfx.com/reader", + "test-chapter": + (("https://kobato.hologfx.com/reader/read/" + "hitoribocchi_no_oo_seikatsu/en/3/34"), { + "keyword": "6e719ac86f0c6dab89390dd7e507e678459e0dbc", + }), + "test-manga": + (("https://kobato.hologfx.com/reader/series/" + "boku_ha_ohimesama_ni_narenai/"), { + "url": "1c1f5a7258ce4f631f5fc32be548d78a6a57990d", + "keyword": "614d89a6045b85c822cbd3e67578ea7577dfc995", + }), + }, + "jaiminisbox": { + "root": "https://jaiminisbox.com/reader", + "pattern": r"(?:www\.)?jaiminisbox\.com/reader", + "extra": {"decode": "base64"}, + "test-chapter": ( + ("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", { + "keyword": "6009af77cc9c05528ab1fdda47b1ad9d4811c673", + }), + ("https://jaiminisbox.com/reader/read/dr-stone/en/0/16/", { + "keyword": "8607375c24b1d0db7f52d059ef5baff793aa458e", + }), + ), + "test-manga": + ("https://jaiminisbox.com/reader/series/sora_no_kian/", { + "url": "66612be177dc3b3fa1d1f537ef02f4f701b163ea", + "keyword": "0908a4145bb03acc4210f5d01169988969f5acd1", + }), + }, + "kireicake": { + "root": "https://reader.kireicake.com", + "test-chapter": + ("https://reader.kireicake.com/read/wonderland/en/1/1/", { + "url": "b2d36bc0bc67e4c461c3a4d6444a2fd339f5d07e", + "keyword": "9f80947920a325e33aea7f5cd69ea669171903b6", + }), + "test-manga": + ("https://reader.kireicake.com/series/wonderland/", { + "url": "d067b649af1cc88fa8c8b698fde04a10909fd169", + "keyword": "268f43772fb239888ca5c5f6a4f65f99ffb3eefb", + }), + }, + "powermanga": { + "root": "https://read.powermanga.org", + "pattern": r"read(?:er)?\.powermanga\.org", + "test-chapter": + (("https://read.powermanga.org" + "/read/one_piece_digital_colour_comics/en/0/75/"), { + "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384", + "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe", + }), + "test-manga": + (("https://read.powermanga.org" + "/series/one_piece_digital_colour_comics/"), { + "count": ">= 1", + "keyword": { + "chapter": int, + "chapter_minor": str, + "chapter_string": str, + "group": "PowerManga", + "lang": "en", + "language": "English", + "manga": "One Piece Digital Colour Comics", + "title": str, + "volume": int, + }, + }), + }, + "sensescans": { + "root": "http://sensescans.com/reader", + "pattern": r"(?:(?:www\.)?sensescans\.com/reader" + r"|reader\.sensescans\.com)", + "test-chapter": ( + (("http://sensescans.com/reader/read/" + "magi__labyrinth_of_magic/en/37/369/"), { + "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812", + "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988", + }), + (("http://reader.sensescans.com/read/" + "magi__labyrinth_of_magic/en/37/369/"), { + "url": "a399ef037cdfbc25b09d435cc2ea1e3e454a6812", + "keyword": "07acd84fb18a9f1fd6dff5befe711bcca0ff9988", + }), + ), + "test-manga": + ("http://sensescans.com/reader/series/hakkenden/", { + "url": "2360ccb0ead0ff2f5e27b7aef7eb17b9329de2f2", + "keyword": "4919f2bfed38e3a34dc984ec8d1dbd7a03044e23", + }), + }, + "worldthree": { + "root": "http://www.slide.world-three.org", + "pattern": r"(?:www\.)?slide\.world-three\.org", + "test-chapter": ( + (("http://www.slide.world-three.org" + "/read/black_bullet/en/2/7/page/1"), { + "url": "be2f04f6e2d311b35188094cfd3e768583271584", + "keyword": "967d536a65de4d52478d5b666a1760b181eddb6e", + }), + (("http://www.slide.world-three.org" + "/read/idolmster_cg_shuffle/en/0/4/2/"), { + "url": "6028ea5ca282744f925dfad92eeb98509f9cc78c", + "keyword": "f3cfe2ad3388991f1d045c85d0fa94795a7694dc", + }), + ), + "test-manga": + ("http://www.slide.world-three.org/series/black_bullet/", { + "url": "5743b93512d26e6b540d90a7a5d69208b6d4a738", + "keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120", + }), + }, + "_ckey": "chapterclass", +} + +generate_extractors(EXTRACTORS, globals(), ( + FoolslideChapterExtractor, + FoolslideMangaExtractor, +)) diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py new file mode 100644 index 0000000..15bd0a8 --- /dev/null +++ b/gallery_dl/extractor/gelbooru.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://gelbooru.com/""" + +from . import booru +from .common import Message +from .. import text, util + + +class GelbooruExtractor(booru.XmlParserMixin, + booru.GelbooruPageMixin, + booru.BooruExtractor): + """Base class for gelbooru extractors""" + category = "gelbooru" + api_url = "https://gelbooru.com/index.php" + post_url = "https://gelbooru.com/index.php?page=post&s=view&id={}" + pool_url = "https://gelbooru.com/index.php?page=pool&s=show&id={}" + + def __init__(self, match): + super().__init__(match) + + self.use_api = self.config("api", True) + if self.use_api: + self.params.update({"page": "dapi", "s": "post", "q": "index"}) + else: + self.items = self.items_noapi + + def items_noapi(self): + data = self.get_metadata() + + yield Message.Version, 1 + yield Message.Directory, data + + for post in self.get_posts(): + post = self.get_post_data(post) + url = post["file_url"] + post.update(data) + yield Message.Url, url, text.nameext_from_url(url, post) + + def get_posts(self): + """Return an iterable containing all relevant post objects""" + + def get_post_data(self, post_id): + """Extract metadata of a single post""" + page = self.request(self.post_url.format(post_id)).text + data = text.extract_all(page, ( + (None , '<meta name="keywords"', ''), + ("tags" , ' imageboard, ', '"'), + ("id" , '<li>Id: ', '<'), + ("created_at", '<li>Posted: ', '<'), + ("width" , '<li>Size: ', 'x'), + ("height" , '', '<'), + ("source" , '<li>Source: <a href="', '"'), + ("rating" , '<li>Rating: ', '<'), + (None , '<li>Score: ', ''), + ("score" , '>', '<'), + ("file_url" , '<li><a href="http', '"'), + ("change" , ' id="lupdated" value="', '"'), + ))[0] + data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1) + data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0] + data["rating"] = (data["rating"] or "?")[0].lower() + data["tags"] = " ".join( + [tag.replace(" ", "_") for tag in data["tags"].split(", ")]) + if self.extags: + self.extended_tags(data, page) + return data + + +class GelbooruTagExtractor(booru.TagMixin, GelbooruExtractor): + """Extractor for images from gelbooru.com based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" + r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") + test = ( + ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", { + "count": 5, + }), + ("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", { + "options": (("api", False),), + "count": 5, + }), + ) + + def __init__(self, match): + super().__init__(match) + if not self.use_api: + self.per_page = 42 + + def get_posts(self): + url = "https://gelbooru.com/index.php?page=post&s=list" + params = {"tags": self.tags, "pid": self.page_start * self.per_page} + + while True: + page = self.request(url, params=params).text + ids = list(text.extract_iter(page, '<a id="p', '"')) + yield from ids + if len(ids) < self.per_page: + return + params["pid"] += self.per_page + + +class GelbooruPoolExtractor(booru.GelbooruPoolMixin, GelbooruExtractor): + """Extractor for image-pools from gelbooru.com""" + pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" + r"\?page=pool&s=show&id=(?P<pool>\d+)") + test = ("https://gelbooru.com/index.php?page=pool&s=show&id=761", { + "count": 6, + }) + + def get_posts(self): + return util.advance(self.posts, self.page_start) + + +class GelbooruPostExtractor(booru.PostMixin, GelbooruExtractor): + """Extractor for single images from gelbooru.com""" + pattern = (r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?" + r"\?page=post&s=view&id=(?P<post>\d+)") + test = ("https://gelbooru.com/index.php?page=post&s=view&id=313638", { + "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + "count": 1, + }) + + def get_posts(self): + return (self.post,) diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py new file mode 100644 index 0000000..1dcb3c8 --- /dev/null +++ b/gallery_dl/extractor/gfycat.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://gfycat.com/""" + +from .common import Extractor, Message + + +class GfycatExtractor(Extractor): + """Base class for gfycat extractors""" + category = "gfycat" + filename_fmt = "{category}_{gfyName}.{extension}" + archive_fmt = "{gfyName}" + root = "https://gfycat.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.formats = (self.config("format", "mp4"), "mp4", "webm", "gif") + + def _select_format(self, gfyitem): + for fmt in self.formats: + key = fmt + "Url" + if key in gfyitem: + url = gfyitem[key] + gfyitem["extension"] = url.rpartition(".")[2] + return url + return "" + + def _get_info(self, gfycat_id): + url = "https://api.gfycat.com/v1/gfycats/" + gfycat_id + return self.request(url).json()["gfyItem"] + + +class GfycatImageExtractor(GfycatExtractor): + """Extractor for individual images from gfycat.com""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:\w+\.)?gfycat\.com" + r"/(?:gifs/detail/|\w+/)?([A-Za-z]+)") + test = ( + ("https://gfycat.com/GrayGenerousCowrie", { + "url": "e0b5e1d7223108249b15c3c7898dd358dbfae045", + "content": "5786028e04b155baa20b87c5f4f77453cd5edc37", + "keyword": { + "gfyId": "graygenerouscowrie", + "gfyName": "GrayGenerousCowrie", + "gfyNumber": "755075459", + "title": "Bottom's up", + "userName": "jackson3oh3", + "createDate": 1495884169, + "md5": "a4796e05b0db9ba9ce5140145cd318aa", + "width": 400, + "height": 224, + "frameRate": 23, + "numFrames": 158, + "views": int, + }, + }), + (("https://thumbs.gfycat.com/SillyLameIsabellinewheatear" + "-size_restricted.gif"), { + "url": "13b32e6cc169d086577d7dd3fd36ee6cdbc02726", + }), + ("https://gfycat.com/detail/UnequaledHastyAnkole?tagname=aww", { + "url": "e24c9f69897fd223343782425a429c5cab6a768e", + }), + ("https://gfycat.com/gifs/detail/UnequaledHastyAnkole"), + ("https://gfycat.com/ifr/UnequaledHastyAnkole"), + ("https://gfycat.com/ru/UnequaledHastyAnkole"), + ) + + def __init__(self, match): + GfycatExtractor.__init__(self, match) + self.gfycat_id = match.group(1) + + def items(self): + gfyitem = self._get_info(self.gfycat_id) + yield Message.Version, 1 + yield Message.Directory, gfyitem + yield Message.Url, self._select_format(gfyitem), gfyitem diff --git a/gallery_dl/extractor/hbrowse.py b/gallery_dl/extractor/hbrowse.py new file mode 100644 index 0000000..01793dc --- /dev/null +++ b/gallery_dl/extractor/hbrowse.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.hbrowse.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, exception +import json + + +class HbrowseBase(): + """Base class for hbrowse extractors""" + category = "hbrowse" + root = "https://www.hbrowse.com" + + def parse_page(self, page, data): + """Parse metadata on 'page' and add it to 'data'""" + data, pos = text.extract_all(page, ( + ('manga' , '<td class="listLong">', '</td>'), + ('artist', '<td class="listLong">', '</td>'), + ('total' , '<td class="listLong">', ' '), + ('origin', '<td class="listLong">', '</td>'), + ), values=data) + + if not data["manga"] and "<b>Warning</b>" in page: + msg = page.rpartition(">")[2].strip() + self.log.error("Site is not accessible: '%s'", msg) + raise exception.StopExtraction() + + tags = text.extract(page, 'class="listTable"', '</table>', pos)[0] + + data["manga"] = text.unescape(data["manga"]) + data["total"] = text.parse_int(data["total"]) + data["artist"] = text.remove_html(data["artist"]) + data["origin"] = text.remove_html(data["origin"]) + data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"')) + return data + + +class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor): + """Extractor for manga-chapters from hbrowse.com""" + directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}") + filename_fmt = ("{category}_{manga_id}_{chapter:>05}_" + "{page:>03}.{extension}") + archive_fmt = "{manga_id}_{chapter}_{page}" + pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))" + test = ("https://www.hbrowse.com/10363/c00000", { + "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6", + "keyword": "274996f6c809e5250b6ff3abbc5147e29f89d9a5", + "content": "44578ebbe176c2c27434966aef22945787e2781e", + }) + + def __init__(self, match): + self.path, self.gid, self.chapter = match.groups() + self.path += "/" + ChapterExtractor.__init__(self, match) + + def metadata(self, page): + return self.parse_page(page, { + "manga_id": text.parse_int(self.gid), + "chapter": text.parse_int(self.chapter) + }) + + def images(self, page): + base = self.root + "/data" + self.path + json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]" + return [(base + name, None) for name in json.loads(json_data)] + + +class HbrowseMangaExtractor(HbrowseBase, MangaExtractor): + """Extractor for manga from hbrowse.com""" + chapterclass = HbrowseChapterExtractor + reverse = False + pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$" + test = ("https://www.hbrowse.com/10363", { + "url": "b89682bfb86c11d2af0dc47463804ec3ac4aadd6", + "keyword": "4b15fda1858a69de1fbf5afddfe47dd893397312", + }) + + def chapters(self, page): + results = [] + data = self.parse_page(page, { + "manga_id": text.parse_int( + self.manga_url.rstrip("/").rpartition("/")[2]) + }) + + pos = 0 + needle = '<td class="listMiddle">\n<a class="listLink" href="' + while True: + url, pos = text.extract(page, needle, '"', pos) + if not url: + return results + title, pos = text.extract(page, '>View ', '<', pos) + data["chapter"] = text.parse_int(url.rpartition("/")[2][1:]) + data["title"] = title + results.append((text.urljoin(self.root, url), data.copy())) diff --git a/gallery_dl/extractor/hentai2read.py b/gallery_dl/extractor/hentai2read.py new file mode 100644 index 0000000..354acbf --- /dev/null +++ b/gallery_dl/extractor/hentai2read.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract hentai-manga from https://hentai2read.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +import json +import re + + +class Hentai2readBase(): + """Base class for hentai2read extractors""" + category = "hentai2read" + root = "https://hentai2read.com" + + +class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor): + """Extractor for a single manga chapter from hentai2read.com""" + archive_fmt = "{chapter_id}_{page}" + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+/(\d+))" + test = ("https://hentai2read.com/amazon_elixir/1/", { + "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", + "keyword": "ff84b8f751f0e4ee37717efc4332ff1db71951d9", + }) + + def __init__(self, match): + self.chapter = match.group(2) + ChapterExtractor.__init__(self, match) + + def metadata(self, page): + title, pos = text.extract(page, "<title>", "</title>") + manga_id, pos = text.extract(page, 'data-mid="', '"', pos) + chapter_id, pos = text.extract(page, 'data-cid="', '"', pos) + match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - " + r"(\d+): (.+) . Page 1 ", title) + return { + "manga": match.group(1), + "manga_id": text.parse_int(manga_id), + "chapter": text.parse_int(self.chapter), + "chapter_id": text.parse_int(chapter_id), + "type": match.group(2), + "author": match.group(3), + "title": match.group(5), + "lang": "en", + "language": "English", + } + + @staticmethod + def images(page): + images = text.extract(page, "'images' : ", ",\n")[0] + return [ + ("https://hentaicdn.com/hentai" + part, None) + for part in json.loads(images) + ] + + +class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor): + """Extractor for hmanga from hentai2read.com""" + chapterclass = Hentai2readChapterExtractor + pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+)/?$" + test = ( + ("https://hentai2read.com/amazon_elixir/", { + "url": "273073752d418ec887d7f7211e42b832e8c403ba", + "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac", + }), + ("https://hentai2read.com/oshikage_riot/", { + "url": "6595f920a3088a15c2819c502862d45f8eb6bea6", + "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36", + }), + ) + + def chapters(self, page): + results = [] + manga, pos = text.extract( + page, '<span itemprop="name">', '</span>') + mtype, pos = text.extract( + page, '<small class="text-danger">[', ']</small>', pos) + manga_id = text.parse_int(text.extract( + page, 'data-mid="', '"', pos)[0]) + + while True: + chapter_id, pos = text.extract(page, ' data-cid="', '"', pos) + if not chapter_id: + return results + _ , pos = text.extract(page, ' href="', '"', pos) + url, pos = text.extract(page, ' href="', '"', pos) + chapter, pos = text.extract(page, '>', '<', pos) + + chapter, _, title = text.unescape(chapter).strip().partition(" - ") + results.append((url, { + "manga_id": manga_id, "manga": manga, "type": mtype, + "chapter_id": text.parse_int(chapter_id), + "chapter": text.parse_int(chapter), + "title": title, "lang": "en", "language": "English", + })) diff --git a/gallery_dl/extractor/hentaicafe.py b/gallery_dl/extractor/hentaicafe.py new file mode 100644 index 0000000..e95467b --- /dev/null +++ b/gallery_dl/extractor/hentaicafe.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentai.cafe/""" + +from . import foolslide +from .. import text +from ..cache import memcache +import re + + +class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor): + """Extractor for manga-chapters from hentai.cafe""" + category = "hentaicafe" + directory_fmt = ("{category}", "{manga}") + pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe" + r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)") + test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", { + "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2", + "keyword": "6913608267d883c82b887303b9ced13821188329", + }) + root = "https://hentai.cafe" + + def metadata(self, page): + info = text.unescape(text.extract(page, '<title>', '</title>')[0]) + manga, _, chapter_string = info.partition(" :: ") + + data = self._data(self.chapter_url.split("/")[5]) + data["manga"] = manga + data["chapter_string"] = chapter_string.rstrip(" :") + return self.parse_chapter_url(self.chapter_url, data) + + @memcache(keyarg=1) + def _data(self, manga): + return {"artist": [], "tags": []} + + +class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor): + """Extractor for manga from hentai.cafe""" + category = "hentaicafe" + pattern = (r"(?:https?://)?" + r"(?:www\.)?hentai\.cafe" + r"((?:/manga/series)?/[^/?&#]+)/?$") + test = ( + # single chapter + ("https://hentai.cafe/hazuki-yuuto-summer-blues/", { + "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b", + "keyword": "eb9f98544098c961bd8cf5dbe69e6da51c4fb2f6", + }), + # multi-chapter + ("https://hentai.cafe/saitom-saitom-box/", { + "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", + "keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb", + }), + # foolslide URL + ("https://hentai.cafe/manga/series/saitom-box/", { + "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076", + "keyword": "f0ece32d958f889d8229ed4052716d398a0a875c", + }), + ) + root = "https://hentai.cafe" + reverse = False + chapterclass = HentaicafeChapterExtractor + + def chapters(self, page): + if "/manga/series/" in self.manga_url: + chapters = foolslide.FoolslideMangaExtractor.chapters(self, page) + chapters.reverse() + return chapters + + tags , pos = text.extract(page, "<p>Tags: ", "</br>") + artist, pos = text.extract(page, "\nArtists: ", "</br>", pos) + manga , pos = text.extract(page, "/manga/read/", "/", pos) + data = { + "tags" : text.split_html(tags)[::2], + "artist": text.split_html(artist), + } + HentaicafeChapterExtractor._data(manga).update(data) + + return [ + (url, data) + for url in re.findall( + r'<a +class="x-btn[^"]*" +href="([^"]+)"', page) + ] diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py new file mode 100644 index 0000000..d31f66f --- /dev/null +++ b/gallery_dl/extractor/hentaifoundry.py @@ -0,0 +1,264 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.hentai-foundry.com/""" + +from .common import Extractor, Message +from .. import text, util, exception + + +class HentaifoundryExtractor(Extractor): + """Base class for hentaifoundry extractors""" + category = "hentaifoundry" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{category}_{index}_{title}.{extension}" + archive_fmt = "{index}" + root = "https://www.hentai-foundry.com" + per_page = 25 + + def __init__(self, match, user="", page=1): + Extractor.__init__(self, match) + self.page_url = "" + self.user = user + self.start_post = 0 + self.start_page = text.parse_int(page, 1) + + def items(self): + data = self.get_job_metadata() + yield Message.Version, 1 + yield Message.Directory, data + + self.set_filters() + for page_url in util.advance(self.get_image_pages(), self.start_post): + url, image = self.get_image_metadata(page_url) + image.update(data) + yield Message.Url, url, image + + def skip(self, num): + pages, posts = divmod(num, self.per_page) + self.start_page += pages + self.start_post += posts + return num + + def get_job_metadata(self): + """Collect metadata for extractor-job""" + self.request(self.root + "/?enterAgree=1") + return {"user": self.user} + + def get_image_pages(self): + """Yield urls of all relevant image pages""" + num = self.start_page + + while True: + page = self.request("{}/page/{}".format(self.page_url, num)).text + yield from text.extract_iter(page, 'thumbTitle"><a href="', '"') + + if 'class="pager"' not in page or 'class="last hidden"' in page: + return + num += 1 + + def get_image_metadata(self, page_url): + """Collect url and metadata from an image page""" + page = self.request(text.urljoin(self.root, page_url)).text + index = page_url.rsplit("/", 2)[1] + title , pos = text.extract(page, '<title>', '</title>') + _ , pos = text.extract(page, 'id="picBox"', '', pos) + width , pos = text.extract(page, 'width="', '"', pos) + height, pos = text.extract(page, 'height="', '"', pos) + url , pos = text.extract(page, 'src="', '"', pos) + + title, _, artist = title.rpartition(" - ")[0].rpartition(" by ") + + data = text.nameext_from_url(url, { + "title": text.unescape(title), + "artist": text.unescape(artist), + "index": text.parse_int(index), + "width": text.parse_int(width), + "height": text.parse_int(height), + }) + if not data["extension"]: + data["extension"] = "jpg" + return text.urljoin(self.root, url), data + + def set_filters(self): + """Set site-internal filters to show all images""" + token = text.unquote(text.extract( + self.session.cookies["YII_CSRF_TOKEN"], "%22", "%22")[0]) + data = { + "YII_CSRF_TOKEN": token, + "rating_nudity": 3, + "rating_violence": 3, + "rating_profanity": 3, + "rating_racism": 3, + "rating_sex": 3, + "rating_spoilers": 3, + "rating_yaoi": 1, + "rating_yuri": 1, + "rating_teen": 1, + "rating_guro": 1, + "rating_furry": 1, + "rating_beast": 1, + "rating_male": 1, + "rating_female": 1, + "rating_futa": 1, + "rating_other": 1, + "rating_scat": 1, + "rating_incest": 1, + "rating_rape": 1, + "filter_media": "A", + "filter_order": "date_new", + "filter_type": 0, + } + url = self.root + "/site/filters" + self.request(url, method="POST", data=data) + + +class HentaifoundryUserExtractor(HentaifoundryExtractor): + """Extractor for all images of a hentai-foundry-user""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/(?:pictures/user/([^/]+)(?:/page/(\d+))?/?$" + r"|user/([^/]+)/profile)") + test = ( + ("https://www.hentai-foundry.com/pictures/user/Tenpura", { + "url": "ebbc981a85073745e3ca64a0f2ab31fab967fc28", + "keyword": "63ad576f87f82fa166ca4676761762f7f8496cf5", + }), + ("https://www.hentai-foundry.com/pictures/user/Tenpura/page/3"), + ("https://www.hentai-foundry.com/user/Tenpura/profile"), + ) + + def __init__(self, match): + HentaifoundryExtractor.__init__( + self, match, match.group(1) or match.group(3), match.group(2)) + self.page_url = "{}/pictures/user/{}".format(self.root, self.user) + + def get_job_metadata(self): + page = self.request(self.page_url + "?enterAgree=1").text + count = text.extract(page, ">Pictures (", ")")[0] + return {"user": self.user, "count": text.parse_int(count)} + + +class HentaifoundryScrapsExtractor(HentaifoundryExtractor): + """Extractor for scrap images of a hentai-foundry-user""" + subcategory = "scraps" + directory_fmt = ("{category}", "{user}", "Scraps") + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/pictures/user/([^/]+)/scraps(?:/page/(\d+))?") + test = ( + ("https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", { + "url": "00a11e30b73ff2b00a1fba0014f08d49da0a68ec", + "keyword": "410c6c900cfd23a8dd1e53dfcc97a79ea68c3359", + }), + ("https://www.hentai-foundry.com" + "/pictures/user/Evulchibi/scraps/page/3"), + ) + + def __init__(self, match): + HentaifoundryExtractor.__init__( + self, match, match.group(1), match.group(2)) + self.page_url = "{}/pictures/user/{}/scraps".format( + self.root, self.user) + + def get_job_metadata(self): + page = self.request(self.page_url + "?enterAgree=1").text + count = text.extract(page, ">Scraps (", ")")[0] + return {"user": self.user, "count": text.parse_int(count)} + + +class HentaifoundryFavoriteExtractor(HentaifoundryExtractor): + """Extractor for favorite images of a hentai-foundry-user""" + subcategory = "favorite" + directory_fmt = ("{category}", "{user}", "Favorites") + archive_fmt = "f_{user}_{index}" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/user/([^/]+)/faves/pictures(?:/page/(\d+))?") + test = ( + ("https://www.hentai-foundry.com/user/Tenpura/faves/pictures", { + "url": "56f9ae2e89fe855e9fe1da9b81e5ec6212b0320b", + "keyword": "2b9478725e66d46ea043fa87476bbd28546958e7", + }), + ("https://www.hentai-foundry.com" + "/user/Tenpura/faves/pictures/page/3"), + ) + + def __init__(self, match): + HentaifoundryExtractor.__init__( + self, match, match.group(1), match.group(2)) + self.page_url = "{}/user/{}/faves/pictures".format( + self.root, self.user) + + +class HentaifoundryRecentExtractor(HentaifoundryExtractor): + """Extractor for 'Recent Pictures' on hentaifoundry.com""" + subcategory = "recent" + directory_fmt = ("{category}", "Recent Pictures", "{date}") + archive_fmt = "r_{index}" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/pictures/recent/(\d+-\d+-\d+)(?:/page/(\d+))?") + test = ("http://www.hentai-foundry.com/pictures/recent/2018-09-20",) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, "", match.group(2)) + self.date = match.group(1) + self.page_url = "{}/pictures/recent/{}".format(self.root, self.date) + + def get_job_metadata(self): + self.request(self.root + "/?enterAgree=1") + return {"date": self.date} + + +class HentaifoundryPopularExtractor(HentaifoundryExtractor): + """Extractor for popular images on hentaifoundry.com""" + subcategory = "popular" + directory_fmt = ("{category}", "Popular Pictures") + archive_fmt = "p_{index}" + pattern = (r"(?:https?://)?(?:www\.)?hentai-foundry\.com" + r"/pictures/popular(?:/page/(\d+))?") + test = ("http://www.hentai-foundry.com/pictures/popular",) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, "", match.group(1)) + self.page_url = self.root + "/pictures/popular" + + +class HentaifoundryImageExtractor(HentaifoundryExtractor): + """Extractor for a single image from hentaifoundry.com""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:www\.|pictures\.)?hentai-foundry\.com" + r"/(?:pictures/user|[^/])/([^/]+)/(\d+)") + test = ( + (("https://www.hentai-foundry.com" + "/pictures/user/Tenpura/407501/shimakaze"), { + "url": "fbf2fd74906738094e2575d2728e8dc3de18a8a3", + "keyword": "cbb9381e6c2acce58db4adf4efc0ad7d138bddc4", + "content": "91bf01497c39254b6dfb234a18e8f01629c77fd1", + }), + ("https://www.hentai-foundry.com/pictures/user/Tenpura/340853/", { + "exception": exception.HttpError, + }), + ("https://pictures.hentai-foundry.com" + "/t/Tenpura/407501/Tenpura-407501-shimakaze.png"), + ) + + def __init__(self, match): + HentaifoundryExtractor.__init__(self, match, match.group(1)) + self.index = match.group(2) + + def items(self): + post_url = "{}/pictures/user/{}/{}/?enterAgree=1".format( + self.root, self.user, self.index) + url, data = self.get_image_metadata(post_url) + data["user"] = self.user + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, url, data + + def skip(self, _): + return 0 diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py new file mode 100644 index 0000000..cf4871f --- /dev/null +++ b/gallery_dl/extractor/hentaifox.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentaifox.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text + + +class HentaifoxBase(): + """Base class for hentaifox extractors""" + category = "hentaifox" + root = "https://hentaifox.com" + + +class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): + """Extractor for image galleries on hentaifox.com""" + pattern = r"(?:https?://)?(?:www\.)?hentaifox\.com(/gallery/(\d+))" + test = ("https://hentaifox.com/gallery/56622/", { + "pattern": r"https://i\d*\.hentaifox\.com/\d+/\d+/\d+\.jpg", + "count": 24, + "keyword": "38f8517605feb6854d48833297da6b05c6541b69", + }) + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_id = match.group(2) + + def metadata(self, page, split=text.split_html): + extr = text.extract_from(page) + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(extr("<h1>", "</h1>")), + "parody" : split(extr(">Parodies:" , "</a></span>"))[::2], + "characters": split(extr(">Characters:", "</a></span>"))[::2], + "tags" : split(extr(">Tags:" , "</a></span>"))[::2], + "artist" : split(extr(">Artists:" , "</a></span>"))[::2], + "group" : split(extr(">Groups:" , "</a></span>"))[::2], + "type" : text.remove_html(extr(">Category:", "</a></span>")), + "language" : "English", + "lang" : "en", + } + + def images(self, page): + return [ + (text.urljoin(self.root, url.replace("t.", ".")), None) + for url in text.extract_iter(page, 'data-src="', '"') + ] + + +class HentaifoxSearchExtractor(HentaifoxBase, Extractor): + """Extractor for search results and listings on hentaifox.com""" + subcategory = "search" + pattern = (r"(?:https?://)?(?:www\.)?hentaifox\.com" + r"(/(?:parody|tag|artist|character|search)/[^/?%#]+)") + test = ( + ("https://hentaifox.com/parody/touhou-project/"), + ("https://hentaifox.com/character/reimu-hakurei/"), + ("https://hentaifox.com/artist/distance/"), + ("https://hentaifox.com/search/touhou/"), + ("https://hentaifox.com/tag/full-colour/", { + "pattern": HentaifoxGalleryExtractor.pattern, + "count": ">= 40", + "keyword": { + "url": str, + "gallery_id": int, + "thumbnail": r"re:https://i\d*.hentaifox.com/\d+/\d+/thumb\.", + "title": str, + "tags": list, + }, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + + def items(self): + yield Message.Version, 1 + for gallery in self.galleries(): + yield Message.Queue, gallery["url"], gallery + + def galleries(self): + url = "{}/{}/".format(self.root, self.path) + + while True: + page = self.request(url).text + info, gpos = text.extract( + page, 'class="galleries_overview">', 'class="clear">') + + for ginfo in text.extract_iter(info, '<div class="item', '</a>'): + tags , pos = text.extract(ginfo, '', '"') + url , pos = text.extract(ginfo, 'href="', '"', pos) + title, pos = text.extract(ginfo, 'alt="', '"', pos) + thumb, pos = text.extract(ginfo, 'src="', '"', pos) + + yield { + "url": text.urljoin(self.root, url), + "gallery_id": text.parse_int( + url.strip("/").rpartition("/")[2]), + "thumbnail": text.urljoin(self.root, thumb), + "title": text.unescape(title), + "tags": tags.split(), + "_extractor": HentaifoxGalleryExtractor, + } + + pos = page.find('class="current"', gpos) + url = text.extract(page, 'href="', '"', pos)[0] + if pos == -1 or "/pag" not in url: + return + url = text.urljoin(self.root, url) diff --git a/gallery_dl/extractor/hentaihere.py b/gallery_dl/extractor/hentaihere.py new file mode 100644 index 0000000..8083a9b --- /dev/null +++ b/gallery_dl/extractor/hentaihere.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract hentai-manga from https://hentaihere.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +import json +import re + + +class HentaihereBase(): + """Base class for hentaihere extractors""" + category = "hentaihere" + root = "https://hentaihere.com" + + +class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor): + """Extractor for a single manga chapter from hentaihere.com""" + archive_fmt = "{chapter_id}_{page}" + pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)" + test = ("https://hentaihere.com/m/S13812/1/1/", { + "url": "964b942cf492b3a129d2fe2608abfc475bc99e71", + "keyword": "cbcee0c0eb178c4b87f06a834085784f8dddad24", + }) + + def __init__(self, match): + self.manga_id, self.chapter = match.groups() + url = "{}/m/S{}/{}/1".format(self.root, self.manga_id, self.chapter) + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + title = text.extract(page, "<title>", "</title>")[0] + chapter_id = text.extract(page, 'report/C', '"')[0] + pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at " + match = re.match(pattern, title) + return { + "manga": match.group(1), + "manga_id": text.parse_int(self.manga_id), + "chapter": text.parse_int(self.chapter), + "chapter_id": text.parse_int(chapter_id), + "type": match.group(2), + "title": match.group(3), + "author": match.group(4), + "lang": "en", + "language": "English", + } + + @staticmethod + def images(page): + images = text.extract(page, "var rff_imageList = ", ";")[0] + return [ + ("https://hentaicdn.com/hentai" + part, None) + for part in json.loads(images) + ] + + +class HentaihereMangaExtractor(HentaihereBase, MangaExtractor): + """Extractor for hmanga from hentaihere.com""" + chapterclass = HentaihereChapterExtractor + pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com(/m/S\d+)/?$" + test = ( + ("https://hentaihere.com/m/S13812", { + "url": "d1ba6e28bb2162e844f8559c2b2725ba0a093559", + "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac", + }), + ("https://hentaihere.com/m/S7608", { + "url": "6c5239758dc93f6b1b4175922836c10391b174f7", + "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36", + }), + ) + + def chapters(self, page): + results = [] + manga_id = text.parse_int( + self.manga_url.rstrip("/").rpartition("/")[2][1:]) + manga, pos = text.extract( + page, '<span itemprop="name">', '</span>') + mtype, pos = text.extract( + page, '<span class="mngType text-danger">[', ']</span>', pos) + + while True: + marker, pos = text.extract( + page, '<li class="sub-chp clearfix">', '', pos) + if marker is None: + return results + url, pos = text.extract(page, '<a href="', '"', pos) + chapter, pos = text.extract(page, 'title="Tagged: -">\n', '<', pos) + chapter_id, pos = text.extract(page, '/C', '"', pos) + chapter, _, title = text.unescape(chapter).strip().partition(" - ") + results.append((url, { + "manga_id": manga_id, "manga": manga, "type": mtype, + "chapter_id": text.parse_int(chapter_id), + "chapter": text.parse_int(chapter), + "title": title, "lang": "en", "language": "English", + })) diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py new file mode 100644 index 0000000..d875817 --- /dev/null +++ b/gallery_dl/extractor/hentainexus.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hentainexus.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import json + + +class HentainexusGalleryExtractor(GalleryExtractor): + """Extractor for image galleries on hentainexus.com""" + category = "hentainexus" + root = "https://hentainexus.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" + r"/(?:view|read)/(\d+)") + test = ( + ("https://hentainexus.com/view/5688", { + "url": "746d0043e20030f1171aae5ea113176607302517", + "keyword": "b05986369fbaf29cfa08b118960d92c49e59524b", + }), + ("https://hentainexus.com/read/5688"), + ) + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/view/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + rmve = text.remove_html + extr = text.extract_from(page) + data = { + "gallery_id" : text.parse_int(self.gallery_id), + "tags" : extr('"og:description" content="', '"').split(", "), + "thumbnail" : extr('"og:image" content="', '"'), + "title" : extr('<h1 class="title">', '</h1>'), + "artist" : rmve(extr('viewcolumn">Artist</td>' , '</td>')), + "book" : rmve(extr('viewcolumn">Book</td>' , '</td>')), + "language" : rmve(extr('viewcolumn">Language</td>' , '</td>')), + "magazine" : rmve(extr('viewcolumn">Magazine</td>' , '</td>')), + "parody" : rmve(extr('viewcolumn">Parody</td>' , '</td>')), + "publisher" : rmve(extr('viewcolumn">Publisher</td>' , '</td>')), + "description": rmve(extr('viewcolumn">Description</td>', '</td>')), + } + data["lang"] = util.language_to_code(data["language"]) + return data + + def images(self, page): + url = "{}/read/{}".format(self.root, self.gallery_id) + extr = text.extract_from(self.request(url).text) + urls = extr("initReader(", "]") + "]" + return [(url, None) for url in json.loads(urls)] + + +class HentainexusSearchExtractor(Extractor): + """Extractor for search results on hentainexus.com""" + category = "hentainexus" + subcategory = "search" + root = "https://hentainexus.com" + pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com" + r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$") + test = ( + ("https://hentainexus.com/?q=tag:%22heart+pupils%22%20tag:group", { + "pattern": HentainexusGalleryExtractor.pattern, + "count": ">= 50", + }), + ("https://hentainexus.com/page/3?q=tag:%22heart+pupils%22"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.params = text.parse_query(match.group(1)) + + def items(self): + yield Message.Version, 1 + params = self.params + path = "/" + + while path: + page = self.request(self.root + path, params=params).text + extr = text.extract_from(page) + data = {"_extractor": HentainexusGalleryExtractor} + + while True: + gallery_id = extr('<a href="/view/', '"') + if not gallery_id: + break + yield Message.Queue, self.root + "/view/" + gallery_id, data + + path = extr('class="pagination-next" href="', '"') diff --git a/gallery_dl/extractor/hitomi.py b/gallery_dl/extractor/hitomi.py new file mode 100644 index 0000000..c112465 --- /dev/null +++ b/gallery_dl/extractor/hitomi.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://hitomi.la/""" + +from .common import GalleryExtractor +from .. import text, util +import string + + +class HitomiGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from hitomi.la""" + category = "hitomi" + root = "https://hitomi.la" + pattern = r"(?:https?://)?hitomi\.la/(?:galleries|reader)/(\d+)" + test = ( + ("https://hitomi.la/galleries/867789.html", { + "url": "cb759868d090fe0e2655c3e29ebf146054322b6d", + "keyword": "067b5d9b9c0f98530cd5dd2444e0f5a5b4b00d38", + }), + ("https://hitomi.la/galleries/1036181.html", { + # "aa" subdomain for gallery-id ending in 1 (#142) + "pattern": r"https://aa\.hitomi\.la/", + }), + ("https://hitomi.la/galleries/1401410.html", { + # download test + "range": "1", + "content": "b3ca8c6c8cc5826cf8b4ceb7252943abad7b8b4c", + }), + ("https://hitomi.la/galleries/733697.html", { + # Game CG with scenes (#321) + "url": "c2a84185f467450b8b9b72fbe40c0649029ce007", + "count": 210, + }), + ("https://hitomi.la/reader/867789.html"), + ) + + def __init__(self, match): + self.gallery_id = text.parse_int(match.group(1)) + url = "{}/galleries/{}.html".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page, page.index('<h1><a href="/reader/')) + data = { + "gallery_id": self.gallery_id, + "title" : text.unescape(extr('.html">', '<').strip()), + "artist" : self._prep(extr('<h2>', '</h2>')), + "group" : self._prep(extr('<td>Group</td><td>', '</td>')), + "type" : self._prep_1(extr('<td>Type</td><td>', '</td>')), + "language" : self._prep_1(extr('<td>Language</td><td>', '</td>')), + "parody" : self._prep(extr('<td>Series</td><td>', '</td>')), + "characters": self._prep(extr('<td>Characters</td><td>', '</td>')), + "tags" : self._prep(extr('<td>Tags</td><td>', '</td>')), + "date" : self._date(extr('<span class="date">', '</span>')), + } + if data["language"] == "N/a": + data["language"] = None + data["lang"] = util.language_to_code(data["language"]) + return data + + def images(self, page): + # see https://ltn.hitomi.la/common.js + offset = self.gallery_id % 2 if self.gallery_id % 10 != 1 else 0 + subdomain = chr(97 + offset) + "a" + base = "https://" + subdomain + ".hitomi.la/galleries/" + + # set Referer header before image downloads (#239) + self.session.headers["Referer"] = self.chapter_url + + # handle Game CG galleries with scenes (#321) + scenes = text.extract(page, "var scene_indexes = [", "]")[0] + if scenes and scenes.strip(): + url = "{}/reader/{}.html".format(self.root, self.gallery_id) + page = self.request(url).text + begin, end = ">//g.hitomi.la/galleries/", "</div>" + else: + begin, end = "'//tn.hitomi.la/smalltn/", ".jpg'," + + return [ + (base + urlpart, None) + for urlpart in text.extract_iter(page, begin, end) + ] + + @staticmethod + def _prep(value): + return [ + text.unescape(string.capwords(v)) + for v in text.extract_iter(value or "", '.html">', '<') + ] + + @staticmethod + def _prep_1(value): + return text.remove_html(value).capitalize() + + @staticmethod + def _date(value): + return text.parse_datetime(value + ":00", "%Y-%m-%d %H:%M:%S%z") diff --git a/gallery_dl/extractor/hypnohub.py b/gallery_dl/extractor/hypnohub.py new file mode 100644 index 0000000..bf2db96 --- /dev/null +++ b/gallery_dl/extractor/hypnohub.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://hypnohub.net/""" + +from . import booru + + +class HypnohubExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): + """Base class for hypnohub extractors""" + category = "hypnohub" + api_url = "https://hypnohub.net/post.json" + post_url = "https://hypnohub.net/post/show/{}" + + +class HypnohubTagExtractor(booru.TagMixin, HypnohubExtractor): + """Extractor for images from hypnohub.net based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net" + r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)") + test = ("https://hypnohub.net/post?tags=gonoike_biwa", { + "url": "6bebc4318489ee37e0c3b814352acd6783ba95d6", + }) + + +class HypnohubPoolExtractor(booru.PoolMixin, HypnohubExtractor): + """Extractor for image-pools from hypnohub.net""" + pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/pool/show/(?P<pool>\d+)" + test = ("https://hypnohub.net/pool/show/61", { + "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf", + }) + + +class HypnohubPostExtractor(booru.PostMixin, HypnohubExtractor): + """Extractor for single images from hypnohub.net""" + pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/post/show/(?P<post>\d+)" + test = ("https://hypnohub.net/post/show/73964", { + "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee", + "options": (("tags", True),), + "keyword": { + "tags_artist": "gonoike_biwa icontrol_(manipper)", + "tags_character": "komaru_naegi", + "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode", + "tags_general": str, + }, + }) + + +class HypnohubPopularExtractor(booru.MoebooruPopularMixin, HypnohubExtractor): + """Extractor for popular images from hypnohub.net""" + pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net" + r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)" + r"(?:\?(?P<query>[^#]*))?") + test = ( + ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", { + "count": 20, + }), + ("https://hypnohub.net/post/popular_recent"), + ) + + def __init__(self, match): + super().__init__(match) + self.api_url = "https://hypnohub.net/post/popular_{scale}.json".format( + scale=self.scale) diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py new file mode 100644 index 0000000..dcb4a54 --- /dev/null +++ b/gallery_dl/extractor/idolcomplex.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://idol.sankakucomplex.com/""" + +from . import sankaku + + +class IdolcomplexExtractor(sankaku.SankakuExtractor): + """Base class for idolcomplex extractors""" + category = "idolcomplex" + cookiedomain = "idol.sankakucomplex.com" + subdomain = "idol" + + +class IdolcomplexTagExtractor(IdolcomplexExtractor, + sankaku.SankakuTagExtractor): + """Extractor for images from idol.sankakucomplex.com by search-tags""" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)" + test = ( + ("https://idol.sankakucomplex.com/?tags=lyumos+wreath", { + "count": ">= 6", + "pattern": r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" + r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", + }), + ("https://idol.sankakucomplex.com" + "/?tags=lyumos+wreath&page=3&next=694215"), + ) + + +class IdolcomplexPoolExtractor(IdolcomplexExtractor, + sankaku.SankakuPoolExtractor): + """Extractor for image-pools from idol.sankakucomplex.com""" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)" + test = ("https://idol.sankakucomplex.com/pool/show/145", { + "count": 3, + }) + + +class IdolcomplexPostExtractor(IdolcomplexExtractor, + sankaku.SankakuPostExtractor): + """Extractor for single images from idol.sankakucomplex.com""" + pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)" + test = ("https://idol.sankakucomplex.com/post/show/694215", { + "content": "694ec2491240787d75bf5d0c75d0082b53a85afd", + "options": (("tags", True),), + "keyword": { + "tags_character": "shani_(the_witcher)", + "tags_copyright": "the_witcher", + "tags_idol": str, + "tags_medium": str, + "tags_general": str, + }, + }) diff --git a/gallery_dl/extractor/imagebam.py b/gallery_dl/extractor/imagebam.py new file mode 100644 index 0000000..6980185 --- /dev/null +++ b/gallery_dl/extractor/imagebam.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from http://www.imagebam.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +class ImagebamExtractor(Extractor): + """Base class for imagebam extractors""" + category = "imagebam" + root = "http://www.imagebam.com" + + def get_image_data(self, page_url, data): + """Fill 'data' and return image URL""" + page = self.request(page_url).text + image_url = text.extract(page, 'property="og:image" content="', '"')[0] + data["extension"] = image_url.rpartition(".")[2] + data["image_key"] = page_url.rpartition("/")[2] + data["image_id"] = data["image_key"][6:] + return image_url + + def request_page(self, url): + """Retrive the main part of a gallery page""" + page = self.request(text.urljoin(self.root, url)).text + return text.extract(page, "<fieldset>", "</fieldset>")[0] + + +class ImagebamGalleryExtractor(ImagebamExtractor): + """Extractor for image galleries from imagebam.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{title} - {gallery_key}") + filename_fmt = "{num:>03}-{image_key}.{extension}" + archive_fmt = "{gallery_key}_{image_key}" + pattern = r"(?:https?://)?(?:www\.)?imagebam\.com/gallery/([0-9a-z]+)" + test = ( + ("http://www.imagebam.com/gallery/adz2y0f9574bjpmonaismyrhtjgvey4o", { + "url": "fb01925129a1ff1941762eaa3a2783a66de6847f", + "keyword": "9e25b8827474ac93c54855e798d60aa3cbecbd7a", + "content": "596e6bfa157f2c7169805d50075c2986549973a8", + }), + ("http://www.imagebam.com/gallery/op9dwcklwdrrguibnkoe7jxgvig30o5p", { + # more than 100 images; see issue #219 + "count": 107, + "url": "f92ce5b17676b6ea69288f0aef26f4cdbea7fd8d", + }), + ("http://www.imagebam.com/gallery/gsl8teckymt4vbvx1stjkyk37j70va2c", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + ImagebamExtractor.__init__(self, match) + self.gallery_key = match.group(1) + + def items(self): + url = "{}/gallery/{}".format(self.root, self.gallery_key) + page = self.request_page(url) + if not page or ">Error<" in page: + raise exception.NotFoundError("gallery") + + data = self.get_metadata(page) + imgs = self.get_image_pages(page) + data["count"] = len(imgs) + data["gallery_key"] = self.gallery_key + + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], page_url in enumerate(imgs, 1): + image_url = self.get_image_data(page_url, data) + yield Message.Url, image_url, data + + @staticmethod + def get_metadata(page): + """Return gallery metadata""" + return text.extract_all(page, ( + ("title" , "'> ", " <span "), + (None , "'>", "</span>"), + ("description", ":#FCFCFC;'>", "</div>"), + ))[0] + + def get_image_pages(self, page): + """Return a list of all image pages""" + pages = [] + while True: + pages.extend(text.extract_iter(page, "\n<a href='", "'")) + pos = page.find('"pagination_current"') + if pos > 0: + url = text.extract(page, "<a href='", "'", pos)[0] + if url: + page = self.request_page(url) + continue + return pages + + +class ImagebamImageExtractor(ImagebamExtractor): + """Extractor for single images from imagebam.com""" + subcategory = "image" + filename_fmt = "{image_key}.{extension}" + archive_fmt = "{image_key}" + pattern = (r"(?:https?://)?(?:\w+\.)?imagebam\.com" + r"/(?:image/|(?:[0-9a-f]{2}/){3})([0-9a-f]+)") + test = ( + ("http://www.imagebam.com/image/94d56c502511890", { + "url": "b384893c35a01a09c58018db71ddc4cf2480be95", + "keyword": "4263d4840007524129792b8587a562b5d20c2687", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }), + ("http://images3.imagebam.com/1d/8c/44/94d56c502511890.png"), + ) + + def __init__(self, match): + ImagebamExtractor.__init__(self, match) + self.image_key = match.group(1) + + def items(self): + page_url = "{}/image/{}".format(self.root, self.image_key) + data = {} + image_url = self.get_image_data(page_url, data) + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, image_url, data diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py new file mode 100644 index 0000000..152b631 --- /dev/null +++ b/gallery_dl/extractor/imagefap.py @@ -0,0 +1,195 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://imagefap.com/""" + +from .common import Extractor, Message +from .. import text +import json + + +class ImagefapExtractor(Extractor): + """Base class for imagefap extractors""" + category = "imagefap" + directory_fmt = ("{category}", "{gallery_id} {title}") + filename_fmt = "{category}_{gallery_id}_{filename}.{extension}" + archive_fmt = "{gallery_id}_{image_id}" + root = "https://www.imagefap.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.session.headers["Referer"] = self.root + + +class ImagefapGalleryExtractor(ImagefapExtractor): + """Extractor for image galleries from imagefap.com""" + subcategory = "gallery" + pattern = (r"(?:https?://)?(?:www\.)?imagefap\.com/" + r"(?:gallery\.php\?gid=|gallery/|pictures/)(\d+)") + test = ( + ("https://www.imagefap.com/pictures/7102714", { + "url": "268995eac5d01ddecd0fe58cfa9828390dc85a84", + "keyword": "b5bd65ab2ff574ed1639db9a43c7b1b8583c85ef", + "content": "694a0a57385980a6f90fbc296cadcd6c11ba2dab", + }), + ("https://www.imagefap.com/gallery/5486966", { + "url": "14906b4f0b8053d1d69bc730a325acb793cbc898", + "keyword": "ab90972f3527a2011478fabc621a2c99a541f752", + }), + ("https://www.imagefap.com/gallery.php?gid=7102714"), + ) + + def __init__(self, match): + ImagefapExtractor.__init__(self, match) + self.gid = match.group(1) + self.image_id = "" + + def items(self): + url = "{}/pictures/{}/".format(self.root, self.gid) + page = self.request(url).text + data = self.get_job_metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for url, image in self.get_images(): + data.update(image) + yield Message.Url, url, data + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + descr, pos = text.extract( + page, '<meta name="description" content="Browse ', '"') + count, pos = text.extract(page, ' 1 of ', ' pics"', pos) + self.image_id = text.extract(page, 'id="img_ed_', '"', pos)[0] + + title, _, descr = descr.partition(" porn picture gallery by ") + uploader, _, tags = descr.partition(" to see hottest ") + return { + "gallery_id": text.parse_int(self.gid), + "title": text.unescape(title), + "uploader": uploader, + "tags": tags[:-11].split(", "), + "count": text.parse_int(count), + } + + def get_images(self): + """Collect image-urls and -metadata""" + num = 0 + url = "{}/photo/{}/".format(self.root, self.image_id) + params = {"gid": self.gid, "idx": 0, "partial": "true"} + while True: + pos = 0 + page = self.request(url, params=params).text + for _ in range(24): + imgurl, pos = text.extract(page, '<a href="', '"', pos) + if not imgurl: + return + num += 1 + _, imgid, name = imgurl.rsplit("/", 2) + data = {"image_id": text.parse_int(imgid), "num": num} + yield imgurl, text.nameext_from_url(name, data) + params["idx"] += 24 + + +class ImagefapImageExtractor(ImagefapExtractor): + """Extractor for single images from imagefap.com""" + subcategory = "image" + pattern = r"(?:https?://)?(?:www\.)?imagefap\.com/photo/(\d+)" + test = ("https://www.imagefap.com/photo/1369341772/", { + "url": "b31ee405b61ff0450020a1bf11c0581ca9adb471", + "keyword": "eadaa8f8012298384996efd21cf1f9e9e0dddb9b", + }) + + def __init__(self, match): + ImagefapExtractor.__init__(self, match) + self.image_id = match.group(1) + + def items(self): + data = self.get_job_metadata() + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, data["url"], data + + def get_job_metadata(self): + """Collect metadata for extractor-job""" + url = "{}/photo/{}/".format(self.root, self.image_id) + page = self.request(url).text + info = json.loads(text.extract( + page, '<script type="application/ld+json">', '</script>')[0]) + parts = info["contentUrl"].rsplit("/", 3) + return text.nameext_from_url(parts[3], { + "url": info["contentUrl"], + "title": text.unescape(info["name"]), + "uploader": info["author"], + "date": info["datePublished"], + "width": text.parse_int(info["width"]), + "height": text.parse_int(info["height"]), + "gallery_id": text.parse_int(parts[1]), + "image_id": text.parse_int(parts[2]), + }) + + +class ImagefapUserExtractor(ImagefapExtractor): + """Extractor for all galleries from a user at imagefap.com""" + subcategory = "user" + categorytransfer = True + pattern = (r"(?:https?://)?(?:www\.)?imagefap\.com/" + r"(?:profile(?:\.php\?user=|/)([^/?&#]+)" + r"|usergallery\.php\?userid=(\d+))") + test = ( + ("https://www.imagefap.com/profile/LucyRae/galleries", { + "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd", + }), + ("https://www.imagefap.com/usergallery.php?userid=1862791", { + "url": "d941aa906f56a75972a7a5283030eb9a8d27a4fd", + }), + ("https://www.imagefap.com/profile.php?user=LucyRae"), + ) + + def __init__(self, match): + ImagefapExtractor.__init__(self, match) + self.user, self.user_id = match.groups() + + def items(self): + yield Message.Version, 1 + for gid, name in self.get_gallery_data(): + url = "{}/gallery/{}".format(self.root, gid) + data = { + "gallery_id": text.parse_int(gid), + "title": text.unescape(name), + "_extractor": ImagefapGalleryExtractor, + } + yield Message.Queue, url, data + + def get_gallery_data(self): + """Yield all gallery_ids of a specific user""" + folders = self.get_gallery_folders() + url = "{}/ajax_usergallery_folder.php".format(self.root) + params = {"userid": self.user_id} + for folder_id in folders: + params["id"] = folder_id + page = self.request(url, params=params).text + + pos = 0 + while True: + gid, pos = text.extract(page, '<a href="/gallery/', '"', pos) + if not gid: + break + name, pos = text.extract(page, "<b>", "<", pos) + yield gid, name + + def get_gallery_folders(self): + """Create a list of all folder_ids of a specific user""" + if self.user: + url = "{}/profile/{}/galleries".format(self.root, self.user) + else: + url = "{}/usergallery.php?userid={}".format( + self.root, self.user_id) + page = self.request(url).text + self.user_id, pos = text.extract(page, '?userid=', '"') + folders, pos = text.extract(page, ' id="tgl_all" value="', '"', pos) + return folders.split("|")[:-1] diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py new file mode 100644 index 0000000..954c1f0 --- /dev/null +++ b/gallery_dl/extractor/imagehosts.py @@ -0,0 +1,251 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Collection of extractors for various imagehosts""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text, exception +from ..cache import memcache +from os.path import splitext + + +class ImagehostImageExtractor(SharedConfigMixin, Extractor): + """Base class for single-image extractors for various imagehosts""" + basecategory = "imagehost" + subcategory = "image" + archive_fmt = "{token}" + https = False + method = "post" + params = "simple" + cookies = None + encoding = None + + def __init__(self, match): + Extractor.__init__(self, match) + self.page_url = "http{}://{}".format( + "s" if self.https else "", match.group(1)) + self.token = match.group(2) + if self.params == "simple": + self.params = { + "imgContinue": "Continue+to+image+...+", + } + elif self.params == "complex": + self.params = { + "op": "view", + "id": self.token, + "pre": "1", + "adb": "1", + "next": "Continue+to+image+...+", + } + else: + self.params = {} + self.method = "get" + + def items(self): + page = self.request( + self.page_url, + method=self.method, + data=self.params, + cookies=self.cookies, + encoding=self.encoding, + ).text + + url, filename = self.get_info(page) + data = text.nameext_from_url(filename, {"token": self.token}) + if self.https and url.startswith("http:"): + url = "https:" + url[5:] + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, url, data + + def get_info(self, page): + """Find image-url and string to get filename from""" + + +class ImxtoImageExtractor(ImagehostImageExtractor): + """Extractor for single images from imx.to""" + category = "imxto" + pattern = (r"(?:https?://)?(?:www\.)?((?:imx\.to|img\.yt)" + r"/(?:i/|img-)(\w+)(\.html)?)") + test = ( + ("https://imx.to/i/1qdeva", { # new-style URL + "url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130", + "keyword": "1153a986c939d7aed599905588f5c940048bc517", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }), + ("https://imx.to/img-57a2050547b97.html", { # old-style URL + "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204", + "keyword": "fd2240aee77a21b8252d5b829a1f7e542f927f09", + "content": "54592f2635674c25677c6872db3709d343cdf92f", + }), + ("https://img.yt/img-57a2050547b97.html", { # img.yt domain + "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204", + }), + ("https://imx.to/img-57a2050547b98.html", { + "exception": exception.NotFoundError, + }), + ) + https = True + encoding = "utf-8" + + def __init__(self, match): + ImagehostImageExtractor.__init__(self, match) + if "/img-" in self.page_url: + self.page_url = self.page_url.replace("img.yt", "imx.to") + self.url_ext = True + else: + self.url_ext = False + + def get_info(self, page): + url, pos = text.extract( + page, '<div style="text-align:center;"><a href="', '"') + if not url: + raise exception.NotFoundError("image") + filename, pos = text.extract(page, ' title="', '"', pos) + if self.url_ext and filename: + filename += splitext(url)[1] + return url, filename or url + + +class AcidimgImageExtractor(ImagehostImageExtractor): + """Extractor for single images from acidimg.cc""" + category = "acidimg" + pattern = r"(?:https?://)?((?:www\.)?acidimg\.cc/img-([a-z0-9]+)\.html)" + test = ("https://acidimg.cc/img-5acb6b9de4640.html", { + "url": "f132a630006e8d84f52d59555191ed82b3b64c04", + "keyword": "a8bb9ab8b2f6844071945d31f8c6e04724051f37", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }) + https = True + encoding = "utf-8" + + def get_info(self, page): + url, pos = text.extract(page, "<img class='centred' src='", "'") + if not url: + raise exception.NotFoundError("image") + filename, pos = text.extract(page, " alt='", "'", pos) + return url, (filename + splitext(url)[1]) if filename else url + + +class ImagevenueImageExtractor(ImagehostImageExtractor): + """Extractor for single images from imagevenue.com""" + category = "imagevenue" + pattern = (r"(?:https?://)?(img\d+\.imagevenue\.com" + r"/img\.php\?image=(?:[a-z]+_)?(\d+)_[^&#]+)") + test = (("http://img28116.imagevenue.com/img.php" + "?image=th_52709_test_122_64lo.jpg"), { + "url": "46812995d557f2c6adf0ebd0e631e6e4e45facde", + "content": "59ec819cbd972dd9a71f25866fbfc416f2f215b3", + }) + params = None + + def get_info(self, page): + url = text.extract(page, "SRC='", "'")[0] + return text.urljoin(self.page_url, url), url + + +class ImagetwistImageExtractor(ImagehostImageExtractor): + """Extractor for single images from imagetwist.com""" + category = "imagetwist" + pattern = r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))" + test = ("https://imagetwist.com/4e46hv31tu0q/test.jpg", { + "url": "c999dc1a5dec0525ac9eb8c092f173dfe6dba0b0", + "keyword": "a9f2e01757ec96d4ee4752cbd8446ede80f7935e", + "content": "96b1fd099b06faad5879fce23a7e4eb8290d8810", + }) + https = True + params = None + + @property + @memcache(maxage=3*3600) + def cookies(self): + return self.request(self.page_url).cookies + + def get_info(self, page): + url , pos = text.extract(page, 'center;"><img src="', '"') + filename, pos = text.extract(page, ' alt="', '"', pos) + return url, filename + + +class ImgspiceImageExtractor(ImagehostImageExtractor): + """Extractor for single images from imgspice.com""" + category = "imgspice" + pattern = r"(?:https?://)?((?:www\.)?imgspice\.com/([^/?&#]+))" + test = ("https://imgspice.com/nwfwtpyog50y/test.png.html", { + "url": "b8c30a8f51ee1012959a4cfd46197fabf14de984", + "keyword": "100e310a19a2fa22d87e1bbc427ecb9f6501e0c0", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }) + https = True + params = None + + def get_info(self, page): + pos = page.find('id="imgpreview"') + if pos < 0: + raise exception.NotFoundError("image") + url , pos = text.extract(page, 'src="', '"', pos) + name, pos = text.extract(page, 'alt="', '"', pos) + return url, text.unescape(name) + + +class PixhostImageExtractor(ImagehostImageExtractor): + """Extractor for single images from pixhost.to""" + category = "pixhost" + pattern = (r"(?:https?://)?((?:www\.)?pixhost\.(?:to|org)" + r"/show/\d+/(\d+)_[^/?&#]+)") + test = ("https://pixhost.to/show/224/96246707_test-.png", { + "url": "8f3d41fdd2dbec4c844e5ee45bf49961fbd79c67", + "keyword": "ecefe2d5814286f9d1dff3d88d9bdc78dd456c5d", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }) + https = True + params = None + cookies = {"pixhostads": "1", "pixhosttest": "1"} + + def get_info(self, page): + url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"") + filename, pos = text.extract(page, "alt=\"", "\"", pos) + return url, filename + + +class PostimgImageExtractor(ImagehostImageExtractor): + """Extractor for single images from postimages.org""" + category = "postimg" + pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)" + r"/(?:image/)?([^/?&#]+)/?)") + test = ("https://postimg.cc/Wtn2b3hC", { + "url": "0794cfda9b8951a8ac3aa692472484200254ab86", + "keyword": "2d05808d04e4e83e33200db83521af06e3147a84", + "content": "cfaa8def53ed1a575e0c665c9d6d8cf2aac7a0ee", + }) + https = True + params = None + + def get_info(self, page): + url , pos = text.extract(page, 'id="main-image" src="', '"') + filename, pos = text.extract(page, 'class="imagename">', '<', pos) + return url, text.unescape(filename) + + +class TurboimagehostImageExtractor(ImagehostImageExtractor): + """Extractor for single images from www.turboimagehost.com""" + category = "turboimagehost" + pattern = (r"(?:https?://)?((?:www\.)?turboimagehost\.com" + r"/p/(\d+)/[^/?&#]+\.html)") + test = ("https://www.turboimagehost.com/p/39078423/test--.png.html", { + "url": "b94de43612318771ced924cb5085976f13b3b90e", + "keyword": "704757ca8825f51cec516ec44c1e627c1f2058ca", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }) + https = True + params = None + + def get_info(self, page): + url = text.extract(page, 'src="', '"', page.index("<img "))[0] + return url, url diff --git a/gallery_dl/extractor/imgbox.py b/gallery_dl/extractor/imgbox.py new file mode 100644 index 0000000..516ef18 --- /dev/null +++ b/gallery_dl/extractor/imgbox.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from galleries at https://imgbox.com/""" + +from .common import Extractor, Message, AsynchronousMixin +from .. import text, exception +import re + + +class ImgboxExtractor(Extractor): + """Base class for imgbox extractors""" + category = "imgbox" + root = "https://imgbox.com" + + def items(self): + data = self.get_job_metadata() + yield Message.Version, 1 + yield Message.Directory, data + + for image_key in self.get_image_keys(): + imgpage = self.request(self.root + "/" + image_key).text + imgdata = self.get_image_metadata(imgpage) + if imgdata["filename"]: + imgdata.update(data) + imgdata["image_key"] = image_key + text.nameext_from_url(imgdata["filename"], imgdata) + yield Message.Url, self.get_image_url(imgpage), imgdata + + @staticmethod + def get_job_metadata(): + """Collect metadata for extractor-job""" + return {} + + @staticmethod + def get_image_keys(): + """Return an iterable containing all image-keys""" + return [] + + @staticmethod + def get_image_metadata(page): + """Collect metadata for a downloadable file""" + return text.extract_all(page, ( + ("num" , '</a> ', ' of '), + (None , 'class="image-container"', ''), + ("filename" , ' title="', '"'), + ))[0] + + @staticmethod + def get_image_url(page): + """Extract download-url""" + pos = page.index(">Image</a>") + return text.extract(page, '<a href="', '"', pos)[0] + + +class ImgboxGalleryExtractor(AsynchronousMixin, ImgboxExtractor): + """Extractor for image galleries from imgbox.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{title} - {gallery_key}") + filename_fmt = "{num:>03}-{filename}.{extension}" + archive_fmt = "{gallery_key}_{image_key}" + pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/g/([A-Za-z0-9]{10})" + test = ( + ("https://imgbox.com/g/JaX5V5HX7g", { + "url": "678f0bca1251d810372326ea4f16582cafa800e4", + "keyword": "4b1e62820ac2c6205b7ad0b6322cc8e00dbe1b0c", + "content": "d20307dc8511ac24d688859c55abf2e2cc2dd3cc", + }), + ("https://imgbox.com/g/cUGEkRbdZZ", { + "url": "d839d47cbbbeb121f83c520072512f7e51f52107", + "keyword": "fb0427b87983197849fb2887905e758f3e50cb6e", + }), + ("https://imgbox.com/g/JaX5V5HX7h", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + ImgboxExtractor.__init__(self, match) + self.gallery_key = match.group(1) + self.image_keys = [] + + def get_job_metadata(self): + page = self.request(self.root + "/g/" + self.gallery_key).text + if "The specified gallery could not be found." in page: + raise exception.NotFoundError("gallery") + self.image_keys = re.findall(r'<a href="/([^"]+)"><img alt="', page) + + title = text.extract(page, "<h1>", "</h1>")[0] + title, _, count = title.rpartition(" - ") + return { + "gallery_key": self.gallery_key, + "title": text.unescape(title), + "count": count[:-7], + } + + def get_image_keys(self): + return self.image_keys + + +class ImgboxImageExtractor(ImgboxExtractor): + """Extractor for single images from imgbox.com""" + subcategory = "image" + archive_fmt = "{image_key}" + pattern = r"(?:https?://)?(?:www\.)?imgbox\.com/([A-Za-z0-9]{8})" + test = ( + ("https://imgbox.com/qHhw7lpG", { + "url": "d931f675a9b848fa7cb9077d6c2b14eb07bdb80f", + "keyword": "dfc72310026b45f3feb4f9cada20c79b2575e1af", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + }), + ("https://imgbox.com/qHhw7lpH", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + ImgboxExtractor.__init__(self, match) + self.image_key = match.group(1) + + def get_image_keys(self): + return (self.image_key,) + + @staticmethod + def get_image_metadata(page): + data = ImgboxExtractor.get_image_metadata(page) + if not data["filename"]: + raise exception.NotFoundError("image") + return data diff --git a/gallery_dl/extractor/imgth.py b/gallery_dl/extractor/imgth.py new file mode 100644 index 0000000..a97f2e0 --- /dev/null +++ b/gallery_dl/extractor/imgth.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://imgth.com/""" + +from .common import Extractor, Message +from .. import text + + +class ImgthGalleryExtractor(Extractor): + """Extractor for image galleries from imgth.com""" + category = "imgth" + subcategory = "gallery" + directory_fmt = ("{category}", "{gallery_id} {title}") + filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{num}" + pattern = r"(?:https?://)?imgth\.com/gallery/(\d+)" + test = ("http://imgth.com/gallery/37/wallpaper-anime", { + "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748", + "keyword": "6f8c00d6849ea89d1a028764675ec1fe9dbd87e2", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.gid = match.group(1) + self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/" + + def items(self): + page = self.request(self.url_base + "0").text + data = self.metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], url in enumerate(self.images(page), 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def images(self, page): + """Yield all image urls for this gallery""" + pnum = 0 + while True: + thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0] + for url in text.extract_iter(thumbs, '<img src="', '"'): + yield "https://imgth.com/images/" + url[24:] + if '<li class="next">' not in page: + return + pnum += 1 + page = self.request(self.url_base + str(pnum)).text + + def metadata(self, page): + """Collect metadata for extractor-job""" + return text.extract_all(page, ( + ("title", '<h1>', '</h1>'), + ("count", 'total of images in this gallery: ', ' '), + ("date" , 'created on ', ' by <'), + (None , 'href="/users/', ''), + ("user" , '>', '<'), + ), values={"gallery_id": self.gid})[0] diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py new file mode 100644 index 0000000..0468c0b --- /dev/null +++ b/gallery_dl/extractor/imgur.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://imgur.com/""" + +from .common import Extractor, Message +from .. import text, exception +import json + + +class ImgurExtractor(Extractor): + """Base class for imgur extractors""" + category = "imgur" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item_id = match.group(1) + self.mp4 = self.config("mp4", True) + + def _get_data(self, urlpart): + response = self.request("https://imgur.com/" + urlpart, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError(self.subcategory) + data = text.extract(response.text, "image : ", ",\n")[0] + return self._clean(json.loads(data)) + + def _prepare(self, image): + image["ext"] = image["ext"].partition("?")[0] + if image["ext"] == ".gif" and ( + (self.mp4 and image["prefer_video"]) or self.mp4 == "always"): + image["ext"] = ".mp4" + url = "https://i.imgur.com/" + image["hash"] + image["ext"] + image["extension"] = image["ext"][1:] + return url + + @staticmethod + def _clean(data): + try: + del data["adConfig"] + del data["isAd"] + except KeyError: + pass + return data + + +class ImgurImageExtractor(ImgurExtractor): + """Extractor for individual images from imgur.com""" + subcategory = "image" + filename_fmt = "{category}_{hash}{title:?_//}.{extension}" + archive_fmt = "{hash}" + pattern = (r"(?:https?://)?(?:www\.|[im]\.|)?imgur\.com" + r"/(?!gallery)(\w{7}|\w{5})[sbtmlh]?\.?") + test = ( + ("https://imgur.com/21yMxCS", { + "url": "6f2dcfb86815bdd72808c313e5f715610bc7b9b2", + "content": "0c8768055e4e20e7c7259608b67799171b691140", + "keyword": { + "animated": False, + "datetime": "2016-11-10 14:24:35", + "description": str, + "ext": ".png", + "extension": "png", + "hash": "21yMxCS", + "height": "32", + "is_moderated": False, + "is_safe": False, + "is_viral": 0, + "looping": False, + "mimetype": "image/png", + "name": None, + "prefer_video": False, + "size": 182, + "source": "", + "title": "Test", + "video_host": None, + "video_source": None, + "width": "64", + }, + }), + ("http://imgur.com/0gybAXR", { # gifv/mp4 video + "url": "a2220eb265a55b0c95e0d3d721ec7665460e3fd7", + "content": "a3c080e43f58f55243ab830569ba02309d59abfc", + }), + ("https://imgur.com/HjoXJAd", { # url ends with '.jpg?1' + "url": "73f361b50753ab25da64160aa50bc5d139480d45", + }), + ("https://imgur.com/zzzzzzz", { # not found + "exception": exception.NotFoundError, + }), + ("https://www.imgur.com/21yMxCS"), # www + ("https://m.imgur.com/21yMxCS"), # mobile + ("https://imgur.com/zxaY6"), # 5 character key + ("https://i.imgur.com/21yMxCS.png"), # direct link + ("https://i.imgur.com/21yMxCSh.png"), # direct link thumbnail + ("https://i.imgur.com/zxaY6.gif"), # direct link (short) + ("https://i.imgur.com/zxaY6s.gif"), # direct link (short; thumb) + ) + + def items(self): + image = self._get_data(self.item_id) + url = self._prepare(image) + + yield Message.Version, 1 + yield Message.Directory, image + yield Message.Url, url, image + + +class ImgurAlbumExtractor(ImgurExtractor): + """Extractor for image albums from imgur.com""" + subcategory = "album" + directory_fmt = ("{category}", "{album[hash]}{album[title]:? - //}") + filename_fmt = "{category}_{album[hash]}_{num:>03}_{hash}.{extension}" + archive_fmt = "{album[hash]}_{hash}" + pattern = (r"(?:https?://)?(?:www\.|m\.)?imgur\.com" + r"/(?:a|gallery|t/unmuted)/(\w{7}|\w{5})") + test = ( + ("https://imgur.com/a/TcBmP", { + "url": "ce3552f550a5b5316bd9c7ae02e21e39f30c0563", + "keyword": { + "album": { + "album_cover": "693j2Kr", + "album_description": None, + "cover": "693j2Kr", + "datetime": "2015-10-09 10:37:50", + "description": None, + "hash": "TcBmP", + "id": "TcBmP", + "is_album": True, + "num_images": "19", + "title": "138", + "title_clean": "TcBmP", + "views": str, + }, + "animated": bool, + "datetime": str, + "extension": str, + "hash": str, + "height": int, + "num": int, + "prefer_video": bool, + "size": int, + "title": str, + "width": int, + }, + }), + ("https://imgur.com/gallery/eD9CT", { # large album + "url": "4ee94de31ff26be416271bc0b1ea27b9349c9937", + }), + ("https://imgur.com/a/RhJXhVT/all", { # 7 character album hash + "url": "695ef0c950023362a0163ee5041796300db76674", + }), + ("https://imgur.com/t/unmuted/YMqBcua", { # unmuted URL + "url": "86b4747f8147cec7602f0214e267309af73a8655", + }), + ("https://imgur.com/a/TcBmQ", { + "exception": exception.NotFoundError, + }), + ("https://www.imgur.com/a/TcBmP"), # www + ("https://m.imgur.com/a/TcBmP"), # mobile + ) + + def items(self): + album = self._get_data("a/" + self.item_id + "/all") + images = album["album_images"]["images"] + del album["album_images"] + + if int(album["num_images"]) > len(images): + url = ("https://imgur.com/ajaxalbums/getimages/" + + self.item_id + "/hit.json") + images = self.request(url).json()["data"]["images"] + + yield Message.Version, 1 + yield Message.Directory, {"album": album, "count": len(images)} + for num, image in enumerate(images, 1): + url = self._prepare(image) + image["num"] = num + image["album"] = album + yield Message.Url, url, image diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py new file mode 100644 index 0000000..871236b --- /dev/null +++ b/gallery_dl/extractor/instagram.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Leonardo Taccari, Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.instagram.com/""" + +import hashlib +import json +from .common import Extractor, Message +from .. import text + + +class InstagramExtractor(Extractor): + """Base class for instagram extractors""" + category = "instagram" + directory_fmt = ("{category}", "{username}") + filename_fmt = "{sidecar_media_id:?/_/}{media_id}.{extension}" + archive_fmt = "{media_id}" + root = "https://www.instagram.com" + + def get_metadata(self): + return {} + + def items(self): + yield Message.Version, 1 + + metadata = self.get_metadata() + for data in self.instagrams(): + data.update(metadata) + yield Message.Directory, data + + if data['typename'] == 'GraphImage': + yield Message.Url, data['display_url'], \ + text.nameext_from_url(data['display_url'], data) + elif data['typename'] == 'GraphVideo': + yield Message.Url, \ + 'ytdl:{}/p/{}/'.format(self.root, data['shortcode']), data + + def _extract_shared_data(self, page): + return json.loads(text.extract(page, + 'window._sharedData = ', ';</script>')[0]) + + def _extract_postpage(self, url): + page = self.request(url).text + shared_data = self._extract_shared_data(page) + media = shared_data['entry_data']['PostPage'][0]['graphql']['shortcode_media'] + + common = { + 'date': text.parse_timestamp(media['taken_at_timestamp']), + 'likes': text.parse_int(media['edge_media_preview_like']['count']), + 'owner_id': media['owner']['id'], + 'username': media['owner']['username'], + 'fullname': media['owner']['full_name'], + 'description': text.parse_unicode_escapes('\n'.join( + edge['node']['text'] + for edge in media['edge_media_to_caption']['edges'] + )), + } + + medias = [] + if media['__typename'] == 'GraphSidecar': + yi = 0 + for n in media['edge_sidecar_to_children']['edges']: + children = n['node'] + media_data = { + 'media_id': children['id'], + 'shortcode': children['shortcode'], + 'typename': children['__typename'], + 'display_url': children['display_url'], + 'height': text.parse_int(children['dimensions']['height']), + 'width': text.parse_int(children['dimensions']['width']), + 'sidecar_media_id': media['id'], + 'sidecar_shortcode': media['shortcode'], + } + if children['__typename'] == 'GraphVideo': + media_data["_ytdl_index"] = yi + yi += 1 + media_data.update(common) + medias.append(media_data) + + else: + media_data = { + 'media_id': media['id'], + 'shortcode': media['shortcode'], + 'typename': media['__typename'], + 'display_url': media['display_url'], + 'height': text.parse_int(media['dimensions']['height']), + 'width': text.parse_int(media['dimensions']['width']), + } + media_data.update(common) + medias.append(media_data) + + return medias + + def _extract_page(self, url, page_type): + shared_data_fields = { + 'ProfilePage': { + 'node': 'user', + 'node_id': 'id', + 'edge_to_medias': 'edge_owner_to_timeline_media', + 'variables_id': 'id', + 'query_hash': '66eb9403e44cc12e5b5ecda48b667d41', + }, + 'TagPage': { + 'node': 'hashtag', + 'node_id': 'name', + 'edge_to_medias': 'edge_hashtag_to_media', + 'variables_id': 'tag_name', + 'query_hash': 'f92f56d47dc7a55b606908374b43a314', + }, + } + + page = self.request(url).text + shared_data = self._extract_shared_data(page) + psdf = shared_data_fields[page_type] + + while True: + # Deal with different structure of pages: the first page + # has interesting data in `entry_data', next pages in `data'. + if 'entry_data' in shared_data: + base_shared_data = shared_data['entry_data'][page_type][0]['graphql'] + + # variables_id is available only in the first page + variables_id = base_shared_data[psdf['node']][psdf['node_id']] + else: + base_shared_data = shared_data['data'] + + medias = base_shared_data[psdf['node']][psdf['edge_to_medias']] + has_next_page = medias['page_info']['has_next_page'] + shortcodes = [n['node']['shortcode'] for n in medias['edges']] + + for s in shortcodes: + url = '{}/p/{}/'.format(self.root, s) + yield from self._extract_postpage(url) + + if not has_next_page: + break + + end_cursor = medias['page_info']['end_cursor'] + variables = '{{"{}":"{}","first":12,"after":"{}"}}'.format( + psdf['variables_id'], + variables_id, + end_cursor, + ) + headers = { + "X-Requested-With": "XMLHttpRequest", + "X-Instagram-GIS": hashlib.md5(variables.encode()).hexdigest(), + } + url = '{}/graphql/query/?query_hash={}&variables={}'.format( + self.root, + psdf['query_hash'], + variables, + ) + shared_data = self.request(url, headers=headers).json() + + def _extract_profilepage(self, url): + yield from self._extract_page(url, 'ProfilePage') + + def _extract_tagpage(self, url): + yield from self._extract_page(url, 'TagPage') + + +class InstagramImageExtractor(InstagramExtractor): + """Extractor for PostPage""" + subcategory = "image" + pattern = r"(?:https?://)?(?:www\.)?instagram\.com/p/([^/?&#]+)" + test = ( + # GraphImage + ("https://www.instagram.com/p/BqvsDleB3lV/", { + "pattern": r"https://[^/]+\.(cdninstagram\.com|fbcdn\.net)" + r"/vp/[0-9a-f]+/[0-9A-F]+/t51.2885-15/e35" + r"/44877605_725955034447492_3123079845831750529_n.jpg", + "keyword": { + "date": "type:datetime", + "description": str, + "height": int, + "likes": int, + "media_id": "1922949326347663701", + "shortcode": "BqvsDleB3lV", + "typename": "GraphImage", + "username": "instagram", + "width": int, + } + }), + + # GraphSidecar + ("https://www.instagram.com/p/BoHk1haB5tM/", { + "count": 5, + "keyword": { + "sidecar_media_id": "1875629777499953996", + "sidecar_shortcode": "BoHk1haB5tM", + "likes": int, + "username": "instagram", + } + }), + + # GraphVideo + ("https://www.instagram.com/p/Bqxp0VSBgJg/", { + "url": "8f38c1cf460c9804842f7306c487410f33f82e7e", + "keyword": { + "date": "type:datetime", + "description": str, + "height": int, + "likes": int, + "media_id": "1923502432034620000", + "shortcode": "Bqxp0VSBgJg", + "typename": "GraphVideo", + "username": "instagram", + "width": int, + } + }), + + # GraphSidecar with 2 embedded GraphVideo objects + ("https://www.instagram.com/p/BtOvDOfhvRr/", { + "count": 2, + "url": "e290d4180a58ae50c910d51d3b04d5f5c4622cd7", + "keyword": { + "sidecar_media_id": "1967717017113261163", + "sidecar_shortcode": "BtOvDOfhvRr", + "_ytdl_index": int, + } + }) + ) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.shortcode = match.group(1) + + def instagrams(self): + url = '{}/p/{}/'.format(self.root, self.shortcode) + return self._extract_postpage(url) + + +class InstagramUserExtractor(InstagramExtractor): + """Extractor for ProfilePage""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/(?!p/|explore/|directory/|accounts/)([^/?&#]+)") + test = ("https://www.instagram.com/instagram/", { + "range": "1-12", + "count": ">= 12", + }) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.username = match.group(1) + + def instagrams(self): + url = '{}/{}/'.format(self.root, self.username) + return self._extract_profilepage(url) + + +class InstagramTagExtractor(InstagramExtractor): + """Extractor for TagPage""" + subcategory = "tag" + directory_fmt = ("{category}", "{subcategory}", "{tag}") + pattern = (r"(?:https?://)?(?:www\.)?instagram\.com" + r"/explore/tags/([^/?&#]+)") + test = ("https://www.instagram.com/explore/tags/instagram/", { + "range": "1-12", + "count": ">= 12", + }) + + def __init__(self, match): + InstagramExtractor.__init__(self, match) + self.tag = match.group(1) + + def get_metadata(self): + return {"tag": self.tag} + + def instagrams(self): + url = '{}/explore/tags/{}/'.format(self.root, self.tag) + return self._extract_tagpage(url) diff --git a/gallery_dl/extractor/keenspot.py b/gallery_dl/extractor/keenspot.py new file mode 100644 index 0000000..5902333 --- /dev/null +++ b/gallery_dl/extractor/keenspot.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for http://www.keenspot.com/""" + +from .common import Extractor, Message +from .. import text + + +class KeenspotComicExtractor(Extractor): + """Extractor for webcomics from keenspot.com""" + category = "keenspot" + subcategory = "comic" + directory_fmt = ("{category}", "{comic}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{comic}_{filename}" + pattern = r"(?:https?://)?(?!www\.|forums\.)([^.]+)\.keenspot\.com(/.+)?" + test = ( + ("http://marksmen.keenspot.com/", { # link + "range": "1-3", + "url": "83bcf029103bf8bc865a1988afa4aaeb23709ba6", + }), + ("http://barkercomic.keenspot.com/", { # id + "range": "1-3", + "url": "c4080926db18d00bac641fdd708393b7d61379e6", + }), + ("http://crowscare.keenspot.com/", { # id v2 + "range": "1-3", + "url": "a00e66a133dd39005777317da90cef921466fcaa" + }), + ("http://supernovas.keenspot.com/", { # ks + "range": "1-3", + "url": "de21b12887ef31ff82edccbc09d112e3885c3aab" + }), + ("http://twokinds.keenspot.com/comic/1066/", { # "random" access + "range": "1-3", + "url": "97e2a6ed8ba1709314f2449f84b6b1ce5db21c04", + }) + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.comic = match.group(1).lower() + self.path = match.group(2) + self.root = "http://" + self.comic + ".keenspot.com" + + self._needle = "" + self._image = 'class="ksc"' + self._next = self._next_needle + + def items(self): + data = {"comic": self.comic} + yield Message.Version, 1 + yield Message.Directory, data + + url = self._first(self.request(self.root + "/").text) + if self.path: + url = self.root + self.path + + prev = None + ilen = len(self._image) + while url and url != prev: + prev = url + page = self.request(text.urljoin(self.root, url)).text + + pos = 0 + while True: + pos = page.find(self._image, pos) + if pos < 0: + break + img, pos = text.extract(page, 'src="', '"', pos + ilen) + if img.endswith(".js"): + continue + if img[0] == "/": + img = self.root + img + elif "youtube.com/" in img: + img = "ytdl:" + img + yield Message.Url, img, text.nameext_from_url(img, data) + + url = self._next(page) + + def _first(self, page): + if self.comic == "brawlinthefamily": + self._next = self._next_brawl + self._image = '<div id="comic">' + return "http://brawlinthefamily.keenspot.com/comic/theshowdown/" + + url = text.extract(page, '<link rel="first" href="', '"')[0] + if url: + if self.comic == "porcelain": + self._needle = 'id="porArchivetop_"' + else: + self._next = self._next_link + return url + + pos = page.find('id="first_day1"') + if pos >= 0: + self._next = self._next_id + return text.rextract(page, 'href="', '"', pos)[0] + + pos = page.find('>FIRST PAGE<') + if pos >= 0: + if self.comic == "lastblood": + self._next = self._next_lastblood + self._image = '<div id="comic">' + else: + self._next = self._next_id + return text.rextract(page, 'href="', '"', pos)[0] + + pos = page.find('<div id="kscomicpart"') + if pos >= 0: + self._needle = '<a href="/archive.html' + return text.extract(page, 'href="', '"', pos)[0] + + pos = page.find('>First Comic<') # twokinds + if pos >= 0: + self._image = '</header>' + self._needle = 'class="navarchive"' + return text.rextract(page, 'href="', '"', pos)[0] + + pos = page.find('id="flip_FirstDay"') # flipside + if pos >= 0: + self._image = 'class="flip_Pages ksc"' + self._needle = 'id="flip_ArcButton"' + return text.rextract(page, 'href="', '"', pos)[0] + + self.log.error("Unrecognized page layout") + return None + + def _next_needle(self, page): + pos = page.index(self._needle) + len(self._needle) + return text.extract(page, 'href="', '"', pos)[0] + + @staticmethod + def _next_link(page): + return text.extract(page, '<link rel="next" href="', '"')[0] + + @staticmethod + def _next_id(page): + pos = page.find('id="next_') + return text.rextract(page, 'href="', '"', pos)[0] if pos >= 0 else None + + @staticmethod + def _next_lastblood(page): + pos = page.index("link rel='next'") + return text.extract(page, "href='", "'", pos)[0] + + @staticmethod + def _next_brawl(page): + pos = page.index("comic-nav-next") + url = text.rextract(page, 'href="', '"', pos)[0] + return None if "?random" in url else url diff --git a/gallery_dl/extractor/khinsider.py b/gallery_dl/extractor/khinsider.py new file mode 100644 index 0000000..c9e6959 --- /dev/null +++ b/gallery_dl/extractor/khinsider.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract soundtracks from https://downloads.khinsider.com/""" + +from .common import Extractor, Message, AsynchronousMixin +from .. import text, exception + + +class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor): + """Extractor for soundtracks from khinsider.com""" + category = "khinsider" + subcategory = "soundtrack" + directory_fmt = ("{category}", "{album}") + archive_fmt = "{album}_{filename}.{extension}" + pattern = (r"(?:https?://)?downloads\.khinsider\.com" + r"/game-soundtracks/album/([^/?&#]+)") + test = (("https://downloads.khinsider.com" + "/game-soundtracks/album/horizon-riders-wii"), { + "pattern": r"https?://\d+\.\d+\.\d+\.\d+/ost/horizon-riders-wii/[^/]+" + r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3", + "count": 1, + "keyword": "b4f460c78dd23e1f1121f4ac784dd67ded7c2679", + }) + root = "https://downloads.khinsider.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.album = match.group(1) + + def items(self): + url = (self.root + "/game-soundtracks/album/" + self.album) + page = self.request(url, encoding="utf-8").text + data = self.get_job_metadata(page) + yield Message.Version, 1 + yield Message.Directory, data + for url, track in self.get_album_tracks(page): + track.update(data) + yield Message.Url, url, track + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + if "Download all songs at once:" not in page: + raise exception.NotFoundError("soundtrack") + data = text.extract_all(page, ( + ("album", "Album name: <b>", "</b>"), + ("count", "Number of Files: <b>", "</b>"), + ("size" , "Total Filesize: <b>", "</b>"), + ("date" , "Date added: <b>", "</b>"), + ("type" , "Album type: <b>", "</b>"), + ))[0] + data["album"] = text.unescape(data["album"]) + return data + + def get_album_tracks(self, page): + """Collect url and metadata for all tracks of a soundtrack""" + page = text.extract(page, '<table id="songlist">', '</table>')[0] + for num, url in enumerate(text.extract_iter( + page, '<td class="clickable-row"><a href="', '"'), 1): + url = text.urljoin(self.root, url) + page = self.request(url, encoding="utf-8").text + url = text.extract( + page, '<p><a style="color: #21363f;" href="', '"')[0] + yield url, text.nameext_from_url(url, {"num": num}) diff --git a/gallery_dl/extractor/kissmanga.py b/gallery_dl/extractor/kissmanga.py new file mode 100644 index 0000000..6314a94 --- /dev/null +++ b/gallery_dl/extractor/kissmanga.py @@ -0,0 +1,223 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://kissmanga.com/""" + +from .common import ChapterExtractor, MangaExtractor, Extractor +from .. import text, aes, exception +from ..cache import cache +import hashlib +import ast +import re + + +class RedirectMixin(): + """Detect and handle redirects to CAPTCHA pages""" + + def request(self, url): + while True: + response = Extractor.request(self, url) + if not response.history or "/AreYouHuman" not in response.url: + return response + if self.config("captcha", "stop") == "wait": + self.log.warning( + "Redirect to \n%s\nVisit this URL in your browser, solve " + "the CAPTCHA, and press ENTER to continue", response.url) + try: + input() + except (EOFError, OSError): + pass + else: + self.log.error( + "Redirect to \n%s\nVisit this URL in your browser and " + "solve the CAPTCHA to continue", response.url) + raise exception.StopExtraction() + + +class KissmangaBase(RedirectMixin): + """Base class for kissmanga extractors""" + category = "kissmanga" + archive_fmt = "{chapter_id}_{page}" + root = "https://kissmanga.com" + + @staticmethod + def parse_chapter_string(data): + """Parse 'chapter_string' value contained in 'data'""" + data["chapter_string"] = text.unescape(data["chapter_string"]) + + match = re.match(( + r"(?:[Vv]ol\.0*(\d+) )?" + r"(?:[Cc]h\.)?0*(\d+)" + r"(?:[.:]0*(\d+))?" + r"(?: *[:-]? *(.+))?" + ), data["chapter_string"]) + + if not match: + match = re.match(( + r".+?(?: -)? ()" + r"0*(\d+)(?:[Vv.]0*(\d+))?" + r"(?: *[:-]? *(.+))?" + ), data["chapter_string"]) + + if match: + volume, chapter, minor, title = match.groups() + else: + volume, chapter, minor, title = 0, 0, "", data["chapter_string"] + + data["volume"] = text.parse_int(volume) + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = "." + minor if minor else "" + data["title"] = title if title and title != "Read Online" else "" + return data + + +class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor): + """Extractor for manga-chapters from kissmanga.com""" + pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" + r"(/Manga/[^/?&#]+/[^/?&#]+\?id=(\d+))") + test = ( + ("https://kissmanga.com/Manga/Dropout/Ch-000---Oneshot-?id=145847", { + "url": "46e63fd63e9e16f19bc1e6c7a45dc060815642fd", + "keyword": "1cd0b5214ac7ae4d53e2fd8fec40ceec84cd09bf", + }), + ("https://kissmanga.com/Manga/Urban-Tales/a?id=256717", { + "url": "c26be8bf9c2abacee2076979d021634092cf38f1", + "keyword": "e1d16780df8e04076ed2b5f0637c5b710ec2f2ea", + }), + ("https://kissmanga.com/Manga/Monster/Monster-79?id=7608", { + "count": 23, + "keyword": "f433a7a8fae840e17dace316a243fa27faab86de", + }), + ("https://kissmanga.com/Manga/Houseki-no-Kuni/Oneshot?id=404189", { + "count": 49, + "keyword": "d44d1b21d08e4dbf888b0c450a3f1bc919588b4f", + }), + ("https://kissmanga.com/mAnGa/mOnStEr/Monster-79?id=7608"), + ) + + def __init__(self, match): + ChapterExtractor.__init__(self, match) + self.chapter_id = match.group(2) + self.session.headers["Referer"] = self.root + + def metadata(self, page): + title = text.extract(page, "<title>", "</title>")[0].strip() + manga, cinfo = title.split("\n")[1:3] + data = { + "manga": manga.strip(), + "chapter_string": cinfo.strip(), + "chapter_id": text.parse_int(self.chapter_id), + "lang": "en", + "language": "English", + } + return self.parse_chapter_string(data) + + def images(self, page): + self.session.headers["Referer"] = None + try: + key = self.build_aes_key(page) + iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0, + 0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3) + return [ + (aes.aes_cbc_decrypt_text( + data, key, iv).partition("&")[0], None) + for data in text.extract_iter( + page, 'lstImages.push(wrapKA("', '"' + ) + ] + except UnicodeDecodeError: + self.log.error("Failed to decrypt image URLs") + except (ValueError, IndexError): + self.log.error("Failed to get AES key") + return [] + + def build_aes_key(self, page): + chko = self._chko_from_external_script() + + for script in self._scripts(page): + for stmt in [s.strip() for s in script.split(";")]: + + if stmt.startswith("var _"): + name, _, value = stmt[4:].partition(" = ") + name += "[0]" + value = ast.literal_eval(value)[0] + + elif stmt.startswith("chko = "): + stmt = stmt[7:] + if stmt == name: + chko = value + elif stmt == "chko + " + name: + chko = chko + value + elif stmt == name + " + chko": + chko = value + chko + else: + self.log.warning("unrecognized expression: '%s'", stmt) + + elif stmt.startswith("key = "): + pass + + else: + self.log.warning("unrecognized statement: '%s'", stmt) + + return list(hashlib.sha256(chko.encode("ascii")).digest()) + + @staticmethod + def _scripts(page): + end = 0 + while True: + pos = page.find("key = ", end) + if pos == -1: + return + beg = page.rindex('<script type="text/javascript">', 0, pos) + 31 + end = page.index('</script>', pos) + yield page[beg:end] + + @cache(maxage=3600) + def _chko_from_external_script(self): + script = self.request(self.root + "/Scripts/lo.js").text + + pos = script.index("var chko") + var = text.extract(script, "=", "[", pos)[0].lstrip() + idx = text.extract(script, "[", "]", pos)[0] + + pos = script.index(var) + lst = text.extract(script, "=", ";", pos)[0] + return ast.literal_eval(lst.strip())[int(idx)] + + +class KissmangaMangaExtractor(KissmangaBase, MangaExtractor): + """Extractor for manga from kissmanga.com""" + chapterclass = KissmangaChapterExtractor + pattern = (r"(?i)(?:https?://)?(?:www\.)?kissmanga\.com" + r"(/Manga/[^/?&#]+/?)$") + test = ( + ("https://kissmanga.com/Manga/Dropout", { + "url": "9e3a6f715b229aa3fafa42a1d5da5d65614cb532", + "keyword": "32b09711c28b481845acc32e3bb6054cfc90224d", + }), + ("https://kissmanga.com/manga/feng-shen-ji"), # lowercase + ) + + def chapters(self, page): + results = [] + manga, pos = text.extract(page, ' class="barTitle">', '\ninformation') + page , pos = text.extract(page, ' class="listing">', '</table>', pos) + manga = manga.strip() + needle = '" title="Read ' + manga + ' ' + manga = text.unescape(manga) + + for item in text.extract_iter(page, '<a href="', ' online">'): + url, _, chapter = item.partition(needle) + data = { + "manga": manga, "chapter_string": chapter, + "chapter_id": text.parse_int(url.rpartition("=")[2]), + "lang": "en", "language": "English", + } + self.parse_chapter_string(data) + results.append((self.root + url, data)) + return results diff --git a/gallery_dl/extractor/komikcast.py b/gallery_dl/extractor/komikcast.py new file mode 100644 index 0000000..8541e4f --- /dev/null +++ b/gallery_dl/extractor/komikcast.py @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://komikcast.com/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +import re + + +class KomikcastBase(): + """Base class for komikcast extractors""" + category = "komikcast" + root = "https://komikcast.com" + + @staticmethod + def parse_chapter_string(chapter_string, data=None): + """Parse 'chapter_string' value and add its info to 'data'""" + if not data: + data = {} + + match = re.match( + r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?", + text.unescape(chapter_string), + ) + manga, chapter, data["chapter_minor"], title = match.groups() + + if manga: + data["manga"] = manga.partition(" Chapter ")[0] + if title and title.lower() != "bahasa indonesia": + data["title"] = title.strip() + else: + data["title"] = "" + data["chapter"] = text.parse_int(chapter) + data["lang"] = "id" + data["language"] = "Indonesian" + + return data + + +class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor): + """Extractor for manga-chapters from komikcast.com""" + pattern = r"(?:https?://)?(?:www\.)?komikcast\.com(/chapter/[^/?&#]+/)" + test = ( + (("https://komikcast.com/chapter/" + "apotheosis-chapter-02-2-bahasa-indonesia/"), { + "url": "f6b43fbc027697749b3ea1c14931c83f878d7936", + "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4", + }), + (("https://komikcast.com/chapter/" + "tonari-no-kashiwagi-san-chapter-18b/"), { + "url": "aff90dd21dbb945a726778b10bdef522af7c42fe", + "keyword": "19b5783864c4299913de436513b124b028b557c1", + }), + (("https://komikcast.com/chapter/090-eko-to-issho-chapter-1/"), { + "url": "cda104a32ea2b06b3d6b096726622f519ed1fa33", + }), + ) + + def metadata(self, page): + info = text.extract(page, '<b>', "</b>")[0] + return self.parse_chapter_string(info) + + @staticmethod + def images(page): + readerarea = text.extract( + page, '<div id="readerarea">', '<div class="navig">')[0] + return [ + (text.unescape(url), None) + for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea) + if "/Banner-" not in url + ] + + +class KomikcastMangaExtractor(KomikcastBase, MangaExtractor): + """Extractor for manga from komikcast.com""" + chapterclass = KomikcastChapterExtractor + pattern = (r"(?:https?://)?(?:www\.)?komikcast\.com" + r"(/(?:komik/)?[^/?&#]+)/?$") + test = ( + ("https://komikcast.com/komik/090-eko-to-issho/", { + "url": "dc798d107697d1f2309b14ca24ca9dba30c6600f", + "keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1", + }), + ("https://komikcast.com/tonari-no-kashiwagi-san/"), + ) + + def chapters(self, page): + results = [] + data = self.metadata(page) + + for item in text.extract_iter( + page, '<span class="leftoff"><a href="', '</a>'): + url, _, chapter_string = item.rpartition('">Chapter ') + self.parse_chapter_string(chapter_string, data) + results.append((url, data.copy())) + return results + + @staticmethod + def metadata(page): + """Return a dict with general metadata""" + manga , pos = text.extract(page, "<title>" , "</title>") + genres, pos = text.extract(page, ">Genres:", "</span>", pos) + author, pos = text.extract(page, ">Author:", "</span>", pos) + mtype , pos = text.extract(page, ">Type:" , "</span>", pos) + + return { + "manga": text.unescape(manga[:-12]), + "author": text.remove_html(author), + "genres": text.split_html(genres)[::2], + "type": text.remove_html(mtype), + } diff --git a/gallery_dl/extractor/konachan.py b/gallery_dl/extractor/konachan.py new file mode 100644 index 0000000..a9d8b3a --- /dev/null +++ b/gallery_dl/extractor/konachan.py @@ -0,0 +1,85 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://konachan.com/""" + +from . import booru + + +class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): + """Base class for konachan extractors""" + category = "konachan" + + def __init__(self, match): + root = "https://konachan." + match.group("tld") + self.api_url = root + "/post.json" + self.post_url = root + "/post/show/{}" + super().__init__(match) + + +class KonachanTagExtractor(booru.TagMixin, KonachanExtractor): + """Extractor for images from konachan.com based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)" + r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)") + test = ( + ("https://konachan.com/post?tags=patata", { + "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d", + }), + ("https://konachan.net/post?tags=patata"), + ) + + +class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor): + """Extractor for image-pools from konachan.com""" + pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)" + r"/pool/show/(?P<pool>\d+)") + test = ( + ("https://konachan.com/pool/show/95", { + "content": "cf0546e38a93c2c510a478f8744e60687b7a8426", + }), + ("https://konachan.net/pool/show/95"), + ) + + +class KonachanPostExtractor(booru.PostMixin, KonachanExtractor): + """Extractor for single images from konachan.com""" + pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)" + r"/post/show/(?P<post>\d+)") + test = ( + ("https://konachan.com/post/show/205189", { + "content": "674e75a753df82f5ad80803f575818b8e46e4b65", + "options": (("tags", True),), + "keyword": { + "tags_artist": "patata", + "tags_character": "clownpiece", + "tags_copyright": "touhou", + "tags_general": str, + }, + }), + ("https://konachan.net/post/show/205189"), + ) + + +class KonachanPopularExtractor(booru.MoebooruPopularMixin, KonachanExtractor): + """Extractor for popular images from konachan.com""" + pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)" + r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)" + r"(?:\?(?P<query>[^#]*))?") + test = ( + ("https://konachan.com/post/popular_by_month?month=11&year=2010", { + "count": 20, + }), + ("https://konachan.com/post/popular_recent"), + ("https://konachan.net/post/popular_recent"), + ) + + def __init__(self, match): + super().__init__(match) + self.api_url = ( + "https://konachan.{tld}/post/popular_{scale}.json".format( + tld=match.group("tld"), scale=self.scale)) diff --git a/gallery_dl/extractor/livedoor.py b/gallery_dl/extractor/livedoor.py new file mode 100644 index 0000000..ed72f4c --- /dev/null +++ b/gallery_dl/extractor/livedoor.py @@ -0,0 +1,156 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for http://blog.livedoor.jp/""" + +from .common import Extractor, Message +from .. import text + + +class LivedoorExtractor(Extractor): + """Base class for livedoor extractors""" + category = "livedoor" + root = "http://blog.livedoor.jp" + filename_fmt = "{post[id]}_{post[title]}_{num:>02}.{extension}" + directory_fmt = ("{category}", "{post[user]}") + archive_fmt = "{post[id]}_{hash}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + yield Message.Version, 1 + for post in self.posts(): + images = self._images(post) + if images: + yield Message.Directory, {"post": post} + for image in images: + yield Message.Url, image["url"], image + + def posts(self): + """Return an iterable with post objects""" + + def _load(self, data, body): + extr = text.extract_from(data) + tags = text.extract(body, '</dt><dd>', '</dl>')[0] + + return { + "id" : text.parse_int(extr("id : '", "'")), + "title" : text.unescape(extr("title : '", "'")), + "categories": [extr("name:'", "'"), extr("name:'", "'")], + "date" : text.parse_datetime( + extr("date : '", "'"), "%Y-%m-%d %H:%M:%S"), + "tags" : text.split_html(tags), + "user" : self.user, + "body" : body, + } + + def _images(self, post): + imgs = [] + body = post.pop("body") + + for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1): + src = text.extract(img, 'src="', '"')[0] + alt = text.extract(img, 'alt="', '"')[0] + + if not src: + continue + if "://livedoor.blogimg.jp/" in src: + url = src.replace("-s.", ".") + else: + url = text.urljoin(self.root, src) + name, _, ext = url.rpartition("/")[2].rpartition(".") + + imgs.append({ + "url" : url, + "num" : num, + "hash" : name, + "filename" : alt or name, + "extension": ext, + "post" : post, + }) + + return imgs + + +class LivedoorBlogExtractor(LivedoorExtractor): + """Extractor for a user's blog on blog.livedoor.jp""" + subcategory = "blog" + pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/?(?:$|[?&#])" + test = ( + ("http://blog.livedoor.jp/zatsu_ke/", { + "range": "1-50", + "count": 50, + "pattern": r"https?://livedoor.blogimg.jp/\w+/imgs/\w/\w/\w+\.\w+", + "keyword": { + "post": { + "categories": list, + "date": "type:datetime", + "id": int, + "tags": list, + "title": str, + "user": "zatsu_ke" + }, + "filename": str, + "hash": r"re:\w{4,}", + "num": int, + }, + }), + ("http://blog.livedoor.jp/uotapo/", { + "range": "1-5", + "count": 5, + }), + ) + + def posts(self): + url = "{}/{}".format(self.root, self.user) + + while url: + extr = text.extract_from(self.request(url).text) + while True: + data = extr('.articles.push(', ');') + if not data: + break + body = extr('class="article-body-inner">', + 'class="article-footer">') + yield self._load(data, body) + url = extr('<a rel="next" href="', '"') + + +class LivedoorPostExtractor(LivedoorExtractor): + """Extractor for images from a blog post on blog.livedoor.jp""" + subcategory = "post" + pattern = r"(?:https?://)?blog\.livedoor\.jp/(\w+)/archives/(\d+)" + test = ( + ("http://blog.livedoor.jp/zatsu_ke/archives/51493859.html", { + "url": "8826fe623f19dc868e7538e8519bf8491e92a0a2", + "keyword": "52fcba9253a000c339bcd658572d252e282626af", + }), + ("http://blog.livedoor.jp/amaumauma/archives/7835811.html", { + "url": "fc1d6a9557245b5a27d3a10bf0fa9922ef377215", + "keyword": "0229072abb5cd8a221df72e0ffdfc13336c0e9ce", + }), + ("http://blog.livedoor.jp/uotapo/archives/1050616939.html", { + "url": "3f3581807ec4776e6a67ed7985a22494d4bc4904", + "keyword": "2eb3e383c68e909c4dd3d563c16d0b6e2fe6627b", + }), + ) + + def __init__(self, match): + LivedoorExtractor.__init__(self, match) + self.post_id = match.group(2) + + def posts(self): + url = "{}/{}/archives/{}.html".format( + self.root, self.user, self.post_id) + extr = text.extract_from(self.request(url).text) + data = extr('articles :', '</script>') + body = extr('class="article-body-inner">', + 'class="article-footer">') + return (self._load(data, body),) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py new file mode 100644 index 0000000..65ae843 --- /dev/null +++ b/gallery_dl/extractor/luscious.py @@ -0,0 +1,208 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://luscious.net/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception +from ..cache import cache + + +class LusciousBase(Extractor): + """Base class for luscious extractors""" + category = "luscious" + cookiedomain = ".luscious.net" + root = "https://members.luscious.net" + + def login(self): + """Login and set necessary cookies""" + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=14*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + url = "https://members.luscious.net/accounts/login/" + headers = {"Referer": "https://members.luscious.net/login/"} + data = { + "login": username, + "password": password, + "remember": "on", + "next": "/", + } + + response = self.request(url, method="POST", headers=headers, data=data) + if "/accounts/login/" in response.url or not response.history: + raise exception.AuthenticationError() + for cookie in response.history[0].cookies: + if cookie.name.startswith("sessionid_"): + return {cookie.name: cookie.value} + raise exception.AuthenticationError() + + @staticmethod + def _parse_tags(tags): + return [ + text.unescape(tag.replace(":_", ":")) + for tag in text.extract_iter(tags or "", "/tags/", "/") + ] + + +class LusciousAlbumExtractor(LusciousBase, GalleryExtractor): + """Extractor for image albums from luscious.net""" + subcategory = "album" + archive_fmt = "{gallery_id}_{image_id}" + pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net" + r"/(?:albums|pictures/c/[^/?&#]+/album)/([^/?&#]+_(\d+))") + test = ( + ("https://luscious.net/albums/okinami-no-koigokoro_277031/", { + "url": "7e4984a271a1072ac6483e4228a045895aff86f3", + "keyword": "c597c132834f4990f90bf5dee5de2a9d4ba263a4", + "content": "b3a747a6464509440bd0ff6d1267e6959f8d6ff3", + }), + ("https://luscious.net/albums/virgin-killer-sweater_282582/", { + "url": "21cc68a7548f4d71dfd67d8caf96349dde7e791c", + "keyword": "e1202078b504adeccd521aa932f456a5a85479a0", + }), + ("https://luscious.net/albums/not-found_277035/", { + "exception": exception.NotFoundError, + }), + ("https://members.luscious.net/albums/login-required_323871/", { + "options": (("username", None),), + "exception": exception.AuthorizationError, + }), + ("https://www.luscious.net/albums/okinami_277031/"), + ("https://members.luscious.net/albums/okinami_277031/"), + ("https://luscious.net/pictures/c/video_game_manga/album" + "/okinami-no-koigokoro_277031/sorted/position/id/16528978/@_1"), + ) + + def __init__(self, match): + path, self.gallery_id = match.groups() + url = "{}/albums/{}/".format(self.root, path) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + pos = page.find("<h1>404 Not Found</h1>") + if pos >= 0: + msg = text.extract(page, '<div class="content">', '</div>', pos)[0] + if msg and "content is not available" in msg: + raise exception.AuthorizationError() + raise exception.NotFoundError("album") + + title, pos = text.extract(page, '"og:title" content="', '"') + info , pos = text.extract(page, '<li class="user_info">', "", pos) + if info is None: + count, pos = text.extract(page, '>Pages:', '<', pos) + else: + count, pos = text.extract(page, '<p>', ' ', pos) + genre, pos = text.extract(page, '<p>Genre:', '</p>', pos) + adnce, pos = text.extract(page, '<p>Audience:', '</p>', pos) + tags , pos = text.extract(page, '"tag_list static">', '</ol>', pos) + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : text.unescape(title or ""), + "count" : text.parse_int(count), + "genre" : text.remove_html(genre), + "audience" : text.remove_html(adnce), + "tags" : self._parse_tags(tags), + } + + def images(self, page): + extr = text.extract + + url = "{}/pictures/album/x_{}/sorted/old/page/1/".format( + self.root, self.gallery_id) + page = self.request(url).text + pos = page.find('<div id="picture_page_') + url = extr(page, '<a href="', '"', pos)[0] + iurl = None + + while url and not url.endswith("/more_like_this/"): + page = self.request(self.root + url).text + + if not iurl: # first loop iteraton + current = extr(page, '"pj_current_page" value="', '"')[0] + if current and current != "1": + url = "{}/albums/{}/jump_to_page/1/".format( + self.root, self.gallery_id) + page = self.request(url, method="POST").text + + iid , pos = extr(url , '/id/', '/') + url , pos = extr(page, '<link rel="next" href="', '"') + name, pos = extr(page, '<h1 id="picture_title">', '</h1>', pos) + _ , pos = extr(page, '<ul class="image_option_icons">', '', pos) + iurl, pos = extr(page, '<li><a href="', '"', pos+100) + + if iurl[0] == "/": + iurl = text.urljoin(self.root, iurl) + + yield iurl, { + "name": name, + "image_id": text.parse_int(iid), + } + + +class LusciousSearchExtractor(LusciousBase, Extractor): + """Extractor for album searches on luscious.net""" + subcategory = "search" + pattern = (r"(?:https?://)?(?:www\.|members\.)?luscious\.net" + r"/(albums(?:/(?![^/?&#]+_\d+)[^/?&#]+)+|manga|pictures)/?$") + test = ( + ("https://luscious.net/manga/"), + ("https://members.luscious.net/albums/sorted/updated/album_type/manga" + "/content_id/2/tagged/+full_color/page/1/", { + "pattern": LusciousAlbumExtractor.pattern, + "range": "20-40", + "count": 21, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1).partition("/page/")[0] + if not self.path.startswith("albums/"): + self.path = "albums/sorted/updated/album_type/" + self.path + + def items(self): + self.login() + yield Message.Version, 1 + for album in self.albums(): + url, data = self.parse_album(album) + yield Message.Queue, url, data + + def albums(self, pnum=1): + while True: + url = "{}/{}/page/{}/.json/".format(self.root, self.path, pnum) + data = self.request(url).json() + + yield from text.extract_iter( + data["html"], "<figcaption>", "</figcaption>") + + if data["paginator_complete"]: + return + pnum += 1 + + def parse_album(self, album): + url , pos = text.extract(album, 'href="', '"') + title, pos = text.extract(album, ">", "<", pos) + count, pos = text.extract(album, "# of pictures:", "<", pos) + date , pos = text.extract(album, "Updated: ", "<", pos) + desc , pos = text.extract(album, "class='desc'>", "<", pos) + tags , pos = text.extract(album, "<ol ", "</ol>", pos) + + return text.urljoin(self.root, url), { + "title": text.unescape(title or ""), + "description": text.unescape(desc or ""), + "gallery_id": text.parse_int(url.rpartition("_")[2].rstrip("/")), + "count": text.parse_int(count), + "date": date, + "tags": self._parse_tags(tags), + "_extractor": LusciousAlbumExtractor, + } diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py new file mode 100644 index 0000000..d0eb2a9 --- /dev/null +++ b/gallery_dl/extractor/mangadex.py @@ -0,0 +1,180 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://mangadex.org/""" + +from .common import Extractor, Message +from .. import text, util +from ..cache import memcache + + +class MangadexExtractor(Extractor): + """Base class for mangadex extractors""" + category = "mangadex" + root = "https://mangadex.org" + + # mangadex-to-iso639-1 codes + iso639_map = { + "br": "pt", + "ct": "ca", + "gb": "en", + "vn": "vi", + } + + def chapter_data(self, chapter_id): + """Request API results for 'chapter_id'""" + url = "{}/api/chapter/{}".format(self.root, chapter_id) + return self.request(url).json() + + @memcache(keyarg=1) + def manga_data(self, manga_id): + """Request API results for 'manga_id'""" + url = "{}/api/manga/{}".format(self.root, manga_id) + return self.request(url).json() + + +class MangadexChapterExtractor(MangadexExtractor): + """Extractor for manga-chapters from mangadex.org""" + subcategory = "chapter" + directory_fmt = ( + "{category}", "{manga}", + "{volume:?v/ />02}c{chapter:>03}{chapter_minor}{title:?: //}") + filename_fmt = ( + "{manga}_c{chapter:>03}{chapter_minor}_{page:>03}.{extension}") + archive_fmt = "{chapter_id}_{page}" + pattern = r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)/chapter/(\d+)" + test = ( + ("https://mangadex.org/chapter/122094", { + "keyword": "1c834dca33025f521e1874aee1f71c51e28ebf99", + "content": "7ab3bef5caccb62b881f8e6e70359d3c7be8137f", + }), + # oneshot + ("https://mangadex.org/chapter/138086", { + "count": 64, + "keyword": "178777bd0352fb19eb934cbee5630d16e3fb60ab", + }), + ) + + def __init__(self, match): + MangadexExtractor.__init__(self, match) + self.chapter_id = match.group(1) + self.data = None + + def items(self): + data = self.metadata() + imgs = self.images() + data["count"] = len(imgs) + + yield Message.Version, 1 + yield Message.Directory, data + for data["page"], url in enumerate(imgs, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def metadata(self): + """Return a dict with general metadata""" + cdata = self.chapter_data(self.chapter_id) + mdata = self.manga_data(cdata["manga_id"]) + self.data = cdata + + chapter, sep, minor = cdata["chapter"].partition(".") + return { + "manga": mdata["manga"]["title"], + "manga_id": cdata["manga_id"], + "artist": mdata["manga"]["artist"], + "author": mdata["manga"]["author"], + "title": text.unescape(cdata["title"]), + "volume": text.parse_int(cdata["volume"]), + "chapter": text.parse_int(chapter), + "chapter_minor": sep + minor, + "chapter_id": cdata["id"], + "group": mdata["chapter"][self.chapter_id]["group_name"], + "date": cdata["timestamp"], + "lang": util.language_to_code(cdata["lang_name"]), + "language": cdata["lang_name"], + } + + def images(self): + """Return a list of all image URLs""" + base = self.data["server"] + self.data["hash"] + "/" + if base.startswith("/"): + base = text.urljoin(self.root, base) + return [base + page for page in self.data["page_array"]] + + +class MangadexMangaExtractor(MangadexExtractor): + """Extractor for manga from mangadex.org""" + subcategory = "manga" + categorytransfer = True + pattern = (r"(?:https?://)?(?:www\.)?mangadex\.(?:org|com)" + r"/(?:title|manga)/(\d+)") + test = ( + ("https://mangadex.org/manga/2946/souten-no-koumori", { + "pattern": r"https://mangadex.org/chapter/\d+", + "keywords": { + "manga": "Souten no Koumori", + "manga_id": 2946, + "title": "Oneshot", + "volume": 0, + "chapter": 0, + "chapter_minor": "", + "chapter_id": int, + "group": str, + "date": int, + "lang": str, + "language": str, + }, + }), + ("https://mangadex.org/manga/13318/dagashi-kashi/chapters/2/", { + "count": ">= 100", + }), + ("https://mangadex.org/title/13004/yorumori-no-kuni-no-sora-ni", { + "count": 0, + }), + ("https://mangadex.org/title/2946/souten-no-koumori"), + ) + + def __init__(self, match): + MangadexExtractor.__init__(self, match) + self.manga_id = text.parse_int(match.group(1)) + + def items(self): + yield Message.Version, 1 + for data in self.chapters(): + url = "{}/chapter/{}".format(self.root, data["chapter_id"]) + yield Message.Queue, url, data + + def chapters(self): + """Return a sorted list of chapter-metadata dicts""" + data = self.manga_data(self.manga_id) + if "chapter" not in data: + return () + manga = data["manga"] + + results = [] + for chid, info in data["chapter"].items(): + chapter, sep, minor = info["chapter"].partition(".") + lang = self.iso639_map.get(info["lang_code"], info["lang_code"]) + results.append({ + "manga": manga["title"], + "manga_id": self.manga_id, + "artist": manga["artist"], + "author": manga["author"], + "title": text.unescape(info["title"]), + "volume": text.parse_int(info["volume"]), + "chapter": text.parse_int(chapter), + "chapter_minor": sep + minor, + "chapter_id": text.parse_int(chid), + "group": text.unescape(info["group_name"]), + "date": info["timestamp"], + "lang": lang, + "language": util.code_to_language(lang), + "_extractor": MangadexChapterExtractor, + }) + + results.sort(key=lambda x: (x["chapter"], x["chapter_minor"])) + return results diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py new file mode 100644 index 0000000..1b8a4a6 --- /dev/null +++ b/gallery_dl/extractor/mangafox.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://fanfox.net/""" + +from .common import ChapterExtractor +from .. import text + + +class MangafoxChapterExtractor(ChapterExtractor): + """Extractor for manga-chapters from fanfox.net""" + category = "mangafox" + pattern = (r"(?:https?://)?(?:www\.|m\.)?(?:mangafox\.me|fanfox\.net)" + r"(/manga/[^/]+/((?:v(\d+)/)?c(\d+)([^/?&#]*)))") + test = ( + ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", { + "keyword": "5661dab258d42d09d98f194f7172fb9851a49766", + "content": "5c50c252dcf12ffecf68801f4db8a2167265f66c", + }), + ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/"), + ) + root = "https://m.fanfox.net" + + def __init__(self, match): + base, self.cstr, self.volume, self.chapter, self.minor = match.groups() + self.urlbase = self.root + base + ChapterExtractor.__init__(self, match, self.urlbase + "/1.html") + + def metadata(self, page): + manga, pos = text.extract(page, "<title>", "</title>") + count, pos = text.extract( + page, ">", "<", page.find("</select>", pos) - 20) + sid , pos = text.extract(page, "var series_id =", ";", pos) + cid , pos = text.extract(page, "var chapter_id =", ";", pos) + + return { + "manga": text.unescape(manga), + "volume": text.parse_int(self.volume), + "chapter": text.parse_int(self.chapter), + "chapter_minor": self.minor or "", + "chapter_string": self.cstr, + "count": text.parse_int(count), + "sid": text.parse_int(sid), + "cid": text.parse_int(cid), + } + + def images(self, page): + pnum = 1 + while True: + url, pos = text.extract(page, '<img src="', '"') + yield url, None + url, pos = text.extract(page, ' src="', '"', pos) + yield url, None + + pnum += 2 + page = self.request("{}/{}.html".format(self.urlbase, pnum)).text diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py new file mode 100644 index 0000000..e15acbe --- /dev/null +++ b/gallery_dl/extractor/mangahere.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://www.mangahere.cc/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text +import re + + +class MangahereBase(): + """Base class for mangahere extractors""" + category = "mangahere" + root = "https://www.mangahere.cc" + mobile_root = "https://m.mangahere.cc" + url_fmt = mobile_root + "/manga/{}/{}.html" + + +class MangahereChapterExtractor(MangahereBase, ChapterExtractor): + """Extractor for manga-chapters from mangahere.cc""" + pattern = (r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]/manga/" + r"([^/]+(?:/v0*(\d+))?/c([^/?&#]+))") + test = ( + ("https://www.mangahere.cc/manga/dongguo_xiaojie/c004.2/", { + "keyword": "7c98d7b50a47e6757b089aa875a53aa970cac66f", + "content": "708d475f06893b88549cbd30df1e3f9428f2c884", + }), + ("http://www.mangahere.co/manga/dongguo_xiaojie/c003.2/"), + ("http://m.mangahere.co/manga/dongguo_xiaojie/c003.2/"), + ) + + def __init__(self, match): + self.part, self.volume, self.chapter = match.groups() + url = self.url_fmt.format(self.part, 1) + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + pos = page.index("</select>") + count , pos = text.extract(page, ">", "<", pos - 20) + manga_id , pos = text.extract(page, "series_id = ", ";", pos) + chapter_id, pos = text.extract(page, "chapter_id = ", ";", pos) + manga , pos = text.extract(page, '"name":"', '"', pos) + chapter, dot, minor = self.chapter.partition(".") + + return { + "manga": text.unescape(manga), + "manga_id": text.parse_int(manga_id), + "title": self._get_title(), + "volume": text.parse_int(self.volume), + "chapter": text.parse_int(chapter), + "chapter_minor": dot + minor, + "chapter_id": text.parse_int(chapter_id), + "count": text.parse_int(count), + "lang": "en", + "language": "English", + } + + def images(self, page): + pnum = 1 + + while True: + url, pos = text.extract(page, '<img src="', '"') + yield url, None + url, pos = text.extract(page, ' src="', '"', pos) + yield url, None + pnum += 2 + page = self.request(self.url_fmt.format(self.part, pnum)).text + + def _get_title(self): + url = "{}/manga/{}/".format(self.root, self.part) + page = self.request(url).text + + try: + pos = page.index(self.part) + len(self.part) + pos = page.index(self.part, pos) + len(self.part) + return text.extract(page, ' title="', '"', pos)[0] + except ValueError: + return "" + + +class MangahereMangaExtractor(MangahereBase, MangaExtractor): + """Extractor for manga from mangahere.cc""" + chapterclass = MangahereChapterExtractor + pattern = (r"(?:https?://)?(?:www\.|m\.)?mangahere\.c[co]" + r"(/manga/[^/]+)/?(?:#.*)?$") + test = ( + ("https://www.mangahere.cc/manga/aria/", { + "url": "23ad9256f7392de5973b79a36f6875e9fdcb7563", + "keyword": "79e326641e7d5d2fed43a1eb9949471b8162a9e0", + }), + ("https://www.mangahere.cc/manga/hiyokoi/#50", { + "url": "654850570aa03825cd57e2ae2904af489602c523", + "keyword": "c8084d89a9ea6cf40353093669f9601a39bf5ca2", + }), + ("https://www.mangahere.co/manga/aria/"), + ("https://m.mangahere.co/manga/aria/"), + ) + + def chapters(self, page): + results = [] + manga, pos = text.extract(page, '<meta name="og:title" content="', '"') + manga = text.unescape(manga) + + page = text.extract( + page, 'id="chapterlist"', 'class="detail-main-list-more"', pos)[0] + pos = 0 + while True: + url, pos = text.extract(page, ' href="', '"', pos) + if not url: + return results + info, pos = text.extract(page, 'class="title3">', '<', pos) + date, pos = text.extract(page, 'class="title2">', '<', pos) + + match = re.match( + r"(?:Vol\.0*(\d+) )?Ch\.0*(\d+)(\S*)(?: - (.*))?", info) + if match: + volume, chapter, minor, title = match.groups() + else: + chapter, _, minor = url[:-1].rpartition("/c")[2].partition(".") + minor = "." + minor + volume = 0 + title = "" + + results.append((text.urljoin(self.root, url), { + "manga": manga, + "title": text.unescape(title) if title else "", + "volume": text.parse_int(volume), + "chapter": text.parse_int(chapter), + "chapter_minor": minor, + "date": date, + "lang": "en", + "language": "English", + })) diff --git a/gallery_dl/extractor/mangapanda.py b/gallery_dl/extractor/mangapanda.py new file mode 100644 index 0000000..18ef005 --- /dev/null +++ b/gallery_dl/extractor/mangapanda.py @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://www.mangapanda.com/""" + +from .mangareader import MangareaderMangaExtractor, MangareaderChapterExtractor + + +class MangapandaBase(): + """Base class for mangapanda extractors""" + category = "mangapanda" + root = "https://www.mangapanda.com" + + +class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor): + """Extractor for manga-chapters from mangapanda.com""" + pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))" + test = ("https://www.mangapanda.com/red-storm/2", { + "url": "1f633f776e950531ba9b1e81965316458e785261", + "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb", + }) + + +class MangapandaMangaExtractor(MangapandaBase, MangareaderMangaExtractor): + """Extractor for manga from mangapanda.com""" + chapterclass = MangapandaChapterExtractor + pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com(/[^/?&#]+)/?$" + test = ("https://www.mangapanda.com/mushishi", { + "url": "357f965732371cac1990fee8b480f62e29141a42", + "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", + }) diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py new file mode 100644 index 0000000..ee11231 --- /dev/null +++ b/gallery_dl/extractor/mangapark.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://mangapark.me/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, exception +import json + + +class MangaparkBase(): + """Base class for mangapark extractors""" + category = "mangapark" + root_fmt = "https://mangapark.{}" + + @staticmethod + def parse_chapter_path(path, data): + """Get volume/chapter information from url-path of a chapter""" + data["volume"], data["chapter_minor"] = 0, "" + for part in path.split("/")[1:]: + key, value = part[0], part[1:] + if key == "c": + chapter, dot, minor = value.partition(".") + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = dot + minor + elif key == "i": + data["chapter_id"] = text.parse_int(value) + elif key == "v": + data["volume"] = text.parse_int(value) + elif key == "s": + data["stream"] = text.parse_int(value) + elif key == "e": + data["chapter_minor"] = "v" + value + + +class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): + """Extractor for manga-chapters from mangapark.me""" + pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" + r"/manga/([^?&#]+/i\d+)") + test = ( + ("https://mangapark.me/manga/gosu/i811615/c55/1", { + "count": 50, + "keyword": "373d678048d29492f9763743ccaa9b6d840f17cf", + }), + (("https://mangapark.me/manga" + "/ad-astra-per-aspera-hata-kenjirou/i662054/c001.2/1"), { + "count": 40, + "keyword": "8e9cce4ed0e25d12a45e02f840d6f32ef838e257", + }), + ("https://mangapark.me/manga/gekkan-shoujo-nozaki-kun/i655476/c70/1", { + "count": 15, + "keyword": "19f730617074d65f91c0781f429de324890925bf", + }), + ("https://mangapark.net/manga/gosu/i811615/c55/1"), + ("https://mangapark.com/manga/gosu/i811615/c55/1"), + ) + + def __init__(self, match): + tld, self.path = match.groups() + self.root = self.root_fmt.format(tld) + url = "{}/manga/{}?zoom=2".format(self.root, self.path) + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + data = text.extract_all(page, ( + ("manga_id" , "var _manga_id = '", "'"), + ("chapter_id", "var _book_id = '", "'"), + ("stream" , "var _stream = '", "'"), + ("path" , "var _book_link = '", "'"), + ("manga" , "<h2>", "</h2>"), + ("title" , "</a>", "<"), + ), values={"lang": "en", "language": "English"})[0] + + if not data["path"]: + raise exception.NotFoundError("chapter") + self.parse_chapter_path(data["path"], data) + + data["manga"], _, data["type"] = data["manga"].rpartition(" ") + data["manga"] = text.unescape(data["manga"]) + data["title"] = data["title"].partition(": ")[2] + for key in ("manga_id", "chapter_id", "stream"): + data[key] = text.parse_int(data[key]) + + return data + + def images(self, page): + data = json.loads(text.extract( + page, "var _load_pages =", ";")[0] or "[]") + return [ + (text.urljoin(self.root, item["u"]), { + "width": text.parse_int(item["w"]), + "height": text.parse_int(item["h"]), + }) + for item in data + ] + + +class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): + """Extractor for manga from mangapark.me""" + chapterclass = MangaparkChapterExtractor + pattern = (r"(?:https?://)?(?:www\.)?mangapark\.(me|net|com)" + r"(/manga/[^/?&#]+)/?$") + test = ( + ("https://mangapark.me/manga/aria", { + "url": "a58be23ef3874fe9705b0b41dd462b67eaaafd9a", + "keyword": "b3b5a30aa2a326bc0ca8b74c65b5ecd4bf676ebf", + }), + ("https://mangapark.net/manga/aria"), + ("https://mangapark.com/manga/aria"), + ) + + def __init__(self, match): + self.root = self.root_fmt.format(match.group(1)) + MangaExtractor.__init__(self, match, self.root + match.group(2)) + + def chapters(self, page): + results = [] + data = {"lang": "en", "language": "English"} + data["manga"] = text.unescape( + text.extract(page, '<title>', ' Manga - ')[0]) + + for stream in page.split('<div id="stream_')[1:]: + data["stream"] = text.parse_int(text.extract(stream, '', '"')[0]) + + for chapter in text.extract_iter(stream, '<li ', '</li>'): + path , pos = text.extract(chapter, 'href="', '"') + title, pos = text.extract(chapter, '>: </span>', '<', pos) + count, pos = text.extract(chapter, ' of ', ' ', pos) + + self.parse_chapter_path(path[8:], data) + data["title"] = title.strip() if title else "" + data["count"] = text.parse_int(count) + results.append((self.root + path, data.copy())) + + return results diff --git a/gallery_dl/extractor/mangareader.py b/gallery_dl/extractor/mangareader.py new file mode 100644 index 0000000..d24d452 --- /dev/null +++ b/gallery_dl/extractor/mangareader.py @@ -0,0 +1,119 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from https://www.mangareader.net/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text + + +class MangareaderBase(): + """Base class for mangareader extractors""" + category = "mangareader" + root = "https://www.mangareader.net" + + @staticmethod + def parse_page(page, data): + """Parse metadata on 'page' and add it to 'data'""" + text.extract_all(page, ( + ("manga" , '<h2 class="aname">', '</h2>'), + ("release", '>Year of Release:</td>\n<td>', '</td>'), + ('author' , '>Author:</td>\n<td>', '</td>'), + ('artist' , '>Artist:</td>\n<td>', '</td>'), + ), values=data) + data["manga"] = data["manga"].strip() + data["author"] = text.unescape(data["author"]) + data["artist"] = text.unescape(data["artist"]) + return data + + +class MangareaderChapterExtractor(MangareaderBase, ChapterExtractor): + """Extractor for manga-chapters from mangareader.net""" + archive_fmt = "{manga}_{chapter}_{page}" + pattern = r"(?:https?://)?(?:www\.)?mangareader\.net((/[^/?&#]+)/(\d+))" + test = (("https://www.mangareader.net" + "/karate-shoukoushi-kohinata-minoru/11"), { + "url": "061cc92a07edf17bb991ce0821fa4c77a147a860", + "keyword": "133e3e2f7c0529a35bbb16149e34c40546f8dfd6", + }) + + def __init__(self, match): + path, self.url_title, self.chapter = match.groups() + ChapterExtractor.__init__(self, match, self.root + path) + + def metadata(self, chapter_page): + page = self.request(self.root + self.url_title).text + data = self.parse_page(page, { + "chapter": text.parse_int(self.chapter), + "lang": "en", + "language": "English", + }) + text.extract_all(page, ( + ('title', ' ' + self.chapter + '</a> : ', '</td>'), + ('date', '<td>', '</td>'), + ), page.index('<div id="chapterlist">'), data) + data["count"] = text.parse_int(text.extract( + chapter_page, '</select> of ', '<')[0] + ) + return data + + def images(self, page): + while True: + next_url, image_url, image_data = self.get_image_metadata(page) + yield image_url, image_data + + if not next_url: + return + page = self.request(next_url).text + + def get_image_metadata(self, page): + """Collect next url, image-url and metadata for one manga-page""" + extr = text.extract + width = None + test , pos = extr(page, "document['pu']", '') + if test is None: + return None, None, None + if page.find("document['imgwidth']", pos, pos+200) != -1: + width , pos = extr(page, "document['imgwidth'] = ", ";", pos) + height, pos = extr(page, "document['imgheight'] = ", ";", pos) + _ , pos = extr(page, '<div id="imgholder">', '') + url, pos = extr(page, ' href="', '"', pos) + if width is None: + width , pos = extr(page, '<img id="img" width="', '"', pos) + height, pos = extr(page, ' height="', '"', pos) + image, pos = extr(page, ' src="', '"', pos) + return self.root + url, image, { + "width": text.parse_int(width), + "height": text.parse_int(height), + } + + +class MangareaderMangaExtractor(MangareaderBase, MangaExtractor): + """Extractor for manga from mangareader.net""" + chapterclass = MangareaderChapterExtractor + reverse = False + pattern = r"(?:https?://)?(?:www\.)?mangareader\.net(/[^/?&#]+)/?$" + test = ("https://www.mangareader.net/mushishi", { + "url": "bc203b858b4ad76e5d77e39118a7be0350e357da", + "keyword": "031b3ea085921c552de017ecbb9b906e462229c9", + }) + + def chapters(self, page): + results = [] + data = self.parse_page(page, {"lang": "en", "language": "English"}) + + needle = '<div class="chico_manga"></div>\n<a href="' + pos = page.index('<div id="chapterlist">') + while True: + url, pos = text.extract(page, needle, '"', pos) + if not url: + return results + data["title"], pos = text.extract(page, '</a> : ', '</td>', pos) + data["date"] , pos = text.extract(page, '<td>', '</td>', pos) + data["chapter"] = text.parse_int(url.rpartition("/")[2]) + results.append((self.root + url, data.copy())) diff --git a/gallery_dl/extractor/mangastream.py b/gallery_dl/extractor/mangastream.py new file mode 100644 index 0000000..7ff0239 --- /dev/null +++ b/gallery_dl/extractor/mangastream.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters from https://readms.net/""" + +from .common import ChapterExtractor +from .. import text + + +class MangastreamChapterExtractor(ChapterExtractor): + """Extractor for manga-chapters from mangastream.com""" + category = "mangastream" + archive_fmt = "{chapter_id}_{page}" + pattern = (r"(?:https?://)?(?:www\.)?(?:readms\.net|mangastream\.com)" + r"/r(?:ead)?/([^/]*/([^/]+)/(\d+))") + test = ( + ("https://readms.net/r/onepunch_man/087/4874/1"), + ("https://mangastream.com/r/onepunch_man/087/4874/1"), + ) + root = "https://readms.net" + + def __init__(self, match): + self.part, self.chapter, self.chapter_id = match.groups() + url = "{}/r/{}".format(self.root, self.part) + ChapterExtractor.__init__(self, match, url) + + def metadata(self, page): + manga, pos = text.extract( + page, '<span class="hidden-xs hidden-sm">', "<") + pos = page.find(self.part, pos) + title, pos = text.extract(page, ' - ', '<', pos) + count, pos = text.extract(page, 'Last Page (', ')', pos) + return { + "manga": manga, + "chapter": text.unquote(self.chapter), + "chapter_id": text.parse_int(self.chapter_id), + "title": title, + "count": text.parse_int(count, 1), + "lang": "en", + "language": "English", + } + + def images(self, page): + while True: + pos = page.index(' class="page"') + next_url = text.extract(page, ' href="', '"', pos)[0] + image_url = text.extract(page, ' src="', '"', pos)[0] + yield text.urljoin(self.root, image_url), None + page = self.request(text.urljoin(self.root, next_url)).text diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py new file mode 100644 index 0000000..4ad8da2 --- /dev/null +++ b/gallery_dl/extractor/mangoxo.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.mangoxo.com/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache +import hashlib + + +class MangoxoExtractor(Extractor): + """Base class for mangoxo extractors""" + category = "mangoxo" + root = "https://www.mangoxo.com" + cookiedomain = "www.mangoxo.com" + cookienames = ("SESSION",) + _warning = True + + def login(self): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + elif MangoxoExtractor._warning: + MangoxoExtractor._warning = False + self.log.warning("Unauthenticated users cannot see " + "more than 5 images per album") + + @cache(maxage=3*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + page = self.request(self.root + "/login/").text + token = text.extract(page, 'id="loginToken" value="', '"')[0] + if not token: + self.log.debug("failed to extract 'loginToken'") + + url = self.root + "/login/loginxmm" + headers = { + "X-Requested-With": "XMLHttpRequest", + "Referer": self.root + "/login", + } + data = { + "name": username, + "password": hashlib.md5(password.encode()).hexdigest(), + "loginToken": token, + } + response = self.request(url, method="POST", headers=headers, data=data) + + if response.json().get("result") != "1": + raise exception.AuthenticationError() + return {"SESSION": self.session.cookies.get("SESSION")} + + @staticmethod + def _total_pages(page): + return text.parse_int(text.extract(page, "total :", ",")[0]) + + +class MangoxoAlbumExtractor(MangoxoExtractor): + """Extractor for albums on mangoxo.com""" + subcategory = "album" + filename_fmt = "{album[id]}_{num:>03}.{extension}" + directory_fmt = ("{category}", "{channel[name]}", "{album[name]}") + archive_fmt = "{album[id]}_{num}" + pattern = r"(?:https?://)?(?:www\.)?mangoxo\.com/album/(\w+)" + test = ("https://www.mangoxo.com/album/lzVOv1Q9", { + "url": "ad921fe62663b06e7d73997f7d00646cab7bdd0d", + "keyword": { + "channel": { + "id": "QeYKRkO0", + "name": "美女图社", + "cover": str, + }, + "album": { + "id": "lzVOv1Q9", + "name": "池永康晟 Ikenaga Yasunari 透出古朴气息的日本美女人像画作", + "date": "2019.3.22 14:42", + "description": str, + }, + "num": int, + "count": 65, + }, + }) + + def __init__(self, match): + MangoxoExtractor.__init__(self, match) + self.album_id = match.group(1) + + def items(self): + self.login() + url = "{}/album/{}/".format(self.root, self.album_id) + page = self.request(url).text + data = self.metadata(page) + imgs = self.images(url, page) + + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], image in enumerate(imgs, 1): + yield Message.Url, image, text.nameext_from_url(image, data) + + def metadata(self, page): + """Return general metadata""" + title, pos = text.extract(page, '<title>', '</title>') + count, pos = text.extract(page, 'id="pic-count">', '<', pos) + cover, pos = text.extract(page, ' src="', '"', pos) + cid , pos = text.extract(page, '//www.mangoxo.com/channel/', '"', pos) + cname, pos = text.extract(page, '>', '<', pos) + date , pos = text.extract(page, '</i>', '<', pos) + descr, pos = text.extract(page, '<pre>', '</pre>', pos) + + return { + "channel": { + "id": cid, + "name": text.unescape(cname), + "cover": cover, + }, + "album": { + "id": self.album_id, + "name": text.unescape(title), + "date": date.strip(), + "description": text.unescape(descr), + }, + "count": text.parse_int(count), + } + + def images(self, url, page): + """Generator; Yields all image URLs""" + total = self._total_pages(page) + num = 1 + + while True: + yield from text.extract_iter( + page, 'class="lightgallery-item" href="', '"') + if num >= total: + return + num += 1 + page = self.request(url + str(num)).text + + +class MangoxoChannelExtractor(MangoxoExtractor): + """Extractor for all albums on a mangoxo channel""" + subcategory = "channel" + pattern = r"(?:https?://)?(?:www\.)?mangoxo\.com/channel/(\w+)" + test = ("https://www.mangoxo.com/channel/QeYKRkO0", { + "pattern": MangoxoAlbumExtractor.pattern, + "range": "1-30", + "count": "> 20", + }) + + def __init__(self, match): + MangoxoExtractor.__init__(self, match) + self.channel_id = match.group(1) + + def items(self): + self.login() + num = total = 1 + url = "{}/channel/{}/album/".format(self.root, self.channel_id) + yield Message.Version, 1 + + while True: + page = self.request(url + str(num)).text + + for album in text.extract_iter( + page, '<a class="link black" href="', '"'): + yield Message.Queue, album, {} + + if num == 1: + total = self._total_pages(page) + if num >= total: + return + num += 1 diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py new file mode 100644 index 0000000..28a2c2d --- /dev/null +++ b/gallery_dl/extractor/mastodon.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for mastodon instances""" + +from .common import Extractor, Message +from .. import text, config, exception +import re + + +class MastodonExtractor(Extractor): + """Base class for mastodon extractors""" + basecategory = "mastodon" + directory_fmt = ("mastodon", "{instance}", "{account[username]}") + filename_fmt = "{category}_{id}_{media[id]}.{extension}" + archive_fmt = "{media[id]}" + instance = None + root = None + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = MastodonAPI(self) + + def config(self, key, default=None, *, sentinel=object()): + value = Extractor.config(self, key, sentinel) + if value is not sentinel: + return value + return config.interpolate( + ("extractor", "mastodon", self.instance, self.subcategory, key), + default, + ) + + def items(self): + yield Message.Version, 1 + for status in self.statuses(): + attachments = self.prepare(status) + yield Message.Directory, status + for media in attachments: + status["media"] = media + url = media["url"] + yield Message.Url, url, text.nameext_from_url(url, status) + + def statuses(self): + """Return an iterable containing all relevant Status-objects""" + return () + + def prepare(self, status): + """Prepare a status object""" + status["instance"] = self.instance + status["tags"] = [tag["name"] for tag in status["tags"]] + attachments = status["media_attachments"] + del status["media_attachments"] + return attachments + + +class MastodonUserExtractor(MastodonExtractor): + """Extractor for all images of an account/user""" + subcategory = "user" + + def __init__(self, match): + MastodonExtractor.__init__(self, match) + self.account_name = match.group(1) + + def statuses(self): + results = self.api.account_search("@" + self.account_name, 1) + for account in results: + if account["username"] == self.account_name: + break + else: + raise exception.NotFoundError("account") + return self.api.account_statuses(account["id"]) + + +class MastodonStatusExtractor(MastodonExtractor): + """Extractor for images from a status""" + subcategory = "status" + + def __init__(self, match): + MastodonExtractor.__init__(self, match) + self.status_id = match.group(1) + + def statuses(self): + return (self.api.status(self.status_id),) + + +class MastodonAPI(): + """Minimal interface for the Mastodon API + + https://github.com/tootsuite/mastodon + https://github.com/tootsuite/documentation/blob/master/Using-the-API/API.md + """ + + def __init__(self, extractor, access_token=None): + self.root = extractor.root + self.extractor = extractor + + if not access_token: + access_token = extractor.config( + "access-token", extractor.access_token) + self.headers = {"Authorization": "Bearer {}".format(access_token)} + + def account_search(self, query, limit=40): + """Search for content""" + params = {"q": query, "limit": limit} + return self._call("accounts/search", params) + + def account_statuses(self, account_id): + """Get an account's statuses""" + endpoint = "accounts/{}/statuses".format(account_id) + params = {"only_media": "1"} + return self._pagination(endpoint, params) + + def status(self, status_id): + """Fetch a Status""" + return self._call("statuses/" + status_id) + + def _call(self, endpoint, params=None): + url = "{}/api/v1/{}".format(self.root, endpoint) + response = self.extractor.request( + url, params=params, headers=self.headers) + return self._parse(response) + + def _pagination(self, endpoint, params): + url = "{}/api/v1/{}".format(self.root, endpoint) + while url: + response = self.extractor.request( + url, params=params, headers=self.headers) + yield from self._parse(response) + url = response.links.get("next", {}).get("url") + + @staticmethod + def _parse(response): + """Parse an API response""" + if response.status_code == 404: + raise exception.NotFoundError() + return response.json() + + +def generate_extractors(): + """Dynamically generate Extractor classes for Mastodon instances""" + + symtable = globals() + extractors = config.get(("extractor", "mastodon")) + if extractors: + EXTRACTORS.update(extractors) + config.set(("extractor", "mastodon"), EXTRACTORS) + + for instance, info in EXTRACTORS.items(): + + if not isinstance(info, dict): + continue + + category = info.get("category") or instance.replace(".", "") + root = info.get("root") or "https://" + instance + name = (info.get("name") or category).capitalize() + token = info.get("access-token") + pattern = info.get("pattern") or re.escape(instance) + + class Extr(MastodonUserExtractor): + pass + + Extr.__name__ = Extr.__qualname__ = name + "UserExtractor" + Extr.__doc__ = "Extractor for all images of a user on " + instance + Extr.category = category + Extr.instance = instance + Extr.pattern = (r"(?:https?://)?" + pattern + + r"/@([^/?&#]+)(?:/media)?/?$") + Extr.root = root + Extr.access_token = token + symtable[Extr.__name__] = Extr + + class Extr(MastodonStatusExtractor): + pass + + Extr.__name__ = Extr.__qualname__ = name + "StatusExtractor" + Extr.__doc__ = "Extractor for images from a status on " + instance + Extr.category = category + Extr.instance = instance + Extr.pattern = r"(?:https?://)?" + pattern + r"/@[^/?&#]+/(\d+)" + Extr.root = root + Extr.access_token = token + symtable[Extr.__name__] = Extr + + +EXTRACTORS = { + "pawoo.net": { + "category" : "pawoo", + "access-token" : "286462927198d0cf3e24683e91c8259a" + "ac4367233064e0570ca18df2ac65b226", + "client-id" : "97b142b6904abf97a1068d51a7bc2f2f" + "cf9323cef81f13cb505415716dba7dac", + "client-secret": "e45bef4bad45b38abf7d9ef88a646b73" + "75e7fb2532c31a026327a93549236481", + }, +} + + +generate_extractors() diff --git a/gallery_dl/extractor/message.py b/gallery_dl/extractor/message.py new file mode 100644 index 0000000..1831620 --- /dev/null +++ b/gallery_dl/extractor/message.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2018 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + + +class Message(): + """Enum for message identifiers + + Extractors yield their results as message-tuples, where the first element + is one of the following identifiers. This message-identifier determines + the type and meaning of the other elements in such a tuple. + + - Message.Version: + - Message protocol version (currently always '1') + - 2nd element specifies the version of all following messages as integer + + - Message.Directory: + - Sets the target directory for all following images + - 2nd element is a dictionary containing general metadata + + - Message.Url: + - Image URL and its metadata + - 2nd element is the URL as a string + - 3rd element is a dictionary with image-specific metadata + + - Message.Headers: # obsolete + - HTTP headers to use while downloading + - 2nd element is a dictionary with header-name and -value pairs + + - Message.Cookies: # obsolete + - Cookies to use while downloading + - 2nd element is a dictionary with cookie-name and -value pairs + + - Message.Queue: + - (External) URL that should be handled by another extractor + - 2nd element is the (external) URL as a string + - 3rd element is a dictionary containing URL-specific metadata + + - Message.Urllist: + - Same as Message.Url, but its 2nd element is a list of multiple URLs + - The additional URLs serve as a fallback if the primary one fails + """ + + Version = 1 + Directory = 2 + Url = 3 + # Headers = 4 + # Cookies = 5 + Queue = 6 + Urllist = 7 diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py new file mode 100644 index 0000000..1515f53 --- /dev/null +++ b/gallery_dl/extractor/myportfolio.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.myportfolio.com/""" + +from .common import Extractor, Message +from .. import text + + +class MyportfolioGalleryExtractor(Extractor): + """Extractor for an image gallery on www.myportfolio.com""" + category = "myportfolio" + subcategory = "gallery" + directory_fmt = ("{category}", "{user}", "{title}") + filename_fmt = "{num:>02}.{extension}" + archive_fmt = "{user}_{filename}" + pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|" + r"(?:https?://)?([^.]+\.myportfolio\.com))" + r"(/[^/?&#]+)?") + test = ( + ("https://hannahcosgrove.myportfolio.com/robyn", { + "url": "93b5430e765e53564b13e7d9c64c30c286011a6b", + "keyword": "25cb3dbdad6b011242a133f30ec598318b7512e8", + }), + ("https://hannahcosgrove.myportfolio.com/lfw", { + "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$", + "count": ">= 8", + }), + ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", { + "count": 3, + }), + ("myportfolio:https://tooco.com.ar/", { + "count": ">= 40", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + domain1, domain2, self.path = match.groups() + self.domain = domain1 or domain2 + self.prefix = "myportfolio:" if domain1 else "" + + def items(self): + yield Message.Version, 1 + url = "https://" + self.domain + (self.path or "") + page = self.request(url).text + + projects = text.extract( + page, '<section class="project-covers', '</section>')[0] + + if projects: + data = {"_extractor": MyportfolioGalleryExtractor} + base = self.prefix + "https://" + self.domain + for path in text.extract_iter(projects, ' href="', '"'): + yield Message.Queue, base + path, data + else: + data = self.metadata(page) + imgs = self.images(page) + data["count"] = len(imgs) + yield Message.Directory, data + for data["num"], url in enumerate(imgs, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + @staticmethod + def metadata(page): + """Collect general image metadata""" + # og:title contains data as "<user> - <title>", but both + # <user> and <title> can contain a "-" as well, so we get the title + # from somewhere else and cut that amount from the og:title content + + user, pos = text.extract( + page, 'property=og:title content="', '"') + desc, pos = text.extract( + page, 'property=og:description content="', '"', pos) + title, pos = text.extract( + page, '<h1 ', '</h1>', pos) + + title = title.partition(">")[2] + user = user[:-len(title)-3] + + return { + "user": text.unescape(user), + "title": text.unescape(title), + "description": text.unescape(desc or ""), + } + + @staticmethod + def images(page): + """Extract and return a list of all image-urls""" + return list(text.extract_iter(page, 'js-lightbox" data-src="', '"')) diff --git a/gallery_dl/extractor/newgrounds.py b/gallery_dl/extractor/newgrounds.py new file mode 100644 index 0000000..9e0aaa3 --- /dev/null +++ b/gallery_dl/extractor/newgrounds.py @@ -0,0 +1,155 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.newgrounds.com/""" + +from .common import Extractor, Message +from .. import text +import json + + +class NewgroundsExtractor(Extractor): + """Base class for newgrounds extractors""" + category = "newgrounds" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{category}_{index}_{title}.{extension}" + archive_fmt = "{index}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + self.root = "https://{}.newgrounds.com".format(self.user) + + def items(self): + data = self.get_metadata() + yield Message.Version, 1 + yield Message.Directory, data + + for page_url in self.get_page_urls(): + image = self.parse_page_data(page_url) + image.update(data) + url = image["url"] + yield Message.Url, url, text.nameext_from_url(url, image) + + def get_metadata(self): + """Collect metadata for extractor-job""" + return {"user": self.user} + + def get_page_urls(self): + """Return urls of all relevant image pages""" + + def parse_page_data(self, page_url): + """Collect url and metadata from an image page""" + extr = text.extract_from(self.request(page_url).text) + full = text.extract_from(json.loads(extr('"full_image_text":', '});'))) + data = { + "description": text.unescape(extr(':description" content="', '"')), + "date" : extr('itemprop="datePublished" content="', '"'), + "rating" : extr('class="rated-', '"'), + "favorites" : text.parse_int(extr('id="faves_load">', '<')), + "score" : text.parse_float(extr('id="score_number">', '<')), + "url" : full('src="', '"'), + "title" : text.unescape(full('alt="', '"')), + "width" : text.parse_int(full('width="', '"')), + "height" : text.parse_int(full('height="', '"')), + } + + tags = text.split_html(extr('<dd class="tags momag">', '</dd>')) + tags.sort() + data["tags"] = tags + + data["date"] = text.parse_datetime(data["date"]) + data["index"] = text.parse_int( + data["url"].rpartition("/")[2].partition("_")[0]) + return data + + def _pagination(self, url): + headers = { + "Referer": self.root, + "X-Requested-With": "XMLHttpRequest", + "Accept": "application/json, text/javascript, */*; q=0.01", + } + + while True: + data = self.request(url, headers=headers).json() + + for year in data["sequence"]: + for item in data["years"][str(year)]["items"]: + page_url = text.extract(item, 'href="', '"')[0] + yield text.urljoin(self.root, page_url) + + if not data["more"]: + return + url = text.urljoin(self.root, data["more"]) + + +class NewgroundsUserExtractor(NewgroundsExtractor): + """Extractor for all images of a newgrounds user""" + subcategory = "user" + pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com(?:/art)?/?$" + test = ( + ("https://blitzwuff.newgrounds.com/art", { + "url": "24b19c4a135a09889fac7b46a74e427e4308d02b", + "keyword": "2aab0532a894ff3cf88dd01ce5c60f114011b268", + }), + ("https://blitzwuff.newgrounds.com/"), + ) + + def get_page_urls(self): + return self._pagination(self.root + "/art/page/1") + + +class NewgroundsImageExtractor(NewgroundsExtractor): + """Extractor for a single image from newgrounds.com""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:" + r"(?:www\.)?newgrounds\.com/art/view/([^/?&#]+)/[^/?&#]+" + r"|art\.ngfiles\.com/images/\d+/\d+_([^_]+)_([^.]+))") + test = ( + ("https://www.newgrounds.com/art/view/blitzwuff/ffx", { + "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818", + "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e", + "content": "cb067d6593598710292cdd340d350d14a26fe075", + }), + ("https://art.ngfiles.com/images/587000/587551_blitzwuff_ffx.png", { + "url": "e7778c4597a2fb74b46e5f04bb7fa1d80ca02818", + "keyword": "cbe90f8f32da4341938f59b08d70f76137028a7e", + }), + ) + + def __init__(self, match): + NewgroundsExtractor.__init__(self, match) + if match.group(2): + self.user = match.group(2) + self.page_url = "https://www.newgrounds.com/art/view/{}/{}".format( + self.user, match.group(3)) + else: + self.page_url = match.group(0) + + def get_page_urls(self): + return (self.page_url,) + + +class NewgroundsVideoExtractor(NewgroundsExtractor): + """Extractor for all videos of a newgrounds user""" + subcategory = "video" + filename_fmt = "{category}_{index}.{extension}" + pattern = r"(?:https?://)?([^.]+)\.newgrounds\.com/movies/?$" + test = ("https://twistedgrim.newgrounds.com/movies", { + "pattern": r"ytdl:https?://www\.newgrounds\.com/portal/view/\d+", + "count": ">= 29", + }) + + def get_page_urls(self): + return self._pagination(self.root + "/movies/page/1") + + def parse_page_data(self, page_url): + return { + "url" : "ytdl:" + page_url, + "index": text.parse_int(page_url.rpartition("/")[2]), + } diff --git a/gallery_dl/extractor/ngomik.py b/gallery_dl/extractor/ngomik.py new file mode 100644 index 0000000..8135a8a --- /dev/null +++ b/gallery_dl/extractor/ngomik.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters and entire manga from http://ngomik.in/""" + +from .common import ChapterExtractor +from .. import text +import re + + +class NgomikChapterExtractor(ChapterExtractor): + """Extractor for manga-chapters from ngomik.in""" + category = "ngomik" + root = "http://ngomik.in" + pattern = (r"(?:https?://)?(?:www\.)?ngomik\.in" + r"(/[^/?&#]+-chapter-[^/?&#]+)") + test = ( + ("https://www.ngomik.in/14-sai-no-koi-chapter-1-6/", { + "url": "8e67fdf751bbc79bc6f4dead7675008ddb8e32a4", + "keyword": "204d177f09d438fd50c9c28d98c73289194640d8", + }), + ("https://ngomik.in/break-blade-chapter-26/", { + "count": 34, + }), + ) + + def metadata(self, page): + info = text.extract(page, '<title>', "</title>")[0] + manga, _, chapter = info.partition(" Chapter ") + chapter, sep, minor = chapter.partition(" ")[0].partition(".") + + return { + "manga": text.unescape(manga), + "chapter": text.parse_int(chapter), + "chapter_minor": sep + minor, + "lang": "id", + "language": "Indonesian", + } + + @staticmethod + def images(page): + readerarea = text.extract(page, 'id=readerarea', 'class=chnav')[0] + return [ + (text.unescape(url), None) + for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea) + ] diff --git a/gallery_dl/extractor/nhentai.py b/gallery_dl/extractor/nhentai.py new file mode 100644 index 0000000..746144a --- /dev/null +++ b/gallery_dl/extractor/nhentai.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://nhentai.net/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util +import collections +import json + + +class NhentaiBase(): + """Base class for nhentai extractors""" + category = "nhentai" + root = "https://nhentai.net" + media_url = "https://i.nhentai.net" + + +class NhentaiGalleryExtractor(NhentaiBase, GalleryExtractor): + """Extractor for image galleries from nhentai.net""" + pattern = r"(?:https?://)?nhentai\.net(/g/(\d+))" + test = ("https://nhentai.net/g/147850/", { + "url": "5179dbf0f96af44005a0ff705a0ad64ac26547d0", + "keyword": { + "title" : r"re:\[Morris\] Amazon no Hiyaku \| Amazon Elixir", + "title_en" : str, + "title_ja" : str, + "gallery_id": 147850, + "media_id" : 867789, + "count" : 16, + "date" : 1446050915, + "scanlator" : "", + "artist" : ["morris"], + "group" : list, + "parody" : list, + "characters": list, + "tags" : list, + "type" : "manga", + "lang" : "en", + "language" : "English", + "width" : int, + "height" : int, + }, + }) + + def __init__(self, match): + GalleryExtractor.__init__(self, match) + self.gallery_id = match.group(2) + self.data = None + + def metadata(self, page): + data = json.loads(text.extract(page, "N.gallery(", ");")[0]) + self.data = data + + title_en = data["title"].get("english", "") + title_ja = data["title"].get("japanese", "") + + info = collections.defaultdict(list) + for tag in data["tags"]: + info[tag["type"]].append(tag["name"]) + + language = "" + for language in info["language"]: + if language != "translated": + language = language.capitalize() + break + + return { + "title" : title_en or title_ja, + "title_en" : title_en, + "title_ja" : title_ja, + "gallery_id": data["id"], + "media_id" : text.parse_int(data["media_id"]), + "date" : data["upload_date"], + "scanlator" : data["scanlator"], + "artist" : info["artist"], + "group" : info["group"], + "parody" : info["parody"], + "characters": info["character"], + "tags" : info["tag"], + "type" : info["category"][0] if info["category"] else "", + "lang" : util.language_to_code(language), + "language" : language, + } + + def images(self, _): + ufmt = "{}/galleries/{}/{{}}.{{}}".format( + self.media_url, self.data["media_id"]) + extdict = {"j": "jpg", "p": "png", "g": "gif"} + + return [ + (ufmt.format(num, extdict.get(img["t"], "jpg")), { + "width": img["w"], "height": img["h"], + }) + for num, img in enumerate(self.data["images"]["pages"], 1) + ] + + +class NhentaiSearchExtractor(NhentaiBase, Extractor): + """Extractor for nhentai search results""" + category = "nhentai" + subcategory = "search" + pattern = r"(?:https?://)?nhentai\.net/search/?\?([^#]+)" + test = ("https://nhentai.net/search/?q=touhou", { + "pattern": NhentaiGalleryExtractor.pattern, + "count": 30, + "range": "1-30", + }) + + def __init__(self, match): + Extractor.__init__(self, match) + self.params = text.parse_query(match.group(1)) + + def items(self): + yield Message.Version, 1 + data = {"_extractor": NhentaiGalleryExtractor} + for gallery_id in self._pagination(self.params): + url = "{}/g/{}/".format(self.root, gallery_id) + yield Message.Queue, url, data + + def _pagination(self, params): + url = "{}/search/".format(self.root) + params["page"] = text.parse_int(params.get("page"), 1) + + while True: + page = self.request(url, params=params).text + yield from text.extract_iter(page, 'href="/g/', '/') + if 'class="next"' not in page: + return + params["page"] += 1 diff --git a/gallery_dl/extractor/nijie.py b/gallery_dl/extractor/nijie.py new file mode 100644 index 0000000..abf1eaa --- /dev/null +++ b/gallery_dl/extractor/nijie.py @@ -0,0 +1,205 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://nijie.info/""" + +from .common import Extractor, Message, AsynchronousMixin +from .. import text, exception +from ..cache import cache + + +class NijieExtractor(AsynchronousMixin, Extractor): + """Base class for nijie extractors""" + category = "nijie" + directory_fmt = ("{category}", "{user_id}") + filename_fmt = "{category}_{artist_id}_{image_id}_p{index:>02}.{extension}" + archive_fmt = "{image_id}_{index}" + cookiedomain = "nijie.info" + cookienames = ("nemail", "nlogin") + root = "https://nijie.info" + view_url = "https://nijie.info/view.php?id=" + popup_url = "https://nijie.info/view_popup.php?id=" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user_id = match.group(1) + self.session.headers["Referer"] = self.root + "/" + + def items(self): + self.login() + data = self.get_job_metadata() + + yield Message.Version, 1 + yield Message.Directory, data + + for image_id in self.get_image_ids(): + for image_url, image_data in self.get_image_data(image_id): + image_data.update(data) + if not image_data["extension"]: + image_data["extension"] = "jpg" + yield Message.Url, image_url, image_data + + def get_job_metadata(self): + """Collect metadata for extractor-job""" + return {"user_id": text.parse_int(self.user_id)} + + def get_image_ids(self): + """Collect all relevant image-ids""" + + def get_image_data(self, image_id): + """Get URL and metadata for images specified by 'image_id'""" + page = self.request(self.view_url + image_id).text + return self.extract_image_data(page, image_id) + + def extract_image_data(self, page, image_id): + """Get URL and metadata for images from 'page'""" + title, pos = text.extract( + page, '<meta property="og:title" content="', '"') + description, pos = text.extract( + page, '<meta property="og:description" content="', '"', pos) + artist_id, pos = text.extract( + page, '"sameAs": "https://nijie.info/members.php?id=', '"', pos) + images = list(text.extract_iter( + page, '<a href="./view_popup.php', '</a>', pos)) + + title = title.rpartition("|")[0].strip() + image_id = text.parse_int(image_id) + artist_id = text.parse_int(artist_id) + + for index, image in enumerate(images): + url = "https:" + text.extract(image, 'src="', '"')[0] + url = url.replace("/__rs_l120x120/", "/", 1) + + yield url, text.nameext_from_url(url, { + "index": index, + "count": len(images), + "title": title, + "description": description, + "image_id": image_id, + "artist_id": artist_id, + }) + + def login(self): + """Login and obtain session cookies""" + if not self._check_cookies(self.cookienames): + username, password = self._get_auth_info() + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=150*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + url = "{}/login_int.php".format(self.root) + data = {"email": username, "password": password, "save": "on"} + + response = self.request(url, method="POST", data=data) + if "//nijie.info/login.php" in response.text: + raise exception.AuthenticationError() + return self.session.cookies + + def _pagination(self, path): + url = "{}/{}.php".format(self.root, path) + params = {"id": self.user_id, "p": 1} + + while True: + response = self.request(url, params=params, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError("artist") + + page = response.text + ids = list(text.extract_iter(page, ' illust_id="', '"')) + yield from ids + + if '<a rel="next"' not in page: + return + params["p"] += 1 + + +class NijieUserExtractor(NijieExtractor): + """Extractor for works of a nijie-user""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.)?nijie\.info" + r"/members(?:_illust)?\.php\?id=(\d+)") + test = ( + ("https://nijie.info/members_illust.php?id=44", { + "url": "585d821df4716b1098660a0be426d01db4b65f2a", + "keyword": "d629c69e3172db1d7e026145e8eb640ac31ac16a", + }), + ("https://nijie.info/members_illust.php?id=43", { + "exception": exception.NotFoundError, + }), + ("https://nijie.info/members.php?id=44"), + ) + + def get_image_ids(self): + return self._pagination("members_illust") + + +class NijieDoujinExtractor(NijieExtractor): + """Extractor for doujin entries of a nijie-user""" + subcategory = "doujin" + pattern = (r"(?:https?://)?(?:www\.)?nijie\.info/" + r"members_dojin\.php\?id=(\d+)") + test = ("https://nijie.info/members_dojin.php?id=6782", { + "count": ">= 18", + }) + + def get_image_ids(self): + return self._pagination("members_dojin") + + +class NijieFavoriteExtractor(NijieExtractor): + """Extractor for all favorites/bookmarks of a nijie-user""" + subcategory = "favorite" + directory_fmt = ("{category}", "bookmarks", "{user_id}") + archive_fmt = "f_{user_id}_{image_id}_{index}" + pattern = (r"(?:https?://)?(?:www\.)?nijie\.info" + r"/user_like_illust_view\.php\?id=(\d+)") + test = ("https://nijie.info/user_like_illust_view.php?id=44", { + "count": ">= 16", + }) + + def get_image_ids(self): + return self._pagination("user_like_illust_view") + + +class NijieImageExtractor(NijieExtractor): + """Extractor for a work/image from nijie.info""" + subcategory = "image" + pattern = (r"(?:https?://)?(?:www\.)?nijie\.info" + r"/view(?:_popup)?\.php\?id=(\d+)") + test = ( + ("https://nijie.info/view.php?id=70720", { + "url": "a10d4995645b5f260821e32c60a35f73546c2699", + "keyword": "408393d010307c76d52cbd0a4368d6d357805aea", + "content": "d85e3ea896ed5e4da0bca2390ad310a4df716ca6", + }), + ("https://nijie.info/view.php?id=70724", { + "exception": exception.NotFoundError, + }), + ("https://nijie.info/view_popup.php?id=70720"), + ) + + def __init__(self, match): + NijieExtractor.__init__(self, match) + self.image_id = match.group(1) + self.page = "" + + def get_job_metadata(self): + response = self.request(self.view_url + self.image_id, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError("image") + self.page = response.text + self.user_id = text.extract( + self.page, '"sameAs": "https://nijie.info/members.php?id=', '"')[0] + return NijieExtractor.get_job_metadata(self) + + def get_image_ids(self): + return (self.image_id,) + + def get_image_data(self, _): + return self.extract_image_data(self.page, self.image_id) diff --git a/gallery_dl/extractor/nsfwalbum.py b/gallery_dl/extractor/nsfwalbum.py new file mode 100644 index 0000000..c55f80a --- /dev/null +++ b/gallery_dl/extractor/nsfwalbum.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://nsfwalbum.com/""" + +from .common import GalleryExtractor +from .. import text + + +class NsfwalbumAlbumExtractor(GalleryExtractor): + """Extractor for image albums on nsfwalbum.com""" + category = "nsfwalbum" + subcategory = "album" + root = "https://nsfwalbum.com" + filename_fmt = "{album_id}_{page:>03}_{id}.{extension}" + directory_fmt = ("{category}", "{album_id} {title}") + archive_fmt = "{id}" + pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))" + test = ("https://nsfwalbum.com/album/295201", { + "range": "1-5", + "url": "e60eced1873215f5deee1ca7226d60cb4dcc051c", + "keyword": "e0573ecb1966611e96d10172a3ca1db1078a7984", + }) + + def __init__(self, match): + self.album_id = match.group(2) + GalleryExtractor.__init__(self, match) + + def metadata(self, page): + extr = text.extract_from(page) + return { + "album_id": text.parse_int(self.album_id), + "title" : text.unescape(extr('<h6>', '</h6>')), + "models" : text.split_html(extr('"models"> Models:', '</div>')), + "studio" : text.remove_html(extr('"models"> Studio:', '</div>')), + } + + def images(self, page): + iframe = self.root + "/iframe_image.php?id=" + backend = self.root + "/backend.php" + for image_id in text.extract_iter(page, 'data-img-id="', '"'): + spirit = text.extract(self.request( + iframe + image_id).text, 'giraffe.annihilate("', '"')[0] + params = {"spirit": self._annihilate(spirit), "photo": image_id} + data = self.request(backend, params=params).json() + yield data[0], { + "id" : text.parse_int(image_id), + "width" : text.parse_int(data[1]), + "height": text.parse_int(data[2]), + } + + @staticmethod + def _annihilate(value, base=6): + return "".join( + chr(ord(char) ^ base) + for char in value + ) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py new file mode 100644 index 0000000..e26eae1 --- /dev/null +++ b/gallery_dl/extractor/oauth.py @@ -0,0 +1,375 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Utility classes to setup OAuth and link a users account to gallery-dl""" + +from .common import Extractor, Message +from . import deviantart, flickr, reddit, smugmug, tumblr +from .. import text, oauth, config, exception +from ..cache import cache +import os +import urllib.parse + + +class OAuthBase(Extractor): + """Base class for OAuth Helpers""" + category = "oauth" + redirect_uri = "http://localhost:6414/" + + def __init__(self, match): + Extractor.__init__(self, match) + self.client = None + + def oauth_config(self, key, default=None): + return config.interpolate( + ("extractor", self.subcategory, key), default) + + def recv(self): + """Open local HTTP server and recv callback parameters""" + import socket + print("Waiting for response. (Cancel with Ctrl+c)") + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + server.bind(("localhost", 6414)) + server.listen(1) + + # workaround for ctrl+c not working during server.accept on Windows + if os.name == "nt": + server.settimeout(1.0) + while True: + try: + self.client = server.accept()[0] + break + except socket.timeout: + pass + server.close() + + data = self.client.recv(1024).decode() + path = data.split(" ", 2)[1] + return text.parse_query(path.partition("?")[2]) + + def send(self, msg): + """Send 'msg' to the socket opened in 'recv()'""" + print(msg) + self.client.send(b"HTTP/1.1 200 OK\r\n\r\n" + msg.encode()) + self.client.close() + + def open(self, url, params): + """Open 'url' in browser amd return response parameters""" + import webbrowser + url += "?" + urllib.parse.urlencode(params) + if not self.config("browser", True) or not webbrowser.open(url): + print("Please open this URL in your browser:") + print(url, end="\n\n", flush=True) + return self.recv() + + def _oauth1_authorization_flow( + self, request_token_url, authorize_url, access_token_url): + """Perform the OAuth 1.0a authorization flow""" + # get a request token + params = {"oauth_callback": self.redirect_uri} + data = self.session.get(request_token_url, params=params).text + + data = text.parse_query(data) + self.session.auth.token_secret = data["oauth_token_secret"] + + # get the user's authorization + params = {"oauth_token": data["oauth_token"], "perms": "read"} + data = self.open(authorize_url, params) + + # exchange the request token for an access token + data = self.session.get(access_token_url, params=data).text + + data = text.parse_query(data) + self.send(OAUTH1_MSG_TEMPLATE.format( + category=self.subcategory, + token=data["oauth_token"], + token_secret=data["oauth_token_secret"], + )) + + def _oauth2_authorization_code_grant( + self, client_id, client_secret, auth_url, token_url, + scope="read", key="refresh_token", auth=True, + message_template=None): + """Perform an OAuth2 authorization code grant""" + + state = "gallery-dl_{}_{}".format( + self.subcategory, + oauth.nonce(8), + ) + + auth_params = { + "client_id": client_id, + "response_type": "code", + "state": state, + "redirect_uri": self.redirect_uri, + "duration": "permanent", + "scope": scope, + } + + # receive an authorization code + params = self.open(auth_url, auth_params) + + # check authorization response + if state != params.get("state"): + self.send("'state' mismatch: expected {}, got {}.".format( + state, params.get("state") + )) + return + if "error" in params: + self.send(params["error"]) + return + + # exchange the authorization code for a token + data = { + "grant_type": "authorization_code", + "code": params["code"], + "redirect_uri": self.redirect_uri, + } + + if auth: + auth = (client_id, client_secret) + else: + auth = None + data["client_id"] = client_id + data["client_secret"] = client_secret + + data = self.session.post(token_url, data=data, auth=auth).json() + + # check token response + if "error" in data: + self.send(data["error"]) + return + + # display token + part = key.partition("_")[0] + template = message_template or OAUTH2_MSG_TEMPLATE + self.send(template.format( + category=self.subcategory, + key=part, + Key=part.capitalize(), + token=data[key], + instance=getattr(self, "instance", ""), + client_id=client_id, + client_secret=client_secret, + )) + + +class OAuthDeviantart(OAuthBase): + subcategory = "deviantart" + pattern = "oauth:deviantart$" + redirect_uri = "https://mikf.github.io/gallery-dl/oauth-redirect.html" + + def items(self): + yield Message.Version, 1 + + self._oauth2_authorization_code_grant( + self.oauth_config( + "client-id", deviantart.DeviantartAPI.CLIENT_ID), + self.oauth_config( + "client-secret", deviantart.DeviantartAPI.CLIENT_SECRET), + "https://www.deviantart.com/oauth2/authorize", + "https://www.deviantart.com/oauth2/token", + scope="browse", + ) + + +class OAuthFlickr(OAuthBase): + subcategory = "flickr" + pattern = "oauth:flickr$" + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.session = oauth.OAuth1Session( + self.oauth_config("api-key", flickr.FlickrAPI.API_KEY), + self.oauth_config("api-secret", flickr.FlickrAPI.API_SECRET), + ) + + def items(self): + yield Message.Version, 1 + + self._oauth1_authorization_flow( + "https://www.flickr.com/services/oauth/request_token", + "https://www.flickr.com/services/oauth/authorize", + "https://www.flickr.com/services/oauth/access_token", + ) + + +class OAuthReddit(OAuthBase): + subcategory = "reddit" + pattern = "oauth:reddit$" + + def items(self): + yield Message.Version, 1 + + self.session.headers["User-Agent"] = reddit.RedditAPI.USER_AGENT + self._oauth2_authorization_code_grant( + self.oauth_config("client-id", reddit.RedditAPI.CLIENT_ID), + "", + "https://www.reddit.com/api/v1/authorize", + "https://www.reddit.com/api/v1/access_token", + scope="read", + ) + + +class OAuthSmugmug(OAuthBase): + subcategory = "smugmug" + pattern = "oauth:smugmug$" + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.session = oauth.OAuth1Session( + self.oauth_config("api-key", smugmug.SmugmugAPI.API_KEY), + self.oauth_config("api-secret", smugmug.SmugmugAPI.API_SECRET), + ) + + def items(self): + yield Message.Version, 1 + + self._oauth1_authorization_flow( + "https://api.smugmug.com/services/oauth/1.0a/getRequestToken", + "https://api.smugmug.com/services/oauth/1.0a/authorize", + "https://api.smugmug.com/services/oauth/1.0a/getAccessToken", + ) + + +class OAuthTumblr(OAuthBase): + subcategory = "tumblr" + pattern = "oauth:tumblr$" + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.session = oauth.OAuth1Session( + self.oauth_config("api-key", tumblr.TumblrAPI.API_KEY), + self.oauth_config("api-secret", tumblr.TumblrAPI.API_SECRET), + ) + + def items(self): + yield Message.Version, 1 + + self._oauth1_authorization_flow( + "https://www.tumblr.com/oauth/request_token", + "https://www.tumblr.com/oauth/authorize", + "https://www.tumblr.com/oauth/access_token", + ) + + +class OAuthMastodon(OAuthBase): + subcategory = "mastodon" + pattern = "oauth:mastodon:(?:https?://)?([^/?&#]+)" + + def __init__(self, match): + OAuthBase.__init__(self, match) + self.instance = match.group(1) + + def items(self): + yield Message.Version, 1 + + application = self.oauth_config(self.instance) + if not application: + application = self._register(self.instance) + + self._oauth2_authorization_code_grant( + application["client-id"], + application["client-secret"], + "https://{}/oauth/authorize".format(self.instance), + "https://{}/oauth/token".format(self.instance), + key="access_token", + message_template=MASTODON_MSG_TEMPLATE, + ) + + @cache(maxage=10*365*24*3600, keyarg=1) + def _register(self, instance): + self.log.info("Registering application for '%s'", instance) + + url = "https://{}/api/v1/apps".format(instance) + data = { + "client_name": "gdl:" + oauth.nonce(8), + "redirect_uris": self.redirect_uri, + "scopes": "read", + } + data = self.session.post(url, data=data).json() + + if "client_id" not in data or "client_secret" not in data: + self.log.error("Failed to register new application: '%s'", data) + raise exception.StopExtraction() + + data["client-id"] = data.pop("client_id") + data["client-secret"] = data.pop("client_secret") + + self.log.info("client-id:\n%s", data["client-id"]) + self.log.info("client-secret:\n%s", data["client-secret"]) + + return data + + +OAUTH1_MSG_TEMPLATE = """ +Your Access Token and Access Token Secret are + +{token} +{token_secret} + +Put these values into your configuration file as +'extractor.{category}.access-token' and +'extractor.{category}.access-token-secret'. + +Example: +{{ + "extractor": {{ + "{category}": {{ + "access-token": "{token}", + "access-token-secret": "{token_secret}" + }} + }} +}} +""" + + +OAUTH2_MSG_TEMPLATE = """ +Your {Key} Token is + +{token} + +Put this value into your configuration file as +'extractor.{category}.{key}-token'. + +Example: +{{ + "extractor": {{ + "{category}": {{ + "{key}-token": "{token}" + }} + }} +}} +""" + + +MASTODON_MSG_TEMPLATE = """ +Your {Key} Token is + +{token} + +Put this value into your configuration file as +'extractor.mastodon.{instance}.{key}-token'. + +You can also add your 'client-id' and 'client-secret' values +if you want to register another account in the future. + +Example: +{{ + "extractor": {{ + "mastodon": {{ + "{instance}": {{ + "{key}-token": "{token}", + "client-id": "{client_id}", + "client-secret": "{client_secret}" + }} + }} + }} +}} +""" diff --git a/gallery_dl/extractor/paheal.py b/gallery_dl/extractor/paheal.py new file mode 100644 index 0000000..a4731d0 --- /dev/null +++ b/gallery_dl/extractor/paheal.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://rule34.paheal.net/""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text + + +class PahealExtractor(SharedConfigMixin, Extractor): + """Base class for paheal extractors""" + basecategory = "booru" + category = "paheal" + filename_fmt = "{category}_{id}_{md5}.{extension}" + archive_fmt = "{id}" + root = "https://rule34.paheal.net" + + def items(self): + yield Message.Version, 1 + yield Message.Directory, self.get_metadata() + + for data in self.get_posts(): + url = data["file_url"] + for key in ("id", "width", "height"): + data[key] = text.parse_int(data[key]) + data["tags"] = text.unquote(data["tags"]) + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_metadata(self): + """Return general metadata""" + return {} + + def get_posts(self): + """Return an iterable containing data of all relevant posts""" + + +class PahealTagExtractor(PahealExtractor): + """Extractor for images from rule34.paheal.net by search-tags""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" + r"/post/list/([^/?&#]+)") + test = ("https://rule34.paheal.net/post/list/k-on/1", { + "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", + "count": ">= 15" + }) + per_page = 70 + + def __init__(self, match): + PahealExtractor.__init__(self, match) + self.tags = text.unquote(match.group(1)) + + def get_metadata(self): + return {"search_tags": self.tags} + + def get_posts(self): + pnum = 1 + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + + for post in text.extract_iter( + page, '<img id="thumb_', '>Image Only<'): + yield self._extract_data(post) + + if ">Next<" not in page: + return + pnum += 1 + + @staticmethod + def _extract_data(post): + pid , pos = text.extract(post, '', '"') + data, pos = text.extract(post, 'title="', '"', pos) + md5 , pos = text.extract(post, '/_thumbs/', '/', pos) + url , pos = text.extract(post, '<a href="', '"', pos) + + tags, dimensions, size, _ = data.split(" // ") + width, _, height = dimensions.partition("x") + + return { + "id": pid, "md5": md5, "tags": tags, "file_url": url, + "width": width, "height": height, + "size": text.parse_bytes(size[:-1]), + } + + +class PahealPostExtractor(PahealExtractor): + """Extractor for single images from rule34.paheal.net""" + subcategory = "post" + pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" + r"/post/view/(\d+)") + test = ("https://rule34.paheal.net/post/view/481609", { + "url": "1142779378f655ec0497d4c301836aa667f788b1", + "keyword": "34e9e93d4fa6fa06fac1a56e78c9a52e8cd7b271", + "content": "7b924bcf150b352ac75c9d281d061e174c851a11", + }) + + def __init__(self, match): + PahealExtractor.__init__(self, match) + self.post_id = match.group(1) + + def get_posts(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + page = self.request(url).text + + tags , pos = text.extract(page, ": ", "<") + md5 , pos = text.extract(page, "/_thumbs/", "/", pos) + url , pos = text.extract(page, "id='main_image' src='", "'", pos) + width , pos = text.extract(page, "data-width='", "'", pos) + height, pos = text.extract(page, "data-height='", "'", pos) + + return ({ + "id": self.post_id, "md5": md5, "tags": tags, "file_url": url, + "width": width, "height": height, "size": 0, + },) diff --git a/gallery_dl/extractor/patreon.py b/gallery_dl/extractor/patreon.py new file mode 100644 index 0000000..4884497 --- /dev/null +++ b/gallery_dl/extractor/patreon.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.patreon.com/""" + +from .common import Extractor, Message +from .. import text +from ..cache import memcache + + +class PatreonExtractor(Extractor): + """Base class for patreon extractors""" + category = "patreon" + root = "https://www.patreon.com" + directory_fmt = ("{category}", "{creator[full_name]}") + filename_fmt = "{id}_{title}_{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + _warning = True + + def items(self): + yield Message.Version, 1 + + if self._warning: + if "session_id" not in self.session.cookies: + self.log.warning("no 'session_id' cookie set") + PatreonExtractor._warning = False + + for post in self.posts(): + yield Message.Directory, post + + post["num"] = 0 + content = post.get("content") + postfile = post.get("post_file") + + for url in text.extract_iter(content or "", 'src="', '"'): + post["num"] += 1 + yield Message.Url, url, text.nameext_from_url(url, post) + + if postfile: + post["num"] += 1 + text.nameext_from_url(postfile["name"], post) + yield Message.Url, postfile["url"], post + + for attachment in post["attachments"]: + post["num"] += 1 + text.nameext_from_url(attachment["name"], post) + yield Message.Url, attachment["url"], post + + def posts(self): + """Return all relevant post objects""" + + def _pagination(self, url): + headers = {"Referer": self.root} + empty = [] + + while url: + posts = self.request(url, headers=headers).json() + + if "included" not in posts: + return + + # collect attachments + attachments = {} + for inc in posts["included"]: + if inc["type"] == "attachment": + attachments[inc["id"]] = inc["attributes"] + + # update posts + for post in posts["data"]: + attr = post["attributes"] + attr["id"] = text.parse_int(post["id"]) + attr["date"] = text.parse_datetime( + attr["published_at"], "%Y-%m-%dT%H:%M:%S.%f%z") + attr["creator"] = self._user( + post["relationships"]["user"]["links"]["related"]) + + # add attachments to post attributes + files = post["relationships"].get("attachments") + if files: + attr["attachments"] = [ + attachments[f["id"]] + for f in files["data"] + ] + else: + attr["attachments"] = empty + + yield attr + + if "links" not in posts: + return + url = posts["links"].get("next") + + @memcache(keyarg=1) + def _user(self, url): + user = self.request(url).json()["data"] + attr = user["attributes"] + attr["id"] = user["id"] + attr["date"] = text.parse_datetime( + attr["created"], "%Y-%m-%dT%H:%M:%S.%f%z") + return attr + + @staticmethod + def _build_url(endpoint, query): + return ( + "https://www.patreon.com/api/" + endpoint + + + "?include=user,attachments,user_defined_tags,campaign,poll.choices" + ",poll.current_user_responses.user,poll.current_user_responses.cho" + "ice,poll.current_user_responses.poll,access_rules.tier.null" + + "&fields[post]=change_visibility_at,comment_count,content,current_" + "user_can_delete,current_user_can_view,current_user_has_liked,embe" + "d,image,is_paid,like_count,min_cents_pledged_to_view,post_file,pu" + "blished_at,patron_count,patreon_url,post_type,pledge_url,thumbnai" + "l_url,teaser_text,title,upgrade_url,url,was_posted_by_campaign_ow" + "ner" + "&fields[user]=image_url,full_name,url" + "&fields[campaign]=avatar_photo_url,earnings_visibility,is_nsfw,is" + "_monthly,name,url" + "&fields[access_rule]=access_rule_type,amount_cents" + query + + + "&json-api-use-default-includes=false" + "&json-api-version=1.0" + ) + + +class PatreonCreatorExtractor(PatreonExtractor): + """Extractor for a creator's works""" + subcategory = "creator" + pattern = (r"(?:https?://)?(?:www\.)?patreon\.com" + r"/(?!(?:home|join|login|signup)(?:$|[/?&#]))([^/?&#]+)/?") + test = ("https://www.patreon.com/koveliana", { + "range": "1-25", + "count": ">= 25", + "keyword": { + "attachments": list, + "comment_count": int, + "content": str, + "creator": dict, + "date": "type:datetime", + "id": int, + "like_count": int, + "post_type": str, + "published_at": str, + "title": str, + }, + }) + + def __init__(self, match): + PatreonExtractor.__init__(self, match) + self.creator = match.group(1).lower() + + def posts(self): + url = "{}/{}".format(self.root, self.creator) + page = self.request(url).text + campaign_id = text.extract(page, "/campaign/", "/")[0] + + url = self._build_url("posts", ( + "&sort=-published_at" + "&filter[is_draft]=false" + "&filter[contains_exclusive_posts]=true" + "&filter[campaign_id]=" + campaign_id + )) + return self._pagination(url) + + +class PatreonUserExtractor(PatreonExtractor): + """Extractor for media from creators supported by you""" + subcategory = "user" + pattern = r"(?:https?://)?(?:www\.)?patreon\.com/home$" + test = ("https://www.patreon.com/home",) + + def posts(self): + url = self._build_url("stream", ( + "&page[cursor]=null" + "&filter[is_following]=true" + )) + return self._pagination(url) diff --git a/gallery_dl/extractor/photobucket.py b/gallery_dl/extractor/photobucket.py new file mode 100644 index 0000000..83f75a3 --- /dev/null +++ b/gallery_dl/extractor/photobucket.py @@ -0,0 +1,178 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://photobucket.com/""" + +from .common import Extractor, Message +from .. import text, exception +import base64 +import json + + +class PhotobucketAlbumExtractor(Extractor): + """Extractor for albums on photobucket.com""" + category = "photobucket" + subcategory = "album" + directory_fmt = ("{category}", "{username}", "{location}") + filename_fmt = "{offset:>03}{pictureId:?_//}_{titleOrFilename}.{extension}" + archive_fmt = "{id}" + pattern = (r"(?:https?://)?((?:[^.]+\.)?photobucket\.com)" + r"/user/[^/?&#]+/library/[^?&#]*") + test = ( + ("https://s258.photobucket.com/user/focolandia/library/", { + "pattern": r"https?://[oi]+\d+.photobucket.com/albums/hh280/", + "count": ">= 39" + }), + # subalbums of main "directory" + ("https://s271.photobucket.com/user/lakerfanryan/library/", { + "options": (("image-filter", "False"),), + "pattern": pattern, + "count": 1, + }), + # subalbums of subalbum without images + ("https://s271.photobucket.com/user/lakerfanryan/library/Basketball", { + "pattern": pattern, + "count": ">= 9", + }), + # private (missing JSON data) + ("https://s1277.photobucket.com/user/sinisterkat44/library/", { + "count": 0, + }), + ("https://s1110.photobucket.com/user/chndrmhn100/library/" + "Chandu%20is%20the%20King?sort=3&page=1"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.album_path = "" + self.root = "https://" + match.group(1) + self.session.headers["Referer"] = self.url + + def items(self): + yield Message.Version, 1 + for image in self.images(): + image["titleOrFilename"] = text.unescape(image["titleOrFilename"]) + image["title"] = text.unescape(image["title"]) + image["extension"] = image["ext"] + yield Message.Directory, image + yield Message.Url, image["fullsizeUrl"], image + + if self.config("subalbums", True): + for album in self.subalbums(): + album["_extractor"] = PhotobucketAlbumExtractor + yield Message.Queue, album["url"], album + + def images(self): + """Yield all images of the current album""" + url = self.url + params = {"sort": "3", "page": 1} + + while True: + page = self.request(url, params=params).text + json_data = text.extract(page, "collectionData:", ",\n")[0] + if not json_data: + msg = text.extract(page, 'libraryPrivacyBlock">', "</div>")[0] + msg = ' ("{}")'.format(text.remove_html(msg)) if msg else "" + self.log.error("Unable to get JSON data%s", msg) + return + data = json.loads(json_data) + + yield from data["items"]["objects"] + + if data["total"] <= data["offset"] + data["pageSize"]: + self.album_path = data["currentAlbumPath"] + return + params["page"] += 1 + + def subalbums(self): + """Return all subalbum objects""" + url = self.root + "/component/Albums-SubalbumList" + params = { + "albumPath": self.album_path, + "fetchSubAlbumsOnly": "true", + "deferCollapsed": "true", + "json": "1", + } + + data = self.request(url, params=params).json() + return data["body"].get("subAlbums", ()) + + +class PhotobucketImageExtractor(Extractor): + """Extractor for individual images from photobucket.com""" + category = "photobucket" + subcategory = "image" + directory_fmt = ("{category}", "{username}") + filename_fmt = "{pictureId:?/_/}{titleOrFilename}.{extension}" + archive_fmt = "{username}_{id}" + pattern = (r"(?:https?://)?(?:[^.]+\.)?photobucket\.com" + r"(?:/gallery/user/([^/?&#]+)/media/([^/?&#]+)" + r"|/user/([^/?&#]+)/media/[^?&#]+\.html)") + test = ( + (("https://s271.photobucket.com/user/lakerfanryan" + "/media/Untitled-3-1.jpg.html"), { + "url": "3b647deeaffc184cc48c89945f67574559c9051f", + "keyword": "a2de4e60d584912537b8025c01bdd1d20bdea735", + }), + (("https://s271.photobucket.com/user/lakerfanryan" + "/media/IsotopeswBros.jpg.html?sort=3&o=2"), { + "url": "12c1890c09c9cdb8a88fba7eec13f324796a8d7b", + "keyword": "61200a223df6c06f45ac3d30c88b3f5b048ce9a8", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) or match.group(3) + self.media_id = match.group(2) + self.session.headers["Referer"] = self.url + + def items(self): + url = "https://photobucket.com/galleryd/search.php" + params = {"userName": self.user, "searchTerm": "", "ref": ""} + + if self.media_id: + params["mediaId"] = self.media_id + else: + params["url"] = self.url + + # retry API call up to 5 times, since it can randomly fail + tries = 0 + while tries < 5: + data = self.request(url, method="POST", params=params).json() + image = data["mediaDocuments"] + if "message" not in image: + break # success + tries += 1 + self.log.debug("'%s'", image["message"]) + else: + self.log.error("%s", image["message"]) + raise exception.StopExtraction() + + # adjust metadata entries to be at least somewhat similar + # to what the 'album' extractor provides + if "media" in image: + image = image["media"][image["mediaIndex"]] + image["albumView"] = data["mediaDocuments"]["albumView"] + image["username"] = image["ownerId"] + else: + image["fileUrl"] = image.pop("imageUrl") + + image.setdefault("title", "") + image.setdefault("description", "") + name, _, ext = image["fileUrl"].rpartition("/")[2].rpartition(".") + image["ext"] = image["extension"] = ext + image["titleOrFilename"] = image["title"] or name + image["tags"] = image.pop("clarifaiTagList", []) + + mtype, _, mid = base64.b64decode(image["id"]).partition(b":") + image["pictureId"] = mid.decode() if mtype == b"mediaId" else "" + + yield Message.Version, 1 + yield Message.Directory, image + yield Message.Url, image["fileUrl"], image diff --git a/gallery_dl/extractor/piczel.py b/gallery_dl/extractor/piczel.py new file mode 100644 index 0000000..6a5c41c --- /dev/null +++ b/gallery_dl/extractor/piczel.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://piczel.tv/""" + +from .common import Extractor, Message +from .. import text + + +class PiczelExtractor(Extractor): + """Base class for piczel extractors""" + category = "piczel" + directory_fmt = ("{category}", "{user[username]}") + filename_fmt = "{category}_{id}_{title}_{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + root = "https://piczel.tv" + api_root = "https://apollo.piczel.tv" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item_id = match.group(1) + + def items(self): + first = True + yield Message.Version, 1 + for image in self.unpack(self.get_images()): + if first: + yield Message.Directory, image + first = False + path = image["image"]["image"]["url"] + url = "{}/static/{}".format(self.api_root, path) + yield Message.Url, url, text.nameext_from_url(url, image) + + @staticmethod + def unpack(images): + """Unpack 'images' into individual image objects""" + for image in images: + if image["multi"]: + multi = image["images"] + del image["images"] + for image["num"], img in enumerate(multi): + image["image"] = img + yield image + else: + image["num"] = 0 + yield image + + def get_images(self): + """Return an iterable with all relevant image objects""" + + +class PiczelUserExtractor(PiczelExtractor): + """Extractor for all images from a user's gallery""" + subcategory = "user" + pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/([^/?&#]+)/?$" + test = ("https://piczel.tv/gallery/Lulena", { + "count": ">= 13", + }) + + def get_images(self): + url = "{}/api/users/{}/gallery".format(self.api_root, self.item_id) + return self.request(url).json() + + +class PiczelFolderExtractor(PiczelExtractor): + """Extractor for images inside a user's folder""" + subcategory = "folder" + directory_fmt = ("{category}", "{user[username]}", "{folder[name]}") + archive_fmt = "f{folder[id]}_{id}_{num}" + pattern = (r"(?:https?://)?(?:www\.)?piczel\.tv" + r"/gallery/(?!image)[^/?&#]+/(\d+)") + test = ("https://piczel.tv/gallery/Lulena/1114", { + "count": ">= 4", + }) + + def get_images(self): + url = "{}/api/gallery/folder/{}".format(self.api_root, self.item_id) + images = self.request(url).json() + images.reverse() + return images + + +class PiczelImageExtractor(PiczelExtractor): + """Extractor for individual images""" + subcategory = "image" + pattern = r"(?:https?://)?(?:www\.)?piczel\.tv/gallery/image/(\d+)" + test = ("https://piczel.tv/gallery/image/7807", { + "url": "9b9e416b6ab7e58676fab84453d5028f306ece34", + "content": "df9a053a24234474a19bce2b7e27e0dec23bff87", + "keyword": { + "created_at": "2018-07-22T05:13:58.000Z", + "description": None, + "extension": "png", + "favorites_count": int, + "folder": dict, + "folder_id": 1113, + "id": 7807, + "is_flash": False, + "is_video": False, + "multi": False, + "nsfw": False, + "num": 0, + "password_protected": False, + "tags": "fanart, commission, altair, recreators, ", + "title": "Altair", + "user": dict, + "views": int, + }, + }) + + def get_images(self): + url = "{}/api/gallery/image/{}".format(self.api_root, self.item_id) + return (self.request(url).json(),) diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py new file mode 100644 index 0000000..fa8cd48 --- /dev/null +++ b/gallery_dl/extractor/pinterest.py @@ -0,0 +1,260 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.pinterest.com/""" + +from .common import Extractor, Message +from .. import text, exception +import json + + +BASE_PATTERN = r"(?:https?://)?(?:\w+\.)?pinterest\.\w+" + + +class PinterestExtractor(Extractor): + """Base class for pinterest extractors""" + category = "pinterest" + filename_fmt = "{category}_{id}.{extension}" + archive_fmt = "{id}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = PinterestAPI(self) + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + + for pin in self.pins(): + if "images" in pin: + url, pin_data = self.data_from_pin(pin) + pin_data.update(data) + yield Message.Url, url, pin_data + + def metadata(self): + """Return general metadata""" + + def pins(self): + """Return all relevant pin-objects""" + + @staticmethod + def data_from_pin(pin): + """Get image url and metadata from a pin-object""" + img = pin["images"]["orig"] + url = img["url"] + pin["width"] = img["width"] + pin["height"] = img["height"] + return url, text.nameext_from_url(url, pin) + + +class PinterestPinExtractor(PinterestExtractor): + """Extractor for images from a single pin from pinterest.com""" + subcategory = "pin" + pattern = BASE_PATTERN + r"/pin/([^/?#&]+)(?!.*#related$)" + test = ( + ("https://www.pinterest.com/pin/858146903966145189/", { + "url": "afb3c26719e3a530bb0e871c480882a801a4e8a5", + # image version depends on CDN server used + # "content": "d3e24bc9f7af585e8c23b9136956bd45a4d9b947", + # "content": "4c435a66f6bb82bb681db2ecc888f76cf6c5f9ca", + }), + ("https://www.pinterest.com/pin/858146903966145188/", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.pin_id = match.group(1) + self.pin = None + + def metadata(self): + self.pin = self.api.pin(self.pin_id) + return self.data_from_pin(self.pin)[1] + + def pins(self): + return (self.pin,) + + +class PinterestBoardExtractor(PinterestExtractor): + """Extractor for images from a board from pinterest.com""" + subcategory = "board" + directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") + archive_fmt = "{board[id]}_{id}" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)(?!.*#related$)" + test = ( + ("https://www.pinterest.com/g1952849/test-/", { + "pattern": r"https://i\.pinimg\.com/originals/", + "count": 2, + }), + ("https://www.pinterest.com/g1952848/test/", { + "exception": exception.GalleryDLException, + }), + ) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.user = text.unquote(match.group(1)) + self.board = text.unquote(match.group(2)) + self.board_id = 0 + + def metadata(self): + board = self.api.board(self.user, self.board) + self.board_id = board["id"] + return {"board": board} + + def pins(self): + return self.api.board_pins(self.board_id) + + +class PinterestRelatedPinExtractor(PinterestPinExtractor): + """Extractor for related pins of another pin from pinterest.com""" + subcategory = "related-pin" + directory_fmt = ("{category}", "related {original_pin[id]}") + pattern = BASE_PATTERN + r"/pin/([^/?#&]+).*#related$" + test = ("https://www.pinterest.com/pin/858146903966145189/#related", { + "range": "31-50", + "count": 20, + }) + + def metadata(self): + pin = self.api.pin(self.pin_id) + return {"original_pin": self.data_from_pin(pin)[1]} + + def pins(self): + return self.api.pin_related(self.pin_id) + + +class PinterestRelatedBoardExtractor(PinterestBoardExtractor): + """Extractor for related pins of a board from pinterest.com""" + subcategory = "related-board" + directory_fmt = ("{category}", "{board[owner][username]}", + "{board[name]}", "related") + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+).*#related$" + test = ("https://www.pinterest.com/g1952849/test-/#related", { + "range": "31-50", + "count": 20, + }) + + def pins(self): + return self.api.board_related(self.board_id) + + +class PinterestPinitExtractor(PinterestExtractor): + """Extractor for images from a pin.it URL""" + subcategory = "pinit" + pattern = r"(?:https?://)?pin\.it/([^/?#&]+)" + + test = ( + ("https://pin.it/Hvt8hgT", { + "url": "8daad8558382c68f0868bdbd17d05205184632fa", + }), + ("https://pin.it/Hvt8hgS", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + PinterestExtractor.__init__(self, match) + self.shortened_id = match.group(1) + + def items(self): + url = "https://api.pinterest.com/url_shortener/{}/redirect".format( + self.shortened_id) + response = self.request(url, method="HEAD", allow_redirects=False) + location = response.headers.get("Location") + if not location or location in ("https://api.pinterest.com/None", + "https://pin.it/None", + "https://www.pinterest.com"): + raise exception.NotFoundError("pin") + yield Message.Queue, location, {} + + +class PinterestAPI(): + """Minimal interface for the Pinterest Web API + + For a better and more complete implementation in PHP, see + - https://github.com/seregazhuk/php-pinterest-bot + """ + + BASE_URL = "https://www.pinterest.com" + HEADERS = { + "Accept" : "application/json, text/javascript, " + "*/*, q=0.01", + "Accept-Language" : "en-US,en;q=0.5", + "X-Pinterest-AppState": "active", + "X-APP-VERSION" : "cb1c7f9", + "X-Requested-With" : "XMLHttpRequest", + "Origin" : BASE_URL + "/", + } + + def __init__(self, extractor): + self.extractor = extractor + + def pin(self, pin_id): + """Query information about a pin""" + options = {"id": pin_id, "field_set_key": "detailed"} + return self._call("Pin", options)["resource_response"]["data"] + + def pin_related(self, pin_id): + """Yield related pins of another pin""" + options = {"pin": pin_id, "add_vase": True, "pins_only": True} + return self._pagination("RelatedPinFeed", options) + + def board(self, user, board): + """Query information about a board""" + options = {"slug": board, "username": user, + "field_set_key": "detailed"} + return self._call("Board", options)["resource_response"]["data"] + + def board_pins(self, board_id): + """Yield all pins of a specific board""" + options = {"board_id": board_id} + return self._pagination("BoardFeed", options) + + def board_related(self, board_id): + """Yield related pins of a specific board""" + options = {"board_id": board_id, "add_vase": True} + return self._pagination("BoardRelatedPixieFeed", options) + + def _call(self, resource, options): + url = "{}/resource/{}Resource/get/".format(self.BASE_URL, resource) + params = {"data": json.dumps({"options": options}), "source_url": ""} + + response = self.extractor.request( + url, params=params, headers=self.HEADERS, expect=range(400, 500)) + + try: + data = response.json() + except ValueError: + data = {} + + if 200 <= response.status_code < 400 and not response.history: + return data + + if response.status_code == 404 or response.history: + resource = self.extractor.subcategory.rpartition("-")[2] + raise exception.NotFoundError(resource) + self.extractor.log.error("API request failed") + self.extractor.log.debug("%s", response.text) + raise exception.StopExtraction() + + def _pagination(self, resource, options): + while True: + data = self._call(resource, options) + yield from data["resource_response"]["data"] + + try: + bookmarks = data["resource"]["options"]["bookmarks"] + if (not bookmarks or bookmarks[0] == "-end-" or + bookmarks[0].startswith("Y2JOb25lO")): + return + options["bookmarks"] = bookmarks + except KeyError: + return diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py new file mode 100644 index 0000000..af29c4b --- /dev/null +++ b/gallery_dl/extractor/pixiv.py @@ -0,0 +1,517 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images and ugoira from https://www.pixiv.net/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache +from datetime import datetime, timedelta + + +class PixivExtractor(Extractor): + """Base class for pixiv extractors""" + category = "pixiv" + directory_fmt = ("{category}", "{user[id]} {user[account]}") + filename_fmt = "{category}_{user[id]}_{id}{num}.{extension}" + archive_fmt = "{id}{num}.{extension}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = PixivAppAPI(self) + self.user_id = -1 + self.load_ugoira = self.config("ugoira", True) + + def items(self): + metadata = self.get_metadata() + yield Message.Version, 1 + + for work in self.works(): + if not work["user"]["id"]: + continue + + meta_single_page = work["meta_single_page"] + meta_pages = work["meta_pages"] + del work["meta_single_page"] + del work["image_urls"] + del work["meta_pages"] + work["num"] = "" + work["tags"] = [tag["name"] for tag in work["tags"]] + work["date"] = text.parse_datetime(work["create_date"]) + work.update(metadata) + + yield Message.Directory, work + + if work["type"] == "ugoira": + if not self.load_ugoira: + continue + ugoira = self.api.ugoira_metadata(work["id"]) + + url = ugoira["zip_urls"]["medium"].replace( + "_ugoira600x600", "_ugoira1920x1080") + work["frames"] = ugoira["frames"] + work["extension"] = "zip" + yield Message.Url, url, work + + elif work["page_count"] == 1: + url = meta_single_page["original_image_url"] + work["extension"] = url.rpartition(".")[2] + yield Message.Url, url, work + + else: + for num, img in enumerate(meta_pages): + url = img["image_urls"]["original"] + work["num"] = "_p{:02}".format(num) + work["extension"] = url.rpartition(".")[2] + yield Message.Url, url, work + + def works(self): + """Return an iterable containing all relevant 'work'-objects""" + + def get_metadata(self, user=None): + """Collect metadata for extractor-job""" + if not user: + user = self.api.user_detail(self.user_id) + return {"user": user} + + +class PixivUserExtractor(PixivExtractor): + """Extractor for works of a pixiv-user""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/" + r"(?:member(?:_illust)?\.php\?id=(\d+)(?:&([^#]+))?" + r"|(?:u(?:ser)?/|(?:mypage\.php)?#id=)(\d+))") + test = ( + ("http://www.pixiv.net/member_illust.php?id=173530", { + "url": "852c31ad83b6840bacbce824d85f2a997889efb7", + }), + # illusts with specific tag + (("https://www.pixiv.net/member_illust.php?id=173530" + "&tag=%E6%89%8B%E3%81%B6%E3%82%8D"), { + "url": "25b1cd81153a8ff82eec440dd9f20a4a22079658", + }), + ("http://www.pixiv.net/member_illust.php?id=173531", { + "exception": exception.NotFoundError, + }), + ("https://www.pixiv.net/u/173530"), + ("https://www.pixiv.net/user/173530"), + ("https://www.pixiv.net/mypage.php#id=173530"), + ("https://www.pixiv.net/#id=173530"), + ("https://touch.pixiv.net/member_illust.php?id=173530"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.user_id = match.group(1) or match.group(3) + self.query = text.parse_query(match.group(2)) + + def works(self): + works = self.api.user_illusts(self.user_id) + + if "tag" in self.query: + tag = text.unquote(self.query["tag"]).lower() + works = ( + work for work in works + if tag in [t["name"].lower() for t in work["tags"]] + ) + + return works + + +class PixivMeExtractor(PixivExtractor): + """Extractor for pixiv.me URLs""" + subcategory = "me" + pattern = r"(?:https?://)?pixiv\.me/([^/?&#]+)" + test = ( + ("https://pixiv.me/del_shannon", { + "url": "0b1a18c3e3553c44ee6e0ccc36a7fd906c498e8f", + }), + ("https://pixiv.me/del_shanno", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.account = match.group(1) + + def items(self): + url = "https://pixiv.me/" + self.account + response = self.request( + url, method="HEAD", allow_redirects=False, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError("user") + yield Message.Version, 1 + yield Message.Queue, response.headers["Location"], {} + + +class PixivWorkExtractor(PixivExtractor): + """Extractor for a single pixiv work/illustration""" + subcategory = "work" + pattern = (r"(?:https?://)?(?:(?:www\.|touch\.)?pixiv\.net" + r"/member(?:_illust)?\.php\?(?:[^&]+&)*illust_id=(\d+)" + r"|(?:i(?:\d+\.pixiv|\.pximg)\.net" + r"/(?:(?:.*/)?img-[^/]+/img/\d{4}(?:/\d\d){5}|img\d+/img/[^/]+)" + r"|img\d*\.pixiv\.net/img/[^/]+|(?:www\.)?pixiv\.net/i)/(\d+))") + test = ( + (("http://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=966412"), { + "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba", + "content": "69a8edfb717400d1c2e146ab2b30d2c235440c5a", + }), + (("http://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=966411"), { + "exception": exception.NotFoundError, + }), + # ugoira + (("https://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=66806629"), { + "url": "7267695a985c4db8759bebcf8d21dbdd2d2317ef", + "keywords": {"frames": list}, + }), + ("http://i1.pixiv.net/c/600x600/img-master" + "/img/2008/06/13/00/29/13/966412_p0_master1200.jpg"), + ("https://i.pximg.net/img-original" + "/img/2017/04/25/07/33/29/62568267_p0.png"), + ("https://www.pixiv.net/i/966412"), + ("http://img.pixiv.net/img/soundcross/42626136.jpg"), + ("http://i2.pixiv.net/img76/img/snailrin/42672235.jpg"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.illust_id = match.group(1) or match.group(2) + self.load_ugoira = True + self.work = None + + def works(self): + return (self.work,) + + def get_metadata(self, user=None): + self.work = self.api.illust_detail(self.illust_id) + return PixivExtractor.get_metadata(self, self.work["user"]) + + +class PixivFavoriteExtractor(PixivExtractor): + """Extractor for all favorites/bookmarks of a pixiv-user""" + subcategory = "favorite" + directory_fmt = ("{category}", "bookmarks", + "{user_bookmark[id]} {user_bookmark[account]}") + archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/bookmark\.php(?:\?([^#]*))?") + test = ( + ("https://www.pixiv.net/bookmark.php?id=173530", { + "url": "e717eb511500f2fa3497aaee796a468ecf685cc4", + }), + # bookmarks with specific tag + (("https://www.pixiv.net/bookmark.php?id=3137110" + "&tag=%E3%81%AF%E3%82%93%E3%82%82%E3%82%93&p=1"), { + "count": 2, + }), + # own bookmarks + ("https://www.pixiv.net/bookmark.php", { + "url": "90c1715b07b0d1aad300bce256a0bc71f42540ba", + }), + # touch URLs + ("https://touch.pixiv.net/bookmark.php?id=173530"), + ("https://touch.pixiv.net/bookmark.php"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.query = text.parse_query(match.group(1)) + if "id" not in self.query: + self.subcategory = "bookmark" + + def works(self): + tag = None + restrict = "public" + + if "tag" in self.query: + tag = text.unquote(self.query["tag"]) + if "rest" in self.query and self.query["rest"] == "hide": + restrict = "private" + + return self.api.user_bookmarks_illust(self.user_id, tag, restrict) + + def get_metadata(self, user=None): + if "id" in self.query: + user = self.api.user_detail(self.query["id"]) + else: + self.api.login() + user = self.api.user + + self.user_id = user["id"] + return {"user_bookmark": user} + + +class PixivRankingExtractor(PixivExtractor): + """Extractor for pixiv ranking pages""" + subcategory = "ranking" + archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}" + directory_fmt = ("{category}", "rankings", + "{ranking[mode]}", "{ranking[date]}") + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/ranking\.php(?:\?([^#]*))?") + test = ( + ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"), + ("https://www.pixiv.net/ranking.php"), + ("https://touch.pixiv.net/ranking.php"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.query = match.group(1) + self.mode = self.date = None + + def works(self): + return self.api.illust_ranking(self.mode, self.date) + + def get_metadata(self, user=None): + query = text.parse_query(self.query) + + mode = query.get("mode", "daily").lower() + mode_map = { + "daily": "day", + "daily_r18": "day_r18", + "weekly": "week", + "weekly_r18": "week_r18", + "monthly": "month", + "male": "day_male", + "male_r18": "day_male_r18", + "female": "day_female", + "female_r18": "day_female_r18", + "original": "week_original", + "rookie": "week_rookie", + "r18g": "week_r18g", + } + if mode not in mode_map: + self.log.warning("invalid mode '%s'", mode) + mode = "daily" + self.mode = mode_map[mode] + + date = query.get("date") + if date: + if len(date) == 8 and date.isdecimal(): + date = "{}-{}-{}".format(date[0:4], date[4:6], date[6:8]) + else: + self.log.warning("invalid date '%s'", date) + date = None + if not date: + date = (datetime.utcnow() - timedelta(days=1)).strftime("%Y-%m-%d") + self.date = date + + return {"ranking": { + "mode": mode, + "date": self.date, + }} + + +class PixivSearchExtractor(PixivExtractor): + """Extractor for pixiv search results""" + subcategory = "search" + archive_fmt = "s_{search[word]}_{id}{num}.{extension}" + directory_fmt = ("{category}", "search", "{search[word]}") + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/search\.php\?([^#]+)") + test = ( + ("https://www.pixiv.net/search.php?s_mode=s_tag&word=Original"), + ("https://touch.pixiv.net/search.php?word=Original"), + ) + + def __init__(self, match): + PixivExtractor.__init__(self, match) + self.query = match.group(1) + self.word = self.sort = self.target = None + + def works(self): + return self.api.search_illust(self.word, self.sort, self.target) + + def get_metadata(self, user=None): + query = text.parse_query(self.query) + + if "word" in query: + self.word = text.unescape(query["word"]) + else: + self.log.error("missing search term") + raise exception.StopExtraction() + + sort = query.get("order", "date_d") + sort_map = { + "date": "date_asc", + "date_d": "date_desc", + } + if sort not in sort_map: + self.log.warning("invalid sort order '%s'", sort) + sort = "date_d" + self.sort = sort_map[sort] + + target = query.get("s_mode", "s_tag") + target_map = { + "s_tag": "partial_match_for_tags", + "s_tag_full": "exact_match_for_tags", + "s_tc": "title_and_caption", + } + if target not in target_map: + self.log.warning("invalid search target '%s'", target) + target = "s_tag" + self.target = target_map[target] + + return {"search": { + "word": self.word, + "sort": self.sort, + "target": self.target, + }} + + +class PixivFollowExtractor(PixivExtractor): + """Extractor for new illustrations from your followed artists""" + subcategory = "follow" + archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}" + directory_fmt = ("{category}", "following") + pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" + r"/bookmark_new_illust\.php") + test = ( + ("https://www.pixiv.net/bookmark_new_illust.php"), + ("https://touch.pixiv.net/bookmark_new_illust.php"), + ) + + def works(self): + return self.api.illust_follow() + + def get_metadata(self, user=None): + self.api.login() + return {"user_follow": self.api.user} + + +class PixivAppAPI(): + """Minimal interface for the Pixiv App API for mobile devices + + For a more complete implementation or documentation, see + - https://github.com/upbit/pixivpy + - https://gist.github.com/ZipFile/3ba99b47162c23f8aea5d5942bb557b1 + """ + CLIENT_ID = "MOBrBDS8blbauoSck0ZfDbtuzpyT" + CLIENT_SECRET = "lsACyCD94FhDUtGTXi3QzcFE2uU1hqtDaKeqrdwj" + + def __init__(self, extractor): + self.extractor = extractor + self.log = extractor.log + self.username, self.password = extractor._get_auth_info() + self.user = None + + self.client_id = extractor.config( + "client-id", self.CLIENT_ID) + self.client_secret = extractor.config( + "client-secret", self.CLIENT_SECRET) + + extractor.session.headers.update({ + "App-OS": "ios", + "App-OS-Version": "10.3.1", + "App-Version": "6.7.1", + "User-Agent": "PixivIOSApp/6.7.1 (iOS 10.3.1; iPhone8,1)", + "Referer": "https://app-api.pixiv.net/", + }) + + def login(self): + """Login and gain an access token""" + self.user, auth = self._login_impl(self.username, self.password) + self.extractor.session.headers["Authorization"] = auth + + @cache(maxage=3600, keyarg=1) + def _login_impl(self, username, password): + url = "https://oauth.secure.pixiv.net/auth/token" + data = { + "client_id": self.client_id, + "client_secret": self.client_secret, + "get_secure_url": 1, + } + refresh_token = _refresh_token_cache(username) + + if refresh_token: + self.log.info("Refreshing access token") + data["grant_type"] = "refresh_token" + data["refresh_token"] = refresh_token + else: + self.log.info("Logging in as %s", username) + data["grant_type"] = "password" + data["username"] = username + data["password"] = password + + response = self.extractor.request( + url, method="POST", data=data, expect=(400,)) + if response.status_code >= 400: + raise exception.AuthenticationError() + + data = response.json()["response"] + if not refresh_token: + _refresh_token_cache.update(username, data["refresh_token"]) + return data["user"], "Bearer " + data["access_token"] + + def illust_detail(self, illust_id): + params = {"illust_id": illust_id} + return self._call("v1/illust/detail", params)["illust"] + + def illust_follow(self, restrict="all"): + params = {"restrict": restrict} + return self._pagination("v2/illust/follow", params) + + def illust_ranking(self, mode="day", date=None): + params = {"mode": mode, "date": date} + return self._pagination("v1/illust/ranking", params) + + def search_illust(self, word, sort=None, target=None, duration=None): + params = {"word": word, "search_target": target, + "sort": sort, "duration": duration} + return self._pagination("v1/search/illust", params) + + def user_bookmarks_illust(self, user_id, tag=None, restrict="public"): + params = {"user_id": user_id, "tag": tag, "restrict": restrict} + return self._pagination("v1/user/bookmarks/illust", params) + + def user_detail(self, user_id): + params = {"user_id": user_id} + return self._call("v1/user/detail", params)["user"] + + def user_illusts(self, user_id): + params = {"user_id": user_id} + return self._pagination("v1/user/illusts", params) + + def ugoira_metadata(self, illust_id): + params = {"illust_id": illust_id} + return self._call("v1/ugoira/metadata", params)["ugoira_metadata"] + + def _call(self, endpoint, params=None): + url = "https://app-api.pixiv.net/" + endpoint + + self.login() + response = self.extractor.request( + url, params=params, expect=range(400, 500)) + + if 200 <= response.status_code < 400: + return response.json() + if response.status_code == 404: + raise exception.NotFoundError() + self.log.error("API request failed: %s", response.text) + raise exception.StopExtraction() + + def _pagination(self, endpoint, params): + while True: + data = self._call(endpoint, params) + yield from data["illusts"] + + if not data["next_url"]: + return + query = data["next_url"].rpartition("?")[2] + params = text.parse_query(query) + + +@cache(maxage=10*365*24*3600, keyarg=0) +def _refresh_token_cache(username): + return None diff --git a/gallery_dl/extractor/pixnet.py b/gallery_dl/extractor/pixnet.py new file mode 100644 index 0000000..9cada6b --- /dev/null +++ b/gallery_dl/extractor/pixnet.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.pixnet.net/""" + +from .common import Extractor, Message +from .. import text + + +BASE_PATTERN = r"(?:https?://)?(?!www\.)([^.]+)\.pixnet.net" + + +class PixnetExtractor(Extractor): + """Base class for pixnet extractors""" + category = "pixnet" + filename_fmt = "{num:>03}_{id}.{extension}" + archive_fmt = "{id}" + url_fmt = "" + + def __init__(self, match): + Extractor.__init__(self, match) + self.blog, self.item_id = match.groups() + self.root = "https://{}.pixnet.net".format(self.blog) + + def items(self): + url = self.url_fmt.format(self.root, self.item_id) + page = self.request(url, encoding="utf-8").text + user = text.extract(page, '<meta name="author" content="', '";')[0] + data = { + "blog": self.blog, + "user": user.rpartition(" (")[0], + } + + for info in self._pagination(page): + url, pos = text.extract(info, ' href="', '"') + alt, pos = text.extract(info, ' alt="', '"', pos) + item = { + "id" : text.parse_int(url.rpartition("/")[2]), + "title" : text.unescape(alt), + "_extractor": (PixnetFolderExtractor if "/folder/" in url else + PixnetSetExtractor), + } + item.update(data) + yield Message.Queue, url, item + + def _pagination(self, page): + while True: + yield from text.extract_iter(page, '<li id="', '</li>') + + pnext = text.extract(page, 'class="nextBtn"', '>')[0] + if "href" not in pnext: + return + url = self.root + text.extract(pnext, 'href="', '"')[0] + page = self.request(url, encoding="utf-8").text + + +class PixnetImageExtractor(PixnetExtractor): + """Extractor for a single photo from pixnet.net""" + subcategory = "image" + filename_fmt = "{id}.{extension}" + directory_fmt = ("{category}", "{blog}") + pattern = BASE_PATTERN + r"/album/photo/(\d+)" + test = ("https://albertayu773.pixnet.net/album/photo/159443828", { + "url": "156564c422138914c9fa5b42191677b45c414af4", + "keyword": "19971bcd056dfef5593f4328a723a9602be0f087", + "content": "0e097bdf49e76dd9b9d57a016b08b16fa6a33280", + }) + + def items(self): + url = "https://api.pixnet.cc/oembed" + params = { + "url": "https://{}.pixnet.net/album/photo/{}".format( + self.blog, self.item_id), + "format": "json", + } + + data = self.request(url, params=params).json() + data["id"] = text.parse_int( + data["url"].rpartition("/")[2].partition("-")[0]) + data["filename"], _, data["extension"] = data["title"].rpartition(".") + data["blog"] = self.blog + data["user"] = data.pop("author_name") + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, data["url"], data + + +class PixnetSetExtractor(PixnetExtractor): + """Extractor for images from a pixnet set""" + subcategory = "set" + url_fmt = "{}/album/set/{}" + directory_fmt = ("{category}", "{blog}", + "{folder_id} {folder_title}", "{set_id} {set_title}") + pattern = BASE_PATTERN + r"/album/set/(\d+)" + test = ( + ("https://albertayu773.pixnet.net/album/set/15078995", { + "url": "6535712801af47af51110542f4938a7cef44557f", + "keyword": "bf25d59e5b0959cb1f53e7fd2e2a25f2f67e5925", + }), + ("https://anrine910070.pixnet.net/album/set/5917493", { + "url": "b3eb6431aea0bcf5003432a4a0f3a3232084fc13", + "keyword": "bf7004faa1cea18cf9bd856f0955a69be51b1ec6", + }), + ) + + def items(self): + url = self.url_fmt.format(self.root, self.item_id) + page = self.request(url, encoding="utf-8").text + data = self.metadata(page) + + yield Message.Version, 1 + yield Message.Directory, data + for num, info in enumerate(self._pagination(page), 1): + url, pos = text.extract(info, ' href="', '"') + src, pos = text.extract(info, ' src="', '"', pos) + alt, pos = text.extract(info, ' alt="', '"', pos) + + photo = { + "id": text.parse_int(url.rpartition("/")[2].partition("#")[0]), + "url": src.replace("_s.", "."), + "num": num, + "filename": alt, + "extension": src.rpartition(".")[2], + } + photo.update(data) + yield Message.Url, photo["url"], photo + + def metadata(self, page): + user , pos = text.extract(page, '<meta name="author" content="', '";') + _ , pos = text.extract(page, 'id="breadcrumb"', '', pos) + fid , pos = text.extract(page, '/folder/', '"', pos) + fname, pos = text.extract(page, '>', '<', pos) + sid , pos = text.extract(page, '/set/', '"', pos) + sname, pos = text.extract(page, '>', '<', pos) + return { + "blog": self.blog, + "user": user.rpartition(" (")[0], + "folder_id" : text.parse_int(fid, ""), + "folder_title": text.unescape(fname).strip(), + "set_id" : text.parse_int(sid), + "set_title" : text.unescape(sname), + } + + +class PixnetFolderExtractor(PixnetExtractor): + """Extractor for all sets in a pixnet folder""" + subcategory = "folder" + url_fmt = "{}/album/folder/{}" + pattern = BASE_PATTERN + r"/album/folder/(\d+)" + test = ("https://albertayu773.pixnet.net/album/folder/1405768", { + "pattern": PixnetSetExtractor.pattern, + "count": ">= 15", + }) + + +class PixnetUserExtractor(PixnetExtractor): + """Extractor for all sets and folders of a pixnet user""" + subcategory = "user" + url_fmt = "{}{}/album/list" + pattern = BASE_PATTERN + r"()(?:/blog|/album(?:/list)?)?/?(?:$|[?&#])" + test = ( + ("https://albertayu773.pixnet.net/"), + ("https://albertayu773.pixnet.net/blog"), + ("https://albertayu773.pixnet.net/album"), + ("https://albertayu773.pixnet.net/album/list", { + "pattern": PixnetFolderExtractor.pattern, + "count": ">= 30", + }), + ("https://anrine910070.pixnet.net/album/list", { + "pattern": PixnetSetExtractor.pattern, + "count": ">= 14", + }), + ) diff --git a/gallery_dl/extractor/plurk.py b/gallery_dl/extractor/plurk.py new file mode 100644 index 0000000..325c6a0 --- /dev/null +++ b/gallery_dl/extractor/plurk.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.plurk.com/""" + +from .common import Extractor, Message +from .. import text, extractor, exception +import datetime +import json +import re + + +class PlurkExtractor(Extractor): + """Base class for plurk extractors""" + category = "plurk" + root = "https://www.plurk.com" + + def items(self): + urls = self._urls_ex if self.config("comments", False) else self._urls + + yield Message.Version, 1 + with extractor.blacklist(("plurk",)): + for plurk in self.plurks(): + for url in urls(plurk): + yield Message.Queue, url, plurk + + def plurks(self): + """Return an iterable with all relevant 'plurk' objects""" + + @staticmethod + def _urls(obj): + """Extract URLs from a 'plurk' object""" + return text.extract_iter(obj["content"], ' href="', '"') + + def _urls_ex(self, plurk): + """Extract URLs from a 'plurk' and its comments""" + yield from self._urls(plurk) + for comment in self._comments(plurk): + yield from self._urls(comment) + + def _comments(self, plurk): + """Return an iterable with a 'plurk's comments""" + url = "https://www.plurk.com/Responses/get" + data = {"plurk_id": plurk["id"], "count": "200"} + + while True: + info = self.request(url, "POST", data=data).json() + yield from info["responses"] + if not info["has_newer"]: + return + data["from_response_id"] = info["responses"][-1]["id"] + + @staticmethod + def _load(data): + if not data: + raise exception.NotFoundError("user") + return json.loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data)) + + +class PlurkTimelineExtractor(PlurkExtractor): + """Extractor for URLs from all posts in a Plurk timeline""" + subcategory = "timeline" + pattern = r"(?:https?://)?(?:www\.)?plurk\.com/(?!p/)(\w+)/?(?:$|[?&#])" + test = ("https://www.plurk.com/plurkapi", { + "pattern": r"https?://.+", + "count": ">= 23" + }) + + def __init__(self, match): + PlurkExtractor.__init__(self, match) + self.user = match.group(1) + + def plurks(self): + url = "{}/{}".format(self.root, self.user) + page = self.request(url).text + user_id, pos = text.extract(page, '"user_id":', ',') + plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0]) + + url = "https://www.plurk.com/TimeLine/getPlurks" + data = {"user_id": user_id.strip()} + headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"} + + while plurks: + yield from plurks + + offset = datetime.datetime.strptime( + plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z") + data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z") + response = self.request(url, "POST", headers=headers, data=data) + plurks = response.json()["plurks"] + + +class PlurkPostExtractor(PlurkExtractor): + """Extractor for URLs from a Plurk post""" + subcategory = "post" + pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)" + test = ( + ("https://www.plurk.com/p/i701j1", { + "url": "2115f208564591b8748525c2807a84596aaaaa5f", + "count": 3, + }), + ("https://www.plurk.com/p/i701j1", { + "options": (("comments", True),), + "count": ">= 210", + }), + ) + + def __init__(self, match): + PlurkExtractor.__init__(self, match) + self.plurk_id = match.group(1) + + def plurks(self): + url = "{}/p/{}".format(self.root, self.plurk_id) + page = self.request(url).text + user, pos = text.extract(page, " GLOBAL = ", "\n") + data, pos = text.extract(page, "plurk = ", ";\n", pos) + + data = self._load(data) + data["user"] = self._load(user)["page_user"] + return (data,) diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py new file mode 100644 index 0000000..40816b3 --- /dev/null +++ b/gallery_dl/extractor/pornhub.py @@ -0,0 +1,157 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.pornhub.com/""" + +from .common import Extractor, Message +from .. import text, exception + + +BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?pornhub\.com" + + +class PornhubExtractor(Extractor): + """Base class for pornhub extractors""" + category = "pornhub" + root = "https://www.pornhub.com" + + +class PornhubGalleryExtractor(PornhubExtractor): + """Extractor for image galleries on pornhub.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{user}", "{gallery[id]} {gallery[title]}") + filename_fmt = "{num:>03}_{id}.{extension}" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/album/(\d+)" + test = ( + ("https://www.pornhub.com/album/1708982", { + "pattern": r"https://\w+.phncdn.com/pics/albums/\d+/\d+/\d+/\d+/", + "count": 93, + "keyword": { + "id": int, + "num": int, + "score": int, + "views": int, + "caption": str, + "user": "Unknown", + "gallery": { + "id" : 1708982, + "score": int, + "views": int, + "tags" : list, + "title": "Random Hentai", + }, + }, + }), + ("https://www.pornhub.com/album/37180171", { + "exception": exception.AuthorizationError, + }), + ) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.gallery_id = match.group(1) + self._first = None + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for num, image in enumerate(self.images(), 1): + url = image["url"] + image.update(data) + image["num"] = num + yield Message.Url, url, text.nameext_from_url(url, image) + + def metadata(self): + url = "{}/album/{}".format( + self.root, self.gallery_id) + extr = text.extract_from(self.request(url).text) + + title = extr("<title>", "</title>") + score = extr('<div id="albumGreenBar" style="width:', '"') + views = extr('<div id="viewsPhotAlbumCounter">', '<') + tags = extr('<div id="photoTagsBox"', '<script') + self._first = extr('<a href="/photo/', '"') + title, _, user = title.rpartition(" - ") + + return { + "user" : text.unescape(user[:-14]), + "gallery": { + "id" : text.parse_int(self.gallery_id), + "title": text.unescape(title), + "score": text.parse_int(score.partition("%")[0]), + "views": text.parse_int(views.partition(" ")[0]), + "tags" : text.split_html(tags)[2:], + }, + } + + def images(self): + url = "{}/album/show_album_json?album={}".format( + self.root, self.gallery_id) + response = self.request(url) + + if response.content == b"Permission denied": + raise exception.AuthorizationError() + images = response.json() + key = end = self._first + + while True: + img = images[key] + yield { + "url" : img["img_large"], + "caption": img["caption"], + "id" : text.parse_int(img["id"]), + "views" : text.parse_int(img["times_viewed"]), + "score" : text.parse_int(img["vote_percent"]), + } + key = img["next"] + if key == end: + return + + +class PornhubUserExtractor(PornhubExtractor): + """Extractor for all galleries of a pornhub user""" + subcategory = "user" + pattern = (BASE_PATTERN + r"/(users|model)/([^/?&#]+)" + "(?:/photos(?:/(public|private|favorites))?)?/?$") + test = ( + ("https://www.pornhub.com/users/flyings0l0/photos/public", { + "pattern": PornhubGalleryExtractor.pattern, + "count": ">= 8", + }), + ("https://www.pornhub.com/users/flyings0l0/"), + ("https://www.pornhub.com/users/flyings0l0/photos/public"), + ("https://www.pornhub.com/users/flyings0l0/photos/private"), + ("https://www.pornhub.com/users/flyings0l0/photos/favorites"), + ("https://www.pornhub.com/model/bossgirl/photos"), + ) + + def __init__(self, match): + PornhubExtractor.__init__(self, match) + self.type, self.user, self.cat = match.groups() + + def items(self): + url = "{}/{}/{}/photos/{}/ajax".format( + self.root, self.type, self.user, self.cat or "public") + params = {"page": 1} + headers = { + "Referer": url[:-5], + "X-Requested-With": "XMLHttpRequest", + } + + data = {"_extractor": PornhubGalleryExtractor} + yield Message.Version, 1 + while True: + page = self.request( + url, method="POST", headers=headers, params=params).text + if not page: + return + for gid in text.extract_iter(page, 'id="albumphoto', '"'): + yield Message.Queue, self.root + "/album/" + gid, data + params["page"] += 1 diff --git a/gallery_dl/extractor/pururin.py b/gallery_dl/extractor/pururin.py new file mode 100644 index 0000000..fa4eb81 --- /dev/null +++ b/gallery_dl/extractor/pururin.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://pururin.io/""" + +from .common import GalleryExtractor +from .. import text, util +import json + + +class PururinGalleryExtractor(GalleryExtractor): + """Extractor for image galleries on pururin.io""" + category = "pururin" + pattern = r"(?:https?://)?(?:www\.)?pururin\.io/(?:gallery|read)/(\d+)" + test = ( + ("https://pururin.io/gallery/38661/iowant-2", { + "pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg", + "keyword": { + "title" : "Iowant 2!!", + "title_en" : "Iowant 2!!", + "title_jp" : "", + "gallery_id": 38661, + "count" : 19, + "artist" : ["Shoda Norihiro"], + "group" : ["Obsidian Order"], + "parody" : ["Kantai Collection"], + "characters": ["Iowa", "Teitoku"], + "tags" : list, + "type" : "Doujinshi", + "collection": "", + "convention": "C92", + "rating" : float, + "uploader" : "demo", + "scanlator" : "", + "lang" : "en", + "language" : "English", + } + }), + ("https://pururin.io/gallery/7661/unisis-team-vanilla", { + "count": 17, + }), + ) + root = "https://pururin.io" + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/gallery/{}/x".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + self._ext = "" + self._cnt = 0 + + def metadata(self, page): + extr = text.extract_from(page) + + def _lst(key, e=extr): + return [ + text.unescape(item) + for item in text.extract_iter(e(key, "</td>"), 'title="', '"') + ] + + def _str(key, e=extr): + return text.unescape(text.extract( + e(key, "</td>"), 'title="', '"')[0] or "") + + url = "{}/read/{}/01/x".format(self.root, self.gallery_id) + page = self.request(url).text + info = json.loads(text.unescape(text.extract( + page, ':gallery="', '"')[0])) + self._ext = info["image_extension"] + self._cnt = info["total_pages"] + + data = { + "gallery_id": text.parse_int(self.gallery_id), + "title" : info["title"] or info.get("j_title") or "", + "title_en" : info["title"], + "title_jp" : info.get("j_title") or "", + "artist" : _lst("<td>Artist</td>"), + "group" : _lst("<td>Circle</td>"), + "parody" : _lst("<td>Parody</td>"), + "tags" : _lst("<td>Contents</td>"), + "type" : _str("<td>Category</td>"), + "characters": _lst("<td>Character</td>"), + "collection": _str("<td>Collection</td>"), + "language" : _str("<td>Language</td>"), + "scanlator" : _str("<td>Scanlator</td>"), + "convention": _str("<td>Convention</td>"), + "uploader" : text.remove_html(extr("<td>Uploader</td>", "</td>")), + "rating" : text.parse_float(extr(" :rating='" , "'")), + } + data["lang"] = util.language_to_code(data["language"]) + return data + + def images(self, _): + ufmt = "https://cdn.pururin.io/assets/images/data/{}/{{}}.{}".format( + self.gallery_id, self._ext) + return [(ufmt.format(num), None) for num in range(1, self._cnt + 1)] diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py new file mode 100644 index 0000000..59d502a --- /dev/null +++ b/gallery_dl/extractor/reactor.py @@ -0,0 +1,338 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Generic extractors for *reactor sites""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text +import urllib.parse +import random +import time +import json + + +BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)" + + +class ReactorExtractor(SharedConfigMixin, Extractor): + """Base class for *reactor.cc extractors""" + basecategory = "reactor" + filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" + archive_fmt = "{post_id}_{num}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.root = "http://" + match.group(1) + self.session.headers["Referer"] = self.root + + self.wait_min = self.config("wait-min", 3) + self.wait_max = self.config("wait-max", 6) + if self.wait_max < self.wait_min: + self.wait_max = self.wait_min + + if not self.category: + # set category based on domain name + netloc = urllib.parse.urlsplit(self.root).netloc + self.category = netloc.rpartition(".")[0] + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(): + for image in self._parse_post(post): + url = image["url"] + image.update(data) + yield Message.Url, url, text.nameext_from_url(url, image) + + def metadata(self): + """Collect metadata for extractor-job""" + return {} + + def posts(self): + """Return all relevant post-objects""" + return self._pagination(self.url) + + def _pagination(self, url): + while True: + time.sleep(random.uniform(self.wait_min, self.wait_max)) + + response = self.request(url) + if response.history: + # sometimes there is a redirect from + # the last page of a listing (.../tag/<tag>/1) + # to the first page (.../tag/<tag>) + # which could cause an endless loop + cnt_old = response.history[0].url.count("/") + cnt_new = response.url.count("/") + if cnt_old == 5 and cnt_new == 4: + return + page = response.text + + yield from text.extract_iter( + page, '<div class="uhead">', '<div class="ufoot">') + + try: + pos = page.index("class='next'") + pos = page.rindex("class='current'", 0, pos) + url = self.root + text.extract(page, "href='", "'", pos)[0] + except (ValueError, TypeError): + return + + def _parse_post(self, post): + post, _, script = post.partition('<script type="application/ld+json">') + images = text.extract_iter(post, '<div class="image">', '</div>') + script = script[:script.index("</")].strip() + + try: + data = json.loads(script) + except ValueError: + try: + # remove control characters and escape backslashes + mapping = dict.fromkeys(range(32)) + script = script.translate(mapping).replace("\\", "\\\\") + data = json.loads(script) + except ValueError as exc: + self.log.warning("Unable to parse JSON data: %s", exc) + return + + num = 0 + date = text.parse_datetime(data["datePublished"]) + user = data["author"]["name"] + description = text.unescape(data["description"]) + title, _, tags = text.unescape(data["headline"]).partition(" / ") + post_id = text.parse_int( + data["mainEntityOfPage"]["@id"].rpartition("/")[2]) + + if not tags: + title, tags = tags, title + tags = tags.split(" :: ") + + for image in images: + url = text.extract(image, ' src="', '"')[0] + if not url: + continue + width = text.extract(image, ' width="', '"')[0] + height = text.extract(image, ' height="', '"')[0] + image_id = url.rpartition("-")[2].partition(".")[0] + num += 1 + + if image.startswith("<iframe "): # embed + url = "ytdl:" + text.unescape(url) + elif "/post/webm/" not in url and "/post/mp4/" not in url: + url = url.replace("/post/", "/post/full/") + + yield { + "url": url, + "post_id": post_id, + "image_id": text.parse_int(image_id), + "width": text.parse_int(width), + "height": text.parse_int(height), + "title": title, + "description": description, + "tags": tags, + "date": date, + "user": user, + "num": num, + } + + +class ReactorTagExtractor(ReactorExtractor): + """Extractor for tag searches on *reactor.cc sites""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "{search_tags}_{post_id}_{num}" + pattern = BASE_PATTERN + r"/tag/([^/?&#]+)" + test = ("http://anime.reactor.cc/tag/Anime+Art",) + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.tag = match.group(2) + + def metadata(self): + return {"search_tags": text.unescape(self.tag).replace("+", " ")} + + +class ReactorSearchExtractor(ReactorTagExtractor): + """Extractor for search results on *reactor.cc sites""" + subcategory = "search" + directory_fmt = ("{category}", "search", "{search_tags}") + archive_fmt = "s_{search_tags}_{post_id}_{num}" + pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + test = ("http://anime.reactor.cc/search?q=Art",) + + +class ReactorUserExtractor(ReactorExtractor): + """Extractor for all posts of a user on *reactor.cc sites""" + subcategory = "user" + directory_fmt = ("{category}", "user", "{user}") + pattern = BASE_PATTERN + r"/user/([^/?&#]+)" + test = ("http://anime.reactor.cc/user/Shuster",) + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.user = match.group(2) + + def metadata(self): + return {"user": text.unescape(self.user).replace("+", " ")} + + +class ReactorPostExtractor(ReactorExtractor): + """Extractor for single posts on *reactor.cc sites""" + subcategory = "post" + pattern = BASE_PATTERN + r"/post/(\d+)" + test = ("http://anime.reactor.cc/post/3576250",) + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.post_id = match.group(2) + + def items(self): + yield Message.Version, 1 + post = self.request(self.url).text + pos = post.find('class="uhead">') + for image in self._parse_post(post[pos:]): + if image["num"] == 1: + yield Message.Directory, image + url = image["url"] + yield Message.Url, url, text.nameext_from_url(url, image) + + +# -------------------------------------------------------------------- +# JoyReactor + +JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))" + + +class JoyreactorTagExtractor(ReactorTagExtractor): + """Extractor for tag searches on joyreactor.cc""" + category = "joyreactor" + pattern = JR_BASE_PATTERN + r"/tag/([^/?&#]+)" + test = ( + ("http://joyreactor.cc/tag/Advent+Cirno", { + "count": ">= 17", + }), + ("http://joyreactor.com/tag/Cirno", { + "url": "de1e60c15bfb07a0e9603b00dc3d05f60edc7914", + }), + ) + + +class JoyreactorSearchExtractor(ReactorSearchExtractor): + """Extractor for search results on joyreactor.cc""" + category = "joyreactor" + pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + test = ( + ("http://joyreactor.cc/search/Cirno+Gifs", { + "range": "1-25", + "count": ">= 20", + }), + ("http://joyreactor.com/search?q=Cirno+Gifs", { + "count": 0, # no search results on joyreactor.com + }), + ) + + +class JoyreactorUserExtractor(ReactorUserExtractor): + """Extractor for all posts of a user on joyreactor.cc""" + category = "joyreactor" + pattern = JR_BASE_PATTERN + r"/user/([^/?&#]+)" + test = ( + ("http://joyreactor.cc/user/hemantic"), + ("http://joyreactor.com/user/Tacoman123", { + "url": "452cd0fa23e2ad0e122c296ba75aa7f0b29329f6", + }), + ) + + +class JoyreactorPostExtractor(ReactorPostExtractor): + """Extractor for single posts on joyreactor.cc""" + category = "joyreactor" + pattern = JR_BASE_PATTERN + r"/post/(\d+)" + test = ( + ("http://joyreactor.com/post/3721876", { # single image + "url": "6ce09f239d8b7fdf6dd1664c2afc39618cc87663", + "keyword": "966d2acd462732a9ed823a9db5ed19f95734fd10", + }), + ("http://joyreactor.com/post/3713804", { # 4 images + "url": "f08ac8493ca0619a3e3c6bedb8d8374af3eec304", + "keyword": "84e34d402342607045a65fab6d4d593d146c238a", + }), + ("http://joyreactor.com/post/3726210", { # gif / video + "url": "33a48e1eca6cb2d298fbbb6536b3283799d6515b", + "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47", + }), + ("http://joyreactor.com/post/3668724", { # youtube embed + "url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a", + "keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651", + }), + ("http://joyreactor.cc/post/1299", { # "malformed" JSON + "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde", + }), + ) + + +# -------------------------------------------------------------------- +# PornReactor + +PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)" + + +class PornreactorTagExtractor(ReactorTagExtractor): + """Extractor for tag searches on pornreactor.cc""" + category = "pornreactor" + pattern = PR_BASE_PATTERN + r"/tag/([^/?&#]+)" + test = ( + ("http://pornreactor.cc/tag/RiceGnat", { + "range": "1-25", + "count": ">= 25", + }), + ("http://fapreactor.com/tag/RiceGnat"), + ) + + +class PornreactorSearchExtractor(ReactorSearchExtractor): + """Extractor for search results on pornreactor.cc""" + category = "pornreactor" + pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + test = ( + ("http://pornreactor.cc/search?q=ecchi+hentai", { + "range": "1-25", + "count": ">= 25", + }), + ("http://fapreactor.com/search/ecchi+hentai"), + ) + + +class PornreactorUserExtractor(ReactorUserExtractor): + """Extractor for all posts of a user on pornreactor.cc""" + category = "pornreactor" + pattern = PR_BASE_PATTERN + r"/user/([^/?&#]+)" + test = ( + ("http://pornreactor.cc/user/Disillusion", { + "range": "1-25", + "count": ">= 25", + }), + ("http://fapreactor.com/user/Disillusion"), + ) + + +class PornreactorPostExtractor(ReactorPostExtractor): + """Extractor for single posts on pornreactor.cc""" + category = "pornreactor" + subcategory = "post" + pattern = PR_BASE_PATTERN + r"/post/(\d+)" + test = ( + ("http://pornreactor.cc/post/863166", { + "url": "680db1e33ca92ff70b2c0e1708c471cbe2201324", + "content": "ec6b0568bfb1803648744077da082d14de844340", + }), + ("http://fapreactor.com/post/863166", { + "url": "864ecd5785e4898301aa8d054dd653b1165be158", + }), + ) diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py new file mode 100644 index 0000000..dda4809 --- /dev/null +++ b/gallery_dl/extractor/readcomiconline.py @@ -0,0 +1,97 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract comic-issues and entire comics from https://readcomiconline.to/""" + +from .common import ChapterExtractor, MangaExtractor +from .kissmanga import RedirectMixin +from .. import text +import re + + +class ReadcomiconlineBase(RedirectMixin): + """Base class for readcomiconline extractors""" + category = "readcomiconline" + directory_fmt = ("{category}", "{comic}", "{issue:>03}") + filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}" + archive_fmt = "{issue_id}_{page}" + root = "https://readcomiconline.to" + + +class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): + """Extractor for comic-issues from readcomiconline.to""" + subcategory = "issue" + pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" + r"(/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+))") + test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", { + "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682", + "keyword": "30fe110273e871305001f33c18634516a0a51421", + }) + + def __init__(self, match): + ChapterExtractor.__init__(self, match) + self.issue_id = match.group(2) + + def metadata(self, page): + comic, pos = text.extract(page, " - Read\r\n ", "\r\n") + iinfo, pos = text.extract(page, " ", "\r\n", pos) + match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo) + return { + "comic": comic, + "issue": match.group(1) or match.group(2), + "issue_id": text.parse_int(self.issue_id), + "lang": "en", + "language": "English", + } + + def images(self, page): + return [ + (url, None) + for url in text.extract_iter( + page, 'lstImages.push("', '"' + ) + ] + + +class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor): + """Extractor for comics from readcomiconline.to""" + chapterclass = ReadcomiconlineIssueExtractor + subcategory = "comic" + pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to" + r"(/Comic/[^/?&#]+/?)$") + test = ( + ("https://readcomiconline.to/Comic/W-i-t-c-h", { + "url": "e231bc2a293edb465133c37a8e36a7e7d94cab14", + "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c", + }), + ("https://readcomiconline.to/Comic/Bazooka-Jules", { + "url": "711674cb78ed10bd2557315f7a67552d01b33985", + "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516", + }), + ) + + def chapters(self, page): + results = [] + comic, pos = text.extract(page, ' class="barTitle">', '<') + page , pos = text.extract(page, ' class="listing">', '</table>', pos) + + comic = comic.rpartition("information")[0].strip() + needle = ' title="Read {} '.format(comic) + comic = text.unescape(comic) + + for item in text.extract_iter(page, ' href="', ' comic online '): + url, _, issue = item.partition(needle) + url = url.rpartition('"')[0] + if issue.startswith('Issue #'): + issue = issue[7:] + results.append((self.root + url, { + "comic": comic, "issue": issue, + "issue_id": text.parse_int(url.rpartition("=")[2]), + "lang": "en", "language": "English", + })) + return results diff --git a/gallery_dl/extractor/recursive.py b/gallery_dl/extractor/recursive.py new file mode 100644 index 0000000..1a793a0 --- /dev/null +++ b/gallery_dl/extractor/recursive.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Recursive extractor""" + +from .common import Extractor, Message +from .. import extractor, util +import requests +import re + + +class RecursiveExtractor(Extractor): + """Extractor that fetches URLs from a remote or local source""" + category = "recursive" + pattern = r"r(?:ecursive)?:" + test = ("recursive:https://pastebin.com/raw/FLwrCYsT", { + "url": "eee86d65c346361b818e8f4b2b307d9429f136a2", + }) + + def items(self): + blist = self.config( + "blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS) + + self.session.mount("file://", FileAdapter()) + page = self.request(self.url.partition(":")[2]).text + + yield Message.Version, 1 + with extractor.blacklist(blist): + for match in re.finditer(r"https?://[^\s\"']+", page): + yield Message.Queue, match.group(0), {} + + +class FileAdapter(requests.adapters.BaseAdapter): + """Requests adapter for local files""" + + def send(self, request, **kwargs): + response = requests.Response() + try: + response.raw = open(request.url[7:], "rb") + except OSError: + import io + response.raw = io.BytesIO() + response.status_code = requests.codes.bad_request + else: + response.raw.release_conn = response.raw.close + response.status_code = requests.codes.ok + return response + + def close(self): + pass diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py new file mode 100644 index 0000000..0c5a924 --- /dev/null +++ b/gallery_dl/extractor/reddit.py @@ -0,0 +1,313 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from subreddits at https://www.reddit.com/""" + +from .common import Extractor, Message +from .. import text, util, extractor, exception +from ..cache import cache +import datetime +import time + + +class RedditExtractor(Extractor): + """Base class for reddit extractors""" + category = "reddit" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = RedditAPI(self) + self.max_depth = int(self.config("recursion", 0)) + self._visited = set() + + def items(self): + subre = RedditSubmissionExtractor.pattern + submissions = self.submissions() + depth = 0 + + yield Message.Version, 1 + with extractor.blacklist( + util.SPECIAL_EXTRACTORS, [RedditSubredditExtractor]): + while True: + extra = [] + for url, data in self._urls(submissions): + if url[0] == "#": + continue + if url[0] == "/": + url = "https://www.reddit.com" + url + + match = subre.match(url) + if match: + extra.append(match.group(1)) + else: + yield Message.Queue, text.unescape(url), data + + if not extra or depth == self.max_depth: + return + depth += 1 + submissions = ( + self.api.submission(sid) for sid in extra + if sid not in self._visited + ) + + def submissions(self): + """Return an iterable containing all (submission, comments) tuples""" + + def _urls(self, submissions): + for submission, comments in submissions: + self._visited.add(submission["id"]) + + if not submission["is_self"]: + yield submission["url"], submission + + for url in text.extract_iter( + submission["selftext_html"] or "", ' href="', '"'): + yield url, submission + + for comment in comments: + for url in text.extract_iter( + comment["body_html"] or "", ' href="', '"'): + yield url, comment + + +class RedditSubredditExtractor(RedditExtractor): + """Extractor for images from subreddits on reddit.com""" + subcategory = "subreddit" + pattern = (r"(?:https?://)?(?:\w+\.)?reddit\.com/r/([^/?&#]+)" + r"(/[a-z]+)?/?" + r"(?:\?.*?(?:\bt=([a-z]+))?)?$") + test = ( + ("https://www.reddit.com/r/lavaporn/"), + ("https://www.reddit.com/r/lavaporn/top/?sort=top&t=month"), + ("https://old.reddit.com/r/lavaporn/"), + ("https://np.reddit.com/r/lavaporn/"), + ("https://m.reddit.com/r/lavaporn/"), + ) + + def __init__(self, match): + RedditExtractor.__init__(self, match) + self.subreddit, self.order, self.timeframe = match.groups() + + def submissions(self): + subreddit = self.subreddit + (self.order or "") + params = {"t": self.timeframe} if self.timeframe else {} + return self.api.submissions_subreddit(subreddit, params) + + +class RedditSubmissionExtractor(RedditExtractor): + """Extractor for images from a submission on reddit.com""" + subcategory = "submission" + pattern = (r"(?:https?://)?(?:" + r"(?:\w+\.)?reddit\.com/r/[^/?&#]+/comments|" + r"redd\.it" + r")/([a-z0-9]+)") + test = ( + ("https://www.reddit.com/r/lavaporn/comments/2a00np/", { + "pattern": r"https?://i\.imgur\.com/AaAUCgy\.jpg", + }), + ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://m.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://redd.it/2a00np/"), + ) + + def __init__(self, match): + RedditExtractor.__init__(self, match) + self.submission_id = match.group(1) + + def submissions(self): + return (self.api.submission(self.submission_id),) + + +class RedditImageExtractor(Extractor): + """Extractor for reddit-hosted images""" + category = "reddit" + subcategory = "image" + archive_fmt = "{filename}" + pattern = (r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)" + r"/[^/?&#]+(?:\?[^#]*)?") + test = ( + ("https://i.redd.it/upjtjcx2npzz.jpg", { + "url": "0de614900feef103e580b632190458c0b62b641a", + "content": "cc9a68cf286708d5ce23c68e79cd9cf7826db6a3", + }), + (("https://i.reddituploads.com/0f44f1b1fca2461f957c713d9592617d" + "?fit=max&h=1536&w=1536&s=e96ce7846b3c8e1f921d2ce2671fb5e2"), { + "url": "f24f25efcedaddeec802e46c60d77ef975dc52a5", + "content": "541dbcc3ad77aa01ee21ca49843c5e382371fae7", + }), + ) + + def items(self): + data = text.nameext_from_url(self.url) + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, self.url, data + + +class RedditAPI(): + """Minimal interface for the reddit API""" + CLIENT_ID = "6N9uN0krSDE-ig" + USER_AGENT = "Python:gallery-dl:0.8.4 (by /u/mikf1)" + + def __init__(self, extractor): + self.extractor = extractor + self.comments = extractor.config("comments", 500) + self.morecomments = extractor.config("morecomments", False) + self.refresh_token = extractor.config("refresh-token") + self.log = extractor.log + + client_id = extractor.config("client-id", self.CLIENT_ID) + user_agent = extractor.config("user-agent", self.USER_AGENT) + + if (client_id == self.CLIENT_ID) ^ (user_agent == self.USER_AGENT): + self.client_id = None + self.log.warning( + "Conflicting values for 'client-id' and 'user-agent': " + "override either both or none of them.") + else: + self.client_id = client_id + extractor.session.headers["User-Agent"] = user_agent + + def submission(self, submission_id): + """Fetch the (submission, comments)=-tuple for a submission id""" + endpoint = "/comments/" + submission_id + "/.json" + link_id = "t3_" + submission_id if self.morecomments else None + submission, comments = self._call(endpoint, {"limit": self.comments}) + return (submission["data"]["children"][0]["data"], + self._flatten(comments, link_id)) + + def submissions_subreddit(self, subreddit, params): + """Collect all (submission, comments)-tuples of a subreddit""" + endpoint = "/r/" + subreddit + "/.json" + params["limit"] = 100 + return self._pagination(endpoint, params) + + def morechildren(self, link_id, children): + """Load additional comments from a submission""" + endpoint = "/api/morechildren" + params = {"link_id": link_id, "api_type": "json"} + index, done = 0, False + while not done: + if len(children) - index < 100: + done = True + params["children"] = ",".join(children[index:index + 100]) + index += 100 + + data = self._call(endpoint, params)["json"] + for thing in data["data"]["things"]: + if thing["kind"] == "more": + children.extend(thing["data"]["children"]) + else: + yield thing["data"] + + def authenticate(self): + """Authenticate the application by requesting an access token""" + access_token = self._authenticate_impl(self.refresh_token) + self.extractor.session.headers["Authorization"] = access_token + + @cache(maxage=3600, keyarg=1) + def _authenticate_impl(self, refresh_token=None): + """Actual authenticate implementation""" + url = "https://www.reddit.com/api/v1/access_token" + if refresh_token: + self.log.info("Refreshing private access token") + data = {"grant_type": "refresh_token", + "refresh_token": refresh_token} + else: + self.log.info("Requesting public access token") + data = {"grant_type": ("https://oauth.reddit.com/" + "grants/installed_client"), + "device_id": "DO_NOT_TRACK_THIS_DEVICE"} + response = self.extractor.request( + url, method="POST", data=data, auth=(self.client_id, "")) + if response.status_code != 200: + raise exception.AuthenticationError('"{} ({})"'.format( + response.json().get("message"), response.status_code)) + return "Bearer " + response.json()["access_token"] + + def _call(self, endpoint, params): + url = "https://oauth.reddit.com" + endpoint + params["raw_json"] = 1 + self.authenticate() + response = self.extractor.request( + url, params=params, expect=range(400, 500)) + remaining = response.headers.get("x-ratelimit-remaining") + if remaining and float(remaining) < 2: + wait = int(response.headers["x-ratelimit-reset"]) + self.log.info("Waiting %d seconds for ratelimit reset", wait) + time.sleep(wait) + data = response.json() + if "error" in data: + if data["error"] == 403: + raise exception.AuthorizationError() + if data["error"] == 404: + raise exception.NotFoundError() + raise Exception(data["message"]) + return data + + def _pagination(self, endpoint, params, _empty=()): + date_fmt = self.extractor.config("date-format", "%Y-%m-%dT%H:%M:%S") + date_min = self._parse_datetime("date-min", 0, date_fmt) + date_max = self._parse_datetime("date-max", 253402210800, date_fmt) + + id_min = self._parse_id("id-min", 0) + id_max = self._parse_id("id-max", 2147483647) + + while True: + data = self._call(endpoint, params)["data"] + + for submission in data["children"]: + submission = submission["data"] + if (date_min <= submission["created_utc"] <= date_max and + id_min <= self._decode(submission["id"]) <= id_max): + if submission["num_comments"] and self.comments: + try: + yield self.submission(submission["id"]) + except exception.AuthorizationError: + pass + else: + yield submission, _empty + + if not data["after"]: + return + params["after"] = data["after"] + + def _flatten(self, comments, link_id=None): + extra = [] + queue = comments["data"]["children"] + while queue: + comment = queue.pop(0) + if comment["kind"] == "more": + if link_id: + extra.extend(comment["data"]["children"]) + continue + comment = comment["data"] + yield comment + if comment["replies"]: + queue += comment["replies"]["data"]["children"] + if link_id and extra: + yield from self.morechildren(link_id, extra) + + def _parse_datetime(self, key, default, fmt): + ts = self.extractor.config(key, default) + if isinstance(ts, str): + try: + ts = int(datetime.datetime.strptime(ts, fmt).timestamp()) + except ValueError as exc: + self.log.warning("Unable to parse '%s': %s", key, exc) + ts = default + return ts + + def _parse_id(self, key, default): + sid = self.extractor.config(key) + return self._decode(sid.rpartition("_")[2].lower()) if sid else default + + @staticmethod + def _decode(sid): + return util.bdecode(sid, "0123456789abcdefghijklmnopqrstuvwxyz") diff --git a/gallery_dl/extractor/rule34.py b/gallery_dl/extractor/rule34.py new file mode 100644 index 0000000..de7ef45 --- /dev/null +++ b/gallery_dl/extractor/rule34.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://rule34.xxx/""" + +from . import booru + + +class Rule34Extractor(booru.XmlParserMixin, + booru.GelbooruPageMixin, + booru.BooruExtractor): + """Base class for rule34 extractors""" + category = "rule34" + api_url = "https://rule34.xxx/index.php" + post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}" + pool_url = "https://rule34.xxx/index.php?page=pool&s=show&id={}" + page_limit = 4000 + + def __init__(self, match): + super().__init__(match) + self.params.update({"page": "dapi", "s": "post", "q": "index"}) + + +class Rule34TagExtractor(booru.TagMixin, Rule34Extractor): + """Extractor for images from rule34.xxx based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?" + r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") + test = ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", { + "content": "97e4bbf86c3860be18de384d02d544251afe1d45", + "pattern": r"https?://([^.]+\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg", + "count": 1, + }) + + +class Rule34PoolExtractor(booru.GelbooruPoolMixin, Rule34Extractor): + """Extractor for image-pools from rule34.xxx""" + pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?" + r"\?page=pool&s=show&id=(?P<pool>\d+)") + test = ("https://rule34.xxx/index.php?page=pool&s=show&id=179", { + "count": 3, + }) + + +class Rule34PostExtractor(booru.PostMixin, Rule34Extractor): + """Extractor for single images from rule34.xxx""" + pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?" + r"\?page=post&s=view&id=(?P<post>\d+)") + test = ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", { + "content": "97e4bbf86c3860be18de384d02d544251afe1d45", + "options": (("tags", True),), + "keyword": { + "tags_artist": "danraku", + "tags_character": "kashima_(kantai_collection)", + "tags_copyright": "kantai_collection", + "tags_general": str, + "tags_metadata": str, + }, + }) diff --git a/gallery_dl/extractor/safebooru.py b/gallery_dl/extractor/safebooru.py new file mode 100644 index 0000000..f5f058c --- /dev/null +++ b/gallery_dl/extractor/safebooru.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://safebooru.org/""" + +from . import booru + + +class SafebooruExtractor(booru.XmlParserMixin, + booru.GelbooruPageMixin, + booru.BooruExtractor): + """Base class for safebooru extractors""" + category = "safebooru" + api_url = "https://safebooru.org/index.php" + post_url = "https://safebooru.org/index.php?page=post&s=view&id={}" + pool_url = "https://safebooru.org/index.php?page=pool&s=show&id={}" + + def __init__(self, match): + super().__init__(match) + self.params.update({"page": "dapi", "s": "post", "q": "index"}) + + +class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor): + """Extractor for images from safebooru.org based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?" + r"\?page=post&s=list&tags=(?P<tags>[^&#]+)") + test = ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", { + "url": "17c61b386530cf4c30842c9f580d15ef1cd09586", + "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb", + }) + + +class SafebooruPoolExtractor(booru.GelbooruPoolMixin, SafebooruExtractor): + """Extractor for image-pools from safebooru.org""" + pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?" + r"\?page=pool&s=show&id=(?P<pool>\d+)") + test = ("https://safebooru.org/index.php?page=pool&s=show&id=11", { + "count": 5, + }) + + +class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor): + """Extractor for single images from safebooru.org""" + pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?" + r"\?page=post&s=view&id=(?P<post>\d+)") + test = ("https://safebooru.org/index.php?page=post&s=view&id=1169132", { + "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b", + "content": "93b293b27dabd198afafabbaf87c49863ac82f27", + "options": (("tags", True),), + "keyword": { + "tags_artist": "kawanakajima", + "tags_character": "heath_ledger ronald_mcdonald the_joker", + "tags_copyright": "dc_comics mcdonald's the_dark_knight", + "tags_general": str, + }, + }) diff --git a/gallery_dl/extractor/sankaku.py b/gallery_dl/extractor/sankaku.py new file mode 100644 index 0000000..012cb8b --- /dev/null +++ b/gallery_dl/extractor/sankaku.py @@ -0,0 +1,299 @@ +# -*- coding: utf-8 -*- + +# Copyright 2014-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://chan.sankakucomplex.com/""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text, util, exception +from ..cache import cache +import collections +import random +import time +import re + + +class SankakuExtractor(SharedConfigMixin, Extractor): + """Base class for sankaku extractors""" + basecategory = "booru" + category = "sankaku" + filename_fmt = "{category}_{id}_{md5}.{extension}" + cookienames = ("login", "pass_hash") + cookiedomain = "chan.sankakucomplex.com" + subdomain = "chan" + + def __init__(self, match): + Extractor.__init__(self, match) + self.root = "https://" + self.cookiedomain + self.logged_in = True + self.start_page = 1 + self.start_post = 0 + self.extags = self.config("tags", False) + self.wait_min = self.config("wait-min", 3.0) + self.wait_max = self.config("wait-max", 6.0) + if self.wait_max < self.wait_min: + self.wait_max = self.wait_min + + def items(self): + self.login() + data = self.get_metadata() + + yield Message.Version, 1 + yield Message.Directory, data + + for post_id in util.advance(self.get_posts(), self.start_post): + self.wait() + post = self.get_post_data(post_id) + url = post["file_url"] + post.update(data) + yield Message.Url, url, text.nameext_from_url(url, post) + + def skip(self, num): + self.start_post += num + return num + + def get_metadata(self): + """Return general metadata""" + return {} + + def get_posts(self): + """Return an iterable containing all relevant post ids""" + + def get_post_data(self, post_id, extr=text.extract): + """Extract metadata of a single post""" + url = self.root + "/post/show/" + post_id + page = self.request(url, retries=10).text + + tags , pos = extr(page, "<title>", " | ") + vavg , pos = extr(page, "itemprop=ratingValue>", "<", pos) + vcnt , pos = extr(page, "itemprop=reviewCount>", "<", pos) + _ , pos = extr(page, "Posted: <", "", pos) + created, pos = extr(page, ' title="', '"', pos) + rating = extr(page, "<li>Rating: ", "<", pos)[0] + + file_url, pos = extr(page, '<li>Original: <a href="', '"', pos) + if file_url: + width , pos = extr(page, '>', 'x', pos) + height, pos = extr(page, '', ' ', pos) + else: + width , pos = extr(page, '<object width=', ' ', pos) + height, pos = extr(page, 'height=', '>', pos) + file_url = extr(page, '<embed src="', '"', pos)[0] + + data = { + "id": text.parse_int(post_id), + "md5": file_url.rpartition("/")[2].partition(".")[0], + "tags": text.unescape(tags), + "vote_average": text.parse_float(vavg), + "vote_count": text.parse_int(vcnt), + "created_at": created, + "rating": (rating or "?")[0].lower(), + "file_url": "https:" + text.unescape(file_url), + "width": text.parse_int(width), + "height": text.parse_int(height), + } + + if self.extags: + tags = collections.defaultdict(list) + tags_html = text.extract(page, '<ul id=tag-sidebar>', '</ul>')[0] + pattern = re.compile(r'tag-type-([^>]+)><a href="/\?tags=([^"]+)') + for tag_type, tag_name in pattern.findall(tags_html or ""): + tags[tag_type].append(text.unquote(tag_name)) + for key, value in tags.items(): + data["tags_" + key] = " ".join(value) + + return data + + def wait(self): + """Wait for a randomly chosen amount of seconds""" + time.sleep(random.uniform(self.wait_min, self.wait_max)) + + def login(self): + """Login and set necessary cookies""" + if self._check_cookies(self.cookienames): + return + username, password = self._get_auth_info() + if username: + cookies = self._login_impl((username, self.subdomain), password) + self._update_cookies(cookies) + else: + self.logged_in = False + + @cache(maxage=90*24*3600, keyarg=1) + def _login_impl(self, usertuple, password): + username = usertuple[0] + self.log.info("Logging in as %s", username) + url = self.root + "/user/authenticate" + data = { + "url": "", + "user[name]": username, + "user[password]": password, + "commit": "Login", + } + response = self.request(url, method="POST", data=data) + + if not response.history or response.url != self.root + "/user/home": + raise exception.AuthenticationError() + cookies = response.history[0].cookies + return {c: cookies[c] for c in self.cookienames} + + +class SankakuTagExtractor(SankakuExtractor): + """Extractor for images from chan.sankakucomplex.com by search-tags""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "t_{search_tags}_{id}" + pattern = r"(?:https?://)?chan\.sankakucomplex\.com/\?([^#]*)" + test = ( + ("https://chan.sankakucomplex.com/?tags=bonocho", { + "count": 5, + "pattern": r"https://cs\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}" + r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+", + }), + # respect 'page' query parameter + ("https://chan.sankakucomplex.com/?tags=bonocho&page=2", { + "count": 0, + }), + # respect 'next' query parameter + ("https://chan.sankakucomplex.com/?tags=bonocho&next=182284", { + "count": 1, + }), + # error on five or more tags + ("https://chan.sankakucomplex.com/?tags=bonocho+a+b+c+d", { + "options": (("username", None),), + "exception": exception.StopExtraction, + }), + # match arbitrary query parameters + ("https://chan.sankakucomplex.com" + "/?tags=marie_rose&page=98&next=3874906&commit=Search"), + ) + per_page = 20 + + def __init__(self, match): + SankakuExtractor.__init__(self, match) + query = text.parse_query(match.group(1)) + self.tags = text.unquote(query.get("tags", "").replace("+", " ")) + self.start_page = text.parse_int(query.get("page"), 1) + self.next = text.parse_int(query.get("next"), 0) + + def skip(self, num): + if self.next: + self.start_post += num + else: + pages, posts = divmod(num, self.per_page) + self.start_page += pages + self.start_post += posts + return num + + def get_metadata(self): + if not self.next: + max_page = 50 if self.logged_in else 25 + if self.start_page > max_page: + self.log.info("Traversing from page %d to page %d", + max_page, self.start_page) + self.start_post += self.per_page * (self.start_page - max_page) + self.start_page = max_page + + tags = self.tags.split() + if not self.logged_in and len(tags) > 4: + self.log.error("Unauthenticated users cannot use " + "more than 4 tags at once.") + raise exception.StopExtraction() + return {"search_tags": " ".join(tags)} + + def get_posts(self): + params = {"tags": self.tags} + + if self.next: + params["next"] = self.next + else: + params["page"] = self.start_page + + while True: + self.wait() + page = self.request(self.root, params=params, retries=10).text + pos = page.find("<div id=more-popular-posts-link>") + 1 + + ids = list(text.extract_iter(page, '" id=p', '>', pos)) + if not ids: + return + yield from ids + + next_qs = text.extract(page, 'next-page-url="/?', '"', pos)[0] + next_id = text.parse_query(next_qs).get("next") + + # stop if the same "next" parameter occurs twice in a row (#265) + if "next" in params and params["next"] == next_id: + return + + params["next"] = next_id or (text.parse_int(ids[-1]) - 1) + params["page"] = "2" + + +class SankakuPoolExtractor(SankakuExtractor): + """Extractor for image-pools from chan.sankakucomplex.com""" + subcategory = "pool" + directory_fmt = ("{category}", "pool", "{pool}") + archive_fmt = "p_{pool}_{id}" + pattern = r"(?:https?://)?chan\.sankakucomplex\.com/pool/show/(\d+)" + test = ("https://chan.sankakucomplex.com/pool/show/90", { + "count": 5, + }) + per_page = 24 + + def __init__(self, match): + SankakuExtractor.__init__(self, match) + self.pool_id = match.group(1) + + def skip(self, num): + pages, posts = divmod(num, self.per_page) + self.start_page += pages + self.start_post += posts + return num + + def get_metadata(self): + return {"pool": self.pool_id} + + def get_posts(self): + url = self.root + "/pool/show/" + self.pool_id + params = {"page": self.start_page} + + while True: + page = self.request(url, params=params, retries=10).text + ids = list(text.extract_iter(page, '" id=p', '>')) + + yield from ids + if len(ids) < self.per_page: + return + + params["page"] += 1 + + +class SankakuPostExtractor(SankakuExtractor): + """Extractor for single images from chan.sankakucomplex.com""" + subcategory = "post" + archive_fmt = "{id}" + pattern = r"(?:https?://)?chan\.sankakucomplex\.com/post/show/(\d+)" + test = ("https://chan.sankakucomplex.com/post/show/360451", { + "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + "options": (("tags", True),), + "keyword": { + "tags_artist": "bonocho", + "tags_copyright": "batman_(series) the_dark_knight", + "tags_medium": "sketch copyright_name", + "tags_studio": "dc_comics", + "tags_character": str, + "tags_general": str, + }, + }) + + def __init__(self, match): + SankakuExtractor.__init__(self, match) + self.post_id = match.group(1) + + def get_posts(self): + return (self.post_id,) diff --git a/gallery_dl/extractor/sankakucomplex.py b/gallery_dl/extractor/sankakucomplex.py new file mode 100644 index 0000000..22b2b63 --- /dev/null +++ b/gallery_dl/extractor/sankakucomplex.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.sankakucomplex.com/""" + +from .common import Extractor, Message +from .. import text +import re + + +class SankakucomplexExtractor(Extractor): + """Base class for sankakucomplex extractors""" + category = "sankakucomplex" + root = "https://www.sankakucomplex.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + + +class SankakucomplexArticleExtractor(SankakucomplexExtractor): + """Extractor for articles on www.sankakucomplex.com""" + subcategory = "article" + directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}") + filename_fmt = "{filename}.{extension}" + archive_fmt = "{date:%Y%m%d}_{filename}" + pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + r"/(\d{4}/\d\d/\d\d/[^/?&#]+)") + test = ( + ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", { + "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d", + "keyword": "4b3b5766b277a5d0acbec90fa8f2343262b07efd", + }), + ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", { + "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c", + "keyword": "f47a416d680717855bbc3e4f0cd44479f61d9aa4", + }), + ) + + def items(self): + url = "{}/{}/?pg=X".format(self.root, self.path) + extr = text.extract_from(self.request(url).text) + data = { + "title" : text.unescape( + extr('property="og:title" content="', '"')), + "description": text.unescape( + extr('property="og:description" content="', '"')), + "date" : text.parse_datetime( + extr('property="article:published_time" content="', '"')), + } + imgs = self.images(extr) + data["count"] = len(imgs) + data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2] + + yield Message.Version, 1 + yield Message.Directory, data + for img in imgs: + img.update(data) + yield Message.Url, img["url"], img + + def images(self, extr): + num = 0 + imgs = [] + urls = set() + orig = re.compile(r"-\d+x\d+\.") + + extr('<div class="entry-content">', '') + while True: + url = extr('data-lazy-src="', '"') + if not url: + return imgs + if url in urls: + continue + if url[0] == "/": + url = text.urljoin(self.root, url) + url = orig.sub(".", url) + num += 1 + imgs.append(text.nameext_from_url(url, { + "url" : url, + "num" : num, + })) + urls.add(url) + + +class SankakucomplexTagExtractor(SankakucomplexExtractor): + """Extractor for sankakucomplex blog articles by tag or author""" + subcategory = "tag" + pattern = (r"(?:https?://)?www\.sankakucomplex\.com" + r"/((?:tag|category|author)/[^/&?#]+)") + test = ( + ("https://www.sankakucomplex.com/tag/cosplay/", { + "range": "1-50", + "count": 50, + "pattern": SankakucomplexArticleExtractor.pattern, + }), + ("https://www.sankakucomplex.com/category/anime/"), + ("https://www.sankakucomplex.com/author/rift/page/5/"), + ) + + def items(self): + pnum = 1 + last = None + data = {"_extractor": SankakucomplexArticleExtractor} + + yield Message.Version, 1 + while True: + url = "{}/{}/page/{}/".format(self.root, self.path, pnum) + response = self.request(url, expect=(404,)) + if response.status_code == 404: + return + for url in text.extract_iter(response.text, 'data-direct="', '"'): + if url != last: + last = url + yield Message.Queue, url, data + pnum += 1 diff --git a/gallery_dl/extractor/seiga.py b/gallery_dl/extractor/seiga.py new file mode 100644 index 0000000..f63c999 --- /dev/null +++ b/gallery_dl/extractor/seiga.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://seiga.nicovideo.jp/""" + +from .common import Extractor, Message +from .. import text, util, exception +from ..cache import cache + + +class SeigaExtractor(Extractor): + """Base class for seiga extractors""" + category = "seiga" + archive_fmt = "{image_id}" + cookiedomain = ".nicovideo.jp" + root = "https://seiga.nicovideo.jp" + + def __init__(self, match): + Extractor.__init__(self, match) + self.start_image = 0 + + def items(self): + self.login() + images = iter(self.get_images()) + data = next(images) + + yield Message.Version, 1 + yield Message.Directory, data + for image in util.advance(images, self.start_image): + data.update(image) + data["extension"] = None + yield Message.Url, self.get_image_url(data["image_id"]), data + + def get_images(self): + """Return iterable containing metadata and images""" + + def get_image_url(self, image_id): + """Get url for an image with id 'image_id'""" + url = "{}/image/source/{}".format(self.root, image_id) + response = self.request( + url, method="HEAD", allow_redirects=False, expect=(404,)) + if response.status_code == 404: + raise exception.NotFoundError("image") + return response.headers["Location"].replace("/o/", "/priv/", 1) + + def login(self): + """Login and set necessary cookies""" + if not self._check_cookies(("user_session",)): + username, password = self._get_auth_info() + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=7*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + url = "https://account.nicovideo.jp/api/v1/login" + data = {"mail_tel": username, "password": password} + + self.request(url, method="POST", data=data) + if "user_session" not in self.session.cookies: + raise exception.AuthenticationError() + del self.session.cookies["nicosid"] + return self.session.cookies + + +class SeigaUserExtractor(SeigaExtractor): + """Extractor for images of a user from seiga.nicovideo.jp""" + subcategory = "user" + directory_fmt = ("{category}", "{user[id]}") + filename_fmt = "{category}_{user[id]}_{image_id}.{extension}" + pattern = (r"(?:https?://)?(?:www\.|seiga\.)?nicovideo\.jp/" + r"user/illust/(\d+)(?:\?(?:[^&]+&)*sort=([^&#]+))?") + test = ( + ("https://seiga.nicovideo.jp/user/illust/39537793", { + "pattern": r"https://lohas\.nicoseiga\.jp/priv/[0-9a-f]+/\d+/\d+", + "count": ">= 4", + "keyword": { + "user": { + "id": 39537793, + "message": str, + "name": str, + }, + "clips": int, + "comments": int, + "count": int, + "extension": None, + "image_id": int, + "title": str, + "views": int, + }, + }), + ("https://seiga.nicovideo.jp/user/illust/79433", { + "exception": exception.NotFoundError, + }), + ("https://seiga.nicovideo.jp/user/illust/39537793" + "?sort=image_view&target=illust_all"), + ) + + def __init__(self, match): + SeigaExtractor.__init__(self, match) + self.user_id, self.order = match.groups() + self.start_page = 1 + + def skip(self, num): + pages, images = divmod(num, 40) + self.start_page += pages + self.start_image += images + return num + + def get_metadata(self, page): + """Collect metadata from 'page'""" + data = text.extract_all(page, ( + ("name" , '<img alt="', '"'), + ("msg" , '<li class="user_message">', '</li>'), + (None , '<span class="target_name">すべて</span>', ''), + ("count", '<span class="count ">', '</span>'), + ))[0] + + if not data["name"] and "ユーザー情報が取得出来ませんでした" in page: + raise exception.NotFoundError("user") + + return { + "user": { + "id": text.parse_int(self.user_id), + "name": data["name"], + "message": (data["msg"] or "").strip(), + }, + "count": text.parse_int(data["count"]), + } + + def get_images(self): + url = "{}/user/illust/{}".format(self.root, self.user_id) + params = {"sort": self.order, "page": self.start_page, + "target": "illust_all"} + + while True: + cnt = 0 + page = self.request(url, params=params).text + + if params["page"] == self.start_page: + yield self.get_metadata(page) + + for info in text.extract_iter( + page, '<li class="list_item', '</a></li> '): + data = text.extract_all(info, ( + ("image_id", '/seiga/im', '"'), + ("title" , '<li class="title">', '</li>'), + ("views" , '</span>', '</li>'), + ("comments", '</span>', '</li>'), + ("clips" , '</span>', '</li>'), + ))[0] + for key in ("image_id", "views", "comments", "clips"): + data[key] = text.parse_int(data[key]) + yield data + cnt += 1 + + if cnt < 40: + return + params["page"] += 1 + + +class SeigaImageExtractor(SeigaExtractor): + """Extractor for single images from seiga.nicovideo.jp""" + subcategory = "image" + filename_fmt = "{category}_{image_id}.{extension}" + pattern = (r"(?:https?://)?(?:" + r"(?:seiga\.|www\.)?nicovideo\.jp/(?:seiga/im|image/source/)" + r"|lohas\.nicoseiga\.jp/(?:thumb|(?:priv|o)/[^/]+/\d+)/)(\d+)") + test = ( + ("https://seiga.nicovideo.jp/seiga/im5977527", { + "keyword": "f66ba5de33d4ce2cb57f23bb37e1e847e0771c10", + "content": "d9202292012178374d57fb0126f6124387265297", + }), + ("https://seiga.nicovideo.jp/seiga/im123", { + "exception": exception.NotFoundError, + }), + ("https://seiga.nicovideo.jp/image/source/5977527"), + ("https://lohas.nicoseiga.jp/thumb/5977527i"), + ("https://lohas.nicoseiga.jp/priv" + "/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"), + ("https://lohas.nicoseiga.jp/o" + "/759a4ef1c639106ba4d665ee6333832e647d0e4e/1549727594/5977527"), + ) + + def __init__(self, match): + SeigaExtractor.__init__(self, match) + self.image_id = match.group(1) + + def skip(self, num): + self.start_image += num + return num + + def get_images(self): + return ({}, {"image_id": text.parse_int(self.image_id)}) diff --git a/gallery_dl/extractor/senmanga.py b/gallery_dl/extractor/senmanga.py new file mode 100644 index 0000000..736173f --- /dev/null +++ b/gallery_dl/extractor/senmanga.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract manga-chapters from from https://raw.senmanga.com/""" + +from .common import Extractor, Message +from .. import text + + +class SenmangaChapterExtractor(Extractor): + """Extractor for manga-chapters from raw.senmanga.com""" + category = "senmanga" + subcategory = "chapter" + directory_fmt = ("{category}", "{manga}", "{chapter_string}") + filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}" + archive_fmt = "{manga}_{chapter_string}_{page}" + pattern = r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)" + test = ( + ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", { + "url": "5f95140ff511d8497e2ec08fa7267c6bb231faec", + "keyword": "705d941a150765edb33cd2707074bd703a93788c", + "content": "0e37b1995708ffc175f2e175d91a518e6948c379", + }), + ("http://raw.senmanga.com/Love-Lab/2016-03/1", { + "url": "8347b9f00c14b864dd3c19a1f5ae52adb2ef00de", + "keyword": "8a8ab2529ba2edfc83a6b3a8bede1d6c580db7b4", + }), + ) + root = "https://raw.senmanga.com" + + def __init__(self, match): + Extractor.__init__(self, match) + part = match.group(1) + self.chapter_url = "{}/{}/".format(self.root, part) + self.img_url = "{}/viewer/{}/".format(self.root, part) + self.session.headers["Referer"] = self.chapter_url + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for data["page"] in range(1, data["count"]+1): + data["extension"] = None + yield Message.Url, self.img_url + str(data["page"]), data + + def metadata(self): + """Collect metadata for extractor-job""" + page = self.request(self.chapter_url).text + self.session.cookies.clear() + title, pos = text.extract(page, '<title>', '</title>') + count, pos = text.extract(page, '</select> of ', '\n', pos) + manga, _, chapter = title.partition(" - Chapter ") + + return { + "manga": text.unescape(manga).replace("-", " "), + "chapter_string": chapter.partition(" - Page ")[0], + "count": text.parse_int(count), + "lang": "jp", + "language": "Japanese", + } diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py new file mode 100644 index 0000000..aa2b16b --- /dev/null +++ b/gallery_dl/extractor/sexcom.py @@ -0,0 +1,194 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.sex.com/""" + +from .common import Extractor, Message +from .. import text + + +class SexcomExtractor(Extractor): + """Base class for sexcom extractors""" + category = "sexcom" + directory_fmt = ("{category}") + filename_fmt = "{pin_id}{title:? //}.{extension}" + archive_fmt = "{pin_id}" + root = "https://www.sex.com" + + def items(self): + yield Message.Version, 1 + yield Message.Directory, self.metadata() + for url in self.pins(): + pin = self._parse_pin(url) + yield Message.Url, pin["url"], pin + + def metadata(self): + return {} + + def pins(self): + return () + + def _pagination(self, url): + while True: + extr = text.extract_from(self.request(url).text) + + while True: + href = extr('<a class="image_wrapper" href="', '"') + if not href: + break + yield self.root + href + + pager = extr('id="pagenum"', '</div>') + url = text.extract(pager, ' href="', '"')[0] + if not url: + return + url = text.urljoin(self.root, url) + + def _parse_pin(self, pin_url): + extr = text.extract_from(self.request(pin_url).text) + data = {} + + data["thumbnail"] = extr('itemprop="thumbnail" content="', '"') + data["type"] = extr('<h1>' , '<').rstrip(" -").strip().lower() + data["title"] = text.unescape(extr('itemprop="name">' , '<')) + data["repins"] = text.parse_int(text.extract( + extr('"btn-group"', '</div>'), '"btn btn-primary">' , '<')[0]) + data["likes"] = text.parse_int(text.extract( + extr('"btn-group"', '</div>'), '"btn btn-default">' , '<')[0]) + data["pin_id"] = text.parse_int(extr('data-id="', '"')) + + if data["type"] == "video": + info = extr("player.updateSrc(", ");") + + if info: + path = text.extract(info, "src: '", "'")[0] + data["filename"] = path.rpartition("/")[2] + data["extension"] = "mp4" + if "'HD'" in info: + path += "/hd" + data["url"] = self.root + path + else: + data["url"] = "ytdl:" + text.extract( + extr('<iframe', '>'), ' src="', '"')[0] + else: + data["url"] = extr(' src="', '"') + text.nameext_from_url(data["url"], data) + + data["uploader"] = extr('itemprop="author">', '<') + data["date"] = extr('datetime="', '"') + data["tags"] = text.split_html(extr('class="tags"> Tags', '</div>')) + data["comments"] = text.parse_int(extr('Comments (', ')')) + + return data + + +class SexcomPinExtractor(SexcomExtractor): + """Extractor a pinned image or video on www.sex.com""" + subcategory = "pin" + directory_fmt = ("{category}",) + pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)" + test = ( + # picture + ("https://www.sex.com/pin/56714360/", { + "url": "599190d6e3d79f9f49dda194a0a58cb0ffa3ab86", + "keyword": { + "comments": int, + "date": "2018-10-02T21:18:17-04:00", + "extension": "jpg", + "filename": "20037816", + "likes": int, + "pin_id": 56714360, + "repins": int, + "tags": list, + "thumbnail": str, + "title": "Pin #56714360", + "type": "picture", + "uploader": "alguem", + "url": str, + }, + }), + # gif + ("https://www.sex.com/pin/11465040-big-titted-hentai-gif/", { + "url": "98a82c5ae7a65c8228e1405ac740f80d4d556de1", + }), + # video + ("https://www.sex.com/pin/55748381/", { + "pattern": "https://www.sex.com/video/stream/776238/hd", + }), + # pornhub embed + ("https://www.sex.com/pin/55847384-very-nicely-animated/", { + "pattern": "ytdl:https://www.pornhub.com/embed/ph56ef24b6750f2", + }), + ) + + def __init__(self, match): + SexcomExtractor.__init__(self, match) + self.pin_id = match.group(1) + + def pins(self): + return ("{}/pin/{}/".format(self.root, self.pin_id),) + + +class SexcomBoardExtractor(SexcomExtractor): + """Extractor for pins from a board on www.sex.com""" + subcategory = "board" + directory_fmt = ("{category}", "{user}", "{board}") + pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user" + r"/([^/?&#]+)/(?!(?:following|pins|repins|likes)/)([^/?&#]+)") + test = ("https://www.sex.com/user/ronin17/exciting-hentai/", { + "count": ">= 15", + }) + + def __init__(self, match): + SexcomExtractor.__init__(self, match) + self.user, self.board = match.groups() + + def metadata(self): + return { + "user" : text.unquote(self.user), + "board": text.unquote(self.board), + } + + def pins(self): + url = "{}/user/{}/{}/".format(self.root, self.user, self.board) + return self._pagination(url) + + +class SexcomSearchExtractor(SexcomExtractor): + """Extractor for search results on www.sex.com""" + subcategory = "search" + directory_fmt = ("{category}", "search", "{search[query]}") + pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:" + r"(pic|gif|video)s/([^/?&#]+)|search/(pic|gif|video)s" + r")/?(?:\?([^#]+))?)") + test = ( + ("https://www.sex.com/search/pics?query=ecchi", { + "range": "1-10", + "count": 10, + }), + ("https://www.sex.com/videos/hentai/", { + "range": "1-10", + "count": 10, + }), + ) + + def __init__(self, match): + SexcomExtractor.__init__(self, match) + self.path = match.group(1) + + self.search = text.parse_query(match.group(5)) + self.search["type"] = match.group(2) or match.group(4) + if "query" not in self.search: + self.search["query"] = match.group(3) or "" + + def metadata(self): + return {"search": self.search} + + def pins(self): + url = "{}/{}".format(self.root, self.path) + return self._pagination(url) diff --git a/gallery_dl/extractor/shopify.py b/gallery_dl/extractor/shopify.py new file mode 100644 index 0000000..35895bb --- /dev/null +++ b/gallery_dl/extractor/shopify.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for Shopify instances""" + +from .common import Extractor, Message, SharedConfigMixin, generate_extractors +from .. import text +import time +import re + + +class ShopifyExtractor(SharedConfigMixin, Extractor): + """Base class for Shopify extractors""" + basecategory = "shopify" + filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}" + archive_fmt = "{id}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.item_url = self.root + match.group(1) + + def request(self, url, method="GET", expect=range(400, 500), **kwargs): + tries = 0 + kwargs["expect"] = expect + while True: + response = Extractor.request(self, url, method, **kwargs) + if response.status_code not in (429, 430): + return response + tries += 1 + waittime = 2 ** (tries + 2) + self.log.warning( + "HTTP status %s: %s - Waiting for %d seconds", + response.status_code, response.reason, waittime) + time.sleep(waittime) + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + + headers = {"X-Requested-With": "XMLHttpRequest"} + for url in self.products(): + response = self.request(url + ".json", headers=headers) + if response.status_code >= 400: + self.log.warning('Skipping %s ("%d: %s")', + url, response.status_code, response.reason) + continue + product = response.json()["product"] + del product["image"] + + for num, image in enumerate(product.pop("images"), 1): + text.nameext_from_url(image["src"], image) + image.update(data) + image["product"] = product + image["num"] = num + yield Message.Url, image["src"], image + + def metadata(self): + """Return general metadata""" + return {} + + def products(self): + """Return an iterable with all relevant product URLs""" + + +class ShopifyCollectionExtractor(ShopifyExtractor): + """Base class for collection extractors for Shopify based sites""" + subcategory = "collection" + directory_fmt = ("{category}", "{collection[title]}") + pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)" + + def __init__(self, match): + ShopifyExtractor.__init__(self, match) + self.params = match.group(2) + + def metadata(self): + return self.request(self.item_url + ".json").json() + + def products(self): + params = text.parse_query(self.params) + params["page"] = text.parse_int(params.get("page"), 1) + search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+") + + while True: + page = self.request(self.item_url, params=params).text + urls = search_re.findall(page) + + if not urls: + return + for path in urls: + yield self.root + path + params["page"] += 1 + + +class ShopifyProductExtractor(ShopifyExtractor): + """Base class for product extractors for Shopify based sites""" + subcategory = "product" + directory_fmt = ("{category}", "Products") + pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)" + + def products(self): + return (self.item_url,) + + +EXTRACTORS = { + "fashionnova": { + "root": "https://www.fashionnova.com", + "pattern": r"(?:www\.)?fashionnova\.com", + "test-product": ( + ("https://www.fashionnova.com/products/essential-slide-red", { + "pattern": r"https?://cdn\.shopify.com/", + "count": 3, + }), + ("https://www.fashionnova.com/collections/flats/products/name"), + ), + "test-collection": ( + ("https://www.fashionnova.com/collections/mini-dresses", { + "range": "1-20", + "count": 20, + }), + ("https://www.fashionnova.com/collections/mini-dresses/?page=1"), + ("https://www.fashionnova.com/collections/mini-dresses#1"), + ), + + }, +} + +generate_extractors(EXTRACTORS, globals(), ( + ShopifyProductExtractor, + ShopifyCollectionExtractor, +)) diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py new file mode 100644 index 0000000..44dc6fe --- /dev/null +++ b/gallery_dl/extractor/simplyhentai.py @@ -0,0 +1,187 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract hentai-manga from https://www.simply-hentai.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util, exception + + +class SimplyhentaiGalleryExtractor(GalleryExtractor): + """Extractor for image galleries from simply-hentai.com""" + category = "simplyhentai" + archive_fmt = "{image_id}" + pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com" + r"(?!/(?:album|gifs?|images?|series)(?:/|$))" + r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?&#]+)+)") + test = ( + (("https://original-work.simply-hentai.com" + "/amazon-no-hiyaku-amazon-elixir"), { + "url": "258289249990502c3138719cb89e995a60861e49", + "keyword": "18ab9defca53dbb2aeb7965193e93e0ea125b76b", + }), + ("https://www.simply-hentai.com/notfound", { + "exception": exception.GalleryDLException, + }), + # custom subdomain + ("https://pokemon.simply-hentai.com/mao-friends-9bc39"), + # www subdomain, two path segments + ("https://www.simply-hentai.com/vocaloid/black-magnet"), + ) + + def __init__(self, match): + url = "https://" + match.group(1) + GalleryExtractor.__init__(self, match, url) + self.session.headers["Referer"] = url + + def metadata(self, page): + extr = text.extract + title , pos = extr(page, '<meta property="og:title" content="', '"') + if not title: + raise exception.NotFoundError("gallery") + gid , pos = extr(page, '/Album/', '/', pos) + series, pos = extr(page, 'box-title">Series</div>', '</div>', pos) + lang , pos = extr(page, 'box-title">Language</div>', '</div>', pos) + chars , pos = extr(page, 'box-title">Characters</div>', '</div>', pos) + tags , pos = extr(page, 'box-title">Tags</div>', '</div>', pos) + artist, pos = extr(page, 'box-title">Artists</div>', '</div>', pos) + date , pos = extr(page, 'Uploaded', '</div>', pos) + lang = text.remove_html(lang) if lang else None + + return { + "gallery_id": text.parse_int(gid), + "title" : text.unescape(title), + "artist" : text.split_html(artist), + "parody" : text.split_html(series), + "characters": text.split_html(chars), + "tags" : text.split_html(tags), + "lang" : util.language_to_code(lang), + "language" : lang, + "date" : text.remove_html(date), + } + + def images(self, _): + url = self.chapter_url + "/all-pages" + headers = {"Accept": "application/json"} + images = self.request(url, headers=headers).json() + return [ + (urls["full"], {"image_id": text.parse_int(image_id)}) + for image_id, urls in sorted(images.items()) + ] + + +class SimplyhentaiImageExtractor(Extractor): + """Extractor for individual images from simply-hentai.com""" + category = "simplyhentai" + subcategory = "image" + directory_fmt = ("{category}", "{type}s") + filename_fmt = "{category}_{token}{title:?_//}.{extension}" + archive_fmt = "{token}" + pattern = (r"(?:https?://)?(?:www\.)?(simply-hentai\.com" + r"/(image|gif)/[^/?&#]+)") + test = ( + (("https://www.simply-hentai.com/image" + "/pheromomania-vol-1-kanzenban-isao-3949d8b3-400c-4b6"), { + "url": "0338eb137830ab6f81e5f410d3936ef785d063d9", + "keyword": "e10e5588481cab68329ef6ec1e5325206b2079a2", + }), + ("https://www.simply-hentai.com/gif/8915dfcf-0b6a-47c", { + "url": "11c060d7ec4dfd0bd105300b6e1fd454674a5af1", + "keyword": "dd97a4bb449c397d6fec9f43a1303c0fb168ae65", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.page_url = "https://www." + match.group(1) + self.type = match.group(2) + + def items(self): + page = self.request(self.page_url).text + url_search = 'data-src="' if self.type == "image" else '<source src="' + + title, pos = text.extract(page, '"og:title" content="', '"') + descr, pos = text.extract(page, '"og:description" content="', '"', pos) + url , pos = text.extract(page, url_search, '"', pos) + + tags = text.extract(descr, " tagged with ", " online for free ")[0] + if tags: + tags = tags.split(", ") + tags[-1] = tags[-1].partition(" ")[2] + else: + tags = [] + + data = text.nameext_from_url(url, { + "title": text.unescape(title) if title else "", + "tags": tags, + "type": self.type, + }) + data["token"] = data["filename"].rpartition("_")[2] + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, url, data + + +class SimplyhentaiVideoExtractor(Extractor): + """Extractor for hentai videos from simply-hentai.com""" + category = "simplyhentai" + subcategory = "video" + directory_fmt = ("{category}", "{type}s") + filename_fmt = "{title}{episode:?_//>02}.{extension}" + archive_fmt = "{title}_{episode}" + pattern = r"(?:https?://)?(videos\.simply-hentai\.com/[^/?&#]+)" + test = ( + ("https://videos.simply-hentai.com/creamy-pie-episode-02", { + "pattern": r"https://www\.googleapis\.com/drive/v3/files" + r"/0B1ecQ8ZVLm3JcHZzQzBnVy1ZUmc\?alt=media&key=[\w-]+", + "keyword": "29d63987fed33f0a9f4b3786d1d71b03d793250a", + "count": 1, + }), + (("https://videos.simply-hentai.com" + "/1715-tifa-in-hentai-gang-bang-3d-movie"), { + "url": "ad9a36ae06c601b6490e3c401834b4949d947eb0", + "keyword": "c561341aa3c6999f615abf1971d28fb2a83da2a7", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.page_url = "https://" + match.group(1) + + def items(self): + page = self.request(self.page_url).text + + title, pos = text.extract(page, "<title>", "</title>") + tags , pos = text.extract(page, ">Tags</div>", "</div>", pos) + date , pos = text.extract(page, ">Upload Date</div>", "</div>", pos) + title = title.rpartition(" - ")[0] + + if "<video" in page: + video_url = text.extract(page, '<source src="', '"', pos)[0] + episode = 0 + else: + # video url from myhentai.tv embed + pos = page.index('<div class="video-frame-container">', pos) + embed_url = text.extract(page, 'src="', '"', pos)[0].replace( + "embedplayer.php?link=", "embed.php?name=") + embed_page = self.request(embed_url).text + video_url = text.extract(embed_page, '"file":"', '"')[0] + title, _, episode = title.rpartition(" Episode ") + + data = text.nameext_from_url(video_url, { + "title": text.unescape(title), + "episode": text.parse_int(episode), + "tags": text.split_html(tags)[::2], + "date": text.remove_html(date), + "type": "video", + }) + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, video_url, data diff --git a/gallery_dl/extractor/slickpic.py b/gallery_dl/extractor/slickpic.py new file mode 100644 index 0000000..127cce8 --- /dev/null +++ b/gallery_dl/extractor/slickpic.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.slickpic.com/""" + +from .common import Extractor, Message +from .. import text +import time + + +BASE_PATTERN = r"(?:https?://)?([^.]+)\.slickpic\.com" + + +class SlickpicExtractor(Extractor): + """Base class for slickpic extractors""" + category = "slickpic" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + self.root = "https://{}.slickpic.com".format(self.user) + + +class SlickpicAlbumExtractor(SlickpicExtractor): + """Extractor for albums on slickpic.com""" + subcategory = "album" + directory_fmt = ("{category}", "{user[name]}", + "{album[id]} {album[title]}") + filename_fmt = "{num:>03}_{id}{title:?_//}.{extension}" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/albums/([^/?&#]+)" + test = ( + ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", { + "url": "58bd94ebc80fd906e9879826970b408d54c6da07", + "keyword": "54a9d6f9e42ae43c644aa9316186fb9d9955fe53", + }), + ("https://mattcrandall.slickpic.com/albums/LamborghiniMurcielago/", { + "range": "34", + "content": "cec6630e659dc72db1ee1a9a6f3b525189261988", + }), + ) + + def __init__(self, match): + SlickpicExtractor.__init__(self, match) + self.album = match.group(2) + + def items(self): + data = self.metadata() + imgs = self.images(data) + + data = { + "album": { + "id" : text.parse_int(data["aid"]), + "title": text.unescape(data["title"]), + }, + "user": { + "id" : text.parse_int(data["uid"]), + "name": text.unescape(data["user"]), + "nick": self.user + }, + "count": len(imgs), + } + + yield Message.Version, 1 + yield Message.Directory, data + for num, img in enumerate(imgs, 1): + url = img["url_rsz"] + "/o/" + img["fname"] + img = text.nameext_from_url(img["fname"], { + "url" : url, + "num" : num, + "id" : text.parse_int(img["id"]), + "width" : text.parse_int(img["width"]), + "height" : text.parse_int(img["height"]), + "title" : img["title"], + "description": img["descr"], + }) + img.update(data) + yield Message.Url, url, img + + def metadata(self): + url = "{}/albums/{}/?wallpaper".format(self.root, self.album) + extr = text.extract_from(self.request(url).text) + + title = text.unescape(extr("<title>", "</title>")) + title, _, user = title.rpartition(" by ") + + return { + "title": title, + "user" : user, + "tk" : extr('tk = "', '"'), + "shd" : extr('shd = "', '"'), + "aid" : extr('data-aid="', '"', ), + "uid" : extr('data-uid="', '"', ), + } + + def images(self, data): + url = self.root + "/xhr/photo/get/list" + data = { + "tm" : time.time(), + "tk" : data["tk"], + "shd" : data["shd"], + "aid" : data["aid"], + "uid" : data["uid"], + "col" : "0", + "sys" : self.album, + "vw" : "1280", + "vh" : "1024", + "skey" : "", + "viewer": "false", + "pub" : "1", + "sng" : "0", + "whq" : "1", + } + return self.request(url, method="POST", data=data).json()["list"] + + +class SlickpicUserExtractor(SlickpicExtractor): + subcategory = "user" + pattern = BASE_PATTERN + r"(?:/gallery)?/?(?:$|[?#])" + test = ( + ("https://mattcrandall.slickpic.com/gallery/", { + "count": ">= 358", + "pattern": SlickpicAlbumExtractor.pattern, + }), + ("https://mattcrandall.slickpic.com/"), + ) + + def items(self): + page = self.request(self.root + "/gallery?viewer").text + data = {"_extractor": SlickpicAlbumExtractor} + base = self.root + "/albums/" + + yield Message.Version, 1 + for album in text.extract_iter(page, 'href="' + base, '"'): + yield Message.Queue, base + album, data diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py new file mode 100644 index 0000000..30420a8 --- /dev/null +++ b/gallery_dl/extractor/slideshare.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann, Leonardo Taccari +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.slideshare.net/""" + +from .common import Extractor, Message +from .. import text + + +class SlidesharePresentationExtractor(Extractor): + """Extractor for images from a presentation on slideshare.net""" + category = "slideshare" + subcategory = "presentation" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{presentation}-{num:>02}.{extension}" + archive_fmt = "{presentation}_{num}" + pattern = (r"(?:https?://)?(?:www\.)?slideshare\.net" + r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)") + test = ( + (("https://www.slideshare.net" + "/Slideshare/get-started-with-slide-share"), { + "url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18", + "content": "ee54e54898778e92696a7afec3ffabdbd98eb0cc", + }), + # long title + (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren" + "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), { + "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7", + }), + # mobile URL + (("https://www.slideshare.net" + "/mobile/uqudent/introduction-to-fixed-prosthodontics"), { + "url": "59993ad7b0cb93c73011547eedcd02c622649e9d", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.user, self.presentation = match.groups() + + def items(self): + page = self.request("https://www.slideshare.net/" + self.user + + "/" + self.presentation).text + data = self.get_job_metadata(page) + imgs = self.get_image_urls(page) + data["count"] = len(imgs) + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], url in enumerate(imgs, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + def get_job_metadata(self, page): + """Collect metadata for extractor-job""" + descr, pos = text.extract( + page, '<meta name="description" content="', '"') + title, pos = text.extract( + page, '<span class="j-title-breadcrumb">', '</span>', pos) + views, pos = text.extract( + page, '<span class="notranslate pippin-data">', 'views<', pos) + published, pos = text.extract( + page, '<time datetime="', '"', pos) + alt_descr, pos = text.extract( + page, 'id="slideshow-description-paragraph" class="notranslate">', + '</p>', pos) + + if descr.endswith("…") and alt_descr: + descr = text.remove_html(alt_descr).strip() + + return { + "user": self.user, + "presentation": self.presentation, + "title": text.unescape(title.strip()), + "description": text.unescape(descr), + "views": text.parse_int(views.replace(",", "")), + "published": published, + } + + @staticmethod + def get_image_urls(page): + """Extract and return a list of all image-urls""" + return list(text.extract_iter(page, 'data-full="', '"')) diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py new file mode 100644 index 0000000..80348ae --- /dev/null +++ b/gallery_dl/extractor/smugmug.py @@ -0,0 +1,316 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.smugmug.com/""" + +from .common import Extractor, Message +from .. import text, oauth, exception + +BASE_PATTERN = ( + r"(?:smugmug:(?!album:)(?:https?://)?([^/]+)|" + r"(?:https?://)?([^.]+)\.smugmug\.com)") + + +class SmugmugExtractor(Extractor): + """Base class for smugmug extractors""" + category = "smugmug" + filename_fmt = ("{category}_{User[NickName]:?/_/}" + "{Image[UploadKey]}_{Image[ImageKey]}.{extension}") + empty_user = { + "Uri": "", + "ResponseLevel": "Public", + "Name": "", + "NickName": "", + "QuickShare": False, + "RefTag": "", + "ViewPassHint": "", + "WebUri": "", + "Uris": None, + } + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = SmugmugAPI(self) + self.videos = self.config("videos", True) + self.session = self.api.session + + def _select_format(self, image): + details = image["Uris"]["ImageSizeDetails"] + media = None + + if self.videos and image["IsVideo"]: + fltr = "VideoSize" + elif "ImageSizeOriginal" in details: + media = details["ImageSizeOriginal"] + else: + fltr = "ImageSize" + + if not media: + sizes = filter(lambda s: s[0].startswith(fltr), details.items()) + media = max(sizes, key=lambda s: s[1]["Width"])[1] + del image["Uris"] + + for key in ("Url", "Width", "Height", "MD5", "Size", "Watermarked", + "Bitrate", "Duration"): + if key in media: + image[key] = media[key] + return image["Url"] + + +class SmugmugAlbumExtractor(SmugmugExtractor): + """Extractor for smugmug albums""" + subcategory = "album" + directory_fmt = ("{category}", "{User[NickName]}", "{Album[Name]}") + archive_fmt = "a_{Album[AlbumKey]}_{Image[ImageKey]}" + pattern = r"smugmug:album:([^:]+)$" + test = ( + ("smugmug:album:ddvxpg", { + "url": "0429e9bf50ee600674e448934e3882ca1761ae7b", + }), + # empty + ("smugmug:album:SXvjbW", { + "count": 0, + }), + # no "User" + ("smugmug:album:6VRT8G", { + "url": "c4a0f4c4bfd514b93cbdeb02b3345bf7ef6604df", + }), + ) + + def __init__(self, match): + SmugmugExtractor.__init__(self, match) + self.album_id = match.group(1) + + def items(self): + album = self.api.album(self.album_id, "User") + user = album["Uris"].get("User") or self.empty_user.copy() + + del user["Uris"] + del album["Uris"] + data = {"Album": album, "User": user} + + yield Message.Version, 1 + yield Message.Directory, data + + for image in self.api.album_images(self.album_id, "ImageSizeDetails"): + url = self._select_format(image) + data["Image"] = image + yield Message.Url, url, text.nameext_from_url(url, data) + + +class SmugmugImageExtractor(SmugmugExtractor): + """Extractor for individual smugmug images""" + subcategory = "image" + archive_fmt = "{Image[ImageKey]}" + pattern = BASE_PATTERN + r"(?:/[^/?&#]+)+/i-([^/?&#-]+)" + test = ( + ("https://acapella.smugmug.com/Micro-Macro/Drops/i-g2Dmf9z", { + "url": "78f0bf3516b6d670b7319216bdeccb35942ca4cf", + "keyword": "b298ef7ed2b1918263b6a7dc6f56e54401584381", + "content": "64a8f69a1d824921eebbdf2420087937adfa45cd", + }), + # video + ("https://tstravels.smugmug.com/Dailies/Daily-Dose-2015/i-39JFNzB", { + "url": "04d0ab1ff829ca7d78f5acb5548953df08e9a5ee", + "keyword": "c708c4b9527a2fb29396c19f7628f9cf4b0b3a39", + }), + ) + + def __init__(self, match): + SmugmugExtractor.__init__(self, match) + self.image_id = match.group(3) + + def items(self): + image = self.api.image(self.image_id, "ImageSizeDetails") + url = self._select_format(image) + + data = {"Image": image} + text.nameext_from_url(url, data) + + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, url, data + + +class SmugmugPathExtractor(SmugmugExtractor): + """Extractor for smugmug albums from URL paths and users""" + subcategory = "path" + pattern = BASE_PATTERN + r"((?:/[^/?&#a-fh-mo-z][^/?&#]*)*)/?$" + test = ( + ("https://acapella.smugmug.com/Micro-Macro/Drops/", { + "pattern": "smugmug:album:ddvxpg$", + }), + ("https://acapella.smugmug.com/", { + "pattern": SmugmugAlbumExtractor.pattern, + "url": "797eb1cbbf5ad8ecac8ee4eedc6466ed77a65d68", + }), + # gallery node without owner + ("https://www.smugmug.com/gallery/n-GLCjnD/", { + "pattern": "smugmug:album:6VRT8G$", + }), + # custom domain + ("smugmug:www.creativedogportraits.com/PortfolioGallery/", { + "pattern": "smugmug:album:txWXzs$", + }), + ("smugmug:www.creativedogportraits.com/", { + "pattern": "smugmug:album:txWXzs$", + }), + ("smugmug:https://www.creativedogportraits.com/"), + ) + + def __init__(self, match): + SmugmugExtractor.__init__(self, match) + self.domain, self.user, self.path = match.groups() + + def items(self): + yield Message.Version, 1 + + if not self.user: + self.user = self.api.site_user(self.domain)["NickName"] + + if self.path: + if self.path.startswith("/gallery/n-"): + node = self.api.node(self.path[11:]) + else: + data = self.api.user_urlpathlookup(self.user, self.path) + node = data["Uris"]["Node"] + + if node["Type"] == "Album": + nodes = (node,) + elif node["Type"] == "Folder": + nodes = self.album_nodes(node) + else: + nodes = () + + for node in nodes: + album_id = node["Uris"]["Album"].rpartition("/")[2] + node["_extractor"] = SmugmugAlbumExtractor + yield Message.Queue, "smugmug:album:" + album_id, node + + else: + for album in self.api.user_albums(self.user): + uri = "smugmug:album:" + album["AlbumKey"] + album["_extractor"] = SmugmugAlbumExtractor + yield Message.Queue, uri, album + + def album_nodes(self, root): + """Yield all descendant album nodes of 'root'""" + for node in self.api.node_children(root["NodeID"]): + if node["Type"] == "Album": + yield node + elif node["Type"] == "Folder": + yield from self.album_nodes(node) + + +class SmugmugAPI(oauth.OAuth1API): + """Minimal interface for the smugmug API v2""" + API_DOMAIN = "api.smugmug.com" + API_KEY = "DFqxg4jf7GrtsQ5PnbNB8899zKfnDrdK" + API_SECRET = ("fknV35p9r9BwZC4XbTzvCXpcSJRdD83S" + "9nMFQm25ndGBzNPnwRDbRnnVBvqt4xTq") + HEADERS = {"Accept": "application/json"} + + def album(self, album_id, expands=None): + return self._expansion("album/" + album_id, expands) + + def image(self, image_id, expands=None): + return self._expansion("image/" + image_id, expands) + + def node(self, node_id, expands=None): + return self._expansion("node/" + node_id, expands) + + def user(self, username, expands=None): + return self._expansion("user/" + username, expands) + + def album_images(self, album_id, expands=None): + return self._pagination("album/" + album_id + "!images", expands) + + def node_children(self, node_id, expands=None): + return self._pagination("node/" + node_id + "!children", expands) + + def user_albums(self, username, expands=None): + return self._pagination("user/" + username + "!albums", expands) + + def site_user(self, domain): + return self._call("!siteuser", domain=domain)["Response"]["User"] + + def user_urlpathlookup(self, username, path): + endpoint = "user/" + username + "!urlpathlookup" + params = {"urlpath": path} + return self._expansion(endpoint, "Node", params) + + def _call(self, endpoint, params=None, domain=API_DOMAIN): + url = "https://{}/api/v2/{}".format(domain, endpoint) + params = params or {} + if self.api_key: + params["APIKey"] = self.api_key + params["_verbosity"] = "1" + + response = self.request(url, params=params, headers=self.HEADERS) + data = response.json() + + if 200 <= data["Code"] < 400: + return data + if data["Code"] == 404: + raise exception.NotFoundError() + if data["Code"] == 429: + self.log.error("Rate limit reached") + else: + self.log.error("API request failed") + self.log.debug(data) + raise exception.StopExtraction() + + def _expansion(self, endpoint, expands, params=None): + endpoint = self._extend(endpoint, expands) + result = self._apply_expansions(self._call(endpoint, params), expands) + if not result: + raise exception.NotFoundError() + return result[0] + + def _pagination(self, endpoint, expands=None): + endpoint = self._extend(endpoint, expands) + params = {"start": 1, "count": 100} + + while True: + data = self._call(endpoint, params) + yield from self._apply_expansions(data, expands) + + if "NextPage" not in data["Response"]["Pages"]: + return + params["start"] += params["count"] + + @staticmethod + def _extend(endpoint, expands): + if expands: + endpoint += "?_expand=" + expands + return endpoint + + @staticmethod + def _apply_expansions(data, expands): + + def unwrap(response): + locator = response["Locator"] + return response[locator] if locator in response else [] + + objs = unwrap(data["Response"]) + if not isinstance(objs, list): + objs = (objs,) + + if "Expansions" in data: + expansions = data["Expansions"] + expands = expands.split(",") + + for obj in objs: + uris = obj["Uris"] + + for name in expands: + if name in uris: + uri = uris[name] + uris[name] = unwrap(expansions[uri]) + + return objs diff --git a/gallery_dl/extractor/test.py b/gallery_dl/extractor/test.py new file mode 100644 index 0000000..2f4992c --- /dev/null +++ b/gallery_dl/extractor/test.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2017 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Utility extractor to execute tests of other extractors""" + +from .common import Extractor, Message +from .. import extractor, exception + + +class TestExtractor(Extractor): + """Extractor to select and run the test URLs of other extractors + + The general form is 'test:<categories>:<subcategories>:<indices>', where + <categories> and <subcategories> are comma-separated (sub)category names + and <indices> is a comma-seperated list of array indices. + To select all possible values for a field use the star '*' character or + leave the field empty. + + Examples: + - test:pixiv + run all pixiv tests + + - test:pixiv:user,favorite:0 + run the first test of the PixivUser- and PixivFavoriteExtractor + + - test: + run all tests + """ + category = "test" + pattern = r"t(?:est)?:([^:]*)(?::([^:]*)(?::(\*|[\d,]*))?)?$" + test = ( + ("test:pixiv"), + ("test:pixiv:user,favorite:0"), + ("test:"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + categories, subcategories, indices = match.groups() + self.categories = self._split(categories) + self.subcategories = self._split(subcategories) + self.indices = self._split(indices) or self + + def items(self): + extractors = extractor.extractors() + + if self.categories: + extractors = [ + extr for extr in extractors + if extr.category in self.categories + ] + + if self.subcategories: + extractors = [ + extr for extr in extractors + if extr.subcategory in self.subcategories + ] + + tests = [ + test + for extr in extractors + for index, test in enumerate(extr._get_tests()) + if str(index) in self.indices + ] + + if not tests: + raise exception.NotFoundError("test") + + yield Message.Version, 1 + for test in tests: + yield Message.Queue, test[0], {} + + @staticmethod + def __contains__(_): + return True + + @staticmethod + def _split(value): + if value and value != "*": + return value.split(",") + return None diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py new file mode 100644 index 0000000..62a9173 --- /dev/null +++ b/gallery_dl/extractor/tsumino.py @@ -0,0 +1,343 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.tsumino.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, exception +from ..cache import cache + + +class TsuminoBase(): + """Base class for tsumino extractors""" + category = "tsumino" + cookiedomain = "www.tsumino.com" + root = "https://www.tsumino.com" + + def login(self): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + else: + self.session.cookies.setdefault( + "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5") + + @cache(maxage=14*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + url = "{}/Account/Login".format(self.root) + headers = {"Referer": url} + data = {"Username": username, "Password": password} + + response = self.request(url, method="POST", headers=headers, data=data) + if not response.history: + raise exception.AuthenticationError() + return {".aotsumino": response.history[0].cookies[".aotsumino"]} + + +class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): + """Extractor for image galleries on tsumino.com""" + pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com" + r"/(?:Book/Info|Read/View)/(\d+)") + test = ( + ("https://www.tsumino.com/Book/Info/40996", { + "url": "84bf30a86623039fc87855680fada884dc8a1ddd", + "keyword": { + "title" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou", + "title_en" : r"re:Shikoshiko Daisuki Nightingale \+ Kaijou", + "title_jp" : "シコシコ大好きナイチンゲール + 会場限定おまけ本", + "gallery_id": 40996, + "date" : "2018 June 29", + "count" : 42, + "collection": "", + "artist" : ["Itou Life"], + "group" : ["Itou Life"], + "parody" : ["Fate/Grand Order"], + "characters": list, + "tags" : list, + "type" : "Doujinshi", + "rating" : float, + "uploader" : "sehki", + "lang" : "en", + "language" : "English", + "thumbnail" : "http://www.tsumino.com/Image/Thumb/40996", + }, + }), + ("https://www.tsumino.com/Read/View/45834"), + ) + + def __init__(self, match): + self.gallery_id = match.group(1) + url = "{}/Book/Info/{}".format(self.root, self.gallery_id) + GalleryExtractor.__init__(self, match, url) + + def metadata(self, page): + extr = text.extract_from(page) + title = extr('"og:title" content="', '"') + title_en, _, title_jp = text.unescape(title).partition("/") + title_en = title_en.strip() + title_jp = title_jp.strip() + + return { + "gallery_id": text.parse_int(self.gallery_id), + "title" : title_en or title_jp, + "title_en" : title_en, + "title_jp" : title_jp, + "thumbnail" : extr('"og:image" content="', '"'), + "uploader" : text.remove_html(extr('id="Uploader">', '</div>')), + "date" : extr('id="Uploaded">', '</div>').strip(), + "rating" : text.parse_float(extr( + 'id="Rating">', '</div>').partition(" ")[0]), + "type" : text.remove_html(extr('id="Category">' , '</div>')), + "collection": text.remove_html(extr('id="Collection">', '</div>')), + "group" : text.split_html(extr('id="Group">' , '</div>')), + "artist" : text.split_html(extr('id="Artist">' , '</div>')), + "parody" : text.split_html(extr('id="Parody">' , '</div>')), + "characters": text.split_html(extr('id="Character">' , '</div>')), + "tags" : text.split_html(extr('id="Tag">' , '</div>')), + "language" : "English", + "lang" : "en", + } + + def images(self, page): + url = "{}/Read/Load/?q={}".format(self.root, self.gallery_id) + headers = {"Referer": self.chapter_url} + response = self.request(url, headers=headers, expect=(404,)) + + if response.status_code == 404: + url = "{}/Read/View/{}".format(self.root, self.gallery_id) + self.log.error( + "Failed to get gallery JSON data. Visit '%s' in a browser " + "and solve the CAPTCHA to continue.", url) + raise exception.StopExtraction() + + base = self.root + "/Image/Object?name=" + return [ + (base + text.quote(name), None) + for name in response.json()["reader_page_urls"] + ] + + +class TsuminoSearchExtractor(TsuminoBase, Extractor): + """Extractor for search results on tsumino.com""" + subcategory = "search" + pattern = (r"(?i)(?:https?://)?(?:www\.)?tsumino\.com" + r"/(?:Books/?)?#(.+)") + test = ( + ("https://www.tsumino.com/Books#?Character=Reimu+Hakurei", { + "pattern": TsuminoGalleryExtractor.pattern, + "range": "1-40", + "count": 40, + }), + (("http://www.tsumino.com/Books#~(Tags~(~" + "(Type~7~Text~'Reimu*20Hakurei~Exclude~false)~" + "(Type~'1~Text~'Pantyhose~Exclude~false)))#"), { + "pattern": TsuminoGalleryExtractor.pattern, + "count": ">= 3", + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.query = match.group(1) + + def items(self): + yield Message.Version, 1 + for gallery in self.galleries(): + url = "{}/Book/Info/{}".format(self.root, gallery["Id"]) + gallery["_extractor"] = TsuminoGalleryExtractor + yield Message.Queue, url, gallery + + def galleries(self): + """Return all gallery results matching 'self.query'""" + url = "{}/Books/Operate".format(self.root) + headers = { + "Referer": "{}/".format(self.root), + "X-Requested-With": "XMLHttpRequest", + } + data = { + "PageNumber": 1, + "Text": "", + "Sort": "Newest", + "List": "0", + "Length": "0", + "MinimumRating": "0", + "ExcludeList": "0", + "CompletelyExcludeHated": "false", + } + data.update(self._parse(self.query)) + + while True: + info = self.request( + url, method="POST", headers=headers, data=data).json() + + for gallery in info["Data"]: + yield gallery["Entry"] + + if info["PageNumber"] >= info["PageCount"]: + return + data["PageNumber"] += 1 + + def _parse(self, query): + try: + if query.startswith("?"): + return self._parse_simple(query) + return self._parse_jsurl(query) + except Exception as exc: + self.log.error("Invalid search query: '%s' (%s)", query, exc) + raise exception.StopExtraction() + + @staticmethod + def _parse_simple(query): + """Parse search query with format '?<key>=value>'""" + key, _, value = query.partition("=") + tag_types = { + "Tag": "1", + "Category": "2", + "Collection": "3", + "Group": "4", + "Artist": "5", + "Parody": "6", + "Character": "7", + "Uploader": "100", + } + + return { + "Tags[0][Type]": tag_types[key[1:].capitalize()], + "Tags[0][Text]": text.unquote(value).replace("+", " "), + "Tags[0][Exclude]": "false", + } + + @staticmethod + def _parse_jsurl(data): + """Parse search query in JSURL format + + Nested lists and dicts are handled in a special way to deal + with the way Tsumino expects its parameters -> expand(...) + + Example: ~(name~'John*20Doe~age~42~children~(~'Mary~'Bill)) + Ref: https://github.com/Sage/jsurl + """ + if not data: + return {} + i = 0 + imax = len(data) + + def eat(expected): + nonlocal i + + if data[i] != expected: + error = "bad JSURL syntax: expected '{}', got {}".format( + expected, data[i]) + raise ValueError(error) + i += 1 + + def decode(): + nonlocal i + + beg = i + result = "" + + while i < imax: + ch = data[i] + + if ch not in "~)*!": + i += 1 + + elif ch == "*": + if beg < i: + result += data[beg:i] + if data[i + 1] == "*": + result += chr(int(data[i+2:i+6], 16)) + i += 6 + else: + result += chr(int(data[i+1:i+3], 16)) + i += 3 + beg = i + + elif ch == "!": + if beg < i: + result += data[beg:i] + result += "$" + i += 1 + beg = i + + else: + break + + return result + data[beg:i] + + def parse_one(): + nonlocal i + + eat('~') + result = "" + ch = data[i] + + if ch == "(": + i += 1 + + if data[i] == "~": + result = [] + if data[i+1] == ")": + i += 1 + else: + result.append(parse_one()) + while data[i] == "~": + result.append(parse_one()) + + else: + result = {} + + if data[i] != ")": + while True: + key = decode() + value = parse_one() + for ekey, evalue in expand(key, value): + result[ekey] = evalue + if data[i] != "~": + break + i += 1 + eat(")") + + elif ch == "'": + i += 1 + result = decode() + + else: + beg = i + i += 1 + + while i < imax and data[i] not in "~)": + i += 1 + + sub = data[beg:i] + if ch in "0123456789-": + fval = float(sub) + ival = int(fval) + result = ival if ival == fval else fval + else: + if sub not in ("true", "false", "null"): + raise ValueError("bad value keyword: " + sub) + result = sub + + return result + + def expand(key, value): + if isinstance(value, list): + for index, cvalue in enumerate(value): + ckey = "{}[{}]".format(key, index) + yield from expand(ckey, cvalue) + elif isinstance(value, dict): + for ckey, cvalue in value.items(): + ckey = "{}[{}]".format(key, ckey) + yield from expand(ckey, cvalue) + else: + yield key, value + + return parse_one() diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py new file mode 100644 index 0000000..5679cdc --- /dev/null +++ b/gallery_dl/extractor/tumblr.py @@ -0,0 +1,425 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.tumblr.com/""" + +from .common import Extractor, Message +from .. import text, oauth, extractor, exception +from datetime import datetime, timedelta +import re +import time + + +def _original_inline_image(url): + return re.sub( + (r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?" + r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"), + r"https://\1_1280.\2", url + ) + + +def _original_video(url): + return re.sub( + (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" + r"/tumblr_[^_]+)_\d+\.([0-9a-z]+)"), + r"https://\1.\2", url + ) + + +POST_TYPES = frozenset(( + "text", "quote", "link", "answer", "video", "audio", "photo", "chat")) + +BASE_PATTERN = ( + r"(?:tumblr:(?:https?://)?([^/]+)|" + r"(?:https?://)?([^.]+\.tumblr\.com))") + + +class TumblrExtractor(Extractor): + """Base class for tumblr extractors""" + category = "tumblr" + directory_fmt = ("{category}", "{name}") + filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" + archive_fmt = "{id}_{num}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.blog = match.group(1) or match.group(2) + self.api = TumblrAPI(self) + + self.types = self._setup_posttypes() + self.avatar = self.config("avatar", False) + self.inline = self.config("inline", True) + self.reblogs = self.config("reblogs", True) + self.external = self.config("external", False) + + if len(self.types) == 1: + self.api.posts_type = next(iter(self.types)) + elif not self.types: + self.log.warning("no valid post types selected") + + if self.reblogs == "same-blog": + self._skip_reblog = self._skip_reblog_same_blog + + def items(self): + blog = None + yield Message.Version, 1 + + for post in self.posts(): + if post["type"] not in self.types: + continue + if not blog: + blog = self.api.info(self.blog) + blog["uuid"] = self.blog + yield Message.Directory, blog.copy() + + if self.avatar: + url = self.api.avatar(self.blog) + yield self._prepare_avatar(url, post.copy(), blog) + + reblog = "reblogged_from_id" in post + if reblog and self._skip_reblog(post): + continue + post["reblogged"] = reblog + + post["blog"] = blog + post["date"] = text.parse_timestamp(post["timestamp"]) + post["num"] = 0 + + if "trail" in post: + del post["trail"] + + if "photos" in post: # type "photo" or "link" + photos = post["photos"] + del post["photos"] + + for photo in photos: + post["photo"] = photo + photo.update(photo["original_size"]) + del photo["original_size"] + del photo["alt_sizes"] + yield self._prepare_image(photo["url"], post) + + url = post.get("audio_url") # type: "audio" + if url: + yield self._prepare(url, post) + + url = post.get("video_url") # type: "video" + if url: + yield self._prepare(_original_video(url), post) + + if self.inline and "reblog" in post: # inline media + # only "chat" posts are missing a "reblog" key in their + # API response, but they can't contain images/videos anyway + body = post["reblog"]["comment"] + post["reblog"]["tree_html"] + for url in re.findall('<img src="([^"]+)"', body): + url = _original_inline_image(url) + yield self._prepare_image(url, post) + for url in re.findall('<source src="([^"]+)"', body): + url = _original_video(url) + yield self._prepare(url, post) + + if self.external: # external links + post["extension"] = None + with extractor.blacklist(("tumblr",)): + for key in ("permalink_url", "url"): + url = post.get(key) + if url: + yield Message.Queue, url, post + break + + def posts(self): + """Return an iterable containing all relevant posts""" + + def _setup_posttypes(self): + types = self.config("posts", "all") + + if types == "all": + return POST_TYPES + + elif not types: + return frozenset() + + else: + if isinstance(types, str): + types = types.split(",") + types = frozenset(types) + + invalid = types - POST_TYPES + if invalid: + types = types & POST_TYPES + self.log.warning('invalid post types: "%s"', + '", "'.join(sorted(invalid))) + return types + + @staticmethod + def _prepare(url, post): + text.nameext_from_url(url, post) + post["num"] += 1 + post["hash"] = post["filename"].partition("_")[2] + return Message.Url, url, post + + @staticmethod + def _prepare_image(url, post): + text.nameext_from_url(url, post) + post["num"] += 1 + + parts = post["filename"].split("_") + try: + post["hash"] = parts[1] if parts[1] != "inline" else parts[2] + except IndexError: + # filename doesn't follow the usual pattern (#129) + post["hash"] = post["filename"] + + return Message.Url, url, post + + @staticmethod + def _prepare_avatar(url, post, blog): + text.nameext_from_url(url, post) + post["num"] = 1 + post["blog"] = blog + post["reblogged"] = False + post["type"] = post["id"] = post["hash"] = "avatar" + return Message.Url, url, post + + def _skip_reblog(self, _): + return not self.reblogs + + def _skip_reblog_same_blog(self, post): + return self.blog != post["reblogged_root_uuid"] + + +class TumblrUserExtractor(TumblrExtractor): + """Extractor for all images from a tumblr-user""" + subcategory = "user" + pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$" + test = ( + ("http://demo.tumblr.com/", { + "pattern": r"https://\d+\.media\.tumblr\.com" + r"/tumblr_[^/_]+_\d+\.jpg", + "count": 1, + "options": (("posts", "photo"),), + }), + ("http://demo.tumblr.com/", { + "pattern": (r"https?://(?:$|" + r"\d+\.media\.tumblr\.com/.+_1280\.jpg|" + r"w+\.tumblr\.com/audio_file/demo/\d+/tumblr_\w+)"), + "count": 3, + "options": (("posts", "all"), ("external", True)) + }), + ("https://mikf123-hidden.tumblr.com/", { # dashbord-only + "count": 2, + "keyword": {"tags": ["test", "hidden"]}, + }), + ("https://mikf123-private.tumblr.com/", { # password protected + "count": 2, + "keyword": {"tags": ["test", "private"]}, + }), + ("https://mikf123-private-hidden.tumblr.com/", { # both + "count": 2, + "keyword": {"tags": ["test", "private", "hidden"]}, + }), + ("https://demo.tumblr.com/page/2"), + ("https://demo.tumblr.com/archive"), + ("tumblr:http://www.b-authentique.com/"), + ("tumblr:www.b-authentique.com"), + ) + + def posts(self): + return self.api.posts(self.blog, {}) + + +class TumblrPostExtractor(TumblrExtractor): + """Extractor for images from a single post on tumblr""" + subcategory = "post" + pattern = BASE_PATTERN + r"/(?:post|image)/(\d+)" + test = ( + ("http://demo.tumblr.com/post/459265350", { + "pattern": (r"https://\d+\.media\.tumblr\.com" + r"/tumblr_[^/_]+_1280.jpg"), + "count": 1, + }), + ("https://mikf123.tumblr.com/post/167770226574/text-post", { + "count": 2, + }), + ("https://mikf123.tumblr.com/post/181022561719/quote-post", { + "count": 1, + }), + ("https://mikf123.tumblr.com/post/167623351559/link-post", { + "count": 2, + }), + ("https://muyanna.tumblr.com/post/180692431632/answer-post", { + "count": 1, + }), + ("https://mikf123.tumblr.com/post/167633596145/video-post", { + "count": 2, + }), + ("https://mikf123.tumblr.com/post/167770026604/audio-post", { + "count": 2, + }), + ("https://mikf123.tumblr.com/post/172687798174/photo-post", { + "count": 4, + }), + ("https://mikf123.tumblr.com/post/181022380064/chat-post", { + "count": 0, + }), + ("http://pinetre-3.tumblr.com/post/181904381470/via", { + "count": 0, # audio post with "null" as URL (#165) + }), + ("http://ziemniax.tumblr.com/post/109697912859/", { + "exception": exception.NotFoundError, # HTML response (#297) + }), + ("http://demo.tumblr.com/image/459265350"), + ) + + def __init__(self, match): + TumblrExtractor.__init__(self, match) + self.post_id = match.group(3) + self.reblogs = True + + def posts(self): + return self.api.posts(self.blog, {"id": self.post_id}) + + @staticmethod + def _setup_posttypes(): + return POST_TYPES + + +class TumblrTagExtractor(TumblrExtractor): + """Extractor for images from a tumblr-user by tag""" + subcategory = "tag" + pattern = BASE_PATTERN + r"/tagged/([^/?&#]+)" + test = ("http://demo.tumblr.com/tagged/Times%20Square", { + "pattern": (r"https://\d+\.media\.tumblr\.com/tumblr_[^/_]+_1280.jpg"), + "count": 1, + }) + + def __init__(self, match): + TumblrExtractor.__init__(self, match) + self.tag = text.unquote(match.group(3)) + + def posts(self): + return self.api.posts(self.blog, {"tag": self.tag}) + + +class TumblrLikesExtractor(TumblrExtractor): + """Extractor for images from a tumblr-user's liked posts""" + subcategory = "likes" + directory_fmt = ("{category}", "{name}", "likes") + archive_fmt = "f_{blog[name]}_{id}_{num}" + pattern = BASE_PATTERN + r"/likes" + test = ("http://mikf123.tumblr.com/likes", { + "count": 1, + }) + + def posts(self): + return self.api.likes(self.blog) + + +class TumblrAPI(oauth.OAuth1API): + """Minimal interface for the Tumblr API v2""" + API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B" + API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03" + BLOG_CACHE = {} + + def __init__(self, extractor): + oauth.OAuth1API.__init__(self, extractor) + self.posts_type = None + + def info(self, blog): + """Return general information about a blog""" + if blog not in self.BLOG_CACHE: + self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"] + return self.BLOG_CACHE[blog] + + def avatar(self, blog, size="512"): + """Retrieve a blog avatar""" + if self.api_key: + url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}" + return url_fmt.format(blog, size, self.api_key) + params = {"size": size} + data = self._call(blog, "avatar", params, allow_redirects=False) + return data["avatar_url"] + + def posts(self, blog, params): + """Retrieve published posts""" + params.update({"offset": 0, "limit": 50, "reblog_info": "true"}) + if self.posts_type: + params["type"] = self.posts_type + while True: + data = self._call(blog, "posts", params) + self.BLOG_CACHE[blog] = data["blog"] + yield from data["posts"] + params["offset"] += params["limit"] + if params["offset"] >= data["total_posts"]: + return + + def likes(self, blog): + """Retrieve liked posts""" + params = {"limit": 50} + while True: + posts = self._call(blog, "likes", params)["liked_posts"] + if not posts: + return + yield from posts + params["before"] = posts[-1]["liked_timestamp"] + + def _call(self, blog, endpoint, params, **kwargs): + if self.api_key: + params["api_key"] = self.api_key + url = "https://api.tumblr.com/v2/blog/{}/{}".format( + blog, endpoint) + + response = self.request(url, params=params, **kwargs) + + try: + data = response.json() + except ValueError: + data = response.text + status = response.status_code + else: + status = data["meta"]["status"] + if 200 <= status < 400: + return data["response"] + + if status == 403: + raise exception.AuthorizationError() + elif status == 404: + raise exception.NotFoundError("user or post") + elif status == 429: + + # daily rate limit + if response.headers.get("x-ratelimit-perday-remaining") == "0": + reset = response.headers.get("x-ratelimit-perday-reset") + self.log.error( + "Daily API rate limit exceeded: aborting; " + "rate limit will reset at %s", + self._to_time(reset), + ) + raise exception.StopExtraction() + + # hourly rate limit + reset = response.headers.get("x-ratelimit-perhour-reset") + if reset: + self.log.info( + "Hourly API rate limit exceeded; " + "waiting until %s for rate limit reset", + self._to_time(reset), + ) + time.sleep(int(reset) + 1) + return self._call(blog, endpoint, params) + + self.log.error(data) + raise exception.StopExtraction() + + @staticmethod + def _to_time(reset): + try: + reset_time = datetime.now() + timedelta(seconds=int(reset)) + except (ValueError, TypeError): + return "?" + return reset_time.strftime("%H:%M:%S") diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py new file mode 100644 index 0000000..ad4dc46 --- /dev/null +++ b/gallery_dl/extractor/twitter.py @@ -0,0 +1,202 @@ +# -*- coding: utf-8 -*- + +# Copyright 2016-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://twitter.com/""" + +from .common import Extractor, Message +from .. import text, exception +from ..cache import cache + + +class TwitterExtractor(Extractor): + """Base class for twitter extractors""" + category = "twitter" + directory_fmt = ("{category}", "{user}") + filename_fmt = "{tweet_id}_{num}.{extension}" + archive_fmt = "{tweet_id}_{retweet_id}_{num}" + root = "https://twitter.com" + sizes = (":orig", ":large", ":medium", ":small") + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + self.retweets = self.config("retweets", True) + self.videos = self.config("videos", False) + + def items(self): + self.login() + yield Message.Version, 1 + yield Message.Directory, self.metadata() + + for tweet in self.tweets(): + data = self._data_from_tweet(tweet) + if not self.retweets and data["retweet_id"]: + continue + + images = text.extract_iter( + tweet, 'data-image-url="', '"') + for data["num"], url in enumerate(images, 1): + text.nameext_from_url(url, data) + urls = [url + size for size in self.sizes] + yield Message.Urllist, urls, data + + if self.videos and "-videoContainer" in tweet: + data["num"] = 1 + url = "ytdl:{}/{}/status/{}".format( + self.root, data["user"], data["tweet_id"]) + yield Message.Url, url, data + + def metadata(self): + """Return general metadata""" + return {"user": self.user} + + def tweets(self): + """Yield HTML content of all relevant tweets""" + + def login(self): + username, password = self._get_auth_info() + if username: + self._update_cookies(self._login_impl(username, password)) + + @cache(maxage=360*24*3600, keyarg=1) + def _login_impl(self, username, password): + self.log.info("Logging in as %s", username) + + page = self.request(self.root + "/login").text + pos = page.index('name="authenticity_token"') + token = text.extract(page, 'value="', '"', pos-80)[0] + + url = self.root + "/sessions" + data = { + "session[username_or_email]": username, + "session[password]" : password, + "authenticity_token" : token, + "ui_metrics" : '{"rf":{},"s":""}', + "scribe_log" : "", + "redirect_after_login" : "", + "remember_me" : "1", + } + response = self.request(url, method="POST", data=data) + + if "/error" in response.url: + raise exception.AuthenticationError() + return self.session.cookies + + @staticmethod + def _data_from_tweet(tweet): + extr = text.extract_from(tweet) + return { + "tweet_id" : text.parse_int(extr('data-tweet-id="' , '"')), + "retweet_id": text.parse_int(extr('data-retweet-id="', '"')), + "retweeter" : extr('data-retweeter="' , '"'), + "user" : extr('data-screen-name="', '"'), + "username" : extr('data-name="' , '"'), + "user_id" : text.parse_int(extr('data-user-id="' , '"')), + "date" : text.parse_timestamp(extr('data-time="', '"')), + } + + def _tweets_from_api(self, url): + params = { + "include_available_features": "1", + "include_entities": "1", + "reset_error_state": "false", + "lang": "en", + } + headers = { + "X-Requested-With": "XMLHttpRequest", + "X-Twitter-Active-User": "yes", + "Referer": "{}/{}".format(self.root, self.user) + } + + while True: + data = self.request(url, params=params, headers=headers).json() + if "inner" in data: + data = data["inner"] + + for tweet in text.extract_iter( + data["items_html"], '<div class="tweet ', '\n</li>'): + yield tweet + + if not data["has_more_items"]: + return + + position = text.parse_int(text.extract( + tweet, 'data-tweet-id="', '"')[0]) + if "max_position" in params and position >= params["max_position"]: + return + params["max_position"] = position + + +class TwitterTimelineExtractor(TwitterExtractor): + """Extractor for all images from a user's timeline""" + subcategory = "timeline" + pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" + r"/([^/?&#]+)/?$") + test = ("https://twitter.com/supernaturepics", { + "range": "1-40", + "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", + "keyword": "7210d679606240405e0cf62cbc67596e81a7a250", + }) + + def tweets(self): + url = "{}/i/profiles/show/{}/timeline/tweets".format( + self.root, self.user) + return self._tweets_from_api(url) + + +class TwitterMediaExtractor(TwitterExtractor): + """Extractor for all images from a user's Media Tweets""" + subcategory = "media" + pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" + r"/([^/?&#]+)/media(?!\w)") + test = ("https://twitter.com/supernaturepics/media", { + "range": "1-40", + "url": "0106229d408f4111d9a52c8fd2ad687f64842aa4", + }) + + def tweets(self): + url = "{}/i/profiles/show/{}/media_timeline".format( + self.root, self.user) + return self._tweets_from_api(url) + + +class TwitterTweetExtractor(TwitterExtractor): + """Extractor for images from individual tweets""" + subcategory = "tweet" + pattern = (r"(?:https?://)?(?:www\.|mobile\.)?twitter\.com" + r"/([^/?&#]+)/status/(\d+)") + test = ( + ("https://twitter.com/supernaturepics/status/604341487988576256", { + "url": "0e801d2f98142dd87c3630ded9e4be4a4d63b580", + "keyword": "1b8afb93cc04a9f44d89173f8facc61c3a6caf91", + "content": "ab05e1d8d21f8d43496df284d31e8b362cd3bcab", + }), + # 4 images + ("https://twitter.com/perrypumas/status/894001459754180609", { + "url": "c8a262a9698cb733fb27870f5a8f75faf77d79f6", + "keyword": "43d98ab448193f0d4f30aa571a4b6bda9b6a5692", + }), + # video + ("https://twitter.com/perrypumas/status/1065692031626829824", { + "options": (("videos", True),), + "pattern": r"ytdl:https://twitter.com/perrypumas/status/\d+", + }), + ) + + def __init__(self, match): + TwitterExtractor.__init__(self, match) + self.tweet_id = match.group(2) + + def metadata(self): + return {"user": self.user, "tweet_id": self.tweet_id} + + def tweets(self): + url = "{}/{}/status/{}".format(self.root, self.user, self.tweet_id) + page = self.request(url).text + return (text.extract( + page, '<div class="tweet ', '<ul class="stats')[0],) diff --git a/gallery_dl/extractor/vanillarock.py b/gallery_dl/extractor/vanillarock.py new file mode 100644 index 0000000..687ce3c --- /dev/null +++ b/gallery_dl/extractor/vanillarock.py @@ -0,0 +1,95 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://vanilla-rock.com/""" + +from .common import Extractor, Message +from .. import text + + +class VanillarockExtractor(Extractor): + """Base class for vanillarock extractors""" + category = "vanillarock" + root = "https://vanilla-rock.com" + + def __init__(self, match): + Extractor.__init__(self, match) + self.path = match.group(1) + + +class VanillarockPostExtractor(VanillarockExtractor): + """Extractor for blogposts on vanilla-rock.com""" + subcategory = "post" + directory_fmt = ("{category}", "{path}") + filename_fmt = "{num:>02}.{extension}" + archive_fmt = "{filename}" + pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com" + r"(/(?!category/|tag/)[^/?&#]+)/?$") + test = ("https://vanilla-rock.com/mizuhashi_parsee-5", { + "url": "7fb9a4d18d9fa22d7295fee8d94ab5a7a52265dd", + "keyword": "b91df99b714e1958d9636748b1c81a07c3ef52c9", + }) + + def items(self): + extr = text.extract_from(self.request(self.root + self.path).text) + name = extr("<title>", "</title>") + + imgs = [] + while True: + img = extr('<div class="main-img">', '</div>') + if not img: + break + imgs.append(text.extract(img, 'href="', '"')[0]) + + data = { + "count": len(imgs), + "title": text.unescape(name.rpartition(" | ")[0]), + "path" : self.path.strip("/"), + "date" : text.parse_datetime(extr( + '<div class="date">', '</div>'), "%Y-%m-%d %H:%M"), + "tags" : text.split_html(extr( + '<div class="cat-tag">', '</div>'))[::2], + } + + yield Message.Version, 1 + yield Message.Directory, data + for data["num"], url in enumerate(imgs, 1): + yield Message.Url, url, text.nameext_from_url(url, data) + + +class VanillarockTagExtractor(VanillarockExtractor): + """Extractor for vanillarock blog posts by tag or category""" + subcategory = "tag" + pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com" + r"(/(?:tag|category)/[^?&#]+)") + test = ( + ("https://vanilla-rock.com/tag/%e5%b0%84%e5%91%bd%e4%b8%b8%e6%96%87", { + "pattern": VanillarockPostExtractor.pattern, + "count": ">= 12", + }), + (("https://vanilla-rock.com/category/%e4%ba%8c%e6%ac%a1%e3%82%a8%e3%83" + "%ad%e7%94%bb%e5%83%8f/%e8%90%8c%e3%81%88%e3%83%bb%e3%82%bd%e3%83%95" + "%e3%83%88%e3%82%a8%e3%83%ad"), { + "pattern": VanillarockPostExtractor.pattern, + "count": 3, + }), + ) + + def items(self): + url = self.root + self.path + data = {"_extractor": VanillarockPostExtractor} + + yield Message.Version, 1 + while url: + extr = text.extract_from(self.request(url).text) + while True: + post = extr('<h2 class="entry-title">', '</h2>') + if not post: + break + yield Message.Queue, text.extract(post, 'href="', '"')[0], data + url = text.unescape(extr('class="next page-numbers" href="', '"')) diff --git a/gallery_dl/extractor/wallhaven.py b/gallery_dl/extractor/wallhaven.py new file mode 100644 index 0000000..4326582 --- /dev/null +++ b/gallery_dl/extractor/wallhaven.py @@ -0,0 +1,148 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://wallhaven.cc/""" + +from .common import Extractor, Message +from .. import text + + +class WallhavenExtractor(Extractor): + """Base class for wallhaven extractors""" + category = "wallhaven" + filename_fmt = "{category}_{id}_{resolution}.{extension}" + root = "https://wallhaven.cc" + + def __init__(self, match): + Extractor.__init__(self, match) + self.api = WallhavenAPI(self) + + +class WallhavenSearchExtractor(WallhavenExtractor): + """Extractor for search results on wallhaven.cc""" + subcategory = "search" + directory_fmt = ("{category}", "{search[q]}") + archive_fmt = "s_{search[q]}_{id}" + pattern = r"(?:https?://)?wallhaven\.cc/search(?:/?\?([^/?#]+))?" + test = ( + ("https://wallhaven.cc/search?q=touhou"), + (("https://wallhaven.cc/search?q=id%3A87" + "&categories=111&purity=100&sorting=date_added&order=asc&page=3"), { + "count": 4, + "url": "d024bc11895d758b76ffdb0fa85a627e53f072cf", + }), + ) + + def __init__(self, match): + WallhavenExtractor.__init__(self, match) + self.params = text.parse_query(match.group(1)) + + def items(self): + yield Message.Version, 1 + yield Message.Directory, {"search": self.params} + for wp in self.api.search(self.params.copy()): + wp["search"] = self.params + yield Message.Url, wp["url"], wp + + +class WallhavenImageExtractor(WallhavenExtractor): + """Extractor for individual wallpaper on wallhaven.cc""" + subcategory = "image" + archive_fmt = "{id}" + pattern = (r"(?:https?://)?(?:wallhaven\.cc/w/|whvn\.cc/" + r"|w\.wallhaven\.cc/[a-z]+/\w\w/wallhaven-)(\w+)") + test = ( + ("https://wallhaven.cc/w/01w334", { + "pattern": "https://[^.]+.wallhaven.cc/full/01/[^-]+-01w334.jpg", + "content": "497212679383a465da1e35bd75873240435085a2", + "keyword": { + "id" : "01w334", + "width" : 1920, + "height" : 1200, + "resolution" : "1920x1200", + "ratio" : 1.6, + "colors" : list, + "tags" : list, + "file_size" : 278799, + "file_type" : "image/jpeg", + "purity" : "sfw", + "short_url" : "https://whvn.cc/01w334", + "source" : str, + "uploader" : { + "group" : "Owner/Developer", + "username" : "AksumkA", + }, + "date" : "type:datetime", + "wh_category": "anime", + "views" : int, + "favorites" : int, + }, + }), + # NSFW + ("https://wallhaven.cc/w/dge6v3", { + "url": "e4b802e70483f659d790ad5d0bd316245badf2ec", + }), + ("https://whvn.cc/01w334"), + ("https://w.wallhaven.cc/full/01/wallhaven-01w334.jpg"), + ) + + def __init__(self, match): + WallhavenExtractor.__init__(self, match) + self.wallpaper_id = match.group(1) + + def items(self): + data = self.api.info(self.wallpaper_id) + yield Message.Version, 1 + yield Message.Directory, data + yield Message.Url, data["url"], data + + +class WallhavenAPI(): + """Minimal interface to wallhaven's API""" + + def __init__(self, extractor): + self.extractor = extractor + + key = extractor.config("api-key") + if key is None: + key = "25HYZenXTICjzBZXzFSg98uJtcQVrDs2" + extractor.log.debug("Using default API Key") + else: + extractor.log.debug("Using custom API Key") + self.headers = {"X-API-Key": key} + + def info(self, wallpaper_id): + url = "https://wallhaven.cc/api/v1/w/" + wallpaper_id + return self._update(self._call(url)["data"]) + + def search(self, params): + url = "https://wallhaven.cc/api/v1/search" + while True: + data = self._call(url, params) + yield from map(self._update, data["data"]) + if data["meta"]["current_page"] >= data["meta"]["last_page"]: + return + params["page"] = data["meta"]["current_page"] + 1 + + def _call(self, url, params=None): + return self.extractor.request( + url, headers=self.headers, params=params).json() + + @staticmethod + def _update(wp): + width, _, height = wp["resolution"].partition("x") + wp["url"] = wp.pop("path") + if "tags" in wp: + wp["tags"] = [t["name"] for t in wp["tags"]] + wp["date"] = text.parse_datetime( + wp.pop("created_at"), "%Y-%m-%d %H:%M:%S") + wp["ratio"] = text.parse_float(wp["ratio"]) + wp["width"] = wp.pop("dimension_x") + wp["height"] = wp.pop("dimension_y") + wp["wh_category"] = wp["category"] + return text.nameext_from_url(wp["url"], wp) diff --git a/gallery_dl/extractor/warosu.py b/gallery_dl/extractor/warosu.py new file mode 100644 index 0000000..d353144 --- /dev/null +++ b/gallery_dl/extractor/warosu.py @@ -0,0 +1,108 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://warosu.org/""" + +from .common import Extractor, Message +from .. import text + + +class WarosuThreadExtractor(Extractor): + """Extractor for images from threads on warosu.org""" + category = "warosu" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread} - {title}") + filename_fmt = "{tim}-{filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)" + test = ( + ("https://warosu.org/jp/thread/16656025", { + "url": "889d57246ed67e491e5b8f7f124e50ea7991e770", + "keyword": "c00ea4c5460c5986994f17bb8416826d42ca57c0", + }), + ("https://warosu.org/jp/thread/16658073", { + "url": "4500cf3184b067424fd9883249bd543c905fbecd", + "keyword": "7534edf4ec51891dbf44d775b73fbbefd52eec71", + "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c", + }), + ) + root = "https://warosu.org" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/thread/{}".format(self.root, self.board, self.thread) + page = self.request(url).text + data = self.get_metadata(page) + posts = self.posts(page) + + if not data["title"]: + title = text.remove_html(posts[0]["com"]) + data["title"] = text.unescape(title)[:50] + + yield Message.Version, 1 + yield Message.Directory, data + for post in posts: + if "image" in post: + for key in ("w", "h", "no", "time", "tim"): + post[key] = text.parse_int(post[key]) + post.update(data) + yield Message.Url, post["image"], post + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + boardname = text.extract(page, "<title>", "</title>")[0] + title = text.extract(page, 'filetitle" itemprop="name">', '<')[0] + return { + "board": self.board, + "board_name": boardname.rpartition(" - ")[2], + "thread": self.thread, + "title": title, + } + + def posts(self, page): + """Build a list of all post-objects""" + page = text.extract(page, '<div class="content">', '<table>')[0] + needle = '<table itemscope itemtype="http://schema.org/Comment">' + return [self.parse(post) for post in page.split(needle)] + + def parse(self, post): + """Build post-object by extracting data from an HTML post""" + data = self._extract_post(post) + if "<span>File:" in post: + self._extract_image(post, data) + part = data["image"].rpartition("/")[2] + data["tim"], _, data["extension"] = part.partition(".") + data["ext"] = "." + data["extension"] + return data + + @staticmethod + def _extract_post(post): + data = text.extract_all(post, ( + ("no" , 'id="p', '"'), + ("name", '<span itemprop="name">', '</span>'), + ("time", '<span class="posttime" title="', '000">'), + ("now" , '', '<'), + ("com" , '<blockquote><p itemprop="text">', '</p></blockquote>'), + ))[0] + data["com"] = text.unescape(text.remove_html(data["com"].strip())) + return data + + @staticmethod + def _extract_image(post, data): + text.extract_all(post, ( + ("fsize" , '<span>File: ', ', '), + ("w" , '', 'x'), + ("h" , '', ', '), + ("filename", '', '<'), + ("image" , '<br />\n<a href="', '"'), + ), 0, data) + data["filename"] = text.unquote(data["filename"].rpartition(".")[0]) + data["image"] = "https:" + data["image"] diff --git a/gallery_dl/extractor/weibo.py b/gallery_dl/extractor/weibo.py new file mode 100644 index 0000000..7a4ee8f --- /dev/null +++ b/gallery_dl/extractor/weibo.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.weibo.com/""" + +from .common import Extractor, Message +from .. import text +import json + + +class WeiboExtractor(Extractor): + category = "weibo" + directory_fmt = ("{category}", "{user[screen_name]}") + filename_fmt = "{status[id]}_{num:>02}.{extension}" + archive_fmt = "{status[id]}_{num}" + root = "https://m.weibo.cn" + + def __init__(self, match): + Extractor.__init__(self, match) + self.retweets = self.config("retweets", True) + + def items(self): + first = True + + for status in self.statuses(): + + obj = status + num = 1 + + if first: + yield Message.Version, 1 + yield Message.Directory, status + first = False + + while True: + + if "pics" in obj: + for image in obj["pics"]: + pid = image["pid"] + if "large" in image: + image = image["large"] + data = text.nameext_from_url(image["url"], { + "num": num, + "pid": pid, + "width": text.parse_int(image["geo"]["width"]), + "height": text.parse_int(image["geo"]["height"]), + "status": status, + }) + yield Message.Url, image["url"], data + num += 1 + + if "page_info" in obj and "media_info" in obj["page_info"]: + info = obj["page_info"]["media_info"] + url = info.get("stream_url_hd") or info["stream_url"] + data = text.nameext_from_url(url, { + "num": num, + "url": url, + "width": 0, + "height": 0, + "status": status, + }) + yield Message.Url, url, data + + if self.retweets and "retweeted_status" in obj: + obj = obj["retweeted_status"] + else: + break + + def statuses(self): + """Returns an iterable containing all relevant 'status' objects""" + + +class WeiboUserExtractor(WeiboExtractor): + """Extractor for all images of a user on weibo.cn""" + subcategory = "user" + pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)" + r"/(?:u|p(?:rofile)?)/(\d+)") + test = ( + ("https://m.weibo.cn/u/2314621010", { + "range": "1-30", + }), + ("https://m.weibo.cn/profile/2314621010"), + ("https://m.weibo.cn/p/2304132314621010_-_WEIBO_SECOND_PROFILE_WEIBO"), + ("https://www.weibo.com/p/1003062314621010/home"), + ) + + def __init__(self, match): + WeiboExtractor.__init__(self, match) + self.user_id = match.group(1) + + def statuses(self): + url = self.root + "/api/container/getIndex" + params = {"page": 1, "containerid": "107603" + self.user_id[-10:]} + + while True: + data = self.request(url, params=params).json() + + for card in data["data"]["cards"]: + if "mblog" in card: + yield card["mblog"] + + if len(data["data"]["cards"]) < 5: + return + params["page"] += 1 + + +class WeiboStatusExtractor(WeiboExtractor): + """Extractor for images from a status on weibo.cn""" + subcategory = "status" + pattern = (r"(?:https?://)?(?:www\.|m\.)?weibo\.c(?:om|n)" + r"/(?:detail|status|\d+)/(\d+)") + test = ( + ("https://m.weibo.cn/detail/4323047042991618", { + "pattern": r"https?://wx\d+.sinaimg.cn/large/\w+.jpg", + }), + ("https://m.weibo.cn/detail/4339748116375525", { + "pattern": r"https?://f.us.sinaimg.cn/\w+\.mp4\?label=mp4_hd", + }), + ("https://m.weibo.cn/status/4339748116375525"), + ("https://m.weibo.cn/5746766133/4339748116375525"), + ) + + def __init__(self, match): + WeiboExtractor.__init__(self, match) + self.status_id = match.group(1) + + def statuses(self): + url = "{}/detail/{}".format(self.root, self.status_id) + page = self.request(url).text + data = json.loads(text.extract( + page, " var $render_data = [", "][0] || {};")[0]) + return (data["status"],) diff --git a/gallery_dl/extractor/wikiart.py b/gallery_dl/extractor/wikiart.py new file mode 100644 index 0000000..b9c223c --- /dev/null +++ b/gallery_dl/extractor/wikiart.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://www.wikiart.org/""" + +from .common import Extractor, Message +from .. import text + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?wikiart\.org/([a-z]+)" + + +class WikiartExtractor(Extractor): + """Base class for wikiart extractors""" + category = "wikiart" + filename_fmt = "{id}_{title}.{extension}" + archive_fmt = "{id}" + root = "https://www.wikiart.org" + + def __init__(self, match): + Extractor.__init__(self, match) + self.lang = match.group(1) + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for painting in self.paintings(): + url = painting["image"] + painting.update(data) + yield Message.Url, url, text.nameext_from_url(url, painting) + + def metadata(self): + """Return a dict with general metadata""" + + def paintings(self): + """Return an iterable containing all relevant 'painting' objects""" + + def _pagination(self, url, extra_params=None, key="Paintings"): + headers = { + "X-Requested-With": "XMLHttpRequest", + "Referer": url, + } + params = { + "json": "2", + "layout": "new", + "page": 1, + "resultType": "masonry", + } + if extra_params: + params.update(extra_params) + + while True: + data = self.request(url, headers=headers, params=params).json() + items = data.get(key) + if not items: + return + yield from items + params["page"] += 1 + + +class WikiartArtistExtractor(WikiartExtractor): + """Extractor for an artist's paintings on wikiart.org""" + subcategory = "artist" + directory_fmt = ("{category}", "{artist[artistName]}") + pattern = BASE_PATTERN + r"/(?!\w+-by-)([\w-]+)" + test = ("https://www.wikiart.org/en/thomas-cole", { + "url": "f1eee8158f5b8b7380382ab730a8f53884715c8b", + "keyword": "b62678394ce645815963883d5c9642255307225f", + }) + + def __init__(self, match): + WikiartExtractor.__init__(self, match) + self.artist = match.group(2) + + def metadata(self): + url = "{}/{}/{}?json=2".format(self.root, self.lang, self.artist) + return {"artist": self.request(url).json()} + + def paintings(self): + url = "{}/{}/{}/mode/all-paintings".format( + self.root, self.lang, self.artist) + return self._pagination(url) + + +class WikiartArtworksExtractor(WikiartExtractor): + """Extractor for artwork collections on wikiart.org""" + subcategory = "artworks" + directory_fmt = ("{category}", "Artworks by {group!c}", "{type}") + pattern = BASE_PATTERN + r"/paintings-by-([\w-]+)/([\w-]+)" + test = ("https://www.wikiart.org/en/paintings-by-media/grisaille", { + "url": "f92d55669fa949491c26a5437527adb14b35b8cc", + }) + + def __init__(self, match): + WikiartExtractor.__init__(self, match) + self.group = match.group(2) + self.type = match.group(3) + + def metadata(self): + return {"group": self.group, "type": self.type} + + def paintings(self): + url = "{}/{}/paintings-by-{}/{}".format( + self.root, self.lang, self.group, self.type) + return self._pagination(url) + + +class WikiartArtistsExtractor(WikiartExtractor): + """Extractor for artist collections on wikiart.org""" + subcategory = "artists" + pattern = (BASE_PATTERN + r"/artists-by-([\w-]+)/([\w-]+)") + test = ("https://www.wikiart.org/en/artists-by-century/12", { + "pattern": WikiartArtistExtractor.pattern, + "count": 7, + }) + + def __init__(self, match): + WikiartExtractor.__init__(self, match) + self.group = match.group(2) + self.type = match.group(3) + + def items(self): + url = "{}/{}/App/Search/Artists-by-{}".format( + self.root, self.lang, self.group) + params = {"json": "3", "searchterm": self.type} + + for artist in self._pagination(url, params, "Artists"): + artist["_extractor"] = WikiartArtistExtractor + yield Message.Queue, self.root + artist["artistUrl"], artist diff --git a/gallery_dl/extractor/xhamster.py b/gallery_dl/extractor/xhamster.py new file mode 100644 index 0000000..9699806 --- /dev/null +++ b/gallery_dl/extractor/xhamster.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://xhamster.com/""" + +from .common import Extractor, Message +from .. import text +import json + + +BASE_PATTERN = r"(?:https?://)?(?:[^.]+\.)?xhamster\.(?:com|one|desi)" + + +class XhamsterExtractor(Extractor): + """Base class for xhamster extractors""" + category = "xhamster" + root = "https://xhamster.com" + + +class XhamsterGalleryExtractor(XhamsterExtractor): + """Extractor for image galleries on xhamster.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{user[name]}", + "{gallery[id]} {gallery[title]}") + filename_fmt = "{num:>03}_{id}.{extension}" + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"(/photos/gallery/[^/?&#]+)" + test = ( + ("https://xhamster.com/photos/gallery/11748968", { + "pattern": r"https://thumb-p\d+.xhcdn.com/./[\w/-]+_1000.jpg$", + "count": ">= 144", + "keyword": { + "comments": int, + "count": int, + "favorite": bool, + "id": int, + "num": int, + "height": int, + "width": int, + "imageURL": str, + "pageURL": str, + "thumbURL": str, + "gallery": { + "date": "type:datetime", + "description": "", + "dislikes": int, + "id": 11748968, + "likes": int, + "tags": ["NON-Porn"], + "thumbnail": str, + "title": "Make the world better.", + "views": int, + }, + "user": { + "id": 16874672, + "name": "Anonymousrants", + "retired": bool, + "subscribers": int, + "url": "https://xhamster.com/users/anonymousrants", + "verified": bool, + }, + }, + }), + ("https://xhamster.com/photos/gallery/make-the-world-better-11748968"), + ("https://xhamster.com/photos/gallery/11748968"), + ("https://xhamster.one/photos/gallery/11748968"), + ("https://xhamster.desi/photos/gallery/11748968"), + ("https://en.xhamster.com/photos/gallery/11748968"), + ) + + def __init__(self, match): + XhamsterExtractor.__init__(self, match) + self.path = match.group(1) + self.data = None + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for num, image in enumerate(self.images(), 1): + url = image["imageURL"] + image.update(data) + image["num"] = num + yield Message.Url, url, text.nameext_from_url(url, image) + + def metadata(self): + self.data = self._data(self.root + self.path) + user = self.data["authorModel"] + imgs = self.data["photosGalleryModel"] + + return { + "user": + { + "id" : text.parse_int(user["id"]), + "url" : user["pageURL"], + "name" : user["name"], + "retired" : user["retired"], + "verified" : user["verified"], + "subscribers": user["subscribers"], + }, + "gallery": + { + "id" : text.parse_int(imgs["id"]), + "tags" : [c["name"] for c in imgs["categories"]], + "date" : text.parse_timestamp(imgs["created"]), + "views" : text.parse_int(imgs["views"]), + "likes" : text.parse_int(imgs["rating"]["likes"]), + "dislikes" : text.parse_int(imgs["rating"]["dislikes"]), + "title" : imgs["title"], + "description": imgs["description"], + "thumbnail" : imgs["thumbURL"], + }, + "count": text.parse_int(imgs["quantity"]), + } + + def images(self): + data = self.data + self.data = None + + while True: + for image in data["photosGalleryModel"]["photos"]: + del image["modelName"] + yield image + + pgntn = data["pagination"] + if pgntn["active"] == pgntn["maxPage"]: + return + url = pgntn["pageLinkTemplate"][:-3] + str(pgntn["next"]) + data = self._data(url) + + def _data(self, url): + page = self.request(url).text + return json.loads(text.extract( + page, "window.initials =", "</script>")[0].rstrip("\n\r;")) + + +class XhamsterUserExtractor(XhamsterExtractor): + """Extractor for all galleries of an xhamster user""" + subcategory = "user" + pattern = BASE_PATTERN + r"/users/([^/?&#]+)(?:/photos)?/?(?:$|[?#])" + test = ( + ("https://xhamster.com/users/nickname68/photos", { + "pattern": XhamsterGalleryExtractor.pattern, + "count": 50, + "range": "1-50", + }), + ("https://xhamster.com/users/nickname68"), + ) + + def __init__(self, match): + XhamsterExtractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + yield Message.Version, 1 + url = "{}/users/{}/photos".format(self.root, self.user) + data = {"_extractor": XhamsterGalleryExtractor} + + while url: + extr = text.extract_from(self.request(url).text) + while True: + url = extr('thumb-image-container" href="', '"') + if not url: + break + yield Message.Queue, url, data + url = extr('data-page="next" href="', '"') diff --git a/gallery_dl/extractor/xvideos.py b/gallery_dl/extractor/xvideos.py new file mode 100644 index 0000000..7eec18b --- /dev/null +++ b/gallery_dl/extractor/xvideos.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- + +# Copyright 2017-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://www.xvideos.com/""" + +from .common import Extractor, Message +from .. import text, exception +import json + + +class XvideosExtractor(Extractor): + """Base class for xvideos extractors""" + category = "xvideos" + root = "https://www.xvideos.com" + + def get_page(self, url, codes=(403, 404)): + response = self.request(url, expect=codes) + if response.status_code in codes: + raise exception.NotFoundError(self.subcategory) + return response.text + + +class XvideosGalleryExtractor(XvideosExtractor): + """Extractor for user profile galleries from xvideos.com""" + subcategory = "gallery" + directory_fmt = ("{category}", "{user[name]}", "{title}") + filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}" + archive_fmt = "{gallery_id}_{num}" + pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" + r"/profiles/([^/?&#]+)/photos/(\d+)") + test = ( + (("https://www.xvideos.com/profiles" + "/pervertedcouple/photos/751031/random_stuff"), { + "url": "4f0d992e5dc39def2c3ac8e099d17bf09e76e3c7", + "keyword": "8d637b372c6231cc4ada92dd5918db5fdbd06520", + }), + ("https://www.xvideos.com/profiles/pervertedcouple/photos/751032/", { + "exception": exception.NotFoundError, + }), + ) + + def __init__(self, match): + XvideosExtractor.__init__(self, match) + self.user, self.gid = match.groups() + + def items(self): + url = "{}/profiles/{}/photos/{}".format(self.root, self.user, self.gid) + page = self.get_page(url) + data = self.get_metadata(page) + imgs = self.get_images(page) + data["count"] = len(imgs) + yield Message.Version, 1 + yield Message.Directory, data + for url in imgs: + data["num"] = text.parse_int(url.rsplit("_", 2)[1]) + data["extension"] = url.rpartition(".")[2] + yield Message.Url, url, data + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + data = text.extract_all(page, ( + ("userid" , '"id_user":', ','), + ("display", '"display":"', '"'), + ("title" , '"title":"', '"'), + ("descr" , '<small class="mobile-hide">', '</small>'), + ("tags" , '<em>Tagged:</em>', '<'), + ))[0] + + return { + "user": { + "id": text.parse_int(data["userid"]), + "name": self.user, + "display": data["display"], + "description": data["descr"].strip(), + }, + "tags": text.unescape(data["tags"] or "").strip().split(", "), + "title": text.unescape(data["title"]), + "gallery_id": text.parse_int(self.gid), + } + + @staticmethod + def get_images(page): + """Return a list of all image urls for this gallery""" + return list(text.extract_iter( + page, '<a class="embed-responsive-item" href="', '"')) + + +class XvideosUserExtractor(XvideosExtractor): + """Extractor for user profiles from xvideos.com""" + subcategory = "user" + categorytransfer = True + pattern = (r"(?:https?://)?(?:www\.)?xvideos\.com" + r"/profiles/([^/?&#]+)/?(?:#.*)?$") + test = ( + ("https://www.xvideos.com/profiles/pervertedcouple", { + "url": "a413f3e60d6d3a2de79bd44fa3b7a9c03db4336e", + "keyword": "a796760d34732adc7ec52a8feb057515209a2ca6", + }), + ("https://www.xvideos.com/profiles/niwehrwhernvh", { + "exception": exception.NotFoundError, + }), + ("https://www.xvideos.com/profiles/pervertedcouple#_tabPhotos"), + ) + + def __init__(self, match): + XvideosExtractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + url = "{}/profiles/{}".format(self.root, self.user) + page = self.get_page(url) + data = json.loads(text.extract( + page, "xv.conf=", ";</script>")[0])["data"] + + if not isinstance(data["galleries"], dict): + return + if "0" in data["galleries"]: + del data["galleries"]["0"] + + galleries = [ + { + "gallery_id": text.parse_int(gid), + "title": text.unescape(gdata["title"]), + "count": gdata["nb_pics"], + "_extractor": XvideosGalleryExtractor, + } + for gid, gdata in data["galleries"].items() + ] + galleries.sort(key=lambda x: x["gallery_id"]) + + yield Message.Version, 1 + for gallery in galleries: + url = "https://www.xvideos.com/profiles/{}/photos/{}".format( + self.user, gallery["gallery_id"]) + yield Message.Queue, url, gallery diff --git a/gallery_dl/extractor/yandere.py b/gallery_dl/extractor/yandere.py new file mode 100644 index 0000000..623e7a8 --- /dev/null +++ b/gallery_dl/extractor/yandere.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://yande.re/""" + +from . import booru + + +class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor): + """Base class for yandere extractors""" + category = "yandere" + api_url = "https://yande.re/post.json" + post_url = "https://yande.re/post/show/{}" + + +class YandereTagExtractor(booru.TagMixin, YandereExtractor): + """Extractor for images from yande.re based on search-tags""" + pattern = (r"(?:https?://)?(?:www\.)?yande\.re" + r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)") + test = ("https://yande.re/post?tags=ouzoku+armor", { + "content": "59201811c728096b2d95ce6896fd0009235fe683", + }) + + +class YanderePoolExtractor(booru.PoolMixin, YandereExtractor): + """Extractor for image-pools from yande.re""" + pattern = r"(?:https?://)?(?:www\.)?yande\.re/pool/show/(?P<pool>\d+)" + test = ("https://yande.re/pool/show/318", { + "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68", + }) + + +class YanderePostExtractor(booru.PostMixin, YandereExtractor): + """Extractor for single images from yande.re""" + pattern = r"(?:https?://)?(?:www\.)?yande\.re/post/show/(?P<post>\d+)" + test = ("https://yande.re/post/show/51824", { + "content": "59201811c728096b2d95ce6896fd0009235fe683", + "options": (("tags", True),), + "keyword": { + "tags_artist": "sasaki_tamaru", + "tags_circle": "softhouse_chara", + "tags_copyright": "ouzoku", + "tags_general": str, + }, + }) + + +class YanderePopularExtractor(booru.MoebooruPopularMixin, YandereExtractor): + """Extractor for popular images from yande.re""" + pattern = (r"(?:https?://)?(?:www\.)?yande\.re" + r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)" + r"(?:\?(?P<query>[^#]*))?") + test = ( + ("https://yande.re/post/popular_by_month?month=6&year=2014", { + "count": 40, + }), + ("https://yande.re/post/popular_recent"), + ) + + def __init__(self, match): + super().__init__(match) + self.api_url = "https://yande.re/post/popular_{scale}.json".format( + scale=self.scale) diff --git a/gallery_dl/extractor/yaplog.py b/gallery_dl/extractor/yaplog.py new file mode 100644 index 0000000..b3c5501 --- /dev/null +++ b/gallery_dl/extractor/yaplog.py @@ -0,0 +1,109 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://yaplog.jp/""" + +from .common import Extractor, Message, AsynchronousMixin +from .. import text, util + + +class YaplogExtractor(AsynchronousMixin, Extractor): + """Base class for yaplog extractors""" + category = "yaplog" + root = "https://yaplog.jp" + filename_fmt = "{post[id]}_{post[title]}_{id}.{extension}" + directory_fmt = ("{category}", "{post[user]}") + archive_fmt = "{post[user]}_{id}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.user = match.group(1) + + def items(self): + yield Message.Version, 1 + for post, urls in self.posts(): + yield Message.Directory, {"post": post} + for num, url in enumerate(urls, 1): + page = self.request(url).text if num > 1 else url + iurl = text.extract(page, '<img src="', '"')[0] + iid, _, ext = iurl.rpartition("/")[2].rpartition(".") + image = { + "url" : iurl, + "num" : num, + "id" : text.parse_int(iid.partition("_")[0]), + "extension": ext, + "post" : post, + } + yield Message.Url, iurl, image + + def posts(self): + """Return an iterable with (data, image page URLs) tuples""" + + def _parse_post(self, url): + page = self.request(url).text + title, pos = text.extract(page, 'class="title">', '<') + date , pos = text.extract(page, 'class="date">' , '<', pos) + pid , pos = text.extract(page, '/archive/' , '"', pos) + prev , pos = text.extract(page, 'class="last"><a href="', '"', pos) + + urls = list(text.extract_iter(page, '<li><a href="', '"', pos)) + urls[0] = page # cache HTML of first page + + if len(urls) == 24 and text.extract(page, '(1/', ')')[0] != '24': + # there are a maximum of 24 image entries in an /image/ page + # -> search /archive/ page for the rest + url = "{}/{}/archive/{}".format(self.root, self.user, pid) + page = self.request(url).text + + base = "{}/{}/image/{}/".format(self.root, self.user, pid) + for part in util.advance(text.extract_iter( + page, base, '"', pos), 24): + urls.append(base + part) + + return prev, urls, { + "id" : text.parse_int(pid), + "title": text.unescape(title[:-3]), + "user" : self.user, + "date" : date, + } + + +class YaplogBlogExtractor(YaplogExtractor): + """Extractor for a user's blog on yaplog.jp""" + subcategory = "blog" + pattern = r"(?:https?://)?(?:www\.)?yaplog\.jp/(\w+)/?(?:$|[?&#])" + test = ("https://yaplog.jp/omitakashi3", { + "pattern": r"https://img.yaplog.jp/img/18/pc/o/m/i/omitakashi3/0/", + "count": ">= 2", + }) + + def posts(self): + url = "{}/{}/image/".format(self.root, self.user) + while url: + url, images, data = self._parse_post(url) + yield data, images + + +class YaplogPostExtractor(YaplogExtractor): + """Extractor for images from a blog post on yaplog.jp""" + subcategory = "post" + pattern = (r"(?:https?://)?(?:www\.)?yaplog\.jp" + r"/(\w+)/(?:archive|image)/(\d+)") + test = ("https://yaplog.jp/imamiami0726/image/1299", { + "url": "896cae20fa718735a57e723c48544e830ff31345", + "keyword": "f8d8781e61c4c38238a7622d6df6c905f864e5d3", + }) + + def __init__(self, match): + YaplogExtractor.__init__(self, match) + self.post_id = match.group(2) + + def posts(self): + url = "{}/{}/image/{}".format(self.root, self.user, self.post_id) + _, images, data = self._parse_post(url) + return ((data, images),) diff --git a/gallery_dl/extractor/yuki.py b/gallery_dl/extractor/yuki.py new file mode 100644 index 0000000..0844c40 --- /dev/null +++ b/gallery_dl/extractor/yuki.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- + +# Copyright 2018-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extract images from https://yuki.la/""" + +from .common import Extractor, Message +from .. import text + + +class YukiThreadExtractor(Extractor): + """Extractor for images from threads on yuki.la""" + category = "yuki" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", "{thread}{title:? - //}") + filename_fmt = "{time}-{filename}.{extension}" + archive_fmt = "{board}_{thread}_{tim}" + pattern = r"(?:https?://)?yuki\.la/([^/?&#]+)/(\d+)" + test = ( + ("https://yuki.la/gd/309639", { + "url": "289e86c5caf673a2515ec5f5f521ac0ae7e189e9", + "keyword": "01cbe29ae207a5cb7556bcbd5ed481ecdaf32727", + "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573", + }), + ("https://yuki.la/a/159767162", { + "url": "cd94d0eb646d279c3b7efb9b7898888e5d44fa93", + "keyword": "7a4ff90e423c74bd3126fb65d13015decec2fa45", + }), + # old thread - missing board name in title and multi-line HTML + ("https://yuki.la/gif/6877752", { + "url": "3dbb2f8453490d002416c5fc2fe95b56c129faf9", + "keyword": "563ef4ae80134d845dddaed7ebe56f5fc41056be", + }), + # even older thread - no thread title + ("https://yuki.la/a/9357051", { + "url": "010560bf254bd485e48366c3531728bda4b22583", + "keyword": "7b736c41e307dcfcb84ef495f29299a6ddd06d67", + }), + ) + root = "https://yuki.la" + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/{}/{}".format(self.root, self.board, self.thread) + page = self.request(url).text + data = self.get_metadata(page) + + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(page): + if "image" in post: + for key in ("w", "h", "no", "time"): + post[key] = text.parse_int(post[key]) + post.update(data) + yield Message.Url, post["image"], post + + def get_metadata(self, page): + """Collect metadata for extractor-job""" + title = text.extract(page, "<title>", "</title>")[0] + try: + title, boardname, _ = title.rsplit(" - ", 2) + except ValueError: + title = boardname = "" + else: + title = title.partition(" - ")[2] + if not title: + title, boardname = boardname, "" + return { + "board": self.board, + "board_name": boardname, + "thread": text.parse_int(self.thread), + "title": text.unescape(title), + } + + def posts(self, page): + """Build a list of all post-objects""" + return [ + self.parse(post) for post in text.extract_iter( + page, '<div class="postContainer', '</blockquote>') + ] + + def parse(self, post): + """Build post-object by extracting data from an HTML post""" + data = self._extract_post(post) + if 'class="file"' in post: + self._extract_image(post, data) + part = data["image"].rpartition("/")[2] + data["tim"], _, data["extension"] = part.partition(".") + data["ext"] = "." + data["extension"] + return data + + @staticmethod + def _extract_post(post): + data, pos = text.extract_all(post, ( + ("no" , 'id="pc', '"'), + ("name", '<span class="name">', '</span>'), + ("time", 'data-utc="', '"'), + ("now" , '>', ' <'), + )) + data["com"] = text.unescape(text.remove_html( + post[post.index("<blockquote ", pos):].partition(">")[2])) + return data + + @staticmethod + def _extract_image(post, data): + text.extract_all(post, ( + (None , '>File:', ''), + ("fullname", '<a title="', '"'), + ("image" , 'href="', '"'), + ("filename", '>', '<'), + ("fsize" , '(', ', '), + ("w" , '', 'x'), + ("h" , '', ')'), + ), 0, data) + filename = data["fullname"] or data["filename"] + data["filename"] = text.unescape(filename.rpartition(".")[0]) + data["image"] = "https:" + data["image"] + del data["fullname"] |
