diff options
| author | 2019-07-02 04:33:45 -0400 | |
|---|---|---|
| committer | 2019-07-02 04:33:45 -0400 | |
| commit | 195c45911e79c33cf0bb986721365fb06df5a153 (patch) | |
| tree | ac0c9b6ef40bea7aa7ab0c5c3cb500eb510668fa /gallery_dl/extractor/reactor.py | |
Import Upstream version 1.8.7upstream/1.8.7
Diffstat (limited to 'gallery_dl/extractor/reactor.py')
| -rw-r--r-- | gallery_dl/extractor/reactor.py | 338 |
1 files changed, 338 insertions, 0 deletions
diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py new file mode 100644 index 0000000..59d502a --- /dev/null +++ b/gallery_dl/extractor/reactor.py @@ -0,0 +1,338 @@ +# -*- coding: utf-8 -*- + +# Copyright 2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Generic extractors for *reactor sites""" + +from .common import Extractor, Message, SharedConfigMixin +from .. import text +import urllib.parse +import random +import time +import json + + +BASE_PATTERN = r"(?:https?://)?([^/.]+\.reactor\.cc)" + + +class ReactorExtractor(SharedConfigMixin, Extractor): + """Base class for *reactor.cc extractors""" + basecategory = "reactor" + filename_fmt = "{post_id}_{num:>02}{title[:100]:?_//}.{extension}" + archive_fmt = "{post_id}_{num}" + + def __init__(self, match): + Extractor.__init__(self, match) + self.root = "http://" + match.group(1) + self.session.headers["Referer"] = self.root + + self.wait_min = self.config("wait-min", 3) + self.wait_max = self.config("wait-max", 6) + if self.wait_max < self.wait_min: + self.wait_max = self.wait_min + + if not self.category: + # set category based on domain name + netloc = urllib.parse.urlsplit(self.root).netloc + self.category = netloc.rpartition(".")[0] + + def items(self): + data = self.metadata() + yield Message.Version, 1 + yield Message.Directory, data + for post in self.posts(): + for image in self._parse_post(post): + url = image["url"] + image.update(data) + yield Message.Url, url, text.nameext_from_url(url, image) + + def metadata(self): + """Collect metadata for extractor-job""" + return {} + + def posts(self): + """Return all relevant post-objects""" + return self._pagination(self.url) + + def _pagination(self, url): + while True: + time.sleep(random.uniform(self.wait_min, self.wait_max)) + + response = self.request(url) + if response.history: + # sometimes there is a redirect from + # the last page of a listing (.../tag/<tag>/1) + # to the first page (.../tag/<tag>) + # which could cause an endless loop + cnt_old = response.history[0].url.count("/") + cnt_new = response.url.count("/") + if cnt_old == 5 and cnt_new == 4: + return + page = response.text + + yield from text.extract_iter( + page, '<div class="uhead">', '<div class="ufoot">') + + try: + pos = page.index("class='next'") + pos = page.rindex("class='current'", 0, pos) + url = self.root + text.extract(page, "href='", "'", pos)[0] + except (ValueError, TypeError): + return + + def _parse_post(self, post): + post, _, script = post.partition('<script type="application/ld+json">') + images = text.extract_iter(post, '<div class="image">', '</div>') + script = script[:script.index("</")].strip() + + try: + data = json.loads(script) + except ValueError: + try: + # remove control characters and escape backslashes + mapping = dict.fromkeys(range(32)) + script = script.translate(mapping).replace("\\", "\\\\") + data = json.loads(script) + except ValueError as exc: + self.log.warning("Unable to parse JSON data: %s", exc) + return + + num = 0 + date = text.parse_datetime(data["datePublished"]) + user = data["author"]["name"] + description = text.unescape(data["description"]) + title, _, tags = text.unescape(data["headline"]).partition(" / ") + post_id = text.parse_int( + data["mainEntityOfPage"]["@id"].rpartition("/")[2]) + + if not tags: + title, tags = tags, title + tags = tags.split(" :: ") + + for image in images: + url = text.extract(image, ' src="', '"')[0] + if not url: + continue + width = text.extract(image, ' width="', '"')[0] + height = text.extract(image, ' height="', '"')[0] + image_id = url.rpartition("-")[2].partition(".")[0] + num += 1 + + if image.startswith("<iframe "): # embed + url = "ytdl:" + text.unescape(url) + elif "/post/webm/" not in url and "/post/mp4/" not in url: + url = url.replace("/post/", "/post/full/") + + yield { + "url": url, + "post_id": post_id, + "image_id": text.parse_int(image_id), + "width": text.parse_int(width), + "height": text.parse_int(height), + "title": title, + "description": description, + "tags": tags, + "date": date, + "user": user, + "num": num, + } + + +class ReactorTagExtractor(ReactorExtractor): + """Extractor for tag searches on *reactor.cc sites""" + subcategory = "tag" + directory_fmt = ("{category}", "{search_tags}") + archive_fmt = "{search_tags}_{post_id}_{num}" + pattern = BASE_PATTERN + r"/tag/([^/?&#]+)" + test = ("http://anime.reactor.cc/tag/Anime+Art",) + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.tag = match.group(2) + + def metadata(self): + return {"search_tags": text.unescape(self.tag).replace("+", " ")} + + +class ReactorSearchExtractor(ReactorTagExtractor): + """Extractor for search results on *reactor.cc sites""" + subcategory = "search" + directory_fmt = ("{category}", "search", "{search_tags}") + archive_fmt = "s_{search_tags}_{post_id}_{num}" + pattern = BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + test = ("http://anime.reactor.cc/search?q=Art",) + + +class ReactorUserExtractor(ReactorExtractor): + """Extractor for all posts of a user on *reactor.cc sites""" + subcategory = "user" + directory_fmt = ("{category}", "user", "{user}") + pattern = BASE_PATTERN + r"/user/([^/?&#]+)" + test = ("http://anime.reactor.cc/user/Shuster",) + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.user = match.group(2) + + def metadata(self): + return {"user": text.unescape(self.user).replace("+", " ")} + + +class ReactorPostExtractor(ReactorExtractor): + """Extractor for single posts on *reactor.cc sites""" + subcategory = "post" + pattern = BASE_PATTERN + r"/post/(\d+)" + test = ("http://anime.reactor.cc/post/3576250",) + + def __init__(self, match): + ReactorExtractor.__init__(self, match) + self.post_id = match.group(2) + + def items(self): + yield Message.Version, 1 + post = self.request(self.url).text + pos = post.find('class="uhead">') + for image in self._parse_post(post[pos:]): + if image["num"] == 1: + yield Message.Directory, image + url = image["url"] + yield Message.Url, url, text.nameext_from_url(url, image) + + +# -------------------------------------------------------------------- +# JoyReactor + +JR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(joyreactor\.c(?:c|om))" + + +class JoyreactorTagExtractor(ReactorTagExtractor): + """Extractor for tag searches on joyreactor.cc""" + category = "joyreactor" + pattern = JR_BASE_PATTERN + r"/tag/([^/?&#]+)" + test = ( + ("http://joyreactor.cc/tag/Advent+Cirno", { + "count": ">= 17", + }), + ("http://joyreactor.com/tag/Cirno", { + "url": "de1e60c15bfb07a0e9603b00dc3d05f60edc7914", + }), + ) + + +class JoyreactorSearchExtractor(ReactorSearchExtractor): + """Extractor for search results on joyreactor.cc""" + category = "joyreactor" + pattern = JR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + test = ( + ("http://joyreactor.cc/search/Cirno+Gifs", { + "range": "1-25", + "count": ">= 20", + }), + ("http://joyreactor.com/search?q=Cirno+Gifs", { + "count": 0, # no search results on joyreactor.com + }), + ) + + +class JoyreactorUserExtractor(ReactorUserExtractor): + """Extractor for all posts of a user on joyreactor.cc""" + category = "joyreactor" + pattern = JR_BASE_PATTERN + r"/user/([^/?&#]+)" + test = ( + ("http://joyreactor.cc/user/hemantic"), + ("http://joyreactor.com/user/Tacoman123", { + "url": "452cd0fa23e2ad0e122c296ba75aa7f0b29329f6", + }), + ) + + +class JoyreactorPostExtractor(ReactorPostExtractor): + """Extractor for single posts on joyreactor.cc""" + category = "joyreactor" + pattern = JR_BASE_PATTERN + r"/post/(\d+)" + test = ( + ("http://joyreactor.com/post/3721876", { # single image + "url": "6ce09f239d8b7fdf6dd1664c2afc39618cc87663", + "keyword": "966d2acd462732a9ed823a9db5ed19f95734fd10", + }), + ("http://joyreactor.com/post/3713804", { # 4 images + "url": "f08ac8493ca0619a3e3c6bedb8d8374af3eec304", + "keyword": "84e34d402342607045a65fab6d4d593d146c238a", + }), + ("http://joyreactor.com/post/3726210", { # gif / video + "url": "33a48e1eca6cb2d298fbbb6536b3283799d6515b", + "keyword": "dbe148d576f2fc9431020c557ddb78f449e48c47", + }), + ("http://joyreactor.com/post/3668724", { # youtube embed + "url": "be2589e2e8f3ffcaf41b34bc28bfad850ccea34a", + "keyword": "da61b9e2887db95759950df5fb89c9d32f8e7651", + }), + ("http://joyreactor.cc/post/1299", { # "malformed" JSON + "url": "ac900743ed7cf1baf3db3b531c3bc414bf1ffcde", + }), + ) + + +# -------------------------------------------------------------------- +# PornReactor + +PR_BASE_PATTERN = r"(?:https?://)?(?:www\.)?(pornreactor\.cc|fapreactor.com)" + + +class PornreactorTagExtractor(ReactorTagExtractor): + """Extractor for tag searches on pornreactor.cc""" + category = "pornreactor" + pattern = PR_BASE_PATTERN + r"/tag/([^/?&#]+)" + test = ( + ("http://pornreactor.cc/tag/RiceGnat", { + "range": "1-25", + "count": ">= 25", + }), + ("http://fapreactor.com/tag/RiceGnat"), + ) + + +class PornreactorSearchExtractor(ReactorSearchExtractor): + """Extractor for search results on pornreactor.cc""" + category = "pornreactor" + pattern = PR_BASE_PATTERN + r"/search(?:/|\?q=)([^/?&#]+)" + test = ( + ("http://pornreactor.cc/search?q=ecchi+hentai", { + "range": "1-25", + "count": ">= 25", + }), + ("http://fapreactor.com/search/ecchi+hentai"), + ) + + +class PornreactorUserExtractor(ReactorUserExtractor): + """Extractor for all posts of a user on pornreactor.cc""" + category = "pornreactor" + pattern = PR_BASE_PATTERN + r"/user/([^/?&#]+)" + test = ( + ("http://pornreactor.cc/user/Disillusion", { + "range": "1-25", + "count": ">= 25", + }), + ("http://fapreactor.com/user/Disillusion"), + ) + + +class PornreactorPostExtractor(ReactorPostExtractor): + """Extractor for single posts on pornreactor.cc""" + category = "pornreactor" + subcategory = "post" + pattern = PR_BASE_PATTERN + r"/post/(\d+)" + test = ( + ("http://pornreactor.cc/post/863166", { + "url": "680db1e33ca92ff70b2c0e1708c471cbe2201324", + "content": "ec6b0568bfb1803648744077da082d14de844340", + }), + ("http://fapreactor.com/post/863166", { + "url": "864ecd5785e4898301aa8d054dd653b1165be158", + }), + ) |
