diff options
Diffstat (limited to 'gallery_dl/extractor/sexcom.py')
| -rw-r--r-- | gallery_dl/extractor/sexcom.py | 121 |
1 files changed, 83 insertions, 38 deletions
diff --git a/gallery_dl/extractor/sexcom.py b/gallery_dl/extractor/sexcom.py index 7708b5c..9e7d75d 100644 --- a/gallery_dl/extractor/sexcom.py +++ b/gallery_dl/extractor/sexcom.py @@ -10,6 +10,9 @@ from .common import Extractor, Message from .. import text +from datetime import datetime + +BASE_PATTERN = r"(?:https?://)?(?:www\.)?sex\.com" class SexcomExtractor(Extractor): @@ -23,8 +26,20 @@ class SexcomExtractor(Extractor): def items(self): yield Message.Directory, self.metadata() for pin in map(self._parse_pin, self.pins()): - if pin: - yield Message.Url, pin["url"], pin + if not pin: + continue + + url = pin["url"] + parts = url.rsplit("/", 4) + try: + pin["date_url"] = dt = datetime( + int(parts[1]), int(parts[2]), int(parts[3])) + if "date" not in pin: + pin["date"] = dt + except Exception: + pass + + yield Message.Url, url, pin def metadata(self): return {} @@ -53,10 +68,18 @@ class SexcomExtractor(Extractor): self.log.warning('Unable to fetch %s ("%s %s")', url, response.status_code, response.reason) return None + + if "/pin/" in response.url: + return self._parse_pin_legacy(response) + if "/videos/" in response.url: + return self._parse_pin_video(response) + return self._parse_pin_gifs(response) + + def _parse_pin_legacy(self, response): extr = text.extract_from(response.text) data = {} - data["_http_headers"] = {"Referer": url} + data["_http_headers"] = {"Referer": response.url} data["thumbnail"] = extr('itemprop="thumbnail" content="', '"') data["type"] = extr('<h1>' , '<').rstrip(" -").strip().lower() data["title"] = text.unescape(extr('itemprop="name">' , '<')) @@ -82,7 +105,8 @@ class SexcomExtractor(Extractor): src = (text.extr(iframe, ' src="', '"') or text.extr(iframe, " src='", "'")) if not src: - self.log.warning("Unable to fetch media from %s", url) + self.log.warning( + "Unable to fetch media from %s", response.url) return None data["extension"] = None data["url"] = "ytdl:" + src @@ -100,27 +124,60 @@ class SexcomExtractor(Extractor): return data + def _parse_pin_gifs(self, response): + extr = text.extract_from(response.text) + + data = { + "_http_headers": {"Referer": response.url}, + "type": "gif", + "url": extr(' href="', '"'), + "title": text.unescape(extr("<title>", " Gif | Sex.com<")), + "pin_id": text.parse_int(extr( + 'rel="canonical" href="', '"').rpartition("/")[2]), + "tags": text.split_html(extr("</h1>", "</section>")), + } + + return text.nameext_from_url(data["url"], data) + + def _parse_pin_video(self, response): + extr = text.extract_from(response.text) + + if not self.cookies.get("CloudFront-Key-Pair-Id", domain=".sex.com"): + self.log.warning("CloudFront cookies required for video downloads") + + data = { + "_ytdl_manifest": "hls", + "extension": "mp4", + "type": "video", + "title": text.unescape(extr("<title>", " | Sex.com<")), + "pin_id": text.parse_int(extr( + 'rel="canonical" href="', '"').rpartition("/")[2]), + "tags": text.split_html(extr( + 'event_name="video_tags_click"', "<div data-testid=") + .partition(">")[2]), + "url": "ytdl:" + extr('<source src="', '"'), + } + + return data + class SexcomPinExtractor(SexcomExtractor): """Extractor for a pinned image or video on www.sex.com""" subcategory = "pin" directory_fmt = ("{category}",) - pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+)(?!.*#related$)" + pattern = (BASE_PATTERN + + r"(/(?:pin|\w\w/(?:gif|video)s)/\d+/?)(?!.*#related$)") example = "https://www.sex.com/pin/12345-TITLE/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.pin_id = match.group(1) - def pins(self): - return ("{}/pin/{}/".format(self.root, self.pin_id),) + return (self.root + self.groups[0],) class SexcomRelatedPinExtractor(SexcomPinExtractor): """Extractor for related pins on www.sex.com""" subcategory = "related-pin" directory_fmt = ("{category}", "related {original_pin[pin_id]}") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/pin/(\d+).*#related$" + pattern = BASE_PATTERN + r"(/pin/(\d+)/?).*#related$" example = "https://www.sex.com/pin/12345#related" def metadata(self): @@ -129,7 +186,7 @@ class SexcomRelatedPinExtractor(SexcomPinExtractor): def pins(self): url = "{}/pin/related?pinId={}&limit=24&offset=0".format( - self.root, self.pin_id) + self.root, self.groups[1]) return self._pagination(url) @@ -137,18 +194,14 @@ class SexcomPinsExtractor(SexcomExtractor): """Extractor for a user's pins on www.sex.com""" subcategory = "pins" directory_fmt = ("{category}", "{user}") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/pins/" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/pins/" example = "https://www.sex.com/user/USER/pins/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user = match.group(1) - def metadata(self): - return {"user": text.unquote(self.user)} + return {"user": text.unquote(self.groups[0])} def pins(self): - url = "{}/user/{}/pins/".format(self.root, self.user) + url = "{}/user/{}/pins/".format(self.root, self.groups[0]) return self._pagination(url) @@ -156,18 +209,14 @@ class SexcomLikesExtractor(SexcomExtractor): """Extractor for a user's liked pins on www.sex.com""" subcategory = "likes" directory_fmt = ("{category}", "{user}", "Likes") - pattern = r"(?:https?://)?(?:www\.)?sex\.com/user/([^/?#]+)/likes/" + pattern = BASE_PATTERN + r"/user/([^/?#]+)/likes/" example = "https://www.sex.com/user/USER/likes/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user = match.group(1) - def metadata(self): - return {"user": text.unquote(self.user)} + return {"user": text.unquote(self.groups[0])} def pins(self): - url = "{}/user/{}/likes/".format(self.root, self.user) + url = "{}/user/{}/likes/".format(self.root, self.groups[0]) return self._pagination(url) @@ -175,15 +224,12 @@ class SexcomBoardExtractor(SexcomExtractor): """Extractor for pins from a board on www.sex.com""" subcategory = "board" directory_fmt = ("{category}", "{user}", "{board}") - pattern = (r"(?:https?://)?(?:www\.)?sex\.com/user" + pattern = (BASE_PATTERN + r"/user" r"/([^/?#]+)/(?!(?:following|pins|repins|likes)/)([^/?#]+)") example = "https://www.sex.com/user/USER/BOARD/" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.user, self.board = match.groups() - def metadata(self): + self.user, self.board = self.groups return { "user" : text.unquote(self.user), "board": text.unquote(self.board), @@ -198,19 +244,18 @@ class SexcomSearchExtractor(SexcomExtractor): """Extractor for search results on www.sex.com""" subcategory = "search" directory_fmt = ("{category}", "search", "{search[query]}") - pattern = (r"(?:https?://)?(?:www\.)?sex\.com/((?:" + pattern = (BASE_PATTERN + r"/((?:" r"(pic|gif|video)s/([^/?#]*)|search/(pic|gif|video)s" r")/?(?:\?([^#]+))?)") example = "https://www.sex.com/search/pics?query=QUERY" - def __init__(self, match): - SexcomExtractor.__init__(self, match) - self.path = match.group(1) + def _init(self): + self.path, t1, query_alt, t2, query = self.groups - self.search = text.parse_query(match.group(5)) - self.search["type"] = match.group(2) or match.group(4) + self.search = text.parse_query(query) + self.search["type"] = t1 or t2 if "query" not in self.search: - self.search["query"] = match.group(3) or "" + self.search["query"] = query_alt or "" def metadata(self): return {"search": self.search} |
