aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/simpcity.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/simpcity.py')
-rw-r--r--gallery_dl/extractor/simpcity.py186
1 files changed, 0 insertions, 186 deletions
diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py
deleted file mode 100644
index d8227fa..0000000
--- a/gallery_dl/extractor/simpcity.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2025 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extractors for https://simpcity.cr/"""
-
-from .common import Extractor, Message
-from .. import text, exception
-
-BASE_PATTERN = r"(?:https?://)?(?:www\.)?simpcity\.(?:cr|su)"
-
-
-class SimpcityExtractor(Extractor):
- """Base class for simpcity extractors"""
- category = "simpcity"
- root = "https://simpcity.cr"
-
- def items(self):
- extract_urls = text.re(
- r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
-
- for post in self.posts():
- urls = extract_urls(post["content"])
- data = {"post": post}
- post["count"] = data["count"] = len(urls)
- yield Message.Directory, data
- for data["num"], url in enumerate(urls, 1):
- yield Message.Queue, url, data
-
- def request_page(self, url):
- try:
- return self.request(url)
- except exception.HttpError as exc:
- if exc.status == 403 and b">Log in<" in exc.response.content:
- msg = text.extr(exc.response.text, "blockMessage--error", "</")
- raise exception.AuthRequired(
- "'authenticated cookies'", None,
- msg.rpartition(">")[2].strip())
- raise
-
- def _pagination(self, base, pnum=None):
- base = f"{self.root}{base}"
-
- if pnum is None:
- url = f"{base}/"
- pnum = 1
- else:
- url = f"{base}/page-{pnum}"
- pnum = None
-
- while True:
- page = self.request_page(url).text
-
- yield page
-
- if pnum is None or "pageNav-jump--next" not in page:
- return
- pnum += 1
- url = f"{base}/page-{pnum}"
-
- def _pagination_reverse(self, base, pnum=None):
- base = f"{self.root}{base}"
-
- url = f"{base}/page-9999" # force redirect to last page
- with self.request_page(url) as response:
- url = response.url
- if url[-1] == "/":
- pnum = 1
- else:
- pnum = text.parse_int(url[url.rfind("-")+1:], 1)
- page = response.text
-
- while True:
- yield page
-
- pnum -= 1
- if pnum > 1:
- url = f"{base}/page-{pnum}"
- elif pnum == 1:
- url = f"{base}/"
- else:
- return
-
- page = self.request_page(url).text
-
- def _parse_thread(self, page):
- schema = self._extract_jsonld(page)["mainEntity"]
- author = schema["author"]
- stats = schema["interactionStatistic"]
- url_t = schema["url"]
- url_a = author.get("url") or ""
-
- thread = {
- "id" : url_t[url_t.rfind(".")+1:-1],
- "url" : url_t,
- "title": schema["headline"],
- "date" : text.parse_datetime(schema["datePublished"]),
- "views": stats[0]["userInteractionCount"],
- "posts": stats[1]["userInteractionCount"],
- "tags" : (schema["keywords"].split(", ")
- if "keywords" in schema else ()),
- "section" : schema["articleSection"],
- "author" : author.get("name") or "",
- "author_id" : (url_a[url_a.rfind(".")+1:-1] if url_a else
- (author.get("name") or "")[15:]),
- "author_url": url_a,
- }
-
- return thread
-
- def _parse_post(self, html):
- extr = text.extract_from(html)
-
- post = {
- "author": extr('data-author="', '"'),
- "id": extr('data-content="post-', '"'),
- "author_url": extr('itemprop="url" content="', '"'),
- "date": text.parse_datetime(extr('datetime="', '"')),
- "content": extr('<div itemprop="text">',
- '<div class="js-selectToQuote').strip(),
- }
-
- url_a = post["author_url"]
- post["author_id"] = url_a[url_a.rfind(".")+1:-1]
-
- return post
-
-
-class SimpcityPostExtractor(SimpcityExtractor):
- subcategory = "post"
- pattern = rf"{BASE_PATTERN}/(?:threads/[^/?#]+/post-|posts/)(\d+)"
- example = "https://simpcity.cr/threads/TITLE.12345/post-54321"
-
- def posts(self):
- post_id = self.groups[0]
- url = f"{self.root}/posts/{post_id}/"
- page = self.request_page(url).text
-
- pos = page.find(f'data-content="post-{post_id}"')
- if pos < 0:
- raise exception.NotFoundError("post")
- html = text.extract(page, "<article ", "</article>", pos-200)[0]
-
- self.kwdict["thread"] = self._parse_thread(page)
- return (self._parse_post(html),)
-
-
-class SimpcityThreadExtractor(SimpcityExtractor):
- subcategory = "thread"
- pattern = rf"{BASE_PATTERN}(/threads/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
- example = "https://simpcity.cr/threads/TITLE.12345/"
-
- def posts(self):
- if (order := self.config("order-posts")) and \
- order[0] not in ("d", "r"):
- pages = self._pagination(*self.groups)
- reverse = False
- else:
- pages = self._pagination_reverse(*self.groups)
- reverse = True
-
- for page in pages:
- if "thread" not in self.kwdict:
- self.kwdict["thread"] = self._parse_thread(page)
- posts = text.extract_iter(page, "<article ", "</article>")
- if reverse:
- posts = list(posts)
- posts.reverse()
- for html in posts:
- yield self._parse_post(html)
-
-
-class SimpcityForumExtractor(SimpcityExtractor):
- subcategory = "forum"
- pattern = rf"{BASE_PATTERN}(/forums/(?:[^/?#]+\.)?\d+)(?:/page-(\d+))?"
- example = "https://simpcity.cr/forums/TITLE.123/"
-
- def items(self):
- data = {"_extractor": SimpcityThreadExtractor}
- for page in self._pagination(*self.groups):
- for path in text.extract_iter(page, ' uix-href="', '"'):
- yield Message.Queue, f"{self.root}{text.unquote(path)}", data