diff options
Diffstat (limited to 'gallery_dl/extractor/simpcity.py')
| -rw-r--r-- | gallery_dl/extractor/simpcity.py | 56 |
1 files changed, 48 insertions, 8 deletions
diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py index 8cc7e38..3354289 100644 --- a/gallery_dl/extractor/simpcity.py +++ b/gallery_dl/extractor/simpcity.py @@ -20,18 +20,20 @@ class SimpcityExtractor(Extractor): root = "https://simpcity.cr" def items(self): - extract_urls = text.re(r' href="([^"]+)').findall + extract_urls = text.re( + r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall for post in self.posts(): urls = extract_urls(post["content"]) data = {"post": post} post["count"] = data["count"] = len(urls) + yield Message.Directory, data for data["num"], url in enumerate(urls, 1): yield Message.Queue, url, data def request_page(self, url): try: - return self.request(url).text + return self.request(url) except exception.HttpError as exc: if exc.status == 403 and b">Log in<" in exc.response.content: msg = text.extr(exc.response.text, "blockMessage--error", "</") @@ -44,14 +46,14 @@ class SimpcityExtractor(Extractor): base = f"{self.root}{base}" if pnum is None: - url = base + url = f"{base}/" pnum = 1 else: url = f"{base}/page-{pnum}" pnum = None while True: - page = self.request_page(url) + page = self.request_page(url).text yield page @@ -60,6 +62,31 @@ class SimpcityExtractor(Extractor): pnum += 1 url = f"{base}/page-{pnum}" + def _pagination_reverse(self, base, pnum=None): + base = f"{self.root}{base}" + + url = f"{base}/page-9999" # force redirect to last page + with self.request_page(url) as response: + url = response.url + if url[-1] == "/": + pnum = 1 + else: + pnum = text.parse_int(url[url.rfind("-")+1:], 1) + page = response.text + + while True: + yield page + + pnum -= 1 + if pnum > 1: + url = f"{base}/page-{pnum}" + elif pnum == 1: + url = f"{base}/" + else: + return + + page = self.request_page(url).text + def _parse_thread(self, page): schema = self._extract_jsonld(page)["mainEntity"] author = schema["author"] @@ -92,7 +119,8 @@ class SimpcityExtractor(Extractor): "id": extr('data-content="post-', '"'), "author_url": extr('itemprop="url" content="', '"'), "date": text.parse_datetime(extr('datetime="', '"')), - "content": extr('<div itemprop="text">', "\t\t</div>").strip(), + "content": extr('<div itemprop="text">', + '<div class="js-selectToQuote').strip(), } url_a = post["author_url"] @@ -109,7 +137,7 @@ class SimpcityPostExtractor(SimpcityExtractor): def posts(self): post_id = self.groups[0] url = f"{self.root}/posts/{post_id}/" - page = self.request_page(url) + page = self.request_page(url).text pos = page.find(f'data-content="post-{post_id}"') if pos < 0: @@ -126,10 +154,22 @@ class SimpcityThreadExtractor(SimpcityExtractor): example = "https://simpcity.cr/threads/TITLE.12345/" def posts(self): - for page in self._pagination(*self.groups): + if (order := self.config("order-posts")) and \ + order[0] not in ("d", "r"): + pages = self._pagination(*self.groups) + reverse = False + else: + pages = self._pagination_reverse(*self.groups) + reverse = True + + for page in pages: if "thread" not in self.kwdict: self.kwdict["thread"] = self._parse_thread(page) - for html in text.extract_iter(page, "<article ", "</article>"): + posts = text.extract_iter(page, "<article ", "</article>") + if reverse: + posts = list(posts) + posts.reverse() + for html in posts: yield self._parse_post(html) |
