summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/simpcity.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/simpcity.py')
-rw-r--r--gallery_dl/extractor/simpcity.py56
1 files changed, 48 insertions, 8 deletions
diff --git a/gallery_dl/extractor/simpcity.py b/gallery_dl/extractor/simpcity.py
index 8cc7e38..3354289 100644
--- a/gallery_dl/extractor/simpcity.py
+++ b/gallery_dl/extractor/simpcity.py
@@ -20,18 +20,20 @@ class SimpcityExtractor(Extractor):
root = "https://simpcity.cr"
def items(self):
- extract_urls = text.re(r' href="([^"]+)').findall
+ extract_urls = text.re(
+ r'<(?:a [^>]*?href|iframe [^>]*?src)="([^"]+)').findall
for post in self.posts():
urls = extract_urls(post["content"])
data = {"post": post}
post["count"] = data["count"] = len(urls)
+ yield Message.Directory, data
for data["num"], url in enumerate(urls, 1):
yield Message.Queue, url, data
def request_page(self, url):
try:
- return self.request(url).text
+ return self.request(url)
except exception.HttpError as exc:
if exc.status == 403 and b">Log in<" in exc.response.content:
msg = text.extr(exc.response.text, "blockMessage--error", "</")
@@ -44,14 +46,14 @@ class SimpcityExtractor(Extractor):
base = f"{self.root}{base}"
if pnum is None:
- url = base
+ url = f"{base}/"
pnum = 1
else:
url = f"{base}/page-{pnum}"
pnum = None
while True:
- page = self.request_page(url)
+ page = self.request_page(url).text
yield page
@@ -60,6 +62,31 @@ class SimpcityExtractor(Extractor):
pnum += 1
url = f"{base}/page-{pnum}"
+ def _pagination_reverse(self, base, pnum=None):
+ base = f"{self.root}{base}"
+
+ url = f"{base}/page-9999" # force redirect to last page
+ with self.request_page(url) as response:
+ url = response.url
+ if url[-1] == "/":
+ pnum = 1
+ else:
+ pnum = text.parse_int(url[url.rfind("-")+1:], 1)
+ page = response.text
+
+ while True:
+ yield page
+
+ pnum -= 1
+ if pnum > 1:
+ url = f"{base}/page-{pnum}"
+ elif pnum == 1:
+ url = f"{base}/"
+ else:
+ return
+
+ page = self.request_page(url).text
+
def _parse_thread(self, page):
schema = self._extract_jsonld(page)["mainEntity"]
author = schema["author"]
@@ -92,7 +119,8 @@ class SimpcityExtractor(Extractor):
"id": extr('data-content="post-', '"'),
"author_url": extr('itemprop="url" content="', '"'),
"date": text.parse_datetime(extr('datetime="', '"')),
- "content": extr('<div itemprop="text">', "\t\t</div>").strip(),
+ "content": extr('<div itemprop="text">',
+ '<div class="js-selectToQuote').strip(),
}
url_a = post["author_url"]
@@ -109,7 +137,7 @@ class SimpcityPostExtractor(SimpcityExtractor):
def posts(self):
post_id = self.groups[0]
url = f"{self.root}/posts/{post_id}/"
- page = self.request_page(url)
+ page = self.request_page(url).text
pos = page.find(f'data-content="post-{post_id}"')
if pos < 0:
@@ -126,10 +154,22 @@ class SimpcityThreadExtractor(SimpcityExtractor):
example = "https://simpcity.cr/threads/TITLE.12345/"
def posts(self):
- for page in self._pagination(*self.groups):
+ if (order := self.config("order-posts")) and \
+ order[0] not in ("d", "r"):
+ pages = self._pagination(*self.groups)
+ reverse = False
+ else:
+ pages = self._pagination_reverse(*self.groups)
+ reverse = True
+
+ for page in pages:
if "thread" not in self.kwdict:
self.kwdict["thread"] = self._parse_thread(page)
- for html in text.extract_iter(page, "<article ", "</article>"):
+ posts = text.extract_iter(page, "<article ", "</article>")
+ if reverse:
+ posts = list(posts)
+ posts.reverse()
+ for html in posts:
yield self._parse_post(html)