summaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/wikimedia.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/wikimedia.py')
-rw-r--r--gallery_dl/extractor/wikimedia.py98
1 files changed, 58 insertions, 40 deletions
diff --git a/gallery_dl/extractor/wikimedia.py b/gallery_dl/extractor/wikimedia.py
index ba020d5..70e42c6 100644
--- a/gallery_dl/extractor/wikimedia.py
+++ b/gallery_dl/extractor/wikimedia.py
@@ -22,25 +22,32 @@ class WikimediaExtractor(BaseExtractor):
request_interval = (1.0, 2.0)
def __init__(self, match):
- BaseExtractor.__init__(self, match)
+ self._init_category(match)
+ self.format = False
if self.category == "wikimedia":
- self.category = self.root.split(".")[-2]
+ labels = self.root.split(".")
+ self.lang = labels[-3][-2:]
+ self.category = labels[-2]
elif self.category in ("fandom", "wikigg"):
+ self.lang = "en"
+ self.format = "original"
+ self.basesubcategory = self.category
self.category = (
f"{self.category}-"
f"{self.root.partition('.')[0].rpartition('/')[2]}")
-
- self.per_page = self.config("limit", 50)
- self.subcategories = False
+ else:
+ self.lang = ""
if useragent := self.config_instance("useragent"):
self.useragent = useragent
+ BaseExtractor.__init__(self, match)
+
def _init(self):
if api_path := self.config_instance("api-path"):
if api_path[0] == "/":
- self.api_url = self.root + api_path
+ self.api_url = f"{self.root}{api_path}"
else:
self.api_url = api_path
else:
@@ -51,12 +58,15 @@ class WikimediaExtractor(BaseExtractor):
# https://www.mediawiki.org/wiki/API:Revisions
# https://www.mediawiki.org/wiki/API:Imageinfo
self.image_revisions = self.config("image-revisions", 1)
+ self.format = self.config("format", self.format)
+ self.per_page = self.config("limit", 50)
+ self.subcategories = False
@cache(maxage=36500*86400, keyarg=1)
def _search_api_path(self, root):
self.log.debug("Probing possible API endpoints")
for path in ("/api.php", "/w/api.php", "/wiki/api.php"):
- url = root + path
+ url = f"{root}{path}"
response = self.request(url, method="HEAD", fatal=None)
if response.status_code < 400:
return url
@@ -74,12 +84,19 @@ class WikimediaExtractor(BaseExtractor):
m["name"]: m["value"]
for m in image["commonmetadata"] or ()}
- text.nameext_from_url(image["canonicaltitle"].partition(":")[2], image)
- image["date"] = text.parse_datetime(
- image["timestamp"], "%Y-%m-%dT%H:%M:%SZ")
+ text.nameext_from_name(
+ image["canonicaltitle"].partition(":")[2], image)
+ image["date"] = self.parse_datetime_iso(image["timestamp"])
+
+ if self.format:
+ url = image["url"]
+ image["url"] = (f"{url}{'&' if '?' in url else '?'}"
+ f"format={self.format}")
def items(self):
- for info in self._pagination(self.params):
+ params = self.params()
+
+ for info in self._pagination(params):
try:
images = info.pop("imageinfo")
except KeyError:
@@ -88,7 +105,7 @@ class WikimediaExtractor(BaseExtractor):
info["count"] = len(images)
self.prepare_info(info)
- yield Message.Directory, info
+ yield Message.Directory, "", info
num = 0
for image in images:
@@ -105,10 +122,10 @@ class WikimediaExtractor(BaseExtractor):
yield Message.Url, image["url"], image
if self.subcategories:
- base = self.root + "/wiki/"
- self.params["gcmtype"] = "subcat"
- for subcat in self._pagination(self.params):
- url = base + subcat["title"].replace(" ", "_")
+ base = f"{self.root}/wiki/"
+ params["gcmtype"] = "subcat"
+ for subcat in self._pagination(params):
+ url = f"{base}{subcat['title'].replace(' ', '_')}"
subcat["_extractor"] = WikimediaArticleExtractor
yield Message.Queue, url, subcat
@@ -219,7 +236,7 @@ class WikimediaArticleExtractor(WikimediaExtractor):
"""Extractor for wikimedia articles"""
subcategory = "article"
directory_fmt = ("{category}", "{page}")
- pattern = BASE_PATTERN + r"/(?!static/)([^?#]+)"
+ pattern = rf"{BASE_PATTERN}/(?!static/)([^?#]+)"
example = "https://en.wikipedia.org/wiki/TITLE"
def __init__(self, match):
@@ -227,53 +244,54 @@ class WikimediaArticleExtractor(WikimediaExtractor):
path = self.groups[-1]
if path[2] == "/":
- self.root = self.root + "/" + path[:2]
+ self.lang = lang = path[:2]
+ self.root = f"{self.root}/{lang}"
path = path[3:]
if path.startswith("wiki/"):
path = path[5:]
+ self.path = text.unquote(path)
pre, sep, _ = path.partition(":")
- prefix = pre.lower() if sep else None
-
- self.title = path = text.unquote(path)
- if prefix:
+ self.prefix = prefix = pre.lower() if sep else None
+ if prefix is not None:
self.subcategory = prefix
- if prefix == "category":
+ def params(self):
+ if self.prefix == "category":
if self.config("subcategories", True):
self.subcategories = True
- self.params = {
+ return {
"generator": "categorymembers",
- "gcmtitle" : path,
+ "gcmtitle" : self.path,
"gcmtype" : "file",
"gcmlimit" : self.per_page,
}
- elif prefix == "file":
- self.params = {
- "titles" : path,
- }
- else:
- self.params = {
- "generator": "images",
- "gimlimit" : self.per_page,
- "titles" : path,
+
+ if self.prefix == "file":
+ return {
+ "titles": self.path,
}
+ return {
+ "generator": "images",
+ "gimlimit" : self.per_page,
+ "titles" : self.path,
+ }
+
def prepare_info(self, info):
- info["page"] = self.title
+ info["page"] = self.path
+ info["lang"] = self.lang
class WikimediaWikiExtractor(WikimediaExtractor):
"""Extractor for all files on a MediaWiki instance"""
subcategory = "wiki"
- pattern = BASE_PATTERN + r"/?$"
+ pattern = rf"{BASE_PATTERN}/?$"
example = "https://en.wikipedia.org/"
- def __init__(self, match):
- WikimediaExtractor.__init__(self, match)
-
+ def params(self):
# ref: https://www.mediawiki.org/wiki/API:Allpages
- self.params = {
+ return {
"generator" : "allpages",
"gapnamespace": 6, # "File" namespace
"gaplimit" : self.per_page,