diff options
| author | 2025-07-31 01:22:01 -0400 | |
|---|---|---|
| committer | 2025-07-31 01:22:01 -0400 | |
| commit | a6e995c093de8aae2e91a0787281bb34c0b871eb (patch) | |
| tree | 2d79821b05300d34d8871eb6c9662b359a2de85d /gallery_dl/extractor/generic.py | |
| parent | 7672a750cb74bf31e21d76aad2776367fd476155 (diff) | |
New upstream version 1.30.2.upstream/1.30.2
Diffstat (limited to 'gallery_dl/extractor/generic.py')
| -rw-r--r-- | gallery_dl/extractor/generic.py | 29 |
1 files changed, 14 insertions, 15 deletions
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py index 4b04732..407e478 100644 --- a/gallery_dl/extractor/generic.py +++ b/gallery_dl/extractor/generic.py @@ -7,9 +7,8 @@ """Generic information extractor""" from .common import Extractor, Message -from .. import config, text +from .. import config, text, util import os.path -import re class GenericExtractor(Extractor): @@ -37,28 +36,28 @@ class GenericExtractor(Extractor): example = "generic:https://www.nongnu.org/lzip/" def __init__(self, match): - self.subcategory = match.group('domain') + self.subcategory = match['domain'] Extractor.__init__(self, match) # Strip the "g(eneric):" prefix # and inform about "forced" or "fallback" mode - if match.group('generic'): - self.url = match.group(0).partition(":")[2] + if match['generic']: + self.url = match[0].partition(":")[2] else: self.log.info("Falling back on generic information extractor.") - self.url = match.group(0) + self.url = match[0] # Make sure we have a scheme, or use https - if match.group('scheme'): - self.scheme = match.group('scheme') + if match['scheme']: + self.scheme = match['scheme'] else: self.scheme = 'https://' self.url = text.ensure_http_scheme(self.url, self.scheme) - self.path = match.group('path') + self.path = match['path'] # Used to resolve relative image urls - self.root = self.scheme + match.group('domain') + self.root = self.scheme + match['domain'] def items(self): """Get page, extract metadata & images, yield them in suitable messages @@ -172,8 +171,8 @@ class GenericExtractor(Extractor): r"(?:[^\"'<>\s]*)?" # optional query and fragment ) - imageurls_src = re.findall(imageurl_pattern_src, page) - imageurls_ext = re.findall(imageurl_pattern_ext, page) + imageurls_src = util.re(imageurl_pattern_src).findall(page) + imageurls_ext = util.re(imageurl_pattern_ext).findall(page) imageurls = imageurls_src + imageurls_ext # Resolve relative urls @@ -182,10 +181,10 @@ class GenericExtractor(Extractor): # by prepending a suitable base url. # # If the page contains a <base> element, use it as base url - basematch = re.search( - r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)", page) + basematch = util.re( + r"(?i)(?:<base\s.*?href=[\"']?)(?P<url>[^\"' >]+)").search(page) if basematch: - self.baseurl = basematch.group('url').rstrip('/') + self.baseurl = basematch['url'].rstrip('/') # Otherwise, extract the base url from self.url else: if self.url.endswith("/"): |
