aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'gallery_dl/extractor/generic.py')
-rw-r--r--gallery_dl/extractor/generic.py58
1 files changed, 32 insertions, 26 deletions
diff --git a/gallery_dl/extractor/generic.py b/gallery_dl/extractor/generic.py
index 9999283..4ab26ae 100644
--- a/gallery_dl/extractor/generic.py
+++ b/gallery_dl/extractor/generic.py
@@ -1,16 +1,19 @@
# -*- coding: utf-8 -*-
-"""Extractor for images in a generic web page."""
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Generic information extractor"""
from .common import Extractor, Message
from .. import config, text
-import re
import os.path
+import re
class GenericExtractor(Extractor):
"""Extractor for images in a generic web page."""
-
category = "generic"
directory_fmt = ("{category}", "{pageurl}")
archive_fmt = "{imageurl}"
@@ -18,19 +21,19 @@ class GenericExtractor(Extractor):
# By default, the generic extractor is disabled
# and the "g(eneric):" prefix in url is required.
# If the extractor is enabled, make the prefix optional
- pattern = r"(?ix)(?P<generic>g(?:eneric)?:)"
+ pattern = r"(?i)(?P<generic>g(?:eneric)?:)"
if config.get(("extractor", "generic"), "enabled"):
pattern += r"?"
# The generic extractor pattern should match (almost) any valid url
# Based on: https://tools.ietf.org/html/rfc3986#appendix-B
- pattern += r"""
- (?P<scheme>https?://)? # optional http(s) scheme
- (?P<domain>[-\w\.]+) # required domain
- (?P<path>/[^?#]*)? # optional path
- (?:\?(?P<query>[^#]*))? # optional query
- (?:\#(?P<fragment>.*))? # optional fragment
- """
+ pattern += (
+ r"(?P<scheme>https?://)?" # optional http(s) scheme
+ r"(?P<domain>[-\w\.]+)" # required domain
+ r"(?P<path>/[^?#]*)?" # optional path
+ r"(?:\?(?P<query>[^#]*))?" # optional query
+ r"(?:\#(?P<fragment>.*))?" # optional fragment
+ )
test = (
("generic:https://www.nongnu.org/lzip/", {
@@ -49,19 +52,20 @@ class GenericExtractor(Extractor):
"count": 2,
"pattern": "^https://räksmörgås.josefsson.org/",
}),
+ ("g:https://en.wikipedia.org/Main_Page"),
+ ("g:https://example.org/path/to/file?que=1?&ry=2/#fragment"),
+ ("g:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"),
("generic:https://en.wikipedia.org/Main_Page"),
("generic:https://example.org/path/to/file?que=1?&ry=2/#fragment"),
("generic:https://example.org/%27%3C%23/%23%3E%27.htm?key=%3C%26%3E"),
)
def __init__(self, match):
- """Init."""
Extractor.__init__(self, match)
# Strip the "g(eneric):" prefix
# and inform about "forced" or "fallback" mode
if match.group('generic'):
- self.log.info("Forcing use of generic information extractor.")
self.url = match.group(0).partition(":")[2]
else:
self.log.info("Falling back on generic information extractor.")
@@ -93,7 +97,6 @@ class GenericExtractor(Extractor):
pass
images = enumerate(imgs, 1)
- yield Message.Version, 1
yield Message.Directory, data
for data["num"], (url, imgdata) in images:
@@ -158,11 +161,13 @@ class GenericExtractor(Extractor):
image urls; this pattern matches only the first url; remaining urls
will be matched by the "imageurl_pattern_ext" pattern below.
"""
- imageurl_pattern_src = r"""(?ix)
- <(?:img|video|source)\s.*? # <img>, <video> or <source>
- src(?:set)?=["']? # src or srcset attributes
- (?P<URL>[^"'\s>]+) # url
- """
+
+ imageurl_pattern_src = (
+ r"(?i)"
+ r"<(?:img|video|source)\s[^>]*" # <img>, <video> or <source>
+ r"src(?:set)?=[\"']?" # src or srcset attributes
+ r"(?P<URL>[^\"'\s>]+)" # url
+ )
"""
2: Look anywhere for urls containing common image/video extensions
@@ -176,12 +181,13 @@ class GenericExtractor(Extractor):
urls in html tags.
"""
- imageurl_pattern_ext = r"""(?ix)
- (?:[^?&#"'>\s]+) # anything until dot+extension
- \.(?:jpe?g|jpe|png|gif
- |web[mp]|mp4|mkv|og[gmv]|opus) # dot + image/video extensions
- (?:[^"'<>\s]*)? # optional query and fragment
- """
+ imageurl_pattern_ext = (
+ r"(?i)"
+ r"(?:[^?&#\"'>\s]+)" # anything until dot+extension
+ # dot + image/video extensions
+ r"\.(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus)"
+ r"(?:[^\"'<>\s]*)?" # optional query and fragment
+ )
imageurls_src = re.findall(imageurl_pattern_src, page)
imageurls_ext = re.findall(imageurl_pattern_ext, page)
@@ -221,7 +227,7 @@ class GenericExtractor(Extractor):
absimageurls.append(self.baseurl + '/' + u)
# Remove duplicates
- absimageurls = set(absimageurls)
+ absimageurls = dict.fromkeys(absimageurls)
# Create the image metadata dict and add imageurl to it
# (image filename and extension are added by items())