diff options
Diffstat (limited to 'gallery_dl/text.py')
| -rw-r--r-- | gallery_dl/text.py | 84 |
1 files changed, 58 insertions, 26 deletions
diff --git a/gallery_dl/text.py b/gallery_dl/text.py index c1dde94..a7539ad 100644 --- a/gallery_dl/text.py +++ b/gallery_dl/text.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -8,14 +8,29 @@ """Collection of functions that work on strings/text""" -import re import sys import html import time import datetime import urllib.parse +import re as re_module -HTML_RE = re.compile("<[^>]+>") +try: + re_compile = re_module._compiler.compile +except AttributeError: + re_compile = re_module.sre_compile.compile + +HTML_RE = re_compile(r"<[^>]+>") +PATTERN_CACHE = {} + + +def re(pattern): + """Compile a regular expression pattern""" + try: + return PATTERN_CACHE[pattern] + except KeyError: + p = PATTERN_CACHE[pattern] = re_compile(pattern) + return p def remove_html(txt, repl=" ", sep=" "): @@ -47,8 +62,13 @@ def slugify(value): Adapted from: https://github.com/django/django/blob/master/django/utils/text.py """ - value = re.sub(r"[^\w\s-]", "", str(value).lower()) - return re.sub(r"[-\s]+", "-", value).strip("-_") + value = re(r"[^\w\s-]").sub("", str(value).lower()) + return re(r"[-\s]+").sub("-", value).strip("-_") + + +def sanitize_whitespace(value): + """Replace all whitespace characters with a single space""" + return re(r"\s+").sub(" ", value.strip()) def ensure_http_scheme(url, scheme="https://"): @@ -100,7 +120,7 @@ def nameext_from_url(url, data=None): return data -def extract(txt, begin, end, pos=0): +def extract(txt, begin, end, pos=None): """Extract the text between 'begin' and 'end' from 'txt' Args: @@ -125,7 +145,7 @@ def extract(txt, begin, end, pos=0): last = txt.index(end, first) return txt[first:last], last+len(end) except Exception: - return None, pos + return None, 0 if pos is None else pos def extr(txt, begin, end, default=""): @@ -137,17 +157,26 @@ def extr(txt, begin, end, default=""): return default -def rextract(txt, begin, end, pos=-1): +def rextract(txt, begin, end, pos=None): try: lbeg = len(begin) - first = txt.rindex(begin, 0, pos) + first = txt.rindex(begin, None, pos) last = txt.index(end, first + lbeg) return txt[first + lbeg:last], first except Exception: - return None, pos + return None, -1 if pos is None else pos + + +def rextr(txt, begin, end, pos=None, default=""): + """Stripped-down version of 'rextract()'""" + try: + first = txt.rindex(begin, None, pos) + len(begin) + return txt[first:txt.index(end, first)] + except Exception: + return default -def extract_all(txt, rules, pos=0, values=None): +def extract_all(txt, rules, pos=None, values=None): """Calls extract for each rule and returns the result in a dict""" if values is None: values = {} @@ -155,10 +184,10 @@ def extract_all(txt, rules, pos=0, values=None): result, pos = extract(txt, begin, end, pos) if key: values[key] = result - return values, pos + return values, 0 if pos is None else pos -def extract_iter(txt, begin, end, pos=0): +def extract_iter(txt, begin, end, pos=None): """Yield values that would be returned by repeated calls of extract()""" try: index = txt.index @@ -173,7 +202,7 @@ def extract_iter(txt, begin, end, pos=0): return -def extract_from(txt, pos=0, default=""): +def extract_from(txt, pos=None, default=""): """Returns a function object that extracts from 'txt'""" def extr(begin, end, index=txt.index, txt=txt): nonlocal pos @@ -190,21 +219,22 @@ def extract_from(txt, pos=0, default=""): def parse_unicode_escapes(txt): """Convert JSON Unicode escapes in 'txt' into actual characters""" if "\\u" in txt: - return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt) + return re(r"\\u([0-9a-fA-F]{4})").sub(_hex_to_char, txt) return txt def _hex_to_char(match): - return chr(int(match.group(1), 16)) + return chr(int(match[1], 16)) def parse_bytes(value, default=0, suffixes="bkmgtp"): """Convert a bytes-amount ("500k", "2.5M", ...) to int""" - try: - last = value[-1].lower() - except Exception: + if not value: return default + value = str(value).strip() + last = value[-1].lower() + if last in suffixes: mul = 1024 ** suffixes.index(last) value = value[:-1] @@ -279,12 +309,19 @@ def parse_query_list(qs, as_list=()): else: result[name] = [value] elif name not in result: - result[name] = unquote(value.replace("+", " ")) + result[name] = value except Exception: pass return result +def build_query(params): + return "&".join([ + f"{quote(name)}={quote(value)}" + for name, value in params.items() + ]) + + if sys.hexversion < 0x30c0000: # Python <= 3.11 def parse_timestamp(ts, default=None): @@ -307,12 +344,7 @@ else: def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z", utcoffset=0): """Create a datetime object by parsing 'date_string'""" try: - if format.endswith("%z") and date_string[-3] == ":": - # workaround for Python < 3.7: +00:00 -> +0000 - ds = date_string[:-3] + date_string[-2:] - else: - ds = date_string - d = datetime.datetime.strptime(ds, format) + d = datetime.datetime.strptime(date_string, format) o = d.utcoffset() if o is not None: # convert to naive UTC |
