diff options
Diffstat (limited to 'gallery_dl/text.py')
| -rw-r--r-- | gallery_dl/text.py | 278 |
1 files changed, 278 insertions, 0 deletions
diff --git a/gallery_dl/text.py b/gallery_dl/text.py new file mode 100644 index 0000000..151fa30 --- /dev/null +++ b/gallery_dl/text.py @@ -0,0 +1,278 @@ +# -*- coding: utf-8 -*- + +# Copyright 2015-2019 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Collection of functions that work on strings/text""" + +import re +import html +import os.path +import datetime +import urllib.parse + + +INVALID_XML_CHARS = ( + "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", + "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12", + "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a", + "\x1b", "\x1c", "\x1d", "\x1e", "\x1f", +) + + +def clean_xml(xmldata, repl=""): + """Replace/Remove invalid control characters in 'xmldata'""" + if not isinstance(xmldata, str): + try: + xmldata = "".join(xmldata) + except TypeError: + return "" + for char in INVALID_XML_CHARS: + if char in xmldata: + xmldata = xmldata.replace(char, repl) + return xmldata + + +def remove_html(txt): + """Remove html-tags from a string""" + try: + return " ".join(re.sub("<[^>]+>", " ", txt).split()) + except TypeError: + return "" + + +def split_html(txt, sep=None): + """Split input string by html-tags""" + try: + return [ + x.strip() for x in re.split("<[^>]+>", txt) + if x and not x.isspace() + ] + except TypeError: + return [] + + +def filename_from_url(url): + """Extract the last part of an URL to use as a filename""" + try: + return urllib.parse.urlsplit(url).path.rpartition("/")[2] + except (TypeError, AttributeError): + return "" + + +def ext_from_url(url): + """Extract the filename extension of an URL""" + filename = filename_from_url(url) + ext = os.path.splitext(filename)[1] + return ext[1:].lower() + + +def nameext_from_url(url, data=None): + """Extract the last part of an URL and fill 'data' accordingly""" + if data is None: + data = {} + name = unquote(filename_from_url(url)) + data["filename"], ext = os.path.splitext(name) + data["extension"] = ext[1:].lower() + return data + + +def clean_path_windows(path): + """Remove illegal characters from a path-segment (Windows)""" + try: + return re.sub(r'[<>:"\\/|?*]', "_", path) + except TypeError: + return "" + + +def clean_path_posix(path): + """Remove illegal characters from a path-segment (Posix)""" + try: + return path.replace("/", "_") + except AttributeError: + return "" + + +def extract(txt, begin, end, pos=0): + """Extract the text between 'begin' and 'end' from 'txt' + + Args: + txt: String to search in + begin: First string to be searched for + end: Second string to be searched for after 'begin' + pos: Starting position for searches in 'txt' + + Returns: + The string between the two search-strings 'begin' and 'end' beginning + with position 'pos' in 'txt' as well as the position after 'end'. + + If at least one of 'begin' or 'end' is not found, None and the original + value of 'pos' is returned + + Examples: + extract("abcde", "b", "d") -> "c" , 4 + extract("abcde", "b", "d", 3) -> None, 3 + """ + try: + first = txt.index(begin, pos) + len(begin) + last = txt.index(end, first) + return txt[first:last], last+len(end) + except (ValueError, TypeError, AttributeError): + return None, pos + + +def rextract(txt, begin, end, pos=-1): + try: + lbeg = len(begin) + first = txt.rindex(begin, 0, pos) + last = txt.index(end, first + lbeg) + return txt[first + lbeg:last], first + except (ValueError, TypeError, AttributeError): + return None, pos + + +def extract_all(txt, rules, pos=0, values=None): + """Calls extract for each rule and returns the result in a dict""" + if values is None: + values = {} + for key, begin, end in rules: + result, pos = extract(txt, begin, end, pos) + if key: + values[key] = result + return values, pos + + +def extract_iter(txt, begin, end, pos=0): + """Yield values that would be returned by repeated calls of extract()""" + index = txt.index + lbeg = len(begin) + lend = len(end) + try: + while True: + first = index(begin, pos) + lbeg + last = index(end, first) + pos = last + lend + yield txt[first:last] + except (ValueError, TypeError, AttributeError): + return + + +def extract_from(txt, pos=0, default=""): + """Returns a function object that extracts from 'txt'""" + def extr(begin, end, index=txt.index, txt=txt): + nonlocal pos + try: + first = index(begin, pos) + len(begin) + last = index(end, first) + pos = last + len(end) + return txt[first:last] + except (ValueError, TypeError, AttributeError): + return default + return extr + + +def parse_unicode_escapes(txt): + """Convert JSON Unicode escapes in 'txt' into actual characters""" + if "\\u" in txt: + return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt) + return txt + + +def _hex_to_char(match): + return chr(int(match.group(1), 16)) + + +def parse_bytes(value, default=0, suffixes="bkmgtp"): + """Convert a bytes-amount ("500k", "2.5M", ...) to int""" + try: + last = value[-1].lower() + except (TypeError, KeyError, IndexError): + return default + + if last in suffixes: + mul = 1024 ** suffixes.index(last) + value = value[:-1] + else: + mul = 1 + + try: + return round(float(value) * mul) + except ValueError: + return default + + +def parse_int(value, default=0): + """Convert 'value' to int""" + if not value: + return default + try: + return int(value) + except (ValueError, TypeError): + return default + + +def parse_float(value, default=0.0): + """Convert 'value' to float""" + if not value: + return default + try: + return float(value) + except (ValueError, TypeError): + return default + + +def parse_query(qs): + """Parse a query string into key-value pairs""" + result = {} + try: + for key, value in urllib.parse.parse_qsl(qs): + if key not in result: + result[key] = value + except AttributeError: + pass + return result + + +def parse_timestamp(ts, default=None): + """Create a datetime object from a unix timestamp""" + try: + return datetime.datetime.utcfromtimestamp(int(ts)) + except (TypeError, ValueError, OverflowError): + return default + + +def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z"): + """Create a datetime object by parsing 'date_string'""" + try: + if format.endswith("%z") and date_string[-3] == ":": + # workaround for Python < 3.7: +00:00 -> +0000 + ds = date_string[:-3] + date_string[-2:] + else: + ds = date_string + d = datetime.datetime.strptime(ds, format) + o = d.utcoffset() + if o is not None: + d = d.replace(tzinfo=None) - o # convert to naive UTC + return d + except (TypeError, IndexError, KeyError): + return None + except (ValueError, OverflowError): + return date_string + + +if os.name == "nt": + clean_path = clean_path_windows +else: + clean_path = clean_path_posix + + +urljoin = urllib.parse.urljoin + +quote = urllib.parse.quote +unquote = urllib.parse.unquote + +escape = html.escape +unescape = html.unescape |
