aboutsummaryrefslogtreecommitdiffstats
path: root/gallery_dl/text.py
diff options
context:
space:
mode:
authorLibravatarUnit 193 <unit193@ubuntu.com>2019-07-02 04:33:45 -0400
committerLibravatarUnit 193 <unit193@ubuntu.com>2019-07-02 04:33:45 -0400
commit195c45911e79c33cf0bb986721365fb06df5a153 (patch)
treeac0c9b6ef40bea7aa7ab0c5c3cb500eb510668fa /gallery_dl/text.py
Import Upstream version 1.8.7upstream/1.8.7
Diffstat (limited to 'gallery_dl/text.py')
-rw-r--r--gallery_dl/text.py278
1 files changed, 278 insertions, 0 deletions
diff --git a/gallery_dl/text.py b/gallery_dl/text.py
new file mode 100644
index 0000000..151fa30
--- /dev/null
+++ b/gallery_dl/text.py
@@ -0,0 +1,278 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2015-2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Collection of functions that work on strings/text"""
+
+import re
+import html
+import os.path
+import datetime
+import urllib.parse
+
+
+INVALID_XML_CHARS = (
+ "\x00", "\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07",
+ "\x08", "\x0b", "\x0c", "\x0e", "\x0f", "\x10", "\x11", "\x12",
+ "\x13", "\x14", "\x15", "\x16", "\x17", "\x18", "\x19", "\x1a",
+ "\x1b", "\x1c", "\x1d", "\x1e", "\x1f",
+)
+
+
+def clean_xml(xmldata, repl=""):
+ """Replace/Remove invalid control characters in 'xmldata'"""
+ if not isinstance(xmldata, str):
+ try:
+ xmldata = "".join(xmldata)
+ except TypeError:
+ return ""
+ for char in INVALID_XML_CHARS:
+ if char in xmldata:
+ xmldata = xmldata.replace(char, repl)
+ return xmldata
+
+
+def remove_html(txt):
+ """Remove html-tags from a string"""
+ try:
+ return " ".join(re.sub("<[^>]+>", " ", txt).split())
+ except TypeError:
+ return ""
+
+
+def split_html(txt, sep=None):
+ """Split input string by html-tags"""
+ try:
+ return [
+ x.strip() for x in re.split("<[^>]+>", txt)
+ if x and not x.isspace()
+ ]
+ except TypeError:
+ return []
+
+
+def filename_from_url(url):
+ """Extract the last part of an URL to use as a filename"""
+ try:
+ return urllib.parse.urlsplit(url).path.rpartition("/")[2]
+ except (TypeError, AttributeError):
+ return ""
+
+
+def ext_from_url(url):
+ """Extract the filename extension of an URL"""
+ filename = filename_from_url(url)
+ ext = os.path.splitext(filename)[1]
+ return ext[1:].lower()
+
+
+def nameext_from_url(url, data=None):
+ """Extract the last part of an URL and fill 'data' accordingly"""
+ if data is None:
+ data = {}
+ name = unquote(filename_from_url(url))
+ data["filename"], ext = os.path.splitext(name)
+ data["extension"] = ext[1:].lower()
+ return data
+
+
+def clean_path_windows(path):
+ """Remove illegal characters from a path-segment (Windows)"""
+ try:
+ return re.sub(r'[<>:"\\/|?*]', "_", path)
+ except TypeError:
+ return ""
+
+
+def clean_path_posix(path):
+ """Remove illegal characters from a path-segment (Posix)"""
+ try:
+ return path.replace("/", "_")
+ except AttributeError:
+ return ""
+
+
+def extract(txt, begin, end, pos=0):
+ """Extract the text between 'begin' and 'end' from 'txt'
+
+ Args:
+ txt: String to search in
+ begin: First string to be searched for
+ end: Second string to be searched for after 'begin'
+ pos: Starting position for searches in 'txt'
+
+ Returns:
+ The string between the two search-strings 'begin' and 'end' beginning
+ with position 'pos' in 'txt' as well as the position after 'end'.
+
+ If at least one of 'begin' or 'end' is not found, None and the original
+ value of 'pos' is returned
+
+ Examples:
+ extract("abcde", "b", "d") -> "c" , 4
+ extract("abcde", "b", "d", 3) -> None, 3
+ """
+ try:
+ first = txt.index(begin, pos) + len(begin)
+ last = txt.index(end, first)
+ return txt[first:last], last+len(end)
+ except (ValueError, TypeError, AttributeError):
+ return None, pos
+
+
+def rextract(txt, begin, end, pos=-1):
+ try:
+ lbeg = len(begin)
+ first = txt.rindex(begin, 0, pos)
+ last = txt.index(end, first + lbeg)
+ return txt[first + lbeg:last], first
+ except (ValueError, TypeError, AttributeError):
+ return None, pos
+
+
+def extract_all(txt, rules, pos=0, values=None):
+ """Calls extract for each rule and returns the result in a dict"""
+ if values is None:
+ values = {}
+ for key, begin, end in rules:
+ result, pos = extract(txt, begin, end, pos)
+ if key:
+ values[key] = result
+ return values, pos
+
+
+def extract_iter(txt, begin, end, pos=0):
+ """Yield values that would be returned by repeated calls of extract()"""
+ index = txt.index
+ lbeg = len(begin)
+ lend = len(end)
+ try:
+ while True:
+ first = index(begin, pos) + lbeg
+ last = index(end, first)
+ pos = last + lend
+ yield txt[first:last]
+ except (ValueError, TypeError, AttributeError):
+ return
+
+
+def extract_from(txt, pos=0, default=""):
+ """Returns a function object that extracts from 'txt'"""
+ def extr(begin, end, index=txt.index, txt=txt):
+ nonlocal pos
+ try:
+ first = index(begin, pos) + len(begin)
+ last = index(end, first)
+ pos = last + len(end)
+ return txt[first:last]
+ except (ValueError, TypeError, AttributeError):
+ return default
+ return extr
+
+
+def parse_unicode_escapes(txt):
+ """Convert JSON Unicode escapes in 'txt' into actual characters"""
+ if "\\u" in txt:
+ return re.sub(r"\\u([0-9a-fA-F]{4})", _hex_to_char, txt)
+ return txt
+
+
+def _hex_to_char(match):
+ return chr(int(match.group(1), 16))
+
+
+def parse_bytes(value, default=0, suffixes="bkmgtp"):
+ """Convert a bytes-amount ("500k", "2.5M", ...) to int"""
+ try:
+ last = value[-1].lower()
+ except (TypeError, KeyError, IndexError):
+ return default
+
+ if last in suffixes:
+ mul = 1024 ** suffixes.index(last)
+ value = value[:-1]
+ else:
+ mul = 1
+
+ try:
+ return round(float(value) * mul)
+ except ValueError:
+ return default
+
+
+def parse_int(value, default=0):
+ """Convert 'value' to int"""
+ if not value:
+ return default
+ try:
+ return int(value)
+ except (ValueError, TypeError):
+ return default
+
+
+def parse_float(value, default=0.0):
+ """Convert 'value' to float"""
+ if not value:
+ return default
+ try:
+ return float(value)
+ except (ValueError, TypeError):
+ return default
+
+
+def parse_query(qs):
+ """Parse a query string into key-value pairs"""
+ result = {}
+ try:
+ for key, value in urllib.parse.parse_qsl(qs):
+ if key not in result:
+ result[key] = value
+ except AttributeError:
+ pass
+ return result
+
+
+def parse_timestamp(ts, default=None):
+ """Create a datetime object from a unix timestamp"""
+ try:
+ return datetime.datetime.utcfromtimestamp(int(ts))
+ except (TypeError, ValueError, OverflowError):
+ return default
+
+
+def parse_datetime(date_string, format="%Y-%m-%dT%H:%M:%S%z"):
+ """Create a datetime object by parsing 'date_string'"""
+ try:
+ if format.endswith("%z") and date_string[-3] == ":":
+ # workaround for Python < 3.7: +00:00 -> +0000
+ ds = date_string[:-3] + date_string[-2:]
+ else:
+ ds = date_string
+ d = datetime.datetime.strptime(ds, format)
+ o = d.utcoffset()
+ if o is not None:
+ d = d.replace(tzinfo=None) - o # convert to naive UTC
+ return d
+ except (TypeError, IndexError, KeyError):
+ return None
+ except (ValueError, OverflowError):
+ return date_string
+
+
+if os.name == "nt":
+ clean_path = clean_path_windows
+else:
+ clean_path = clean_path_posix
+
+
+urljoin = urllib.parse.urljoin
+
+quote = urllib.parse.quote
+unquote = urllib.parse.unquote
+
+escape = html.escape
+unescape = html.unescape