importhtmlimportrefromtypingimportCallable,Optionalfromurllib.parseimporturlparse,urlunparse,quote,unquote# noqa: F401from.utilsimportESCAPABLE# TODO below we port the use of the JS packages:# var mdurl = require('mdurl')# var punycode = require('punycode')## e.g. mdurl: parsed = mdurl.parse(url, True)## but need to check these fixes from https://www.npmjs.com/package/mdurl:## Parse url string. Similar to node's url.parse,# but without any normalizations and query string parse.# url - input url (string)# slashesDenoteHost - if url starts with //, expect a hostname after it. Optional, false.# Difference with node's url:# No leading slash in paths, e.g. in url.parse('http://foo?bar') pathname is ``, not /# Backslashes are not replaced with slashes, so http:\\example.org\ is treated like a relative path# Trailing colon is treated like a part of the path, i.e. in http://example.org:foo pathname is :foo# Nothing is URL-encoded in the resulting object,# (in joyent/node some chars in auth and paths are encoded)# url.parse() does not have parseQueryString argument# Removed extraneous result properties: host, path, query, etc.,# which can be constructed using other parts of the url.# ################# Copied from Commonmark.py #################ENTITY="&(?:#x[a-f0-9]{1,6}|#[0-9]{1,7}|[a-z][a-z0-9]{1,31});"reBackslashOrAmp=re.compile(r"[\\&]")reEntityOrEscapedChar=re.compile("\\\\"+"["+ESCAPABLE+"]|"+ENTITY,re.IGNORECASE)
[文档]defunescape_string(s:str)->str:"""Replace entities and backslash escapes with literal characters."""ifre.search(reBackslashOrAmp,s):returnre.sub(reEntityOrEscapedChar,lambdam:unescape_char(m.group()),s)else:returns
# TODO the selective encoding below should probably be done here,# something like:# url_check = urllib.parse.urlparse(destination)# if url_check.scheme in RECODE_HOSTNAME_FOR: ...# parsed = urlparse(url)# if parsed.hostname:# # Encode hostnames in urls like:# # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`# ## # We don't encode unknown schemas, because it's likely that we encode# # something we shouldn't (e.g. `skype:name` treated as `skype:host`)# ## if (not parsed.scheme) or parsed.scheme in RECODE_HOSTNAME_FOR:# try:# parsed.hostname = punycode.toASCII(parsed.hostname)# except Exception:# pass# return quote(urlunparse(parsed))
# TODO the selective encoding below should probably be done here,# something like:# url_check = urllib.parse.urlparse(destination)# if url_check.scheme in RECODE_HOSTNAME_FOR: ...# parsed = urlparse(url)# if parsed.hostname:# # Encode hostnames in urls like:# # `http:#host/`, `https:#host/`, `mailto:user@host`, `#host/`# ## # We don't encode unknown schemas, because it's likely that we encode# # something we shouldn't (e.g. `skype:name` treated as `skype:host`)# ## if (not parsed.protocol) or parsed.protocol in RECODE_HOSTNAME_FOR:# try:# parsed.hostname = punycode.toUnicode(parsed.hostname)# except Exception:# pass# return unquote(urlunparse(parsed))BAD_PROTO_RE=re.compile(r"^(vbscript|javascript|file|data):")GOOD_DATA_RE=re.compile(r"^data:image\/(gif|png|jpeg|webp);")
[文档]defvalidateLink(url:str,validator:Optional[Callable]=None)->bool:"""Validate URL link is allowed in output. This validator can prohibit more than really needed to prevent XSS. It's a tradeoff to keep code simple and to be secure by default. Note: url should be normalized at this point, and existing entities decoded. """ifvalidatorisnotNone:returnvalidator(url)url=url.strip().lower()returnbool(GOOD_DATA_RE.search(url))ifBAD_PROTO_RE.search(url)elseTrue