From 1466a391d1b4428ae244689933edfcbdb24dadd5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Mar 2023 20:55:01 -0800 Subject: [PATCH] Bump bleach from 5.0.1 to 6.0.0 (#1979) * Bump bleach from 5.0.1 to 6.0.0 Bumps [bleach](https://github.com/mozilla/bleach) from 5.0.1 to 6.0.0. - [Release notes](https://github.com/mozilla/bleach/releases) - [Changelog](https://github.com/mozilla/bleach/blob/main/CHANGES) - [Commits](https://github.com/mozilla/bleach/compare/v5.0.1...v6.0.0) --- updated-dependencies: - dependency-name: bleach dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> * Update bleach==6.0.0 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci] --- lib/bleach/__init__.py | 6 +- lib/bleach/html5lib_shim.py | 245 ++++++++++++++++++------------------ lib/bleach/linkifier.py | 82 ++++++++++-- lib/bleach/sanitizer.py | 154 +++++++++++++---------- requirements.txt | 2 +- 5 files changed, 291 insertions(+), 198 deletions(-) diff --git a/lib/bleach/__init__.py b/lib/bleach/__init__.py index bbcc2e03..4e87eb80 100644 --- a/lib/bleach/__init__.py +++ b/lib/bleach/__init__.py @@ -11,9 +11,9 @@ from bleach.sanitizer import ( # yyyymmdd -__releasedate__ = "20220627" +__releasedate__ = "20230123" # x.y.z or x.y.z.dev0 -- semver -__version__ = "5.0.1" +__version__ = "6.0.0" __all__ = ["clean", "linkify"] @@ -52,7 +52,7 @@ def clean( :arg str text: the text to clean - :arg list tags: allowed list of tags; defaults to + :arg set tags: set of allowed tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; diff --git a/lib/bleach/html5lib_shim.py b/lib/bleach/html5lib_shim.py index d121953b..aa5189b1 100644 --- a/lib/bleach/html5lib_shim.py +++ b/lib/bleach/html5lib_shim.py @@ -38,6 +38,9 @@ from bleach._vendor.html5lib.filters.sanitizer import ( allowed_protocols, allowed_css_properties, allowed_svg_properties, + attr_val_is_uri, + svg_attr_val_allows_ref, + svg_allow_local_href, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.sanitizer import ( Filter as SanitizerFilter, @@ -78,127 +81,129 @@ TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"] #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3 -HTML_TAGS = [ - "a", - "abbr", - "address", - "area", - "article", - "aside", - "audio", - "b", - "base", - "bdi", - "bdo", - "blockquote", - "body", - "br", - "button", - "canvas", - "caption", - "cite", - "code", - "col", - "colgroup", - "data", - "datalist", - "dd", - "del", - "details", - "dfn", - "dialog", - "div", - "dl", - "dt", - "em", - "embed", - "fieldset", - "figcaption", - "figure", - "footer", - "form", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "head", - "header", - "hgroup", - "hr", - "html", - "i", - "iframe", - "img", - "input", - "ins", - "kbd", - "keygen", - "label", - "legend", - "li", - "link", - "map", - "mark", - "menu", - "meta", - "meter", - "nav", - "noscript", - "object", - "ol", - "optgroup", - "option", - "output", - "p", - "param", - "picture", - "pre", - "progress", - "q", - "rp", - "rt", - "ruby", - "s", - "samp", - "script", - "section", - "select", - "slot", - "small", - "source", - "span", - "strong", - "style", - "sub", - "summary", - "sup", - "table", - "tbody", - "td", - "template", - "textarea", - "tfoot", - "th", - "thead", - "time", - "title", - "tr", - "track", - "u", - "ul", - "var", - "video", - "wbr", -] +HTML_TAGS = frozenset( + ( + "a", + "abbr", + "address", + "area", + "article", + "aside", + "audio", + "b", + "base", + "bdi", + "bdo", + "blockquote", + "body", + "br", + "button", + "canvas", + "caption", + "cite", + "code", + "col", + "colgroup", + "data", + "datalist", + "dd", + "del", + "details", + "dfn", + "dialog", + "div", + "dl", + "dt", + "em", + "embed", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "head", + "header", + "hgroup", + "hr", + "html", + "i", + "iframe", + "img", + "input", + "ins", + "kbd", + "keygen", + "label", + "legend", + "li", + "link", + "map", + "mark", + "menu", + "meta", + "meter", + "nav", + "noscript", + "object", + "ol", + "optgroup", + "option", + "output", + "p", + "param", + "picture", + "pre", + "progress", + "q", + "rp", + "rt", + "ruby", + "s", + "samp", + "script", + "section", + "select", + "slot", + "small", + "source", + "span", + "strong", + "style", + "sub", + "summary", + "sup", + "table", + "tbody", + "td", + "template", + "textarea", + "tfoot", + "th", + "thead", + "time", + "title", + "tr", + "track", + "u", + "ul", + "var", + "video", + "wbr", + ) +) #: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369 #: from mozilla on 2019.07.11 #: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements HTML_TAGS_BLOCK_LEVEL = frozenset( - [ + ( "address", "article", "aside", @@ -232,7 +237,7 @@ HTML_TAGS_BLOCK_LEVEL = frozenset( "section", "table", "ul", - ] + ) ) @@ -473,7 +478,7 @@ class BleachHTMLParser(HTMLParser): def __init__(self, tags, strip, consume_entities, **kwargs): """ - :arg tags: list of allowed tags--everything else is either stripped or + :arg tags: set of allowed tags--everything else is either stripped or escaped; if None, then this doesn't look at tags at all :arg strip: whether to strip disallowed tags (True) or escape them (False); if tags=None, then this doesn't have any effect @@ -481,7 +486,9 @@ class BleachHTMLParser(HTMLParser): leave them as is when tokenizing (BleachHTMLTokenizer-added behavior) """ - self.tags = [tag.lower() for tag in tags] if tags is not None else None + self.tags = ( + frozenset((tag.lower() for tag in tags)) if tags is not None else None + ) self.strip = strip self.consume_entities = consume_entities super().__init__(**kwargs) @@ -691,7 +698,7 @@ class BleachHTMLSerializer(HTMLSerializer): # Only leave entities in that are not ambiguous. If they're # ambiguous, then we escape the ampersand. if entity is not None and convert_entity(entity) is not None: - yield "&" + entity + ";" + yield f"&{entity};" # Length of the entity plus 2--one for & at the beginning # and one for ; at the end diff --git a/lib/bleach/linkifier.py b/lib/bleach/linkifier.py index b3b83e62..679d7ead 100644 --- a/lib/bleach/linkifier.py +++ b/lib/bleach/linkifier.py @@ -120,9 +120,10 @@ class Linker: :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` - :arg list skip_tags: list of tags that you don't want to linkify the - contents of; for example, you could set this to ``['pre']`` to skip - linkifying contents of ``pre`` tags + :arg set skip_tags: set of tags that you don't want to linkify the + contents of; for example, you could set this to ``{'pre'}`` to skip + linkifying contents of ``pre`` tags; ``None`` means you don't + want linkify to skip any tags :arg bool parse_email: whether or not to linkify email addresses @@ -130,7 +131,7 @@ class Linker: :arg email_re: email matching regex - :arg list recognized_tags: the list of tags that linkify knows about; + :arg set recognized_tags: the set of tags that linkify knows about; everything else gets escaped :returns: linkified text as unicode @@ -145,15 +146,18 @@ class Linker: # Create a parser/tokenizer that allows all HTML tags and escapes # anything not in that list. self.parser = html5lib_shim.BleachHTMLParser( - tags=recognized_tags, + tags=frozenset(recognized_tags), strip=False, - consume_entities=True, + consume_entities=False, namespaceHTMLElements=False, ) self.walker = html5lib_shim.getTreeWalker("etree") self.serializer = html5lib_shim.BleachHTMLSerializer( quote_attr_values="always", omit_optional_tags=False, + # We want to leave entities as they are without escaping or + # resolving or expanding + resolve_entities=False, # linkify does not sanitize sanitize=False, # linkify preserves attr order @@ -218,8 +222,8 @@ class LinkifyFilter(html5lib_shim.Filter): :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` - :arg list skip_tags: list of tags that you don't want to linkify the - contents of; for example, you could set this to ``['pre']`` to skip + :arg set skip_tags: set of tags that you don't want to linkify the + contents of; for example, you could set this to ``{'pre'}`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses @@ -232,7 +236,7 @@ class LinkifyFilter(html5lib_shim.Filter): super().__init__(source) self.callbacks = callbacks or [] - self.skip_tags = skip_tags or [] + self.skip_tags = skip_tags or {} self.parse_email = parse_email self.url_re = url_re @@ -510,6 +514,62 @@ class LinkifyFilter(html5lib_shim.Filter): yield {"type": "Characters", "data": str(new_text)} yield token_buffer[-1] + def extract_entities(self, token): + """Handles Characters tokens with entities + + Our overridden tokenizer doesn't do anything with entities. However, + that means that the serializer will convert all ``&`` in Characters + tokens to ``&``. + + Since we don't want that, we extract entities here and convert them to + Entity tokens so the serializer will let them be. + + :arg token: the Characters token to work on + + :returns: generator of tokens + + """ + data = token.get("data", "") + + # If there isn't a & in the data, we can return now + if "&" not in data: + yield token + return + + new_tokens = [] + + # For each possible entity that starts with a "&", we try to extract an + # actual entity and re-tokenize accordingly + for part in html5lib_shim.next_possible_entity(data): + if not part: + continue + + if part.startswith("&"): + entity = html5lib_shim.match_entity(part) + if entity is not None: + if entity == "amp": + # LinkifyFilter can't match urls across token boundaries + # which is problematic with & since that shows up in + # querystrings all the time. This special-cases & + # and converts it to a & and sticks it in as a + # Characters token. It'll get merged with surrounding + # tokens in the BleachSanitizerfilter.__iter__ and + # escaped in the serializer. + new_tokens.append({"type": "Characters", "data": "&"}) + else: + new_tokens.append({"type": "Entity", "name": entity}) + + # Length of the entity plus 2--one for & at the beginning + # and one for ; at the end + remainder = part[len(entity) + 2 :] + if remainder: + new_tokens.append({"type": "Characters", "data": remainder}) + continue + + new_tokens.append({"type": "Characters", "data": part}) + + yield from new_tokens + def __iter__(self): in_a = False in_skip_tag = None @@ -564,8 +624,8 @@ class LinkifyFilter(html5lib_shim.Filter): new_stream = self.handle_links(new_stream) - for token in new_stream: - yield token + for new_token in new_stream: + yield from self.extract_entities(new_token) # We've already yielded this token, so continue continue diff --git a/lib/bleach/sanitizer.py b/lib/bleach/sanitizer.py index 6527ac03..8662a879 100644 --- a/lib/bleach/sanitizer.py +++ b/lib/bleach/sanitizer.py @@ -8,21 +8,23 @@ from bleach import html5lib_shim from bleach import parse_shim -#: List of allowed tags -ALLOWED_TAGS = [ - "a", - "abbr", - "acronym", - "b", - "blockquote", - "code", - "em", - "i", - "li", - "ol", - "strong", - "ul", -] +#: Set of allowed tags +ALLOWED_TAGS = frozenset( + ( + "a", + "abbr", + "acronym", + "b", + "blockquote", + "code", + "em", + "i", + "li", + "ol", + "strong", + "ul", + ) +) #: Map of allowed attributes by tag @@ -33,7 +35,7 @@ ALLOWED_ATTRIBUTES = { } #: List of allowed protocols -ALLOWED_PROTOCOLS = ["http", "https", "mailto"] +ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto")) #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr) INVISIBLE_CHARACTERS = "".join( @@ -48,6 +50,10 @@ INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICOD INVISIBLE_REPLACEMENT_CHAR = "?" +class NoCssSanitizerWarning(UserWarning): + pass + + class Cleaner: """Cleaner for cleaning HTML fragments of malicious content @@ -89,7 +95,7 @@ class Cleaner: ): """Initializes a Cleaner - :arg list tags: allowed list of tags; defaults to + :arg set tags: set of allowed tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; @@ -143,6 +149,25 @@ class Cleaner: alphabetical_attributes=False, ) + if css_sanitizer is None: + # FIXME(willkg): this doesn't handle when attributes or an + # attributes value is a callable + attributes_values = [] + if isinstance(attributes, list): + attributes_values = attributes + + elif isinstance(attributes, dict): + attributes_values = [] + for values in attributes.values(): + if isinstance(values, (list, tuple)): + attributes_values.extend(values) + + if "style" in attributes_values: + warnings.warn( + "'style' attribute specified, but css_sanitizer not set.", + category=NoCssSanitizerWarning, + ) + def clean(self, text): """Cleans text and returns sanitized result as unicode @@ -155,9 +180,8 @@ class Cleaner: """ if not isinstance(text, str): message = ( - "argument cannot be of '{name}' type, must be of text type".format( - name=text.__class__.__name__ - ) + f"argument cannot be of {text.__class__.__name__!r} type, " + + "must be of text type" ) raise TypeError(message) @@ -167,13 +191,11 @@ class Cleaner: dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), - # Bleach-sanitizer-specific things + allowed_tags=self.tags, attributes=self.attributes, - strip_disallowed_elements=self.strip, + strip_disallowed_tags=self.strip, strip_html_comments=self.strip_comments, css_sanitizer=self.css_sanitizer, - # html5lib-sanitizer things - allowed_elements=self.tags, allowed_protocols=self.protocols, ) @@ -237,19 +259,21 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): def __init__( self, source, - allowed_elements=ALLOWED_TAGS, + allowed_tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, allowed_protocols=ALLOWED_PROTOCOLS, - strip_disallowed_elements=False, + attr_val_is_uri=html5lib_shim.attr_val_is_uri, + svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref, + svg_allow_local_href=html5lib_shim.svg_allow_local_href, + strip_disallowed_tags=False, strip_html_comments=True, css_sanitizer=None, - **kwargs, ): """Creates a BleachSanitizerFilter instance :arg source: html5lib TreeWalker stream as an html5lib TreeWalker - :arg list allowed_elements: allowed list of tags; defaults to + :arg set allowed_tags: set of allowed tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; @@ -258,8 +282,16 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): :arg list allowed_protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` - :arg bool strip_disallowed_elements: whether or not to strip disallowed - elements + :arg attr_val_is_uri: set of attributes that have URI values + + :arg svg_attr_val_allows_ref: set of SVG attributes that can have + references + + :arg svg_allow_local_href: set of SVG elements that can have local + hrefs + + :arg bool strip_disallowed_tags: whether or not to strip disallowed + tags :arg bool strip_html_comments: whether or not to strip HTML comments @@ -267,24 +299,24 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): sanitizing style attribute values and style text; defaults to None """ - self.attr_filter = attribute_filter_factory(attributes) - self.strip_disallowed_elements = strip_disallowed_elements - self.strip_html_comments = strip_html_comments - self.css_sanitizer = css_sanitizer + # NOTE(willkg): This is the superclass of + # html5lib.filters.sanitizer.Filter. We call this directly skipping the + # __init__ for html5lib.filters.sanitizer.Filter because that does + # things we don't need to do and kicks up the deprecation warning for + # using Sanitizer. + html5lib_shim.Filter.__init__(self, source) - # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init - warnings.filterwarnings( - "ignore", - message="html5lib's sanitizer is deprecated", - category=DeprecationWarning, - module="bleach._vendor.html5lib", - ) - return super().__init__( - source, - allowed_elements=allowed_elements, - allowed_protocols=allowed_protocols, - **kwargs, - ) + self.allowed_tags = frozenset(allowed_tags) + self.allowed_protocols = frozenset(allowed_protocols) + + self.attr_filter = attribute_filter_factory(attributes) + self.strip_disallowed_tags = strip_disallowed_tags + self.strip_html_comments = strip_html_comments + + self.attr_val_is_uri = attr_val_is_uri + self.svg_attr_val_allows_ref = svg_attr_val_allows_ref + self.css_sanitizer = css_sanitizer + self.svg_allow_local_href = svg_allow_local_href def sanitize_stream(self, token_iterator): for token in token_iterator: @@ -354,10 +386,10 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): """ token_type = token["type"] if token_type in ["StartTag", "EndTag", "EmptyTag"]: - if token["name"] in self.allowed_elements: + if token["name"] in self.allowed_tags: return self.allow_token(token) - elif self.strip_disallowed_elements: + elif self.strip_disallowed_tags: return None else: @@ -570,7 +602,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): def disallowed_token(self, token): token_type = token["type"] if token_type == "EndTag": - token["data"] = "</%s>" % token["name"] + token["data"] = f"</{token['name']}>" elif token["data"]: assert token_type in ("StartTag", "EmptyTag") @@ -586,25 +618,19 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): if ns is None or ns not in html5lib_shim.prefixes: namespaced_name = name else: - namespaced_name = "{}:{}".format(html5lib_shim.prefixes[ns], name) + namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}" - attrs.append( - ' %s="%s"' - % ( - namespaced_name, - # NOTE(willkg): HTMLSerializer escapes attribute values - # already, so if we do it here (like HTMLSerializer does), - # then we end up double-escaping. - v, - ) - ) - token["data"] = "<{}{}>".format(token["name"], "".join(attrs)) + # NOTE(willkg): HTMLSerializer escapes attribute values + # already, so if we do it here (like HTMLSerializer does), + # then we end up double-escaping. + attrs.append(f' {namespaced_name}="{v}"') + token["data"] = f"<{token['name']}{''.join(attrs)}>" else: - token["data"] = "<%s>" % token["name"] + token["data"] = f"<{token['name']}>" if token.get("selfClosing"): - token["data"] = token["data"][:-1] + "/>" + token["data"] = f"{token['data'][:-1]}/>" token["type"] = "Characters" diff --git a/requirements.txt b/requirements.txt index 73c9308b..799b09b2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ backports.csv==1.0.7 backports.functools-lru-cache==1.6.4 backports.zoneinfo==0.2.1;python_version<"3.9" beautifulsoup4==4.11.1 -bleach==5.0.1 +bleach==6.0.0 certifi==2022.12.7 cheroot==9.0.0 cherrypy==18.8.0