Bump bleach from 5.0.1 to 6.0.0 (#1979)

* Bump bleach from 5.0.1 to 6.0.0 Bumps [bleach](https://github.com/mozilla/bleach) from 5.0.1 to 6.0.0. - [Release notes](https://github.com/mozilla/bleach/releases) - [Changelog](https://github.com/mozilla/bleach/blob/main/CHANGES) - [Commits](https://github.com/mozilla/bleach/compare/v5.0.1...v6.0.0) --- updated-dependencies: - dependency-name: bleach dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] <support@github.com> * Update bleach==6.0.0 --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: JonnyWong16 <9099342+JonnyWong16@users.noreply.github.com> [skip ci]
2025-03-12 04:35:40 -07:00 · 2023-03-02 20:55:01 -08:00 · 2023-03-02 20:55:01 -08:00 · 1466a391d1
commit 1466a391d1
parent 6b1b6d0f32
5 changed files with 291 additions and 198 deletions
--- a/lib/bleach/init.py
+++ b/lib/bleach/init.py
@ -11,9 +11,9 @@ from bleach.sanitizer import (


 # yyyymmdd
-__releasedate__ = "20220627"
+__releasedate__ = "20230123"
 # x.y.z or x.y.z.dev0 -- semver
-__version__ = "5.0.1"
+__version__ = "6.0.0"


 __all__ = ["clean", "linkify"]
@ -52,7 +52,7 @@ def clean(

    :arg str text: the text to clean

-    :arg list tags: allowed list of tags; defaults to
+    :arg set tags: set of allowed tags; defaults to
        ``bleach.sanitizer.ALLOWED_TAGS``

    :arg dict attributes: allowed attributes; can be a callable, list or dict;
--- a/lib/bleach/html5lib_shim.py
+++ b/lib/bleach/html5lib_shim.py
@ -38,6 +38,9 @@ from bleach._vendor.html5lib.filters.sanitizer import (
    allowed_protocols,
    allowed_css_properties,
    allowed_svg_properties,
+    attr_val_is_uri,
+    svg_attr_val_allows_ref,
+    svg_allow_local_href,
 )  # noqa: E402 module level import not at top of file
 from bleach._vendor.html5lib.filters.sanitizer import (
    Filter as SanitizerFilter,
@ -78,127 +81,129 @@ TAG_TOKEN_TYPE_PARSEERROR = constants.tokenTypes["ParseError"]

 #: List of valid HTML tags, from WHATWG HTML Living Standard as of 2018-10-17
 #: https://html.spec.whatwg.org/multipage/indices.html#elements-3
-HTML_TAGS = [
-    "a",
-    "abbr",
-    "address",
-    "area",
-    "article",
-    "aside",
-    "audio",
-    "b",
-    "base",
-    "bdi",
-    "bdo",
-    "blockquote",
-    "body",
-    "br",
-    "button",
-    "canvas",
-    "caption",
-    "cite",
-    "code",
-    "col",
-    "colgroup",
-    "data",
-    "datalist",
-    "dd",
-    "del",
-    "details",
-    "dfn",
-    "dialog",
-    "div",
-    "dl",
-    "dt",
-    "em",
-    "embed",
-    "fieldset",
-    "figcaption",
-    "figure",
-    "footer",
-    "form",
-    "h1",
-    "h2",
-    "h3",
-    "h4",
-    "h5",
-    "h6",
-    "head",
-    "header",
-    "hgroup",
-    "hr",
-    "html",
-    "i",
-    "iframe",
-    "img",
-    "input",
-    "ins",
-    "kbd",
-    "keygen",
-    "label",
-    "legend",
-    "li",
-    "link",
-    "map",
-    "mark",
-    "menu",
-    "meta",
-    "meter",
-    "nav",
-    "noscript",
-    "object",
-    "ol",
-    "optgroup",
-    "option",
-    "output",
-    "p",
-    "param",
-    "picture",
-    "pre",
-    "progress",
-    "q",
-    "rp",
-    "rt",
-    "ruby",
-    "s",
-    "samp",
-    "script",
-    "section",
-    "select",
-    "slot",
-    "small",
-    "source",
-    "span",
-    "strong",
-    "style",
-    "sub",
-    "summary",
-    "sup",
-    "table",
-    "tbody",
-    "td",
-    "template",
-    "textarea",
-    "tfoot",
-    "th",
-    "thead",
-    "time",
-    "title",
-    "tr",
-    "track",
-    "u",
-    "ul",
-    "var",
-    "video",
-    "wbr",
-]
+HTML_TAGS = frozenset(
+    (
+        "a",
+        "abbr",
+        "address",
+        "area",
+        "article",
+        "aside",
+        "audio",
+        "b",
+        "base",
+        "bdi",
+        "bdo",
+        "blockquote",
+        "body",
+        "br",
+        "button",
+        "canvas",
+        "caption",
+        "cite",
+        "code",
+        "col",
+        "colgroup",
+        "data",
+        "datalist",
+        "dd",
+        "del",
+        "details",
+        "dfn",
+        "dialog",
+        "div",
+        "dl",
+        "dt",
+        "em",
+        "embed",
+        "fieldset",
+        "figcaption",
+        "figure",
+        "footer",
+        "form",
+        "h1",
+        "h2",
+        "h3",
+        "h4",
+        "h5",
+        "h6",
+        "head",
+        "header",
+        "hgroup",
+        "hr",
+        "html",
+        "i",
+        "iframe",
+        "img",
+        "input",
+        "ins",
+        "kbd",
+        "keygen",
+        "label",
+        "legend",
+        "li",
+        "link",
+        "map",
+        "mark",
+        "menu",
+        "meta",
+        "meter",
+        "nav",
+        "noscript",
+        "object",
+        "ol",
+        "optgroup",
+        "option",
+        "output",
+        "p",
+        "param",
+        "picture",
+        "pre",
+        "progress",
+        "q",
+        "rp",
+        "rt",
+        "ruby",
+        "s",
+        "samp",
+        "script",
+        "section",
+        "select",
+        "slot",
+        "small",
+        "source",
+        "span",
+        "strong",
+        "style",
+        "sub",
+        "summary",
+        "sup",
+        "table",
+        "tbody",
+        "td",
+        "template",
+        "textarea",
+        "tfoot",
+        "th",
+        "thead",
+        "time",
+        "title",
+        "tr",
+        "track",
+        "u",
+        "ul",
+        "var",
+        "video",
+        "wbr",
+    )
+)


 #: List of block level HTML tags, as per https://github.com/mozilla/bleach/issues/369
 #: from mozilla on 2019.07.11
 #: https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements#Elements
 HTML_TAGS_BLOCK_LEVEL = frozenset(
-    [
+    (
        "address",
        "article",
        "aside",
@ -232,7 +237,7 @@ HTML_TAGS_BLOCK_LEVEL = frozenset(
        "section",
        "table",
        "ul",
-    ]
+    )
 )


@ -473,7 +478,7 @@ class BleachHTMLParser(HTMLParser):

    def __init__(self, tags, strip, consume_entities, **kwargs):
        """
-        :arg tags: list of allowed tags--everything else is either stripped or
+        :arg tags: set of allowed tags--everything else is either stripped or
            escaped; if None, then this doesn't look at tags at all
        :arg strip: whether to strip disallowed tags (True) or escape them (False);
            if tags=None, then this doesn't have any effect
@ -481,7 +486,9 @@ class BleachHTMLParser(HTMLParser):
            leave them as is when tokenizing (BleachHTMLTokenizer-added behavior)

        """
-        self.tags = [tag.lower() for tag in tags] if tags is not None else None
+        self.tags = (
+            frozenset((tag.lower() for tag in tags)) if tags is not None else None
+        )
        self.strip = strip
        self.consume_entities = consume_entities
        super().__init__(**kwargs)
@ -691,7 +698,7 @@ class BleachHTMLSerializer(HTMLSerializer):
                # Only leave entities in that are not ambiguous. If they're
                # ambiguous, then we escape the ampersand.
                if entity is not None and convert_entity(entity) is not None:
-                    yield "&" + entity + ";"
+                    yield f"&{entity};"

                    # Length of the entity plus 2--one for & at the beginning
                    # and one for ; at the end
--- a/lib/bleach/linkifier.py
+++ b/lib/bleach/linkifier.py
@ -120,9 +120,10 @@ class Linker:
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

-        :arg list skip_tags: list of tags that you don't want to linkify the
-            contents of; for example, you could set this to ``['pre']`` to skip
-            linkifying contents of ``pre`` tags
+        :arg set skip_tags: set of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``{'pre'}`` to skip
+            linkifying contents of ``pre`` tags; ``None`` means you don't
+            want linkify to skip any tags

        :arg bool parse_email: whether or not to linkify email addresses

@ -130,7 +131,7 @@ class Linker:

        :arg email_re: email matching regex

-        :arg list recognized_tags: the list of tags that linkify knows about;
+        :arg set recognized_tags: the set of tags that linkify knows about;
            everything else gets escaped

        :returns: linkified text as unicode
@ -145,15 +146,18 @@ class Linker:
        # Create a parser/tokenizer that allows all HTML tags and escapes
        # anything not in that list.
        self.parser = html5lib_shim.BleachHTMLParser(
-            tags=recognized_tags,
+            tags=frozenset(recognized_tags),
            strip=False,
-            consume_entities=True,
+            consume_entities=False,
            namespaceHTMLElements=False,
        )
        self.walker = html5lib_shim.getTreeWalker("etree")
        self.serializer = html5lib_shim.BleachHTMLSerializer(
            quote_attr_values="always",
            omit_optional_tags=False,
+            # We want to leave entities as they are without escaping or
+            # resolving or expanding
+            resolve_entities=False,
            # linkify does not sanitize
            sanitize=False,
            # linkify preserves attr order
@ -218,8 +222,8 @@ class LinkifyFilter(html5lib_shim.Filter):
        :arg list callbacks: list of callbacks to run when adjusting tag attributes;
            defaults to ``bleach.linkifier.DEFAULT_CALLBACKS``

-        :arg list skip_tags: list of tags that you don't want to linkify the
-            contents of; for example, you could set this to ``['pre']`` to skip
+        :arg set skip_tags: set of tags that you don't want to linkify the
+            contents of; for example, you could set this to ``{'pre'}`` to skip
            linkifying contents of ``pre`` tags

        :arg bool parse_email: whether or not to linkify email addresses
@ -232,7 +236,7 @@ class LinkifyFilter(html5lib_shim.Filter):
        super().__init__(source)

        self.callbacks = callbacks or []
-        self.skip_tags = skip_tags or []
+        self.skip_tags = skip_tags or {}
        self.parse_email = parse_email

        self.url_re = url_re
@ -510,6 +514,62 @@ class LinkifyFilter(html5lib_shim.Filter):
                yield {"type": "Characters", "data": str(new_text)}
                yield token_buffer[-1]

+    def extract_entities(self, token):
+        """Handles Characters tokens with entities
+
+        Our overridden tokenizer doesn't do anything with entities. However,
+        that means that the serializer will convert all ``&`` in Characters
+        tokens to ``&amp;``.
+
+        Since we don't want that, we extract entities here and convert them to
+        Entity tokens so the serializer will let them be.
+
+        :arg token: the Characters token to work on
+
+        :returns: generator of tokens
+
+        """
+        data = token.get("data", "")
+
+        # If there isn't a & in the data, we can return now
+        if "&" not in data:
+            yield token
+            return
+
+        new_tokens = []
+
+        # For each possible entity that starts with a "&", we try to extract an
+        # actual entity and re-tokenize accordingly
+        for part in html5lib_shim.next_possible_entity(data):
+            if not part:
+                continue
+
+            if part.startswith("&"):
+                entity = html5lib_shim.match_entity(part)
+                if entity is not None:
+                    if entity == "amp":
+                        # LinkifyFilter can't match urls across token boundaries
+                        # which is problematic with &amp; since that shows up in
+                        # querystrings all the time. This special-cases &amp;
+                        # and converts it to a & and sticks it in as a
+                        # Characters token. It'll get merged with surrounding
+                        # tokens in the BleachSanitizerfilter.__iter__ and
+                        # escaped in the serializer.
+                        new_tokens.append({"type": "Characters", "data": "&"})
+                    else:
+                        new_tokens.append({"type": "Entity", "name": entity})
+
+                    # Length of the entity plus 2--one for & at the beginning
+                    # and one for ; at the end
+                    remainder = part[len(entity) + 2 :]
+                    if remainder:
+                        new_tokens.append({"type": "Characters", "data": remainder})
+                    continue
+
+            new_tokens.append({"type": "Characters", "data": part})
+
+        yield from new_tokens
+
    def __iter__(self):
        in_a = False
        in_skip_tag = None
@ -564,8 +624,8 @@ class LinkifyFilter(html5lib_shim.Filter):

                new_stream = self.handle_links(new_stream)

-                for token in new_stream:
-                    yield token
+                for new_token in new_stream:
+                    yield from self.extract_entities(new_token)

                # We've already yielded this token, so continue
                continue
--- a/lib/bleach/sanitizer.py
+++ b/lib/bleach/sanitizer.py
@ -8,21 +8,23 @@ from bleach import html5lib_shim
 from bleach import parse_shim


-#: List of allowed tags
-ALLOWED_TAGS = [
-    "a",
-    "abbr",
-    "acronym",
-    "b",
-    "blockquote",
-    "code",
-    "em",
-    "i",
-    "li",
-    "ol",
-    "strong",
-    "ul",
-]
+#: Set of allowed tags
+ALLOWED_TAGS = frozenset(
+    (
+        "a",
+        "abbr",
+        "acronym",
+        "b",
+        "blockquote",
+        "code",
+        "em",
+        "i",
+        "li",
+        "ol",
+        "strong",
+        "ul",
+    )
+)


 #: Map of allowed attributes by tag
@ -33,7 +35,7 @@ ALLOWED_ATTRIBUTES = {
 }

 #: List of allowed protocols
-ALLOWED_PROTOCOLS = ["http", "https", "mailto"]
+ALLOWED_PROTOCOLS = frozenset(("http", "https", "mailto"))

 #: Invisible characters--0 to and including 31 except 9 (tab), 10 (lf), and 13 (cr)
 INVISIBLE_CHARACTERS = "".join(
@ -48,6 +50,10 @@ INVISIBLE_CHARACTERS_RE = re.compile("[" + INVISIBLE_CHARACTERS + "]", re.UNICOD
 INVISIBLE_REPLACEMENT_CHAR = "?"


+class NoCssSanitizerWarning(UserWarning):
+    pass
+
+
 class Cleaner:
    """Cleaner for cleaning HTML fragments of malicious content

@ -89,7 +95,7 @@ class Cleaner:
    ):
        """Initializes a Cleaner

-        :arg list tags: allowed list of tags; defaults to
+        :arg set tags: set of allowed tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``

        :arg dict attributes: allowed attributes; can be a callable, list or dict;
@ -143,6 +149,25 @@ class Cleaner:
            alphabetical_attributes=False,
        )

+        if css_sanitizer is None:
+            # FIXME(willkg): this doesn't handle when attributes or an
+            # attributes value is a callable
+            attributes_values = []
+            if isinstance(attributes, list):
+                attributes_values = attributes
+
+            elif isinstance(attributes, dict):
+                attributes_values = []
+                for values in attributes.values():
+                    if isinstance(values, (list, tuple)):
+                        attributes_values.extend(values)
+
+            if "style" in attributes_values:
+                warnings.warn(
+                    "'style' attribute specified, but css_sanitizer not set.",
+                    category=NoCssSanitizerWarning,
+                )
+
    def clean(self, text):
        """Cleans text and returns sanitized result as unicode

@ -155,9 +180,8 @@ class Cleaner:
        """
        if not isinstance(text, str):
            message = (
-                "argument cannot be of '{name}' type, must be of text type".format(
-                    name=text.__class__.__name__
-                )
+                f"argument cannot be of {text.__class__.__name__!r} type, "
+                + "must be of text type"
            )
            raise TypeError(message)

@ -167,13 +191,11 @@ class Cleaner:
        dom = self.parser.parseFragment(text)
        filtered = BleachSanitizerFilter(
            source=self.walker(dom),
-            # Bleach-sanitizer-specific things
+            allowed_tags=self.tags,
            attributes=self.attributes,
-            strip_disallowed_elements=self.strip,
+            strip_disallowed_tags=self.strip,
            strip_html_comments=self.strip_comments,
            css_sanitizer=self.css_sanitizer,
-            # html5lib-sanitizer things
-            allowed_elements=self.tags,
            allowed_protocols=self.protocols,
        )

@ -237,19 +259,21 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
    def __init__(
        self,
        source,
-        allowed_elements=ALLOWED_TAGS,
+        allowed_tags=ALLOWED_TAGS,
        attributes=ALLOWED_ATTRIBUTES,
        allowed_protocols=ALLOWED_PROTOCOLS,
-        strip_disallowed_elements=False,
+        attr_val_is_uri=html5lib_shim.attr_val_is_uri,
+        svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref,
+        svg_allow_local_href=html5lib_shim.svg_allow_local_href,
+        strip_disallowed_tags=False,
        strip_html_comments=True,
        css_sanitizer=None,
-        **kwargs,
    ):
        """Creates a BleachSanitizerFilter instance

        :arg source: html5lib TreeWalker stream as an html5lib TreeWalker

-        :arg list allowed_elements: allowed list of tags; defaults to
+        :arg set allowed_tags: set of allowed tags; defaults to
            ``bleach.sanitizer.ALLOWED_TAGS``

        :arg dict attributes: allowed attributes; can be a callable, list or dict;
@ -258,8 +282,16 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
        :arg list allowed_protocols: allowed list of protocols for links; defaults
            to ``bleach.sanitizer.ALLOWED_PROTOCOLS``

-        :arg bool strip_disallowed_elements: whether or not to strip disallowed
-            elements
+        :arg attr_val_is_uri: set of attributes that have URI values
+
+        :arg svg_attr_val_allows_ref: set of SVG attributes that can have
+            references
+
+        :arg svg_allow_local_href: set of SVG elements that can have local
+            hrefs
+
+        :arg bool strip_disallowed_tags: whether or not to strip disallowed
+            tags

        :arg bool strip_html_comments: whether or not to strip HTML comments

@ -267,24 +299,24 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
            sanitizing style attribute values and style text; defaults to None

        """
-        self.attr_filter = attribute_filter_factory(attributes)
-        self.strip_disallowed_elements = strip_disallowed_elements
-        self.strip_html_comments = strip_html_comments
-        self.css_sanitizer = css_sanitizer
+        # NOTE(willkg): This is the superclass of
+        # html5lib.filters.sanitizer.Filter. We call this directly skipping the
+        # __init__ for html5lib.filters.sanitizer.Filter because that does
+        # things we don't need to do and kicks up the deprecation warning for
+        # using Sanitizer.
+        html5lib_shim.Filter.__init__(self, source)

-        # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
-        warnings.filterwarnings(
-            "ignore",
-            message="html5lib's sanitizer is deprecated",
-            category=DeprecationWarning,
-            module="bleach._vendor.html5lib",
-        )
-        return super().__init__(
-            source,
-            allowed_elements=allowed_elements,
-            allowed_protocols=allowed_protocols,
-            **kwargs,
-        )
+        self.allowed_tags = frozenset(allowed_tags)
+        self.allowed_protocols = frozenset(allowed_protocols)
+
+        self.attr_filter = attribute_filter_factory(attributes)
+        self.strip_disallowed_tags = strip_disallowed_tags
+        self.strip_html_comments = strip_html_comments
+
+        self.attr_val_is_uri = attr_val_is_uri
+        self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
+        self.css_sanitizer = css_sanitizer
+        self.svg_allow_local_href = svg_allow_local_href

    def sanitize_stream(self, token_iterator):
        for token in token_iterator:
@ -354,10 +386,10 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
        """
        token_type = token["type"]
        if token_type in ["StartTag", "EndTag", "EmptyTag"]:
-            if token["name"] in self.allowed_elements:
+            if token["name"] in self.allowed_tags:
                return self.allow_token(token)

-            elif self.strip_disallowed_elements:
+            elif self.strip_disallowed_tags:
                return None

            else:
@ -570,7 +602,7 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
    def disallowed_token(self, token):
        token_type = token["type"]
        if token_type == "EndTag":
-            token["data"] = "</%s>" % token["name"]
+            token["data"] = f"</{token['name']}>"

        elif token["data"]:
            assert token_type in ("StartTag", "EmptyTag")
@ -586,25 +618,19 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
                if ns is None or ns not in html5lib_shim.prefixes:
                    namespaced_name = name
                else:
-                    namespaced_name = "{}:{}".format(html5lib_shim.prefixes[ns], name)
+                    namespaced_name = f"{html5lib_shim.prefixes[ns]}:{name}"

-                attrs.append(
-                    ' %s="%s"'
-                    % (
-                        namespaced_name,
-                        # NOTE(willkg): HTMLSerializer escapes attribute values
-                        # already, so if we do it here (like HTMLSerializer does),
-                        # then we end up double-escaping.
-                        v,
-                    )
-                )
-            token["data"] = "<{}{}>".format(token["name"], "".join(attrs))
+                # NOTE(willkg): HTMLSerializer escapes attribute values
+                # already, so if we do it here (like HTMLSerializer does),
+                # then we end up double-escaping.
+                attrs.append(f' {namespaced_name}="{v}"')
+            token["data"] = f"<{token['name']}{''.join(attrs)}>"

        else:
-            token["data"] = "<%s>" % token["name"]
+            token["data"] = f"<{token['name']}>"

        if token.get("selfClosing"):
-            token["data"] = token["data"][:-1] + "/>"
+            token["data"] = f"{token['data'][:-1]}/>"

        token["type"] = "Characters"

--- a/requirements.txt
+++ b/requirements.txt
@ -5,7 +5,7 @@ backports.csv==1.0.7
 backports.functools-lru-cache==1.6.4
 backports.zoneinfo==0.2.1;python_version<"3.9"
 beautifulsoup4==4.11.1
-bleach==5.0.1
+bleach==6.0.0
 certifi==2022.12.7
 cheroot==9.0.0
 cherrypy==18.8.0