diff --git a/bleach/__init__.py b/bleach/__init__.py index bbcc2e03..8774e273 100644 --- a/bleach/__init__.py +++ b/bleach/__init__.py @@ -52,7 +52,7 @@ def clean( :arg str text: the text to clean - :arg list tags: allowed list of tags; defaults to + :arg set tags: set of allowed tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; diff --git a/bleach/html5lib_shim.py b/bleach/html5lib_shim.py index d121953b..022fe19e 100644 --- a/bleach/html5lib_shim.py +++ b/bleach/html5lib_shim.py @@ -38,6 +38,9 @@ allowed_protocols, allowed_css_properties, allowed_svg_properties, + attr_val_is_uri, + svg_attr_val_allows_ref, + svg_allow_local_href, ) # noqa: E402 module level import not at top of file from bleach._vendor.html5lib.filters.sanitizer import ( Filter as SanitizerFilter, diff --git a/bleach/sanitizer.py b/bleach/sanitizer.py index da9637c1..35ccf71e 100644 --- a/bleach/sanitizer.py +++ b/bleach/sanitizer.py @@ -8,21 +8,23 @@ from bleach import parse_shim -#: List of allowed tags -ALLOWED_TAGS = [ - "a", - "abbr", - "acronym", - "b", - "blockquote", - "code", - "em", - "i", - "li", - "ol", - "strong", - "ul", -] +#: Set of allowed tags +ALLOWED_TAGS = frozenset( + ( + "a", + "abbr", + "acronym", + "b", + "blockquote", + "code", + "em", + "i", + "li", + "ol", + "strong", + "ul", + ) +) #: Map of allowed attributes by tag @@ -93,7 +95,7 @@ def __init__( ): """Initializes a Cleaner - :arg list tags: allowed list of tags; defaults to + :arg set tags: set of allowed tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; @@ -190,13 +192,11 @@ def clean(self, text): dom = self.parser.parseFragment(text) filtered = BleachSanitizerFilter( source=self.walker(dom), - # Bleach-sanitizer-specific things + allowed_tags=self.tags, attributes=self.attributes, - strip_disallowed_elements=self.strip, + strip_disallowed_tags=self.strip, strip_html_comments=self.strip_comments, css_sanitizer=self.css_sanitizer, - # html5lib-sanitizer things - allowed_elements=self.tags, allowed_protocols=self.protocols, ) @@ -260,19 +260,21 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter): def __init__( self, source, - allowed_elements=ALLOWED_TAGS, + allowed_tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, allowed_protocols=ALLOWED_PROTOCOLS, - strip_disallowed_elements=False, + attr_val_is_uri=html5lib_shim.attr_val_is_uri, + svg_attr_val_allows_ref=html5lib_shim.svg_attr_val_allows_ref, + svg_allow_local_href=html5lib_shim.svg_allow_local_href, + strip_disallowed_tags=False, strip_html_comments=True, css_sanitizer=None, - **kwargs, ): """Creates a BleachSanitizerFilter instance :arg source: html5lib TreeWalker stream as an html5lib TreeWalker - :arg list allowed_elements: allowed list of tags; defaults to + :arg set allowed_tags: set of allowed tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; @@ -281,8 +283,16 @@ def __init__( :arg list allowed_protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` - :arg bool strip_disallowed_elements: whether or not to strip disallowed - elements + :arg attr_val_is_uri: set of attributes that have URI values + + :arg svg_attr_val_allows_ref: set of SVG attributes that can have + references + + :arg svg_allow_local_href: set of SVG elements that can have local + hrefs + + :arg bool strip_disallowed_tags: whether or not to strip disallowed + tags :arg bool strip_html_comments: whether or not to strip HTML comments @@ -290,24 +300,24 @@ def __init__( sanitizing style attribute values and style text; defaults to None """ + # NOTE(willkg): This is the superclass of + # html5lib.filters.sanitizer.Filter. We call this directly skipping the + # __init__ for html5lib.filters.sanitizer.Filter because that does + # things we don't need to do and kicks up the deprecation warning for + # using Sanitizer. + html5lib_shim.Filter.__init__(self, source) + + self.allowed_tags = frozenset(allowed_tags) + self.allowed_protocols = allowed_protocols + self.attr_filter = attribute_filter_factory(attributes) - self.strip_disallowed_elements = strip_disallowed_elements + self.strip_disallowed_tags = strip_disallowed_tags self.strip_html_comments = strip_html_comments - self.css_sanitizer = css_sanitizer - # filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init - warnings.filterwarnings( - "ignore", - message="html5lib's sanitizer is deprecated", - category=DeprecationWarning, - module="bleach._vendor.html5lib", - ) - return super().__init__( - source, - allowed_elements=allowed_elements, - allowed_protocols=allowed_protocols, - **kwargs, - ) + self.attr_val_is_uri = attr_val_is_uri + self.svg_attr_val_allows_ref = svg_attr_val_allows_ref + self.css_sanitizer = css_sanitizer + self.svg_allow_local_href = svg_allow_local_href def sanitize_stream(self, token_iterator): for token in token_iterator: @@ -377,10 +387,10 @@ def sanitize_token(self, token): """ token_type = token["type"] if token_type in ["StartTag", "EndTag", "EmptyTag"]: - if token["name"] in self.allowed_elements: + if token["name"] in self.allowed_tags: return self.allow_token(token) - elif self.strip_disallowed_elements: + elif self.strip_disallowed_tags: return None else: