#600 - refactored ht… (#601)

* #600 - refactored html helper. * #600 - fixed small flake8 error
jackdewinter · Mar 6, 2023 · 8bc4455 · 8bc4455
1 parent 94537c8
commit 8bc4455
Show file tree

Hide file tree

Showing 10 changed files with 353 additions and 303 deletions.
diff --git a/publish/coverage.json b/publish/coverage.json
@@ -6,8 +6,8 @@
         "totalCovered": 3581
     },
     "lineLevel": {
-        "totalMeasured": 14479,
-        "totalCovered": 14479
+        "totalMeasured": 14501,
+        "totalCovered": 14501
     }
 }
 
diff --git a/publish/pylint_suppression.json b/publish/pylint_suppression.json
@@ -66,9 +66,10 @@
             "too-many-arguments": 2
         },
         "pymarkdown/extensions/task_list_items.py": {},
-        "pymarkdown/html_helper.py": {
+        "pymarkdown/html/html_helper.py": {
             "too-many-arguments": 1
         },
+        "pymarkdown/html/html_raw_helper.py": {},
         "pymarkdown/inline/inline_autolink_helper.py": {},
         "pymarkdown/inline/inline_backslash_helper.py": {},
         "pymarkdown/inline/inline_backtick_helper.py": {},

diff --git a/pymarkdown/html_helper.py → pymarkdown/html/html_helper.py b/pymarkdown/html_helper.py → pymarkdown/html/html_helper.py
@@ -6,10 +6,8 @@
 from typing import List, Optional, Tuple, cast
 
 from pymarkdown.block_quote_data import BlockQuoteData
-from pymarkdown.constants import Constants
 from pymarkdown.container_helper import ContainerHelper
-from pymarkdown.inline.inline_request import InlineRequest
-from pymarkdown.inline_markdown_token import RawHtmlMarkdownToken, TextMarkdownToken
+from pymarkdown.inline_markdown_token import TextMarkdownToken
 from pymarkdown.leaf_markdown_token import HtmlBlockMarkdownToken
 from pymarkdown.markdown_token import MarkdownToken
 from pymarkdown.parser_helper import ParserHelper
@@ -21,8 +19,6 @@
 
 POGGER = ParserLogger(logging.getLogger(__name__))
 
-# pylint: disable=too-many-lines
-
 
 class HtmlHelper:
     """
@@ -45,20 +41,14 @@ class HtmlHelper:
     __html_attribute_value_double = '"'
     __html_attribute_name_value_separator = "="
     __html_attribute_separator = ParserHelper.space_character
-    __valid_tag_name_start = string.ascii_letters
     __valid_tag_name_characters = f"{string.ascii_letters}{string.digits}-"
-    __tag_attribute_name_characters = f"{string.ascii_letters}{string.digits}_.:-"
-    __unquoted_attribute_value_stop = f"\"'=<>`{Constants.ascii_whitespace}"
-    __tag_attribute_name_start = f"{string.ascii_letters}_:"
     __html_block_1_start_tag_prefix = ["script", "pre", "style"]
     __html_tag_attribute_value_terminators = " \"'=<>`"
     __html_block_2_to_5_start = "!"
     __html_block_2_continued_start = "--"
-    __html_block_2_xx = f"{__html_block_2_to_5_start}{__html_block_2_continued_start}"
     __html_block_3_continued_start = "?"
     __html_block_4_continued_start = string.ascii_uppercase
     __html_block_5_continued_start = "[CDATA["
-    __html_block_5_xx = f"{__html_block_2_to_5_start}{__html_block_5_continued_start}"
     __html_block_1_end_tags = ["</script>", "</pre>", "</style>"]
     __html_block_2_end = "-->"
     __html_block_3_end = "?>"
@@ -68,13 +58,6 @@ class HtmlHelper:
     __attribute_start_characters = "abcdefghijklmnopqrstuvwxyz1234567890:_"
     __attribute_other_characters = f"{__attribute_start_characters}.-"
 
-    __raw_declaration_start_character = "!"
-    __raw_declaration_whitespace = ParserHelper.space_character
-    __raw_html_exclusion_1 = ">"
-    __raw_html_exclusion_2 = "->"
-    __raw_html_exclusion_3 = "-"
-    __raw_html_exclusion_4 = "--"
-
     __html_block_6_start = [
         "address",
         "article",
@@ -351,281 +334,6 @@ def is_complete_html_start_tag(
             non_whitespace_index,
         )
 
-    @staticmethod
-    def __parse_raw_tag_name(text_to_parse: str, start_index: int) -> str:
-        """
-        Parse a HTML tag name from the string.
-        """
-        if ParserHelper.is_character_at_index_one_of(
-            text_to_parse, start_index, HtmlHelper.__valid_tag_name_start
-        ):
-            index, __ = ParserHelper.collect_while_one_of_characters(
-                text_to_parse, start_index + 1, HtmlHelper.__valid_tag_name_characters
-            )
-            return text_to_parse[:index]
-        return ""
-
-    @staticmethod
-    def __parse_tag_attributes(
-        text_to_parse: str, start_index: int
-    ) -> Tuple[Optional[int], Optional[str]]:
-        """
-        Handle the parsing of the attributes for an open tag.
-        """
-        parse_index, _ = ParserHelper.collect_while_one_of_characters(
-            text_to_parse, start_index, HtmlHelper.__tag_attribute_name_characters
-        )
-        assert parse_index is not None
-        end_name_index, extracted_whitespace = ParserHelper.extract_ascii_whitespace(
-            text_to_parse, parse_index
-        )
-        assert end_name_index is not None
-        if ParserHelper.is_character_at_index(
-            text_to_parse,
-            end_name_index,
-            HtmlHelper.__html_attribute_name_value_separator,
-        ):
-            (
-                value_start_index,
-                _,
-            ) = ParserHelper.extract_ascii_whitespace(text_to_parse, end_name_index + 1)
-            assert value_start_index is not None
-            value_end_index: Optional[int] = None
-            if ParserHelper.is_character_at_index_one_of(
-                text_to_parse,
-                value_start_index,
-                HtmlHelper.__html_attribute_value_single,
-            ):
-                value_end_index, _ = ParserHelper.collect_until_character(
-                    text_to_parse,
-                    value_start_index + 1,
-                    HtmlHelper.__html_attribute_value_single,
-                )
-                assert value_end_index is not None
-                if not ParserHelper.is_character_at_index(
-                    text_to_parse,
-                    value_end_index,
-                    HtmlHelper.__html_attribute_value_single,
-                ):
-                    return None, None
-                value_end_index += 1
-            elif ParserHelper.is_character_at_index_one_of(
-                text_to_parse,
-                value_start_index,
-                HtmlHelper.__html_attribute_value_double,
-            ):
-                value_end_index, _ = ParserHelper.collect_until_character(
-                    text_to_parse,
-                    value_start_index + 1,
-                    HtmlHelper.__html_attribute_value_double,
-                )
-                assert value_end_index is not None
-                if not ParserHelper.is_character_at_index(
-                    text_to_parse,
-                    value_end_index,
-                    HtmlHelper.__html_attribute_value_double,
-                ):
-                    return None, None
-                value_end_index += 1
-            else:
-                value_end_index, _ = ParserHelper.collect_until_one_of_characters(
-                    text_to_parse,
-                    value_start_index,
-                    HtmlHelper.__unquoted_attribute_value_stop,
-                )
-            assert value_end_index is not None
-            (
-                end_name_index,
-                extracted_whitespace,
-            ) = ParserHelper.extract_ascii_whitespace(text_to_parse, value_end_index)
-
-        return end_name_index, extracted_whitespace
-
-    @staticmethod
-    def __parse_raw_open_tag(text_to_parse: str) -> Tuple[Optional[str], int]:
-        """
-        Parse the current line as if it is an open tag, and determine if it is valid.
-        """
-
-        end_parse_index, valid_raw_html, tag_name = (
-            -1,
-            None,
-            HtmlHelper.__parse_raw_tag_name(text_to_parse, 0),
-        )
-        if tag_name:
-            parse_index, extracted_whitespace = ParserHelper.extract_ascii_whitespace(
-                text_to_parse, len(tag_name)
-            )
-            assert parse_index is not None
-            while extracted_whitespace and ParserHelper.is_character_at_index_one_of(
-                text_to_parse,
-                parse_index,
-                HtmlHelper.__tag_attribute_name_start,
-            ):
-                (
-                    parse_index,
-                    extracted_whitespace,
-                ) = HtmlHelper.__parse_tag_attributes(text_to_parse, parse_index)
-                if parse_index is None:
-                    return None, -1
-
-            if ParserHelper.is_character_at_index(
-                text_to_parse, parse_index, HtmlHelper.__html_tag_start
-            ):
-                parse_index += 1
-
-            if ParserHelper.is_character_at_index(
-                text_to_parse, parse_index, HtmlHelper.__html_tag_end
-            ):
-                valid_raw_html = text_to_parse[:parse_index]
-                end_parse_index = parse_index + 1
-
-        return valid_raw_html, end_parse_index
-
-    @staticmethod
-    def __parse_raw_close_tag(text_to_parse: str) -> Optional[str]:
-        """
-        Parse the current line as if it is a close tag, and determine if it is valid.
-        """
-        valid_raw_html = None
-        if ParserHelper.is_character_at_index(
-            text_to_parse, 0, HtmlHelper.__html_tag_start
-        ):
-            if tag_name := HtmlHelper.__parse_raw_tag_name(text_to_parse, 1):
-                parse_index: Optional[int] = len(tag_name)
-                assert parse_index is not None
-                text_to_parse_size = len(text_to_parse)
-                if parse_index != text_to_parse_size:
-                    parse_index, _ = ParserHelper.extract_spaces(
-                        text_to_parse, parse_index
-                    )
-                if parse_index == text_to_parse_size:
-                    valid_raw_html = text_to_parse
-        return valid_raw_html
-
-    @staticmethod
-    def __parse_raw_declaration(text_to_parse: str) -> Optional[str]:
-        """
-        Parse a possible raw html declaration sequence, and return if it is valid.
-        """
-
-        valid_raw_html = None
-        if ParserHelper.is_character_at_index_one_of(
-            text_to_parse, 0, HtmlHelper.__raw_declaration_start_character
-        ):
-            (
-                parse_index,
-                declaration_name,
-            ) = ParserHelper.collect_while_one_of_characters(
-                text_to_parse, 1, HtmlHelper.__html_block_4_continued_start
-            )
-            assert parse_index is not None
-            if declaration_name:
-                whitespace_count, _ = ParserHelper.collect_while_character(
-                    text_to_parse, parse_index, HtmlHelper.__raw_declaration_whitespace
-                )
-                if whitespace_count:
-                    valid_raw_html = text_to_parse
-        return valid_raw_html
-
-    @staticmethod
-    def __process_raw_special(
-        remaining_line: str,
-        special_start: str,
-        special_end: str,
-        do_extra_check: bool = False,
-    ) -> Tuple[Optional[str], int]:
-        """
-        Parse a possible raw html special sequence, and return if it is valid.
-        """
-        valid_raw_html: Optional[str] = None
-        parse_index = -1
-        if remaining_line.startswith(special_start):
-            special_start_size = len(special_start)
-            remaining_line = remaining_line[special_start_size:]
-            parse_index = remaining_line.find(special_end)
-            if parse_index != -1:
-                remaining_line = remaining_line[:parse_index]
-                parse_index = parse_index + special_start_size + len(special_end)
-                if (not do_extra_check) or (
-                    not (
-                        remaining_line[0] == HtmlHelper.__raw_html_exclusion_1
-                        or remaining_line.startswith(HtmlHelper.__raw_html_exclusion_2)
-                        or remaining_line[-1] == HtmlHelper.__raw_html_exclusion_3
-                        or HtmlHelper.__raw_html_exclusion_4 in remaining_line
-                    )
-                ):
-                    valid_raw_html = (
-                        f"{special_start}{remaining_line}{special_end[:-1]}"
-                    )
-        return valid_raw_html, parse_index
-
-    @staticmethod
-    def parse_raw_html(
-        only_between_angles: str,
-        remaining_line: str,
-        line_number: int,
-        column_number: int,
-        inline_request: InlineRequest,
-    ) -> Tuple[Optional[RawHtmlMarkdownToken], int]:
-        """
-        Given an open HTML tag character (<), try the various possibilities for
-        types of tag, and determine if any of them parse validly.
-        """
-
-        valid_raw_html, remaining_line_parse_index = HtmlHelper.__parse_raw_open_tag(
-            remaining_line
-        )
-        if not valid_raw_html:
-            valid_raw_html = HtmlHelper.__parse_raw_close_tag(only_between_angles)
-        if not valid_raw_html:
-            (
-                valid_raw_html,
-                remaining_line_parse_index,
-            ) = HtmlHelper.__process_raw_special(
-                remaining_line,
-                HtmlHelper.__html_block_2_xx,
-                HtmlHelper.__html_block_2_end,
-                True,
-            )
-        if not valid_raw_html:
-            (
-                valid_raw_html,
-                remaining_line_parse_index,
-            ) = HtmlHelper.__process_raw_special(
-                remaining_line,
-                HtmlHelper.__html_block_3_continued_start,
-                HtmlHelper.__html_block_3_end,
-            )
-        if not valid_raw_html:
-            (
-                valid_raw_html,
-                remaining_line_parse_index,
-            ) = HtmlHelper.__process_raw_special(
-                remaining_line,
-                HtmlHelper.__html_block_5_xx,
-                HtmlHelper.__html_block_5_end,
-            )
-        if not valid_raw_html:
-            valid_raw_html = HtmlHelper.__parse_raw_declaration(only_between_angles)
-
-        if not valid_raw_html:
-            return None, -1
-        if inline_request.para_owner:
-            (
-                valid_raw_html,
-                inline_request.para_owner.rehydrate_index,
-            ) = ParserHelper.recombine_string_with_whitespace(
-                valid_raw_html,
-                inline_request.para_owner.extracted_whitespace,
-                inline_request.para_owner.rehydrate_index,
-                add_replace_marker_if_empty=True,
-            )
-        return (
-            RawHtmlMarkdownToken(valid_raw_html, line_number, column_number),
-            remaining_line_parse_index,
-        )
-
     @staticmethod
     def __check_for_special_html_blocks(
         line_to_parse: str, character_index: int