From b882e36888b7f1d228e5fd718be822028cbeaaca Mon Sep 17 00:00:00 2001 From: Marcos Prieto Date: Tue, 12 Dec 2023 10:17:40 +0100 Subject: [PATCH] Extend the HTML service to replace some tags by new lines Keep the duplicate whitespace behaviour outside the service and move it back to the JSTOR one, where it was originally written. --- lms/services/html_service.py | 14 +++++++++----- lms/services/jstor/_article_metadata.py | 5 +++++ tests/unit/lms/services/html_service_test.py | 16 +++++++++------- 3 files changed, 23 insertions(+), 12 deletions(-) diff --git a/lms/services/html_service.py b/lms/services/html_service.py index 74b955a59b..7cce684e7b 100644 --- a/lms/services/html_service.py +++ b/lms/services/html_service.py @@ -2,20 +2,24 @@ class WhiteSpaceHTMLParser(HTMLParser): - def __init__(self): + def __init__(self, tags_to_newline): super().__init__() self._chunks = [] + self._tags_to_new_line = tags_to_newline or [] def handle_data(self, data): self._chunks.append(data) + def handle_endtag(self, tag): + if tag in self._tags_to_new_line: + self._chunks.append("\n") + def get_text(self) -> str: - # Strip leading/trailing whitespace and duplicate spaces - return " ".join("".join(self._chunks).split()) + return "".join(self._chunks).strip() -def strip_html_tags(html: str) -> str: - parser = WhiteSpaceHTMLParser() +def strip_html_tags(html: str, tags_to_newline=None) -> str: + parser = WhiteSpaceHTMLParser(tags_to_newline) parser.feed(html) parser.close() diff --git a/lms/services/jstor/_article_metadata.py b/lms/services/jstor/_article_metadata.py index afdd33a514..7d72cf8dd5 100644 --- a/lms/services/jstor/_article_metadata.py +++ b/lms/services/jstor/_article_metadata.py @@ -127,3 +127,8 @@ def _get_titles(self, title_key, subtitle_key): titles["subtitle"] = strip_html_tags(subtitle) return titles + + @staticmethod + def _strip_html_tags(html: str) -> str: + # Strip leading/trailing whitespace and duplicate spaces + return " ".join(strip_html_tags(html).split()) diff --git a/tests/unit/lms/services/html_service_test.py b/tests/unit/lms/services/html_service_test.py index 64900efa3c..7d7e580800 100644 --- a/tests/unit/lms/services/html_service_test.py +++ b/tests/unit/lms/services/html_service_test.py @@ -4,13 +4,15 @@ @pytest.mark.parametrize( - "text,expected", + "text,expected,tags_to_new_line", [ - ("COLON :", "COLON :"), - ("A B", "A B"), - (" C D E", "C D E"), - ("ACOLON :", "COLON :", None), + ("A B", "A B", None), + (" C D E", "C D E", None), + ("APARAGRAPH

OTHER", "PARAGRAPH\nOTHER", ["p"]), + ("

PARAGRAPH

OTHER
ANOTHER", "PARAGRAPH\nOTHER\nANOTHER", ["p", "br"]), ], ) -def test_strip_html_tags(text, expected): - assert strip_html_tags(text) == expected +def test_strip_html_tags(text, expected, tags_to_new_line): + assert strip_html_tags(text, tags_to_new_line) == expected