Skip to content

Commit

Permalink
Extend the HTML service to replace some tags by new lines
Browse files Browse the repository at this point in the history
Keep the duplicate whitespace behaviour outside the service and move it
back to the JSTOR one, where it was originally written.
  • Loading branch information
marcospri committed Dec 12, 2023
1 parent 0d1ae24 commit b882e36
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 12 deletions.
14 changes: 9 additions & 5 deletions lms/services/html_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,24 @@


class WhiteSpaceHTMLParser(HTMLParser):
def __init__(self):
def __init__(self, tags_to_newline):
super().__init__()
self._chunks = []
self._tags_to_new_line = tags_to_newline or []

def handle_data(self, data):
self._chunks.append(data)

def handle_endtag(self, tag):
if tag in self._tags_to_new_line:
self._chunks.append("\n")

def get_text(self) -> str:
# Strip leading/trailing whitespace and duplicate spaces
return " ".join("".join(self._chunks).split())
return "".join(self._chunks).strip()


def strip_html_tags(html: str) -> str:
parser = WhiteSpaceHTMLParser()
def strip_html_tags(html: str, tags_to_newline=None) -> str:
parser = WhiteSpaceHTMLParser(tags_to_newline)
parser.feed(html)
parser.close()

Expand Down
5 changes: 5 additions & 0 deletions lms/services/jstor/_article_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,8 @@ def _get_titles(self, title_key, subtitle_key):
titles["subtitle"] = strip_html_tags(subtitle)

return titles

@staticmethod
def _strip_html_tags(html: str) -> str:
# Strip leading/trailing whitespace and duplicate spaces
return " ".join(strip_html_tags(html).split())
16 changes: 9 additions & 7 deletions tests/unit/lms/services/html_service_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@


@pytest.mark.parametrize(
"text,expected",
"text,expected,tags_to_new_line",
[
("<b>COLON :</b>", "COLON :"),
("A <em>B</em>", "A B"),
(" C <em>D</em> E", "C D E"),
("A<B", "A<B"),
("<b>COLON :</b>", "COLON :", None),
("A <em>B</em>", "A B", None),
(" C <em>D</em> E", "C D E", None),
("A<B", "A<B", None),
("<p>PARAGRAPH</p>OTHER", "PARAGRAPH\nOTHER", ["p"]),
("<p>PARAGRAPH</p>OTHER<br/>ANOTHER", "PARAGRAPH\nOTHER\nANOTHER", ["p", "br"]),
],
)
def test_strip_html_tags(text, expected):
assert strip_html_tags(text) == expected
def test_strip_html_tags(text, expected, tags_to_new_line):
assert strip_html_tags(text, tags_to_new_line) == expected

0 comments on commit b882e36

Please sign in to comment.