refactor: Improve sentence streaming logic in slop.py and enhance tes…

…ts with Hypothesis for better coverage
lessrest · Nov 27, 2024 · 5c7562a · 5c7562a
1 parent 1ee5377
commit 5c7562a
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 81 deletions.
diff --git a/bubble/slop.py b/bubble/slop.py
@@ -8,51 +8,30 @@
 
 async def stream_sentences(stream, initial_sentence=""):
     """Stream sentences from an Anthropic response, yielding each complete sentence."""
+    import re
+
+    # Start with any initial sentence fragment passed in
     current_sentence = initial_sentence
+
+    # Process each chunk from the stream
     for chunk in stream:
         if isinstance(chunk, anthropic.TextEvent):
+            # Add the new text to our current sentence buffer
             current_sentence += chunk.text
 
-            # Look for complete sentences
-            while "</sentence>" in current_sentence:
-                # Split on first </sentence> tag
-                parts = current_sentence.split("</sentence>", 1)
-                sentence_content = parts[0].strip()
-
-                # Extract just the sentence content, ignoring any tags
-                if "<sentence>" in sentence_content:
-                    _, sentence_content = sentence_content.split(
-                        "<sentence>", 1
-                    )
-
-                # Remove trailing period if present
-                sentence_content = sentence_content.rstrip(".")
+            # Keep extracting complete sentences while we have them
+            while match := re.search(
+                r"^(.*?[.!?])[ \n](.*)$", current_sentence, re.DOTALL
+            ):
+                # Extract the complete sentence and yield it
+                sentence_content = match.group(1)
+                # Keep the remainder for next iteration
+                current_sentence = match.group(2)
+                yield sentence_content
 
-                # Only process if there's actual content
-                if sentence_content.strip():
-                    # Handle multiline content by joining with spaces
-                    cleaned_sentence = " ".join(
-                        line.strip()
-                        for line in sentence_content.splitlines()
-                    )
-                    yield cleaned_sentence
-
-                # Keep remainder for next iteration
-                current_sentence = parts[1]
-
-    # Handle any final incomplete sentence
-    if current_sentence.strip():
-        if "<sentence>" in current_sentence:
-            _, sentence_content = current_sentence.rsplit(
-                "<sentence>", 1
-            )
-            sentence_content = sentence_content.rstrip(".")
-            if sentence_content.strip():
-                cleaned_sentence = " ".join(
-                    line.strip()
-                    for line in sentence_content.splitlines()
-                )
-                yield cleaned_sentence
+    # Yield any remaining text as the final sentence
+    if current_sentence:
+        yield current_sentence
 
 
 async def stream_normally(stream) -> str:
@@ -65,10 +44,6 @@ async def stream_normally(stream) -> str:
             console.print(chunk.text, end="")
     console.print()
 
-    # Ensure we have complete XML tags
-    if text.count("<sentence>") != text.count("</sentence>"):
-        text += "</sentence>"
-
     return text
 
 

diff --git a/bubble/test/test_slop.py b/bubble/test/test_slop.py
@@ -1,71 +1,121 @@
 import pytest
 from anthropic import TextEvent
 from bubble.slop import stream_sentences, stream_normally
+import hypothesis.strategies as st
+from hypothesis import given
+
+sentence_ending_punctuation = st.sampled_from(".!?")
+
+sentence = st.text("abc, \n").flatmap(
+    lambda text: sentence_ending_punctuation.map(
+        lambda punctuation: text + punctuation
+    )
+)
 
 
 def create_text_stream(*chunks):
     """Create a mock Anthropic text stream from text chunks"""
-    return [TextEvent(type="text", text=chunk, snapshot="") for chunk in chunks]
+    return [
+        TextEvent(type="text", text=chunk, snapshot="")
+        for chunk in chunks
+    ]
+
 
 @pytest.fixture
 def text_stream():
     """Create a mock Anthropic text stream"""
     return create_text_stream(
-        "<sentence>First sentence",
-        ".</sentence> ",
-        "<sentence>Second ",
-        "sentence.</sentence>",
-        "<sentence>Third sentence."
+        "First sentence",
+        ". ",
+        "Second ",
+        "sentence. ",
+        "Third sentence.",
     )
 
 
-@pytest.mark.trio
 async def test_stream_sentences(text_stream):
-    """Test streaming sentences with XML tags"""
+    """Test streaming sentences"""
     sentences = []
     async for sentence in stream_sentences(text_stream):
         sentences.append(sentence)
 
     assert sentences == [
-        "First sentence",
-        "Second sentence",
-        "Third sentence"
+        "First sentence.",
+        "Second sentence.",
+        "Third sentence.",
     ]
 
 
-@pytest.mark.trio
-async def test_stream_sentences_with_initial(text_stream):
-    """Test streaming with initial partial sentence"""
-    sentences = []
-    async for sentence in stream_sentences(
-        text_stream, initial_sentence="<sentence>Initial "
-    ):
-        sentences.append(sentence)
-
-    assert sentences[0].startswith("Initial")
-
-
-@pytest.mark.trio
 async def test_stream_normally(text_stream):
     """Test normal streaming without sentence parsing"""
     result = await stream_normally(text_stream)
 
-    expected = (
-        "<sentence>First sentence.</sentence> "
-        "<sentence>Second sentence.</sentence>"
-        "<sentence>Third sentence.</sentence>"
-    )
+    expected = "First sentence. " "Second sentence. " "Third sentence."
 
     assert result == expected
 
 
-@pytest.mark.trio
-async def test_stream_multiline_sentences():
-    """Test handling of multiline sentence content"""
-    events = create_text_stream("<sentence>First\nline\nof text.")
+@given(st.text())
+async def test_sentence_stream_yields_same_text(text: str):
+    events = create_text_stream(text)
+    sentences = [
+        sentence async for sentence in stream_sentences(events)
+    ]
+    assert "".join(sentences).strip() == text.strip()
 
-    sentences = []
-    async for sentence in stream_sentences(events):
-        sentences.append(sentence)
-
-    assert sentences == ["First line of text"]
+
+@given(st.lists(sentence))
+async def test_sentence_stream_on_list_of_sentences(
+    sentences: list[str],
+):
+    events = create_text_stream(" ".join(sentences))
+    streamed = [sentence async for sentence in stream_sentences(events)]
+    assert streamed == sentences
+
+
+@given(st.lists(sentence))
+async def test_sentence_stream_on_list_of_sentences_with_newlines(
+    sentences: list[str],
+):
+    events = create_text_stream("\n".join(sentences))
+    streamed = [sentence async for sentence in stream_sentences(events)]
+    assert streamed == sentences
+
+
+async def test_sentence_stream_handles_no_sentence_ending():
+    events = create_text_stream("This is a test")
+    sentences = [
+        sentence async for sentence in stream_sentences(events)
+    ]
+    assert sentences == ["This is a test"]
+
+
+async def test_sentence_stream_handles_ellipsis():
+    events = create_text_stream("This is a test... This is a test.")
+    sentences = [
+        sentence async for sentence in stream_sentences(events)
+    ]
+    assert sentences == ["This is a test...", "This is a test."]
+
+
+async def test_sentence_stream_handles_newlines():
+    events = create_text_stream("This is a test.\nThis is a test.")
+    sentences = [
+        sentence async for sentence in stream_sentences(events)
+    ]
+    assert sentences == ["This is a test.", "This is a test."]
+
+
+@given(
+    sentence.flatmap(
+        lambda s: st.integers(1, len(s)).map(lambda n: (s, n))
+    )
+)
+async def test_foo(x):
+    (sentence, n) = x
+    a = sentence[:n]
+    b = sentence[n:]
+    assert a + b == sentence
+    events = create_text_stream(a, b)
+    streamed = [sentence async for sentence in stream_sentences(events)]
+    assert streamed == [sentence]