Skip to content

Commit

Permalink
refactor: Improve sentence streaming logic in slop.py and enhance tes…
Browse files Browse the repository at this point in the history
…ts with Hypothesis for better coverage
  • Loading branch information
mbrock committed Nov 27, 2024
1 parent 1ee5377 commit 5c7562a
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 81 deletions.
61 changes: 18 additions & 43 deletions bubble/slop.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,51 +8,30 @@

async def stream_sentences(stream, initial_sentence=""):
"""Stream sentences from an Anthropic response, yielding each complete sentence."""
import re

# Start with any initial sentence fragment passed in
current_sentence = initial_sentence

# Process each chunk from the stream
for chunk in stream:
if isinstance(chunk, anthropic.TextEvent):
# Add the new text to our current sentence buffer
current_sentence += chunk.text

# Look for complete sentences
while "</sentence>" in current_sentence:
# Split on first </sentence> tag
parts = current_sentence.split("</sentence>", 1)
sentence_content = parts[0].strip()

# Extract just the sentence content, ignoring any tags
if "<sentence>" in sentence_content:
_, sentence_content = sentence_content.split(
"<sentence>", 1
)

# Remove trailing period if present
sentence_content = sentence_content.rstrip(".")
# Keep extracting complete sentences while we have them
while match := re.search(
r"^(.*?[.!?])[ \n](.*)$", current_sentence, re.DOTALL
):
# Extract the complete sentence and yield it
sentence_content = match.group(1)
# Keep the remainder for next iteration
current_sentence = match.group(2)
yield sentence_content

# Only process if there's actual content
if sentence_content.strip():
# Handle multiline content by joining with spaces
cleaned_sentence = " ".join(
line.strip()
for line in sentence_content.splitlines()
)
yield cleaned_sentence

# Keep remainder for next iteration
current_sentence = parts[1]

# Handle any final incomplete sentence
if current_sentence.strip():
if "<sentence>" in current_sentence:
_, sentence_content = current_sentence.rsplit(
"<sentence>", 1
)
sentence_content = sentence_content.rstrip(".")
if sentence_content.strip():
cleaned_sentence = " ".join(
line.strip()
for line in sentence_content.splitlines()
)
yield cleaned_sentence
# Yield any remaining text as the final sentence
if current_sentence:
yield current_sentence


async def stream_normally(stream) -> str:
Expand All @@ -65,10 +44,6 @@ async def stream_normally(stream) -> str:
console.print(chunk.text, end="")
console.print()

# Ensure we have complete XML tags
if text.count("<sentence>") != text.count("</sentence>"):
text += "</sentence>"

return text


Expand Down
126 changes: 88 additions & 38 deletions bubble/test/test_slop.py
Original file line number Diff line number Diff line change
@@ -1,71 +1,121 @@
import pytest
from anthropic import TextEvent
from bubble.slop import stream_sentences, stream_normally
import hypothesis.strategies as st
from hypothesis import given

sentence_ending_punctuation = st.sampled_from(".!?")

sentence = st.text("abc, \n").flatmap(
lambda text: sentence_ending_punctuation.map(
lambda punctuation: text + punctuation
)
)


def create_text_stream(*chunks):
"""Create a mock Anthropic text stream from text chunks"""
return [TextEvent(type="text", text=chunk, snapshot="") for chunk in chunks]
return [
TextEvent(type="text", text=chunk, snapshot="")
for chunk in chunks
]


@pytest.fixture
def text_stream():
"""Create a mock Anthropic text stream"""
return create_text_stream(
"<sentence>First sentence",
".</sentence> ",
"<sentence>Second ",
"sentence.</sentence>",
"<sentence>Third sentence."
"First sentence",
". ",
"Second ",
"sentence. ",
"Third sentence.",
)


@pytest.mark.trio
async def test_stream_sentences(text_stream):
"""Test streaming sentences with XML tags"""
"""Test streaming sentences"""
sentences = []
async for sentence in stream_sentences(text_stream):
sentences.append(sentence)

assert sentences == [
"First sentence",
"Second sentence",
"Third sentence"
"First sentence.",
"Second sentence.",
"Third sentence.",
]


@pytest.mark.trio
async def test_stream_sentences_with_initial(text_stream):
"""Test streaming with initial partial sentence"""
sentences = []
async for sentence in stream_sentences(
text_stream, initial_sentence="<sentence>Initial "
):
sentences.append(sentence)

assert sentences[0].startswith("Initial")


@pytest.mark.trio
async def test_stream_normally(text_stream):
"""Test normal streaming without sentence parsing"""
result = await stream_normally(text_stream)

expected = (
"<sentence>First sentence.</sentence> "
"<sentence>Second sentence.</sentence>"
"<sentence>Third sentence.</sentence>"
)
expected = "First sentence. " "Second sentence. " "Third sentence."

assert result == expected


@pytest.mark.trio
async def test_stream_multiline_sentences():
"""Test handling of multiline sentence content"""
events = create_text_stream("<sentence>First\nline\nof text.")
@given(st.text())
async def test_sentence_stream_yields_same_text(text: str):
events = create_text_stream(text)
sentences = [
sentence async for sentence in stream_sentences(events)
]
assert "".join(sentences).strip() == text.strip()

sentences = []
async for sentence in stream_sentences(events):
sentences.append(sentence)

assert sentences == ["First line of text"]

@given(st.lists(sentence))
async def test_sentence_stream_on_list_of_sentences(
sentences: list[str],
):
events = create_text_stream(" ".join(sentences))
streamed = [sentence async for sentence in stream_sentences(events)]
assert streamed == sentences


@given(st.lists(sentence))
async def test_sentence_stream_on_list_of_sentences_with_newlines(
sentences: list[str],
):
events = create_text_stream("\n".join(sentences))
streamed = [sentence async for sentence in stream_sentences(events)]
assert streamed == sentences


async def test_sentence_stream_handles_no_sentence_ending():
events = create_text_stream("This is a test")
sentences = [
sentence async for sentence in stream_sentences(events)
]
assert sentences == ["This is a test"]


async def test_sentence_stream_handles_ellipsis():
events = create_text_stream("This is a test... This is a test.")
sentences = [
sentence async for sentence in stream_sentences(events)
]
assert sentences == ["This is a test...", "This is a test."]


async def test_sentence_stream_handles_newlines():
events = create_text_stream("This is a test.\nThis is a test.")
sentences = [
sentence async for sentence in stream_sentences(events)
]
assert sentences == ["This is a test.", "This is a test."]


@given(
sentence.flatmap(
lambda s: st.integers(1, len(s)).map(lambda n: (s, n))
)
)
async def test_foo(x):
(sentence, n) = x
a = sentence[:n]
b = sentence[n:]
assert a + b == sentence
events = create_text_stream(a, b)
streamed = [sentence async for sentence in stream_sentences(events)]
assert streamed == [sentence]

0 comments on commit 5c7562a

Please sign in to comment.