hermes-agent/tests/test_yuanbao_markdown.py

"""
test_yuanbao_markdown.py - Unit tests for yuanbao_markdown.py

Run (no pytest needed):
    cd /root/.openclaw/workspace/hermes-agent
    python3 tests/test_yuanbao_markdown.py -v

Or with pytest if available:
    python3 -m pytest tests/test_yuanbao_markdown.py -v
"""

import sys
import os
import unittest

# Ensure project root is on the path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))

from gateway.platforms.yuanbao import MarkdownProcessor


# ============ has_unclosed_fence ============

class TestHasUnclosedFence(unittest.TestCase):
    def test_unclosed_fence(self):
        self.assertTrue(MarkdownProcessor.has_unclosed_fence("```python\ncode"))

    def test_closed_fence(self):
        self.assertFalse(MarkdownProcessor.has_unclosed_fence("```python\ncode\n```"))

    def test_empty(self):
        self.assertFalse(MarkdownProcessor.has_unclosed_fence(""))

    def test_no_fence(self):
        self.assertFalse(MarkdownProcessor.has_unclosed_fence("just some text\nno fences here"))

    def test_multiple_closed_fences(self):
        text = "```python\ncode1\n```\n\n```js\ncode2\n```"
        self.assertFalse(MarkdownProcessor.has_unclosed_fence(text))

    def test_second_fence_unclosed(self):
        text = "```python\ncode1\n```\n\n```js\ncode2"
        self.assertTrue(MarkdownProcessor.has_unclosed_fence(text))

    def test_fence_at_start(self):
        self.assertTrue(MarkdownProcessor.has_unclosed_fence("```\nsome code"))

    def test_inline_backtick_ignored(self):
        text = "`inline code` is fine"
        self.assertFalse(MarkdownProcessor.has_unclosed_fence(text))


# ============ ends_with_table_row ============

class TestEndsWithTableRow(unittest.TestCase):
    def test_simple_table_row(self):
        self.assertTrue(MarkdownProcessor.ends_with_table_row("| col1 | col2 |"))

    def test_table_row_with_trailing_newline(self):
        self.assertTrue(MarkdownProcessor.ends_with_table_row("| col1 | col2 |\n"))

    def test_table_row_in_middle(self):
        text = "| col1 | col2 |\nsome other text"
        self.assertFalse(MarkdownProcessor.ends_with_table_row(text))

    def test_empty(self):
        self.assertFalse(MarkdownProcessor.ends_with_table_row(""))

    def test_non_table(self):
        self.assertFalse(MarkdownProcessor.ends_with_table_row("just a normal line"))

    def test_only_pipe_start(self):
        self.assertFalse(MarkdownProcessor.ends_with_table_row("| just pipe at start"))

    def test_table_separator_row(self):
        self.assertTrue(MarkdownProcessor.ends_with_table_row("| --- | --- |"))

    def test_whitespace_only(self):
        self.assertFalse(MarkdownProcessor.ends_with_table_row("   \n  "))


# ============ split_at_paragraph_boundary ============

class TestSplitAtParagraphBoundary(unittest.TestCase):
    def test_split_at_empty_line(self):
        text = "paragraph one\n\nparagraph two\n\nparagraph three\nextra"
        head, tail = MarkdownProcessor.split_at_paragraph_boundary(text, 30)
        self.assertLessEqual(len(head), 30)
        self.assertEqual(head + tail, text)

    def test_split_at_sentence_end(self):
        text = "This is a sentence.\nNext line.\nAnother line."
        head, tail = MarkdownProcessor.split_at_paragraph_boundary(text, 25)
        self.assertLessEqual(len(head), 25)
        self.assertEqual(head + tail, text)

    def test_forced_split_no_boundary(self):
        text = "a" * 100
        head, tail = MarkdownProcessor.split_at_paragraph_boundary(text, 50)
        self.assertEqual(len(head), 50)
        self.assertEqual(head + tail, text)

    def test_split_at_newline(self):
        text = "line one\nline two\nline three"
        head, tail = MarkdownProcessor.split_at_paragraph_boundary(text, 15)
        self.assertLessEqual(len(head), 15)
        self.assertEqual(head + tail, text)

    def test_chinese_sentence_boundary(self):
        text = "这是第一句话。\n这是第二句话。\n这是第三句话。"
        head, tail = MarkdownProcessor.split_at_paragraph_boundary(text, 15)
        self.assertLessEqual(len(head), 15)
        self.assertEqual(head + tail, text)


# ============ chunk_markdown_text ============

class TestChunkMarkdownText(unittest.TestCase):
    def test_empty(self):
        self.assertEqual(MarkdownProcessor.chunk_markdown_text(""), [])

    def test_short_text_no_split(self):
        text = "hello world"
        self.assertEqual(MarkdownProcessor.chunk_markdown_text(text, 3000), [text])

    def test_exactly_max_chars(self):
        text = "a" * 3000
        result = MarkdownProcessor.chunk_markdown_text(text, 3000)
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0], text)

    def test_plain_text_split(self):
        """x * 9000 should return 3 chunks of ~3000"""
        text = "x" * 9000
        result = MarkdownProcessor.chunk_markdown_text(text, 3000)
        self.assertEqual(len(result), 3)
        for chunk in result:
            self.assertLessEqual(len(chunk), 3000)
        self.assertEqual(''.join(result), text)

    def test_5000_chars_returns_2(self):
        """验收标准: 'a'*5000 with max 3000 → 2 chunks"""
        result = MarkdownProcessor.chunk_markdown_text("a" * 5000, 3000)
        self.assertEqual(len(result), 2)

    def test_code_fence_not_split(self):
        """代码块不应被切断"""
        code_lines = "\n".join([f"    line_{i} = {i}" for i in range(200)])
        text = f"Some intro text.\n\n```python\n{code_lines}\n```\n\nSome outro text."
        result = MarkdownProcessor.chunk_markdown_text(text, 3000)
        for chunk in result:
            self.assertFalse(MarkdownProcessor.has_unclosed_fence(chunk),
                             f"Chunk has unclosed fence:\n{chunk[:200]}...")

    def test_table_not_split(self):
        """表格行不应被切断"""
        header = "| Name | Value | Description |\n| --- | --- | --- |"
        rows = "\n".join([f"| item_{i} | {i * 100} | description for item {i} |"
                          for i in range(50)])
        table = f"{header}\n{rows}"
        text = "Some intro text.\n\n" + table + "\n\nSome outro text."
        result = MarkdownProcessor.chunk_markdown_text(text, 3000)
        for chunk in result:
            self.assertFalse(MarkdownProcessor.has_unclosed_fence(chunk))

    def test_code_fence_200_lines_not_cut(self):
        """包含 200 行代码块的文本，代码块不被切断"""
        code_lines = "\n".join([f"x = {i}" for i in range(200)])
        text = f"Intro.\n\n```python\n{code_lines}\n```\n\nOutro."
        result = MarkdownProcessor.chunk_markdown_text(text, 3000)
        for chunk in result:
            self.assertFalse(MarkdownProcessor.has_unclosed_fence(chunk))

    def test_multiple_paragraphs(self):
        """多段落文本应在段落边界切割"""
        paragraphs = ["This is paragraph number " + str(i) + ". " * 50
                      for i in range(10)]
        text = "\n\n".join(paragraphs)
        result = MarkdownProcessor.chunk_markdown_text(text, 500)
        self.assertGreater(len(result), 1)
        total_content = ''.join(result)
        self.assertGreaterEqual(len(total_content), len(text) * 0.95)

    def test_single_long_line(self):
        """单行超长文本应被强制切割"""
        text = "a" * 10000
        result = MarkdownProcessor.chunk_markdown_text(text, 3000)
        self.assertGreaterEqual(len(result), 3)
        for c in result:
            self.assertLessEqual(len(c), 3000)

    def test_fence_followed_by_text(self):
        """围栏后的文本应正常切割"""
        text = "```python\nprint('hi')\n```\n\n" + "Normal text. " * 300
        result = MarkdownProcessor.chunk_markdown_text(text, 500)
        for chunk in result:
            self.assertFalse(MarkdownProcessor.has_unclosed_fence(chunk))

    def test_returns_non_empty_strings(self):
        """所有返回的片段都应为非空字符串"""
        text = "Hello world!\n\n" * 100
        result = MarkdownProcessor.chunk_markdown_text(text, 100)
        for chunk in result:
            self.assertGreater(len(chunk), 0)


# ============ Acceptance criteria ============

class TestAcceptanceCriteria(unittest.TestCase):
    def test_9000_x_returns_3_chunks(self):
        """验收：MarkdownProcessor.chunk_markdown_text("x" * 9000, 3000) 返回 3 个片段"""
        result = MarkdownProcessor.chunk_markdown_text("x" * 9000, 3000)
        self.assertEqual(len(result), 3)
        for chunk in result:
            self.assertLessEqual(len(chunk), 3000)

    def test_5000_a_returns_2_chunks(self):
        """验收：python -c 输出 2"""
        result = MarkdownProcessor.chunk_markdown_text("a" * 5000, 3000)
        self.assertEqual(len(result), 2)

    def test_has_unclosed_fence_true(self):
        """验收：MarkdownProcessor.has_unclosed_fence("```python\\ncode") 返回 True"""
        self.assertTrue(MarkdownProcessor.has_unclosed_fence("```python\ncode"))

    def test_has_unclosed_fence_false(self):
        """验收：MarkdownProcessor.has_unclosed_fence("```python\\ncode\\n```") 返回 False"""
        self.assertFalse(MarkdownProcessor.has_unclosed_fence("```python\ncode\n```"))

    def test_code_block_200_lines_not_broken(self):
        """验收：包含 200 行代码块的文本，代码块不被切断"""
        code_lines = "\n".join([f"    result_{i} = compute({i})" for i in range(200)])
        text = f"Introduction.\n\n```python\n{code_lines}\n```\n\nConclusion."
        result = MarkdownProcessor.chunk_markdown_text(text, 3000)
        for chunk in result:
            self.assertFalse(MarkdownProcessor.has_unclosed_fence(chunk),
                             f"Found unclosed fence in chunk:\n{chunk[:100]}...")

    def test_table_rows_not_broken(self):
        """验收：表格行不被切断（每个 chunk 中的表格 fence 完整）"""
        rows = "\n".join([
            f"| Col A {i} | Col B {i} | Col C {i} |" for i in range(100)
        ])
        text = f"Table:\n\n| A | B | C |\n| --- | --- | --- |\n{rows}\n\nDone."
        result = MarkdownProcessor.chunk_markdown_text(text, 500)
        for chunk in result:
            self.assertFalse(MarkdownProcessor.has_unclosed_fence(chunk))


if __name__ == '__main__':
    unittest.main(verbosity=2)


# ============ pytest-style function tests (task specification) ============

def test_short_text_no_split():
    assert MarkdownProcessor.chunk_markdown_text("hello", 100) == ["hello"]


def test_plain_text_split():
    chunks = MarkdownProcessor.chunk_markdown_text("a" * 5000, 3000)
    assert len(chunks) >= 2
    for c in chunks:
        assert len(c) <= 3000


def test_fence_not_broken():
    """代码块不应被切断"""
    code_block = "```python\n" + "x = 1\n" * 200 + "```"
    chunks = MarkdownProcessor.chunk_markdown_text(code_block, 1000)
    for c in chunks:
        assert not MarkdownProcessor.has_unclosed_fence(c), f"Chunk has unclosed fence: {c[:100]}"


def test_large_fence_kept_whole():
    """超大代码块即便超过 max_chars 也应整块输出"""
    code_block = "```python\n" + "x = 1\n" * 200 + "```"
    chunks = MarkdownProcessor.chunk_markdown_text(code_block, 500)
    # 代码块应在同一个 chunk 中（允许超出 max_chars）
    fence_chunks = [c for c in chunks if "```python" in c]
    for c in fence_chunks:
        assert not MarkdownProcessor.has_unclosed_fence(c)


def test_mixed_content():
    """代码块前后的普通文本可以正常切割"""
    text = "intro paragraph\n\n" + "```python\nx=1\n```" + "\n\noutro paragraph"
    chunks = MarkdownProcessor.chunk_markdown_text(text, 100)
    for c in chunks:
        assert not MarkdownProcessor.has_unclosed_fence(c)


def test_table_not_broken():
    """表格不应被切断"""
    table = "| A | B |\n|---|---|\n| 1 | 2 |\n| 3 | 4 |"
    text = "before\n\n" + table + "\n\nafter"
    chunks = MarkdownProcessor.chunk_markdown_text(text, 30)
    table_in_chunk = [c for c in chunks if "|" in c]
    for c in table_in_chunk:
        lines = [line for line in c.split('\n') if line.strip().startswith('|')]
        if lines:
            # 至少表格行不被半截切割
            pass


def test_has_unclosed_fence():
    assert MarkdownProcessor.has_unclosed_fence("```python\ncode") == True
    assert MarkdownProcessor.has_unclosed_fence("```python\ncode\n```") == False
    assert MarkdownProcessor.has_unclosed_fence("no fence") == False


def test_ends_with_table_row():
    assert MarkdownProcessor.ends_with_table_row("| a | b |") == True
    assert MarkdownProcessor.ends_with_table_row("normal text") == False


def test_empty_text():
    assert MarkdownProcessor.chunk_markdown_text("", 100) == []


def test_exact_limit():
    text = "a" * 3000
    chunks = MarkdownProcessor.chunk_markdown_text(text, 3000)
    assert len(chunks) == 1