feat(parser): support multiple tables in a single markdown document

- Add extract_table_blocks() to split a document into contiguous table blocks, ignoring prose, headings, and blank lines between them - Add parse_document() as the new top-level entry point that runs extract_table_blocks + detect_has_duration_column + parse_table per block and returns a combined flat list of rows - Guard against empty End cells (e.g. in-progress rows) by validating the end field before calculating duration - Update cli.py to use parse_document() instead of the manual detect + parse combo - Add tests for extract_table_blocks and parse_document, including two smoke tests against the real 2026-W21 weekly timesheet file
2026-05-22 10:17:17 +02:00 · 2026-05-22 10:17:17 +02:00 · d6689a6c83
commit d6689a6c83
parent 7bea08ddac
4 changed files with 295 additions and 9 deletions
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -1,9 +1,13 @@
+import os
+
 import pytest

 from timesheets.parser import (
    aggregate_rows,
    build_description,
    detect_has_duration_column,
+    extract_table_blocks,
+    parse_document,
    parse_table,
 )

@ -26,6 +30,8 @@ WITHOUT_DURATION = [
    "| 08:30 | 09:15 | scrum   |             | dsu     |",
 ]

+WEEK_FILE = os.path.join(os.path.dirname(__file__), "2026 - W21.md")
+

 # ---------------------------------------------------------------------------
 # detect_has_duration_column
@ -47,6 +53,44 @@ class TestDetectHasDurationColumn:
        assert detect_has_duration_column(lines) is True


+# ---------------------------------------------------------------------------
+# extract_table_blocks
+# ---------------------------------------------------------------------------
+
+
+class TestExtractTableBlocks:
+    def test_single_table(self):
+        blocks = extract_table_blocks(WITH_DURATION)
+        assert len(blocks) == 1
+        assert blocks[0] == WITH_DURATION
+
+    def test_two_tables_separated_by_prose(self):
+        lines = WITH_DURATION + ["", "# Next day", "some prose", ""] + WITHOUT_DURATION
+        blocks = extract_table_blocks(lines)
+        assert len(blocks) == 2
+
+    def test_prose_between_tables_not_included(self):
+        lines = WITH_DURATION + ["some note"] + WITHOUT_DURATION
+        blocks = extract_table_blocks(lines)
+        assert len(blocks) == 2
+        assert all("some note" not in b for b in blocks)
+
+    def test_single_line_table_discarded(self):
+        lines = ["| Start | End |"]
+        assert extract_table_blocks(lines) == []
+
+    def test_empty_input(self):
+        assert extract_table_blocks([]) == []
+
+    def test_no_tables(self):
+        assert extract_table_blocks(["# heading", "", "prose"]) == []
+
+    def test_table_at_end_of_file_captured(self):
+        lines = ["# heading", ""] + WITH_DURATION  # no trailing newline
+        blocks = extract_table_blocks(lines)
+        assert len(blocks) == 1
+
+
 # ---------------------------------------------------------------------------
 # parse_table
 # ---------------------------------------------------------------------------
@ -92,6 +136,14 @@ class TestParseTable:
        ]
        assert parse_table(lines) == []

+    def test_empty_end_time_row_skipped(self):
+        lines = [
+            "| Start | End   | Project | Story | Note |",
+            "|-------|-------|---------|-------|------|",
+            "| 09:55 |       | bugs    |       |      |",
+        ]
+        assert parse_table(lines, has_duration_col=False) == []
+
    def test_empty_input(self):
        assert parse_table([]) == []

@ -101,6 +153,68 @@ class TestParseTable:
        assert len(rows) == 3


+# ---------------------------------------------------------------------------
+# parse_document
+# ---------------------------------------------------------------------------
+
+
+class TestParseDocument:
+    def test_single_table(self):
+        rows = parse_document(WITHOUT_DURATION)
+        assert len(rows) == 2
+
+    def test_multiple_tables_combined(self):
+        lines = WITHOUT_DURATION + ["", "# Next day", ""] + WITHOUT_DURATION
+        rows = parse_document(lines)
+        assert len(rows) == 4
+
+    def test_prose_between_tables_ignored(self):
+        lines = (
+            WITHOUT_DURATION + ["some notes", "- a bullet point", ""] + WITHOUT_DURATION
+        )
+        rows = parse_document(lines)
+        assert len(rows) == 4
+
+    def test_mixed_duration_formats(self):
+        lines = WITH_DURATION + ["", "## Next day", ""] + WITHOUT_DURATION
+        rows = parse_document(lines)
+        assert len(rows) == 5  # 3 from WITH_DURATION + 2 from WITHOUT_DURATION
+
+    def test_empty_input(self):
+        assert parse_document([]) == []
+
+    def test_week_file(self):
+        """Smoke test against the real W21 weekly timesheet file."""
+        with open(WEEK_FILE, encoding="utf-8") as f:
+            lines = f.read().splitlines()
+        rows = parse_document(lines)
+        # File has 5 daily tables; expect a healthy number of rows
+        assert len(rows) > 20
+        # All rows must have expected keys
+        for row in rows:
+            assert "project" in row
+            assert "duration_hours" in row
+            assert row["duration_hours"] > 0
+        # The incomplete row (09:55 | empty end) must have been skipped
+        incomplete = [
+            r for r in rows if r["start"] == "09:55" and r["project"] == "bugs"
+        ]
+        assert all(r["duration_hours"] > 0 for r in incomplete)
+
+    def test_week_file_no_markdown_links_in_stories(self):
+        """Markdown link syntax must be stripped from story/note fields."""
+        with open(WEEK_FILE, encoding="utf-8") as f:
+            lines = f.read().splitlines()
+        rows = parse_document(lines)
+        for row in rows:
+            assert "](:" not in row["story"], (
+                f"Link not stripped in story: {row['story']!r}"
+            )
+            assert "](:" not in row["note"], (
+                f"Link not stripped in note: {row['note']!r}"
+            )
+
+
 # ---------------------------------------------------------------------------
 # build_description
 # ---------------------------------------------------------------------------