feat(parser): support multiple tables in a single markdown document

- Add extract_table_blocks() to split a document into contiguous table blocks, ignoring prose, headings, and blank lines between them - Add parse_document() as the new top-level entry point that runs extract_table_blocks + detect_has_duration_column + parse_table per block and returns a combined flat list of rows - Guard against empty End cells (e.g. in-progress rows) by validating the end field before calculating duration - Update cli.py to use parse_document() instead of the manual detect + parse combo - Add tests for extract_table_blocks and parse_document, including two smoke tests against the real 2026-W21 weekly timesheet file
2026-05-22 10:17:17 +02:00 · 2026-05-22 10:17:17 +02:00 · d6689a6c83
commit d6689a6c83
parent 7bea08ddac
4 changed files with 295 additions and 9 deletions
--- a/src/timesheets/cli.py
+++ b/src/timesheets/cli.py
@ -4,7 +4,7 @@ import sys
 from datetime import date

 from .output import print_summary, write_csv
-from .parser import aggregate_rows, detect_has_duration_column, parse_table
+from .parser import aggregate_rows, parse_document
 from .projects import load_project_map
 from .utils import format_date

@ -18,7 +18,8 @@ def build_parser() -> argparse.ArgumentParser:
        help="Path to the markdown file containing the timesheet table, or '-' to read from stdin.",
    )
    parser.add_argument(
-        "-o", "--output",
+        "-o",
+        "--output",
        help="Path to the output CSV file. Defaults to stdout.",
        default=None,
    )
@ -59,7 +60,7 @@ def main() -> None:
            sys.exit(1)

    lines = content.splitlines()
-    rows = parse_table(lines, has_duration_col=detect_has_duration_column(lines))
+    rows = parse_document(lines)

    if not rows:
        print("Warning: no timesheet rows found in input.", file=sys.stderr)
--- a/src/timesheets/parser.py
+++ b/src/timesheets/parser.py
@ -4,10 +4,21 @@ from collections import defaultdict
 from .utils import duration_from_start_end, parse_duration, strip_markdown_link


+def _is_table_line(line: str) -> bool:
+    """Return True if the line looks like part of a markdown table."""
+    s = line.strip()
+    return s.startswith("|") and s.endswith("|")
+
+
+def _is_separator_line(line: str) -> bool:
+    """Return True if the line is a markdown table separator (|---|---|)."""
+    return bool(re.match(r"^\|[-| :]+\|$", line.strip()))
+
+
 def detect_has_duration_column(lines: list[str]) -> bool:
    """
-    Inspect the header row to determine whether a Duration column is present.
-    Falls back to True if no header row is found.
+    Inspect the header row of a table block to determine whether a Duration
+    column is present. Falls back to True if no header row is found.
    """
    for line in lines:
        line = line.strip()
@ -19,9 +30,35 @@ def detect_has_duration_column(lines: list[str]) -> bool:
    return True


+def extract_table_blocks(lines: list[str]) -> list[list[str]]:
+    """
+    Split a markdown document into contiguous table blocks.
+
+    A block is a maximal run of lines that are either table rows or table
+    separators. Non-table lines (headings, prose, bullet points, blank lines)
+    break a block. Each returned block contains at least a header and a
+    separator line; shorter runs are discarded.
+    """
+    blocks: list[list[str]] = []
+    current: list[str] = []
+
+    for line in lines:
+        if _is_table_line(line):
+            current.append(line)
+        else:
+            if len(current) >= 2:  # at minimum: header + separator
+                blocks.append(current)
+            current = []
+
+    if len(current) >= 2:
+        blocks.append(current)
+
+    return blocks
+
+
 def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
    """
-    Parse markdown table lines into a list of row dicts.
+    Parse a single markdown table block into a list of row dicts.

    With duration:    Start | End | Duration | Project | Story | Note  (6 cols)
    Without duration: Start | End | Project  | Story   | Note         (5 cols)
@ -31,7 +68,7 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:

    for line in lines:
        line = line.strip()
-        if not line or re.match(r"^\|[-| :]+\|$", line):
+        if not line or _is_separator_line(line):
            continue
        if not (line.startswith("|") and line.endswith("|")):
            continue
@ -42,13 +79,18 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:

        if has_duration_col:
            start, end, duration, project, story, note = (
-                cells[0], cells[1], cells[2], cells[3],
+                cells[0],
+                cells[1],
+                cells[2],
+                cells[3],
                strip_markdown_link(cells[4]),
                strip_markdown_link(cells[5]),
            )
        else:
            start, end, project, story, note = (
-                cells[0], cells[1], cells[2],
+                cells[0],
+                cells[1],
+                cells[2],
                strip_markdown_link(cells[3]),
                strip_markdown_link(cells[4]),
            )
@ -58,6 +100,8 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
            continue
        if not re.match(r"^\d+:\d{2}$", start):
            continue
+        if not re.match(r"^\d+:\d{2}$", end):
+            continue

        if duration is not None:
            if not re.match(r"^\d+:\d{2}$", duration):
@ -83,6 +127,20 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
    return rows


+def parse_document(lines: list[str]) -> list[dict]:
+    """
+    Parse all timesheet tables found in a markdown document.
+
+    Extracts every table block, detects its column layout independently,
+    and returns the combined flat list of all parsed rows.
+    """
+    rows = []
+    for block in extract_table_blocks(lines):
+        has_duration_col = detect_has_duration_column(block)
+        rows.extend(parse_table(block, has_duration_col=has_duration_col))
+    return rows
+
+
 def build_description(story: str, note: str) -> str:
    """Combine story and note into a single description string."""
    parts = [p.strip() for p in [story, note] if p.strip()]
--- a/tests/2026
+++ b/tests/2026
@ -0,0 +1,113 @@
+# Week of 2026-05-18 - Review
+
+# Vrijdag - 2026-05-22
+
+| Start | End   | Project  | Story                                                                                                                                          | Note            |
+|-------|-------|----------|------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|
+| 08:15 | 09:30 | rate     | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | rebase          |
+| 09:30 | 09:45 | bugs     | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc)                                                              | review          |
+| 09:45 | 09:55 | scrum    |                                                                                                                                                | daily standup   |
+| 09:55 |       | bugs     | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc)                                                              | review          |
+| 16:00 | 16:30 | internal |                                                                                                                                                | Growth Path 2.0 |
+| 16:30 | 17:30 | internal |                                                                                                                                                | Factry Flow     |
+
+- [ ] Triage [Inbox](:/3a994fed3dc746a59d71c9f7ab1f60bc)
+- [x] Process `distill` tags
+- [ ] Process `refine` tags
+- [ ] Refine [2026-05-26 - Sprint Retro](:/49951990900947438b80007b2d21b228)
+- [ ] re-review [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc)
+
+ge moogtr de big REST API MR rebasen op develop, good luck, `git rebase --onto origin/develop -i 027b7cc3b`, er staan paar fixup commits klaar al
+
+# Donderdag - 2026-05-21
+
+| Start | End   | Project | Story                                                                                                                                          | Note                |
+|-------|-------|---------|------------------------------------------------------------------------------------------------------------------------------------------------|---------------------|
+| 08:15 | 08:20 | office  |                                                                                                                                                | koffie              |
+| 08:20 | 09:35 | bugs    | Bug 35326: OpenAPI: query parameters should be case-insensitive (canonical = lowercase)                                                        | review              |
+| 09:35 | 09:45 | hatch   | [PBI 35098: SPIKE: Gitlab MR previews apps](:/eb90c72a90e746d8b535dda26e8e7275)                                                                |                     |
+| 09:45 | 10:00 | scrum   |                                                                                                                                                | daily standup       |
+| 10:00 | 10:20 | hatch   | [PBI 35098: SPIKE: Gitlab MR previews apps](:/eb90c72a90e746d8b535dda26e8e7275)                                                                |                     |
+| 10:20 | 10:50 | refine  | PBI 35330: Calculation script errors should not be in Sentry? Or add a tag we can filter by?                                                   |                     |
+| 10:50 | 11:15 | product |                                                                                                                                                | Claude code CLI     |
+| 11:15 | 12:05 | refine  | PBI 35330: Calculation script errors should not be in Sentry? Or add a tag we can filter by?                                                   |                     |
+| 12:45 | 13:40 | bugs    |                                                                                                                                                | review race bugs MR |
+| 13:40 | 14:30 | bugs    | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc)                                                              | review              |
+| 14:30 | 15:00 | bugs    | Bug 35331: Clients hitting sql max open connections                                                                                            |                     |
+| 15:00 | 15:25 | refine  | PBI 35330: Calculation script errors should not be in Sentry? Or add a tag we can filter by?                                                   | respond to comments |
+| 15:25 | 15:30 | bugs    | Bug 35331: Clients hitting sql max open connections                                                                                            |                     |
+| 15:30 | 16:30 | rate    | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | respond to comments |
+
+- [ ] Uitzoeken hoeveel audit log info er verloren is gegaan door de REST api migratie (`GetDatabase` -> `GetDatabaseByOrganizationUUID`)
+- [ ]  kijken of GetEvents property value filter nu broken is met datasource door remodelling in openapi spec
+
+# Woensdag - 2026-05-20
+
+| Start | End   | Project | Story                                                                                                                                          | Note                                      |
+|-------|-------|---------|------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|
+| 08:15 | 08:25 | office  |                                                                                                                                                | koffie, sleutels zoeken                   |
+| 08:25 | 09:00 | bugs    | Bug 34948: Bulk measurement update returns 500 on failed validation                                                                            |                                           |
+| 09:00 | 09:15 | bugs    | Bug 35238: DestroySink does not listen to the  'force' query-parameter                                                                         |                                           |
+| 09:15 | 09:50 | bugs    |                                                                                                                                                | CPU usage spikes in statistics collection |
+| 09:50 | 10:00 | bugs    | Bug 35241: OpenAPI: timeseries query result schema breaks JSON round-trip for scalar values, arrays, and string tags                           | review                                    |
+| 10:00 | 10:30 | bugs    | Bug 35238: DestroySink does not listen to the  'force' query-parameter                                                                         |                                           |
+| 10:30 | 12:00 | rate    | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) |                                           |
+| 12:40 | 13:30 | rate    | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | trying to remove the wrapper              |
+| 13:30 | 13:45 | bugs    | Bug 34948: Bulk measurement update returns 500 on failed validation                                                                            | respond to comments                       |
+| 13:45 | 14:45 | rate    | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | trying to remove the wrapper              |
+| 14:45 | 15:00 | bugs    | Bug 34948: Bulk measurement update returns 500 on failed validation                                                                            | respond to comments                       |
+| 15:00 | 15:15 | bugs    | Bug 35238: DestroySink does not listen to the  'force' query-parameter                                                                         | respond to comments                       |
+| 15:15 | 15:25 | hatch   | [PBI 35098: SPIKE: Gitlab MR previews apps](:/eb90c72a90e746d8b535dda26e8e7275)                                                                |                                           |
+| 15:25 | 15:45 | bugs    | Bug 35326: OpenAPI: query parameters should be case-insensitive (canonical = lowercase)                                                        | review                                    |
+| 15:45 | 16:00 | product |                                                                                                                                                | CI Evelien fixen                          |
+| 16:00 | 16:20 | bugs    | Bug 35326: OpenAPI: query parameters should be case-insensitive (canonical = lowercase)                                                        | review                                    |
+| 16:20 | 17:00 | hatch   | [PBI 35098: SPIKE: Gitlab MR previews apps](:/eb90c72a90e746d8b535dda26e8e7275)                                                                |                                           |
+
+
+- [x] fix CPU usage spikes in statistics collection ([thread](https://factrylabs.slack.com/archives/C01TD6M694G/p1779112687422719))
+	* idee: maak een global statistics variable aan, guarded met een mutex en getters/setters
+	* hou bij hoe oud die is, if too old -> recollect, anders geef gwn de cached versie mee
+	* dan gebeurt collection 1 keer en gebruiken alle loops gwn the same data
+
+# Dinsdag - 2026-05-19
+
+| Start | End   | Project | Story                                                                                                                                             | Note                  |
+|-------|-------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------|
+| 08:10 | 08:15 | office  |                                                                                                                                                   | coffee                |
+| 08:15 | 09:45 | rate    | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | respond to comments   |
+| 09:45 | 09:55 | scrum   |                                                                                                                                                   | daily standup         |
+| 09:55 | 10:30 | rate    | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | rebase                |
+| 10:30 | 12:05 | rate    | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f)    | fix tests             |
+| 12:55 | 13:05 | rate    | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f)    | fix tests             |
+| 13:05 | 14:10 | bugs    | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc)                                                                 | review                |
+| 14:10 | 14:45 | rate    | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | final comments        |
+| 14:45 | 15:15 | rate    | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f)    | fix tests             |
+| 15:15 | 15:45 | bugs    | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc)                                                                 | review                |
+| 15:45 | 16:10 | rate    | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | rebase                |
+| 16:10 | 16:30 | rate    | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f)    | respond to duo review |
+| 16:30 | 17:10 | bugs    | Bug 34948: Bulk measurement update returns 500 on failed validation                                                                               |                       |
+
+
+# Maandag - 2026-05-18
+
+| Start | End   | Project  | Story                                                                                                                                             | Note                           |
+|-------|-------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------|
+| 08:10 | 08:15 | office   |                                                                                                                                                   | coffee                         |
+| 08:15 | 09:45 | rate     | PBI 34972: refactor(rest): migrate events and event configuration handlers to OpenAPI 3.0                                                         | respond to comments            |
+| 09:45 | 10:05 | scrum    |                                                                                                                                                   | daily standup                  |
+| 10:05 | 10:25 | rate     | PBI 34972: refactor(rest): migrate events and event configuration handlers to OpenAPI 3.0                                                         | respond to comments            |
+| 10:25 | 11:45 | rate     | PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0                                       | fix tests                      |
+| 11:45 | 12:05 | bugs     |                                                                                                                                                   | review race bugs MR            |
+| 12:40 | 13:05 | bugs     |                                                                                                                                                   | review race bugs MR            |
+| 13:05 | 13:40 | internal |                                                                                                                                                   | claude code proberen instellen |
+| 13:40 | 13:55 | rate     | PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0                                       | CI debugging                   |
+| 13:55 | 14:50 | rate     | PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0                                          | fix tests                      |
+| 14:50 | 15:15 | refine   | PBI 34321: Update asset from prototype: add 'Keep all' / 'Update all' bulk action buttons                                                         |                                |
+| 15:30 | 15:35 | bugs     | Bug 34916: GetAssets endpoint fails with "invalid escape \ sequence" when path filter contains backslashes                                        |                                |
+| 15:35 | 15:45 | refine   | PBI 34321: Update asset from prototype: add 'Keep all' / 'Update all' bulk action buttons                                                         |                                |
+| 15:45 | 17:00 | rate     | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | respond to comments            |
+
+* laatste rest refactor MR nog afwerken
+	* gij hebt de 3 laatste op uw naam, dus gij kunt de wrapper uiteindelijk wegdoen
+	* moet nog verder opkuisen wa claude gedaan heeft
+	* nog wa testen da falen ook
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@ -1,9 +1,13 @@
+import os
+
 import pytest

 from timesheets.parser import (
    aggregate_rows,
    build_description,
    detect_has_duration_column,
+    extract_table_blocks,
+    parse_document,
    parse_table,
 )

@ -26,6 +30,8 @@ WITHOUT_DURATION = [
    "| 08:30 | 09:15 | scrum   |             | dsu     |",
 ]

+WEEK_FILE = os.path.join(os.path.dirname(__file__), "2026 - W21.md")
+

 # ---------------------------------------------------------------------------
 # detect_has_duration_column
@ -47,6 +53,44 @@ class TestDetectHasDurationColumn:
        assert detect_has_duration_column(lines) is True


+# ---------------------------------------------------------------------------
+# extract_table_blocks
+# ---------------------------------------------------------------------------
+
+
+class TestExtractTableBlocks:
+    def test_single_table(self):
+        blocks = extract_table_blocks(WITH_DURATION)
+        assert len(blocks) == 1
+        assert blocks[0] == WITH_DURATION
+
+    def test_two_tables_separated_by_prose(self):
+        lines = WITH_DURATION + ["", "# Next day", "some prose", ""] + WITHOUT_DURATION
+        blocks = extract_table_blocks(lines)
+        assert len(blocks) == 2
+
+    def test_prose_between_tables_not_included(self):
+        lines = WITH_DURATION + ["some note"] + WITHOUT_DURATION
+        blocks = extract_table_blocks(lines)
+        assert len(blocks) == 2
+        assert all("some note" not in b for b in blocks)
+
+    def test_single_line_table_discarded(self):
+        lines = ["| Start | End |"]
+        assert extract_table_blocks(lines) == []
+
+    def test_empty_input(self):
+        assert extract_table_blocks([]) == []
+
+    def test_no_tables(self):
+        assert extract_table_blocks(["# heading", "", "prose"]) == []
+
+    def test_table_at_end_of_file_captured(self):
+        lines = ["# heading", ""] + WITH_DURATION  # no trailing newline
+        blocks = extract_table_blocks(lines)
+        assert len(blocks) == 1
+
+
 # ---------------------------------------------------------------------------
 # parse_table
 # ---------------------------------------------------------------------------
@ -92,6 +136,14 @@ class TestParseTable:
        ]
        assert parse_table(lines) == []

+    def test_empty_end_time_row_skipped(self):
+        lines = [
+            "| Start | End   | Project | Story | Note |",
+            "|-------|-------|---------|-------|------|",
+            "| 09:55 |       | bugs    |       |      |",
+        ]
+        assert parse_table(lines, has_duration_col=False) == []
+
    def test_empty_input(self):
        assert parse_table([]) == []

@ -101,6 +153,68 @@ class TestParseTable:
        assert len(rows) == 3


+# ---------------------------------------------------------------------------
+# parse_document
+# ---------------------------------------------------------------------------
+
+
+class TestParseDocument:
+    def test_single_table(self):
+        rows = parse_document(WITHOUT_DURATION)
+        assert len(rows) == 2
+
+    def test_multiple_tables_combined(self):
+        lines = WITHOUT_DURATION + ["", "# Next day", ""] + WITHOUT_DURATION
+        rows = parse_document(lines)
+        assert len(rows) == 4
+
+    def test_prose_between_tables_ignored(self):
+        lines = (
+            WITHOUT_DURATION + ["some notes", "- a bullet point", ""] + WITHOUT_DURATION
+        )
+        rows = parse_document(lines)
+        assert len(rows) == 4
+
+    def test_mixed_duration_formats(self):
+        lines = WITH_DURATION + ["", "## Next day", ""] + WITHOUT_DURATION
+        rows = parse_document(lines)
+        assert len(rows) == 5  # 3 from WITH_DURATION + 2 from WITHOUT_DURATION
+
+    def test_empty_input(self):
+        assert parse_document([]) == []
+
+    def test_week_file(self):
+        """Smoke test against the real W21 weekly timesheet file."""
+        with open(WEEK_FILE, encoding="utf-8") as f:
+            lines = f.read().splitlines()
+        rows = parse_document(lines)
+        # File has 5 daily tables; expect a healthy number of rows
+        assert len(rows) > 20
+        # All rows must have expected keys
+        for row in rows:
+            assert "project" in row
+            assert "duration_hours" in row
+            assert row["duration_hours"] > 0
+        # The incomplete row (09:55 | empty end) must have been skipped
+        incomplete = [
+            r for r in rows if r["start"] == "09:55" and r["project"] == "bugs"
+        ]
+        assert all(r["duration_hours"] > 0 for r in incomplete)
+
+    def test_week_file_no_markdown_links_in_stories(self):
+        """Markdown link syntax must be stripped from story/note fields."""
+        with open(WEEK_FILE, encoding="utf-8") as f:
+            lines = f.read().splitlines()
+        rows = parse_document(lines)
+        for row in rows:
+            assert "](:" not in row["story"], (
+                f"Link not stripped in story: {row['story']!r}"
+            )
+            assert "](:" not in row["note"], (
+                f"Link not stripped in note: {row['note']!r}"
+            )
+
+
 # ---------------------------------------------------------------------------
 # build_description
 # ---------------------------------------------------------------------------