feat(parser): support multiple tables in a single markdown document

- Add extract_table_blocks() to split a document into contiguous table blocks, ignoring prose, headings, and blank lines between them - Add parse_document() as the new top-level entry point that runs extract_table_blocks + detect_has_duration_column + parse_table per block and returns a combined flat list of rows - Guard against empty End cells (e.g. in-progress rows) by validating the end field before calculating duration - Update cli.py to use parse_document() instead of the manual detect + parse combo - Add tests for extract_table_blocks and parse_document, including two smoke tests against the real 2026-W21 weekly timesheet file
2026-05-22 10:17:17 +02:00 · 2026-05-22 10:17:17 +02:00 · d6689a6c83
commit d6689a6c83
parent 7bea08ddac
4 changed files with 295 additions and 9 deletions
--- a/src/timesheets/cli.py
+++ b/src/timesheets/cli.py
@ -4,7 +4,7 @@ import sys
 from datetime import date

 from .output import print_summary, write_csv
-from .parser import aggregate_rows, detect_has_duration_column, parse_table
+from .parser import aggregate_rows, parse_document
 from .projects import load_project_map
 from .utils import format_date

@ -18,7 +18,8 @@ def build_parser() -> argparse.ArgumentParser:
        help="Path to the markdown file containing the timesheet table, or '-' to read from stdin.",
    )
    parser.add_argument(
-        "-o", "--output",
+        "-o",
+        "--output",
        help="Path to the output CSV file. Defaults to stdout.",
        default=None,
    )
@ -59,7 +60,7 @@ def main() -> None:
            sys.exit(1)

    lines = content.splitlines()
-    rows = parse_table(lines, has_duration_col=detect_has_duration_column(lines))
+    rows = parse_document(lines)

    if not rows:
        print("Warning: no timesheet rows found in input.", file=sys.stderr)
--- a/src/timesheets/parser.py
+++ b/src/timesheets/parser.py
@ -4,10 +4,21 @@ from collections import defaultdict
 from .utils import duration_from_start_end, parse_duration, strip_markdown_link


+def _is_table_line(line: str) -> bool:
+    """Return True if the line looks like part of a markdown table."""
+    s = line.strip()
+    return s.startswith("|") and s.endswith("|")
+
+
+def _is_separator_line(line: str) -> bool:
+    """Return True if the line is a markdown table separator (|---|---|)."""
+    return bool(re.match(r"^\|[-| :]+\|$", line.strip()))
+
+
 def detect_has_duration_column(lines: list[str]) -> bool:
    """
-    Inspect the header row to determine whether a Duration column is present.
-    Falls back to True if no header row is found.
+    Inspect the header row of a table block to determine whether a Duration
+    column is present. Falls back to True if no header row is found.
    """
    for line in lines:
        line = line.strip()
@ -19,9 +30,35 @@ def detect_has_duration_column(lines: list[str]) -> bool:
    return True


+def extract_table_blocks(lines: list[str]) -> list[list[str]]:
+    """
+    Split a markdown document into contiguous table blocks.
+
+    A block is a maximal run of lines that are either table rows or table
+    separators. Non-table lines (headings, prose, bullet points, blank lines)
+    break a block. Each returned block contains at least a header and a
+    separator line; shorter runs are discarded.
+    """
+    blocks: list[list[str]] = []
+    current: list[str] = []
+
+    for line in lines:
+        if _is_table_line(line):
+            current.append(line)
+        else:
+            if len(current) >= 2:  # at minimum: header + separator
+                blocks.append(current)
+            current = []
+
+    if len(current) >= 2:
+        blocks.append(current)
+
+    return blocks
+
+
 def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
    """
-    Parse markdown table lines into a list of row dicts.
+    Parse a single markdown table block into a list of row dicts.

    With duration:    Start | End | Duration | Project | Story | Note  (6 cols)
    Without duration: Start | End | Project  | Story   | Note         (5 cols)
@ -31,7 +68,7 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:

    for line in lines:
        line = line.strip()
-        if not line or re.match(r"^\|[-| :]+\|$", line):
+        if not line or _is_separator_line(line):
            continue
        if not (line.startswith("|") and line.endswith("|")):
            continue
@ -42,13 +79,18 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:

        if has_duration_col:
            start, end, duration, project, story, note = (
-                cells[0], cells[1], cells[2], cells[3],
+                cells[0],
+                cells[1],
+                cells[2],
+                cells[3],
                strip_markdown_link(cells[4]),
                strip_markdown_link(cells[5]),
            )
        else:
            start, end, project, story, note = (
-                cells[0], cells[1], cells[2],
+                cells[0],
+                cells[1],
+                cells[2],
                strip_markdown_link(cells[3]),
                strip_markdown_link(cells[4]),
            )
@ -58,6 +100,8 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
            continue
        if not re.match(r"^\d+:\d{2}$", start):
            continue
+        if not re.match(r"^\d+:\d{2}$", end):
+            continue

        if duration is not None:
            if not re.match(r"^\d+:\d{2}$", duration):
@ -83,6 +127,20 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
    return rows


+def parse_document(lines: list[str]) -> list[dict]:
+    """
+    Parse all timesheet tables found in a markdown document.
+
+    Extracts every table block, detects its column layout independently,
+    and returns the combined flat list of all parsed rows.
+    """
+    rows = []
+    for block in extract_table_blocks(lines):
+        has_duration_col = detect_has_duration_column(block)
+        rows.extend(parse_table(block, has_duration_col=has_duration_col))
+    return rows
+
+
 def build_description(story: str, note: str) -> str:
    """Combine story and note into a single description string."""
    parts = [p.strip() for p in [story, note] if p.strip()]