feat(parser): support multiple tables in a single markdown document
- Add extract_table_blocks() to split a document into contiguous table blocks, ignoring prose, headings, and blank lines between them - Add parse_document() as the new top-level entry point that runs extract_table_blocks + detect_has_duration_column + parse_table per block and returns a combined flat list of rows - Guard against empty End cells (e.g. in-progress rows) by validating the end field before calculating duration - Update cli.py to use parse_document() instead of the manual detect + parse combo - Add tests for extract_table_blocks and parse_document, including two smoke tests against the real 2026-W21 weekly timesheet file
This commit is contained in:
parent
7bea08ddac
commit
d6689a6c83
4 changed files with 295 additions and 9 deletions
|
|
@ -4,7 +4,7 @@ import sys
|
|||
from datetime import date
|
||||
|
||||
from .output import print_summary, write_csv
|
||||
from .parser import aggregate_rows, detect_has_duration_column, parse_table
|
||||
from .parser import aggregate_rows, parse_document
|
||||
from .projects import load_project_map
|
||||
from .utils import format_date
|
||||
|
||||
|
|
@ -18,7 +18,8 @@ def build_parser() -> argparse.ArgumentParser:
|
|||
help="Path to the markdown file containing the timesheet table, or '-' to read from stdin.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output",
|
||||
"-o",
|
||||
"--output",
|
||||
help="Path to the output CSV file. Defaults to stdout.",
|
||||
default=None,
|
||||
)
|
||||
|
|
@ -59,7 +60,7 @@ def main() -> None:
|
|||
sys.exit(1)
|
||||
|
||||
lines = content.splitlines()
|
||||
rows = parse_table(lines, has_duration_col=detect_has_duration_column(lines))
|
||||
rows = parse_document(lines)
|
||||
|
||||
if not rows:
|
||||
print("Warning: no timesheet rows found in input.", file=sys.stderr)
|
||||
|
|
|
|||
|
|
@ -4,10 +4,21 @@ from collections import defaultdict
|
|||
from .utils import duration_from_start_end, parse_duration, strip_markdown_link
|
||||
|
||||
|
||||
def _is_table_line(line: str) -> bool:
|
||||
"""Return True if the line looks like part of a markdown table."""
|
||||
s = line.strip()
|
||||
return s.startswith("|") and s.endswith("|")
|
||||
|
||||
|
||||
def _is_separator_line(line: str) -> bool:
|
||||
"""Return True if the line is a markdown table separator (|---|---|)."""
|
||||
return bool(re.match(r"^\|[-| :]+\|$", line.strip()))
|
||||
|
||||
|
||||
def detect_has_duration_column(lines: list[str]) -> bool:
|
||||
"""
|
||||
Inspect the header row to determine whether a Duration column is present.
|
||||
Falls back to True if no header row is found.
|
||||
Inspect the header row of a table block to determine whether a Duration
|
||||
column is present. Falls back to True if no header row is found.
|
||||
"""
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
|
|
@ -19,9 +30,35 @@ def detect_has_duration_column(lines: list[str]) -> bool:
|
|||
return True
|
||||
|
||||
|
||||
def extract_table_blocks(lines: list[str]) -> list[list[str]]:
|
||||
"""
|
||||
Split a markdown document into contiguous table blocks.
|
||||
|
||||
A block is a maximal run of lines that are either table rows or table
|
||||
separators. Non-table lines (headings, prose, bullet points, blank lines)
|
||||
break a block. Each returned block contains at least a header and a
|
||||
separator line; shorter runs are discarded.
|
||||
"""
|
||||
blocks: list[list[str]] = []
|
||||
current: list[str] = []
|
||||
|
||||
for line in lines:
|
||||
if _is_table_line(line):
|
||||
current.append(line)
|
||||
else:
|
||||
if len(current) >= 2: # at minimum: header + separator
|
||||
blocks.append(current)
|
||||
current = []
|
||||
|
||||
if len(current) >= 2:
|
||||
blocks.append(current)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
|
||||
"""
|
||||
Parse markdown table lines into a list of row dicts.
|
||||
Parse a single markdown table block into a list of row dicts.
|
||||
|
||||
With duration: Start | End | Duration | Project | Story | Note (6 cols)
|
||||
Without duration: Start | End | Project | Story | Note (5 cols)
|
||||
|
|
@ -31,7 +68,7 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
|
|||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line or re.match(r"^\|[-| :]+\|$", line):
|
||||
if not line or _is_separator_line(line):
|
||||
continue
|
||||
if not (line.startswith("|") and line.endswith("|")):
|
||||
continue
|
||||
|
|
@ -42,13 +79,18 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
|
|||
|
||||
if has_duration_col:
|
||||
start, end, duration, project, story, note = (
|
||||
cells[0], cells[1], cells[2], cells[3],
|
||||
cells[0],
|
||||
cells[1],
|
||||
cells[2],
|
||||
cells[3],
|
||||
strip_markdown_link(cells[4]),
|
||||
strip_markdown_link(cells[5]),
|
||||
)
|
||||
else:
|
||||
start, end, project, story, note = (
|
||||
cells[0], cells[1], cells[2],
|
||||
cells[0],
|
||||
cells[1],
|
||||
cells[2],
|
||||
strip_markdown_link(cells[3]),
|
||||
strip_markdown_link(cells[4]),
|
||||
)
|
||||
|
|
@ -58,6 +100,8 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
|
|||
continue
|
||||
if not re.match(r"^\d+:\d{2}$", start):
|
||||
continue
|
||||
if not re.match(r"^\d+:\d{2}$", end):
|
||||
continue
|
||||
|
||||
if duration is not None:
|
||||
if not re.match(r"^\d+:\d{2}$", duration):
|
||||
|
|
@ -83,6 +127,20 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]:
|
|||
return rows
|
||||
|
||||
|
||||
def parse_document(lines: list[str]) -> list[dict]:
|
||||
"""
|
||||
Parse all timesheet tables found in a markdown document.
|
||||
|
||||
Extracts every table block, detects its column layout independently,
|
||||
and returns the combined flat list of all parsed rows.
|
||||
"""
|
||||
rows = []
|
||||
for block in extract_table_blocks(lines):
|
||||
has_duration_col = detect_has_duration_column(block)
|
||||
rows.extend(parse_table(block, has_duration_col=has_duration_col))
|
||||
return rows
|
||||
|
||||
|
||||
def build_description(story: str, note: str) -> str:
|
||||
"""Combine story and note into a single description string."""
|
||||
parts = [p.strip() for p in [story, note] if p.strip()]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue