From d6689a6c83f1b0062cef77e1e704b9e6591a960e Mon Sep 17 00:00:00 2001 From: Jef Roosens Date: Fri, 22 May 2026 10:17:17 +0200 Subject: [PATCH] feat(parser): support multiple tables in a single markdown document - Add extract_table_blocks() to split a document into contiguous table blocks, ignoring prose, headings, and blank lines between them - Add parse_document() as the new top-level entry point that runs extract_table_blocks + detect_has_duration_column + parse_table per block and returns a combined flat list of rows - Guard against empty End cells (e.g. in-progress rows) by validating the end field before calculating duration - Update cli.py to use parse_document() instead of the manual detect + parse combo - Add tests for extract_table_blocks and parse_document, including two smoke tests against the real 2026-W21 weekly timesheet file --- src/timesheets/cli.py | 7 +-- src/timesheets/parser.py | 70 +++++++++++++++++++++--- tests/2026 - W21.md | 113 ++++++++++++++++++++++++++++++++++++++ tests/test_parser.py | 114 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 295 insertions(+), 9 deletions(-) create mode 100644 tests/2026 - W21.md diff --git a/src/timesheets/cli.py b/src/timesheets/cli.py index 97dbe93..dc51a33 100644 --- a/src/timesheets/cli.py +++ b/src/timesheets/cli.py @@ -4,7 +4,7 @@ import sys from datetime import date from .output import print_summary, write_csv -from .parser import aggregate_rows, detect_has_duration_column, parse_table +from .parser import aggregate_rows, parse_document from .projects import load_project_map from .utils import format_date @@ -18,7 +18,8 @@ def build_parser() -> argparse.ArgumentParser: help="Path to the markdown file containing the timesheet table, or '-' to read from stdin.", ) parser.add_argument( - "-o", "--output", + "-o", + "--output", help="Path to the output CSV file. Defaults to stdout.", default=None, ) @@ -59,7 +60,7 @@ def main() -> None: sys.exit(1) lines = content.splitlines() - rows = parse_table(lines, has_duration_col=detect_has_duration_column(lines)) + rows = parse_document(lines) if not rows: print("Warning: no timesheet rows found in input.", file=sys.stderr) diff --git a/src/timesheets/parser.py b/src/timesheets/parser.py index eadc9f5..6ac3ae5 100644 --- a/src/timesheets/parser.py +++ b/src/timesheets/parser.py @@ -4,10 +4,21 @@ from collections import defaultdict from .utils import duration_from_start_end, parse_duration, strip_markdown_link +def _is_table_line(line: str) -> bool: + """Return True if the line looks like part of a markdown table.""" + s = line.strip() + return s.startswith("|") and s.endswith("|") + + +def _is_separator_line(line: str) -> bool: + """Return True if the line is a markdown table separator (|---|---|).""" + return bool(re.match(r"^\|[-| :]+\|$", line.strip())) + + def detect_has_duration_column(lines: list[str]) -> bool: """ - Inspect the header row to determine whether a Duration column is present. - Falls back to True if no header row is found. + Inspect the header row of a table block to determine whether a Duration + column is present. Falls back to True if no header row is found. """ for line in lines: line = line.strip() @@ -19,9 +30,35 @@ def detect_has_duration_column(lines: list[str]) -> bool: return True +def extract_table_blocks(lines: list[str]) -> list[list[str]]: + """ + Split a markdown document into contiguous table blocks. + + A block is a maximal run of lines that are either table rows or table + separators. Non-table lines (headings, prose, bullet points, blank lines) + break a block. Each returned block contains at least a header and a + separator line; shorter runs are discarded. + """ + blocks: list[list[str]] = [] + current: list[str] = [] + + for line in lines: + if _is_table_line(line): + current.append(line) + else: + if len(current) >= 2: # at minimum: header + separator + blocks.append(current) + current = [] + + if len(current) >= 2: + blocks.append(current) + + return blocks + + def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]: """ - Parse markdown table lines into a list of row dicts. + Parse a single markdown table block into a list of row dicts. With duration: Start | End | Duration | Project | Story | Note (6 cols) Without duration: Start | End | Project | Story | Note (5 cols) @@ -31,7 +68,7 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]: for line in lines: line = line.strip() - if not line or re.match(r"^\|[-| :]+\|$", line): + if not line or _is_separator_line(line): continue if not (line.startswith("|") and line.endswith("|")): continue @@ -42,13 +79,18 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]: if has_duration_col: start, end, duration, project, story, note = ( - cells[0], cells[1], cells[2], cells[3], + cells[0], + cells[1], + cells[2], + cells[3], strip_markdown_link(cells[4]), strip_markdown_link(cells[5]), ) else: start, end, project, story, note = ( - cells[0], cells[1], cells[2], + cells[0], + cells[1], + cells[2], strip_markdown_link(cells[3]), strip_markdown_link(cells[4]), ) @@ -58,6 +100,8 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]: continue if not re.match(r"^\d+:\d{2}$", start): continue + if not re.match(r"^\d+:\d{2}$", end): + continue if duration is not None: if not re.match(r"^\d+:\d{2}$", duration): @@ -83,6 +127,20 @@ def parse_table(lines: list[str], has_duration_col: bool = True) -> list[dict]: return rows +def parse_document(lines: list[str]) -> list[dict]: + """ + Parse all timesheet tables found in a markdown document. + + Extracts every table block, detects its column layout independently, + and returns the combined flat list of all parsed rows. + """ + rows = [] + for block in extract_table_blocks(lines): + has_duration_col = detect_has_duration_column(block) + rows.extend(parse_table(block, has_duration_col=has_duration_col)) + return rows + + def build_description(story: str, note: str) -> str: """Combine story and note into a single description string.""" parts = [p.strip() for p in [story, note] if p.strip()] diff --git a/tests/2026 - W21.md b/tests/2026 - W21.md new file mode 100644 index 0000000..2246825 --- /dev/null +++ b/tests/2026 - W21.md @@ -0,0 +1,113 @@ +# Week of 2026-05-18 - Review + +# Vrijdag - 2026-05-22 + +| Start | End | Project | Story | Note | +|-------|-------|----------|------------------------------------------------------------------------------------------------------------------------------------------------|-----------------| +| 08:15 | 09:30 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | rebase | +| 09:30 | 09:45 | bugs | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc) | review | +| 09:45 | 09:55 | scrum | | daily standup | +| 09:55 | | bugs | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc) | review | +| 16:00 | 16:30 | internal | | Growth Path 2.0 | +| 16:30 | 17:30 | internal | | Factry Flow | + +- [ ] Triage [Inbox](:/3a994fed3dc746a59d71c9f7ab1f60bc) +- [x] Process `distill` tags +- [ ] Process `refine` tags +- [ ] Refine [2026-05-26 - Sprint Retro](:/49951990900947438b80007b2d21b228) +- [ ] re-review [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc) + +ge moogtr de big REST API MR rebasen op develop, good luck, `git rebase --onto origin/develop -i 027b7cc3b`, er staan paar fixup commits klaar al + +# Donderdag - 2026-05-21 + +| Start | End | Project | Story | Note | +|-------|-------|---------|------------------------------------------------------------------------------------------------------------------------------------------------|---------------------| +| 08:15 | 08:20 | office | | koffie | +| 08:20 | 09:35 | bugs | Bug 35326: OpenAPI: query parameters should be case-insensitive (canonical = lowercase) | review | +| 09:35 | 09:45 | hatch | [PBI 35098: SPIKE: Gitlab MR previews apps](:/eb90c72a90e746d8b535dda26e8e7275) | | +| 09:45 | 10:00 | scrum | | daily standup | +| 10:00 | 10:20 | hatch | [PBI 35098: SPIKE: Gitlab MR previews apps](:/eb90c72a90e746d8b535dda26e8e7275) | | +| 10:20 | 10:50 | refine | PBI 35330: Calculation script errors should not be in Sentry? Or add a tag we can filter by? | | +| 10:50 | 11:15 | product | | Claude code CLI | +| 11:15 | 12:05 | refine | PBI 35330: Calculation script errors should not be in Sentry? Or add a tag we can filter by? | | +| 12:45 | 13:40 | bugs | | review race bugs MR | +| 13:40 | 14:30 | bugs | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc) | review | +| 14:30 | 15:00 | bugs | Bug 35331: Clients hitting sql max open connections | | +| 15:00 | 15:25 | refine | PBI 35330: Calculation script errors should not be in Sentry? Or add a tag we can filter by? | respond to comments | +| 15:25 | 15:30 | bugs | Bug 35331: Clients hitting sql max open connections | | +| 15:30 | 16:30 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | respond to comments | + +- [ ] Uitzoeken hoeveel audit log info er verloren is gegaan door de REST api migratie (`GetDatabase` -> `GetDatabaseByOrganizationUUID`) +- [ ] kijken of GetEvents property value filter nu broken is met datasource door remodelling in openapi spec + +# Woensdag - 2026-05-20 + +| Start | End | Project | Story | Note | +|-------|-------|---------|------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------| +| 08:15 | 08:25 | office | | koffie, sleutels zoeken | +| 08:25 | 09:00 | bugs | Bug 34948: Bulk measurement update returns 500 on failed validation | | +| 09:00 | 09:15 | bugs | Bug 35238: DestroySink does not listen to the 'force' query-parameter | | +| 09:15 | 09:50 | bugs | | CPU usage spikes in statistics collection | +| 09:50 | 10:00 | bugs | Bug 35241: OpenAPI: timeseries query result schema breaks JSON round-trip for scalar values, arrays, and string tags | review | +| 10:00 | 10:30 | bugs | Bug 35238: DestroySink does not listen to the 'force' query-parameter | | +| 10:30 | 12:00 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | | +| 12:40 | 13:30 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | trying to remove the wrapper | +| 13:30 | 13:45 | bugs | Bug 34948: Bulk measurement update returns 500 on failed validation | respond to comments | +| 13:45 | 14:45 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | trying to remove the wrapper | +| 14:45 | 15:00 | bugs | Bug 34948: Bulk measurement update returns 500 on failed validation | respond to comments | +| 15:00 | 15:15 | bugs | Bug 35238: DestroySink does not listen to the 'force' query-parameter | respond to comments | +| 15:15 | 15:25 | hatch | [PBI 35098: SPIKE: Gitlab MR previews apps](:/eb90c72a90e746d8b535dda26e8e7275) | | +| 15:25 | 15:45 | bugs | Bug 35326: OpenAPI: query parameters should be case-insensitive (canonical = lowercase) | review | +| 15:45 | 16:00 | product | | CI Evelien fixen | +| 16:00 | 16:20 | bugs | Bug 35326: OpenAPI: query parameters should be case-insensitive (canonical = lowercase) | review | +| 16:20 | 17:00 | hatch | [PBI 35098: SPIKE: Gitlab MR previews apps](:/eb90c72a90e746d8b535dda26e8e7275) | | + + +- [x] fix CPU usage spikes in statistics collection ([thread](https://factrylabs.slack.com/archives/C01TD6M694G/p1779112687422719)) + * idee: maak een global statistics variable aan, guarded met een mutex en getters/setters + * hou bij hoe oud die is, if too old -> recollect, anders geef gwn de cached versie mee + * dan gebeurt collection 1 keer en gebruiken alle loops gwn the same data + +# Dinsdag - 2026-05-19 + +| Start | End | Project | Story | Note | +|-------|-------|---------|---------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------| +| 08:10 | 08:15 | office | | coffee | +| 08:15 | 09:45 | rate | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | respond to comments | +| 09:45 | 09:55 | scrum | | daily standup | +| 09:55 | 10:30 | rate | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | rebase | +| 10:30 | 12:05 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | fix tests | +| 12:55 | 13:05 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | fix tests | +| 13:05 | 14:10 | bugs | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc) | review | +| 14:10 | 14:45 | rate | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | final comments | +| 14:45 | 15:15 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | fix tests | +| 15:15 | 15:45 | bugs | [Bug 35232: Biiiig asset tree import is slow](:/9e12b149a19b4c789b02479d7e3412bc) | review | +| 15:45 | 16:10 | rate | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | rebase | +| 16:10 | 16:30 | rate | [PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0](:/76f097bbb7854fd086e41ebc9132898f) | respond to duo review | +| 16:30 | 17:10 | bugs | Bug 34948: Bulk measurement update returns 500 on failed validation | | + + +# Maandag - 2026-05-18 + +| Start | End | Project | Story | Note | +|-------|-------|----------|---------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------| +| 08:10 | 08:15 | office | | coffee | +| 08:15 | 09:45 | rate | PBI 34972: refactor(rest): migrate events and event configuration handlers to OpenAPI 3.0 | respond to comments | +| 09:45 | 10:05 | scrum | | daily standup | +| 10:05 | 10:25 | rate | PBI 34972: refactor(rest): migrate events and event configuration handlers to OpenAPI 3.0 | respond to comments | +| 10:25 | 11:45 | rate | PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0 | fix tests | +| 11:45 | 12:05 | bugs | | review race bugs MR | +| 12:40 | 13:05 | bugs | | review race bugs MR | +| 13:05 | 13:40 | internal | | claude code proberen instellen | +| 13:40 | 13:55 | rate | PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0 | CI debugging | +| 13:55 | 14:50 | rate | PBI 34976: refactor(rest): migrate manual entry, manual entry form and prototype handlers to OpenAPI 3.0 | fix tests | +| 14:50 | 15:15 | refine | PBI 34321: Update asset from prototype: add 'Keep all' / 'Update all' bulk action buttons | | +| 15:30 | 15:35 | bugs | Bug 34916: GetAssets endpoint fails with "invalid escape \ sequence" when path filter contains backslashes | | +| 15:35 | 15:45 | refine | PBI 34321: Update asset from prototype: add 'Keep all' / 'Update all' bulk action buttons | | +| 15:45 | 17:00 | rate | [PBI 34974: refactor(rest): migrate collectors, sinks, forwarders, tasks and related handlers to OpenAPI 3.0](:/9b4d4a0384be4ec3b1bbbcab68640721) | respond to comments | + +* laatste rest refactor MR nog afwerken + * gij hebt de 3 laatste op uw naam, dus gij kunt de wrapper uiteindelijk wegdoen + * moet nog verder opkuisen wa claude gedaan heeft + * nog wa testen da falen ook diff --git a/tests/test_parser.py b/tests/test_parser.py index 2691c6c..176dfd4 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,9 +1,13 @@ +import os + import pytest from timesheets.parser import ( aggregate_rows, build_description, detect_has_duration_column, + extract_table_blocks, + parse_document, parse_table, ) @@ -26,6 +30,8 @@ WITHOUT_DURATION = [ "| 08:30 | 09:15 | scrum | | dsu |", ] +WEEK_FILE = os.path.join(os.path.dirname(__file__), "2026 - W21.md") + # --------------------------------------------------------------------------- # detect_has_duration_column @@ -47,6 +53,44 @@ class TestDetectHasDurationColumn: assert detect_has_duration_column(lines) is True +# --------------------------------------------------------------------------- +# extract_table_blocks +# --------------------------------------------------------------------------- + + +class TestExtractTableBlocks: + def test_single_table(self): + blocks = extract_table_blocks(WITH_DURATION) + assert len(blocks) == 1 + assert blocks[0] == WITH_DURATION + + def test_two_tables_separated_by_prose(self): + lines = WITH_DURATION + ["", "# Next day", "some prose", ""] + WITHOUT_DURATION + blocks = extract_table_blocks(lines) + assert len(blocks) == 2 + + def test_prose_between_tables_not_included(self): + lines = WITH_DURATION + ["some note"] + WITHOUT_DURATION + blocks = extract_table_blocks(lines) + assert len(blocks) == 2 + assert all("some note" not in b for b in blocks) + + def test_single_line_table_discarded(self): + lines = ["| Start | End |"] + assert extract_table_blocks(lines) == [] + + def test_empty_input(self): + assert extract_table_blocks([]) == [] + + def test_no_tables(self): + assert extract_table_blocks(["# heading", "", "prose"]) == [] + + def test_table_at_end_of_file_captured(self): + lines = ["# heading", ""] + WITH_DURATION # no trailing newline + blocks = extract_table_blocks(lines) + assert len(blocks) == 1 + + # --------------------------------------------------------------------------- # parse_table # --------------------------------------------------------------------------- @@ -92,6 +136,14 @@ class TestParseTable: ] assert parse_table(lines) == [] + def test_empty_end_time_row_skipped(self): + lines = [ + "| Start | End | Project | Story | Note |", + "|-------|-------|---------|-------|------|", + "| 09:55 | | bugs | | |", + ] + assert parse_table(lines, has_duration_col=False) == [] + def test_empty_input(self): assert parse_table([]) == [] @@ -101,6 +153,68 @@ class TestParseTable: assert len(rows) == 3 +# --------------------------------------------------------------------------- +# parse_document +# --------------------------------------------------------------------------- + + +class TestParseDocument: + def test_single_table(self): + rows = parse_document(WITHOUT_DURATION) + assert len(rows) == 2 + + def test_multiple_tables_combined(self): + lines = WITHOUT_DURATION + ["", "# Next day", ""] + WITHOUT_DURATION + rows = parse_document(lines) + assert len(rows) == 4 + + def test_prose_between_tables_ignored(self): + lines = ( + WITHOUT_DURATION + ["some notes", "- a bullet point", ""] + WITHOUT_DURATION + ) + rows = parse_document(lines) + assert len(rows) == 4 + + def test_mixed_duration_formats(self): + lines = WITH_DURATION + ["", "## Next day", ""] + WITHOUT_DURATION + rows = parse_document(lines) + assert len(rows) == 5 # 3 from WITH_DURATION + 2 from WITHOUT_DURATION + + def test_empty_input(self): + assert parse_document([]) == [] + + def test_week_file(self): + """Smoke test against the real W21 weekly timesheet file.""" + with open(WEEK_FILE, encoding="utf-8") as f: + lines = f.read().splitlines() + rows = parse_document(lines) + # File has 5 daily tables; expect a healthy number of rows + assert len(rows) > 20 + # All rows must have expected keys + for row in rows: + assert "project" in row + assert "duration_hours" in row + assert row["duration_hours"] > 0 + # The incomplete row (09:55 | empty end) must have been skipped + incomplete = [ + r for r in rows if r["start"] == "09:55" and r["project"] == "bugs" + ] + assert all(r["duration_hours"] > 0 for r in incomplete) + + def test_week_file_no_markdown_links_in_stories(self): + """Markdown link syntax must be stripped from story/note fields.""" + with open(WEEK_FILE, encoding="utf-8") as f: + lines = f.read().splitlines() + rows = parse_document(lines) + for row in rows: + assert "](:" not in row["story"], ( + f"Link not stripped in story: {row['story']!r}" + ) + assert "](:" not in row["note"], ( + f"Link not stripped in note: {row['note']!r}" + ) + + # --------------------------------------------------------------------------- # build_description # ---------------------------------------------------------------------------