vdoc: implement .toplevel_comments parsing mode

2020-06-06 18:47:16 +03:00 · 2020-06-06 18:47:16 +03:00 · 8d3f680d07
parent 3aecdeab63
commit 8d3f680d07
7 changed files with 259 additions and 64 deletions
--- a/cmd/tools/vdoc.v
+++ b/cmd/tools/vdoc.v
@ -399,7 +399,7 @@ fn (cfg DocConfig) gen_plaintext(idx int) string {
 	for cn in dcs.contents {
 		pw.writeln(cn.content)
 		if cn.comment.len > 0 {
-			pw.writeln('\n' + cn.comment)
+			pw.writeln('\n' + '\/\/ ' + cn.comment.trim_space())
 		}
 		if cfg.show_loc {
 			pw.writeln('Location: ${cn.file_path}:${cn.pos.line}:${cn.pos.col}\n\n')
@ -509,7 +509,7 @@ fn (mut cfg DocConfig) generate_docs_from_file() {
 	dirs := if cfg.is_multi { get_modules_list(cfg.input_path) } else { [cfg.input_path] } 
 	for dirpath in dirs {
 		cfg.vprintln('Generating docs for ${dirpath}...')
-		mut dcs := doc.generate(dirpath, cfg.pub_only, !is_vlib) or {
+		mut dcs := doc.generate(dirpath, cfg.pub_only, true) or {
 			panic(err)
 		}
 		if dcs.contents.len == 0 { continue }
--- a/vlib/v/doc/doc.v
+++ b/vlib/v/doc/doc.v
@ -40,24 +40,32 @@ pub mut:
 	parent_type string = ''
 }

-pub fn write_comment_bw(stmts []ast.Stmt, start_idx int) string {
+pub fn get_comment_block_right_before(stmts []ast.Stmt) string {
+	if stmts.len == 0 {
+		return ''
+	}
 	mut comment := ''
-	for i := start_idx; i >= 0; i-- {
+	mut last_comment_line_nr := 0
+	for i := stmts.len-1; i >= 0; i-- {
 		stmt := stmts[i]
-		if stmt is ast.Comment {
-			cmt := stmt as ast.Comment
-			cmt_content := cmt.text.trim_left('|')
-			comment = cmt_content + if cmt_content.starts_with('```') {
-				'\n'
-			} else {
-				' '
-			} + comment
-		} else {
+		if stmt !is ast.Comment {
 			panic('Not a comment')
 		}
-		if i - 1 >= 0 && !(stmts[i - 1] is ast.Comment) {
-			break
+		cmt := stmt as ast.Comment
+		if last_comment_line_nr != 0 && cmt.pos.line_nr < last_comment_line_nr - 1 {
+			// skip comments that are not part of a continuous block,
+			// located right above the top level statement.
+			//			break
 		}
+		cmt_content := cmt.text.trim_left('|')
+		if cmt_content.len == cmt.text.len {
+			// ignore /* */ style comments for now
+			continue
+		}
+		//eprintln('cmt: $cmt')
+		cseparator := if cmt_content.starts_with('```') {'\n'} else {' '}
+		comment = cmt_content + cseparator + comment
+		last_comment_line_nr = cmt.pos.line_nr
 	}
 	return comment
 }
@ -145,7 +153,7 @@ pub fn (nodes []DocNode) find_children_of(parent_type string) []DocNode {

 fn get_parent_mod(dir string) ?string {
 	$if windows {
-	// windows root path is C: or D:
+		// windows root path is C: or D:
 		if dir.len <= 2 { return error('root folder reached') }
 	} $else {
 		if dir.len == 0 { return error('root folder reached') }
@ -168,9 +176,9 @@ fn get_parent_mod(dir string) ?string {
 		}
 		return error('No V files found.')
 	}
-	file_ast := parser.parse_file(v_files[0], table.new_table(), .skip_comments, prefs, &ast.Scope{
-		parent: 0
-	})
+	tbl := table.new_table()
+	scope := &ast.Scope{ parent: 0 }
+	file_ast := parser.parse_file(v_files[0], tbl, .skip_comments, prefs, scope)
 	if file_ast.mod.name == 'main' {
 		return ''
 	}
@ -196,7 +204,7 @@ pub fn (mut d Doc) generate() ?bool {
 	// parse files
 	mut file_asts := []ast.File{}
 	// TODO: remove later for vlib
-	comments_mode := if d.with_comments { scanner.CommentsMode.parse_comments } else { scanner.CommentsMode.skip_comments }
+	comments_mode := if d.with_comments { scanner.CommentsMode.toplevel_comments } else { scanner.CommentsMode.skip_comments }
 	for file in v_files {
 		file_ast := parser.parse_file(file, d.table, comments_mode, d.prefs, &ast.Scope{
 			parent: 0
@ -224,51 +232,69 @@ pub fn (mut d Doc) generate() ?bool {
 		} else if file_ast.mod.name != orig_mod_name {
 			continue
 		}
+		mut prev_comments := []ast.Stmt{}
 		stmts := file_ast.stmts
-		for si, stmt in stmts {
+		for _, stmt in stmts {
+			//eprintln('stmt typeof: ' + typeof(stmt))
 			if stmt is ast.Comment {
+				prev_comments << stmt
 				continue
 			}
-			if stmt !is ast.Module {
-				// todo: accumulate consts
-				mut name := d.get_name(stmt)
-				signature := d.get_signature(stmt)
-				pos := d.get_pos(stmt)
-				if !signature.starts_with('pub') && d.pub_only {
+			if stmt is ast.Module {
+				// the previous comments were probably a copyright/license one
+				module_comment := get_comment_block_right_before(prev_comments)
+				prev_comments = []
+				if module_comment == '' {
 					continue
 				}
-				if name.starts_with(orig_mod_name + '.') {
-					name = name.all_after(orig_mod_name + '.')
+				if module_comment == d.head.comment {
+					continue
 				}
-				mut node := DocNode{
-					name: name
-					content: signature
-					comment: ''
-					pos: convert_pos(v_files[i], pos)
-					file_path: v_files[i]
+				if d.head.comment != '' {
+					d.head.comment += '\n'
 				}
-				if stmt is ast.FnDecl {
-					fnd := stmt as ast.FnDecl
-					if fnd.receiver.typ != 0 {
-						mut parent_type := d.table.get_type_name(fnd.receiver.typ)
-						if parent_type.starts_with(module_name + '.') {
-							parent_type = parent_type.all_after(module_name + '.')
-						}
-						node.parent_type = parent_type
+				d.head.comment += module_comment
+				continue
+			}
+			// todo: accumulate consts
+			mut name := d.get_name(stmt)
+			signature := d.get_signature(stmt)
+			pos := d.get_pos(stmt)
+			if !signature.starts_with('pub') && d.pub_only {
+				prev_comments = []
+				continue
+			}
+			if name.starts_with(orig_mod_name + '.') {
+				name = name.all_after(orig_mod_name + '.')
+			}
+			mut node := DocNode{
+				name: name
+				content: signature
+				comment: ''
+				pos: convert_pos(v_files[i], pos)
+				file_path: v_files[i]
+			}
+			if stmt is ast.FnDecl {
+				fnd := stmt as ast.FnDecl
+				if fnd.receiver.typ != 0 {
+					mut parent_type := d.table.get_type_name(fnd.receiver.typ)
+					if parent_type.starts_with(module_name + '.') {
+						parent_type = parent_type.all_after(module_name + '.')
 					}
+					node.parent_type = parent_type
 				}
-				if node.name.len == 0 && node.comment.len == 0 && node.content.len == 0 { continue }
-				d.contents << node
+
 			}
-			if d.with_comments && (si - 1 >= 0 && stmts[si - 1] is ast.Comment) {
-				if stmt is ast.Module {
-					d.head.comment = write_comment_bw(stmts, si - 1)
-				} else {
-					last_comment := d.contents[d.contents.len - 1].comment
-					d.contents[d.contents.len - 1].comment = last_comment + '\n' + write_comment_bw(stmts,
-						si - 1)
-				}
+			if node.name.len == 0 && node.comment.len == 0 && node.content.len == 0 {
+				continue
 			}
+			d.contents << node
+			if d.with_comments && (prev_comments.len > 0) {
+				last_comment := d.contents[d.contents.len - 1].comment
+				cmt := last_comment + '\n' + get_comment_block_right_before(prev_comments)
+				d.contents[d.contents.len - 1].comment = cmt
+			}
+			prev_comments = []
 		}
 	}
 	d.time_generated = time.now()
--- a/vlib/v/parser/fn.v
+++ b/vlib/v/parser/fn.v
@ -71,7 +71,7 @@ pub fn (mut p Parser) call_expr(language table.Language, mod string) ast.CallExp
 			is_used: true
 		})
 		or_kind = .block
-		or_stmts = p.parse_block_no_scope()
+		or_stmts = p.parse_block_no_scope(false)
 		p.close_scope()
 		p.inside_or_expr = was_inside_or_expr
 	}
@ -117,6 +117,7 @@ pub fn (mut p Parser) call_args() []ast.CallArg {
 }

 fn (mut p Parser) fn_decl() ast.FnDecl {
+	p.top_level_statement_start()
 	start_pos := p.tok.position()
 	is_deprecated := p.attr == 'deprecated'
 	is_pub := p.tok.kind == .key_pub
@ -268,7 +269,7 @@ fn (mut p Parser) fn_decl() ast.FnDecl {
 	no_body := p.tok.kind != .lcbr
 	body_start_pos := p.peek_tok.position()
 	if p.tok.kind == .lcbr {
-		stmts = p.parse_block_no_scope()
+		stmts = p.parse_block_no_scope(true)
 	}
 	p.close_scope()
 	p.attr = ''
@ -321,7 +322,7 @@ fn (mut p Parser) anon_fn() ast.AnonFn {
 	mut stmts := []ast.Stmt{}
 	no_body := p.tok.kind != .lcbr
 	if p.tok.kind == .lcbr {
-		stmts = p.parse_block_no_scope()
+		stmts = p.parse_block_no_scope(false)
 	}
 	p.close_scope()
 	mut func := table.Fn{
--- a/vlib/v/parser/parser.v
+++ b/vlib/v/parser/parser.v
@ -14,12 +14,12 @@ import os
 import runtime
 import time

-// import sync
 pub struct Parser {
 	file_name         string // "/home/user/hello.v"
 	file_name_dir     string // "/home/user"
 mut:
 	scanner           &scanner.Scanner
+	comments_mode     scanner.CommentsMode = .skip_comments // see comment in parse_file
 	tok               token.Token
 	prev_tok          token.Token
 	peek_tok          token.Token
@ -75,6 +75,11 @@ pub fn parse_stmt(text string, table &table.Table, scope &ast.Scope) ast.Stmt {
 }

 pub fn parse_file(path string, b_table &table.Table, comments_mode scanner.CommentsMode, pref &pref.Preferences, global_scope &ast.Scope) ast.File {
+	// NB: when comments_mode == .toplevel_comments,
+	// the parser gives feedback to the scanner about toplevel statements, so that the scanner can skip
+	// all the tricky inner comments. This is needed because we do not have a good general solution
+	// for handling them, and should be removed when we do (the general solution is also needed for vfmt)
+
 	// println('parse_file("$path")')
 	// text := os.read_file(path) or {
 	// panic(err)
@ -82,6 +87,7 @@ pub fn parse_file(path string, b_table &table.Table, comments_mode scanner.Comme
 	mut stmts := []ast.Stmt{}
 	mut p := Parser{
 		scanner: scanner.new_scanner_file(path, comments_mode)
+		comments_mode: comments_mode
 		table: b_table
 		file_name: path
 		file_name_dir: os.dir(path)
@ -213,7 +219,10 @@ pub fn parse_files(paths []string, table &table.Table, pref &pref.Preferences, g
 	return files
 }

-pub fn (p &Parser) init_parse_fns() {
+pub fn (mut p Parser) init_parse_fns() {
+	if p.comments_mode == .toplevel_comments {
+		p.scanner.scan_all_tokens_in_buffer()
+	}
 	// p.prefix_parse_fns = make(100, 100, sizeof(PrefixParseFn))
 	// p.prefix_parse_fns[token.Kind.name] = parse_name
 }
@ -265,13 +274,13 @@ pub fn (mut p Parser) close_scope() {
 pub fn (mut p Parser) parse_block() []ast.Stmt {
 	p.open_scope()
 	// println('parse block')
-	stmts := p.parse_block_no_scope()
+	stmts := p.parse_block_no_scope(false)
 	p.close_scope()
 	// println('nr exprs in block = $exprs.len')
 	return stmts
 }

-pub fn (mut p Parser) parse_block_no_scope() []ast.Stmt {
+pub fn (mut p Parser) parse_block_no_scope(is_top_level bool) []ast.Stmt {
 	p.check(.lcbr)
 	mut stmts := []ast.Stmt{}
 	if p.tok.kind != .rcbr {
@ -283,6 +292,9 @@ pub fn (mut p Parser) parse_block_no_scope() []ast.Stmt {
 			}
 		}
 	}
+	if is_top_level {
+		p.top_level_statement_end()
+	}
 	p.check(.rcbr)
 	return stmts
 }
@ -1031,7 +1043,7 @@ fn (mut p Parser) dot_expr(left ast.Expr) ast.Expr {
 				is_used: true
 			})
 			or_kind = .block
-			or_stmts = p.parse_block_no_scope()
+			or_stmts = p.parse_block_no_scope(false)
 			p.close_scope()
 		}
 		// `foo()?`
@ -1254,6 +1266,7 @@ fn (mut p Parser) import_stmt() ast.Import {
 }

 fn (mut p Parser) const_decl() ast.ConstDecl {
+	p.top_level_statement_start()
 	start_pos := p.tok.position()
 	is_pub := p.tok.kind == .key_pub
 	if is_pub {
@ -1291,6 +1304,7 @@ fn (mut p Parser) const_decl() ast.ConstDecl {
 		fields << field
 		p.global_scope.register(field.name, field)
 	}
+	p.top_level_statement_end()
 	p.check(.rpar)
 	return ast.ConstDecl{
 		pos: start_pos.extend(end_pos)
@ -1370,6 +1384,7 @@ fn (mut p Parser) global_decl() ast.GlobalDecl {
 }

 fn (mut p Parser) enum_decl() ast.EnumDecl {
+	p.top_level_statement_start()
 	is_pub := p.tok.kind == .key_pub
 	start_pos := p.tok.position()
 	if is_pub {
@ -1402,6 +1417,7 @@ fn (mut p Parser) enum_decl() ast.EnumDecl {
 			has_expr: has_expr
 		}
 	}
+	p.top_level_statement_end()
 	p.check(.rcbr)
 	attr := p.attr
 	is_flag := attr == 'flag'
@ -1563,3 +1579,46 @@ fn (p &Parser) new_true_expr() ast.Expr {
 fn verror(s string) {
 	util.verror('parser error', s)
 }
+
+fn (mut p Parser) top_level_statement_start() {
+	if p.comments_mode == .toplevel_comments {
+		p.scanner.set_is_inside_toplevel_statement(true)
+		p.rewind_scanner_to_current_token_in_new_mode()
+		$if debugscanner ? {
+			eprintln('>> p.top_level_statement_start | tidx:${p.tok.tidx:-5} | p.tok.kind: ${p.tok.kind:-10} | p.tok.lit: $p.tok.lit $p.peek_tok.lit $p.peek_tok2.lit $p.peek_tok3.lit ...')
+		}
+	}
+}
+
+fn (mut p Parser) top_level_statement_end() {
+	if p.comments_mode == .toplevel_comments {
+		p.scanner.set_is_inside_toplevel_statement(false)
+		p.rewind_scanner_to_current_token_in_new_mode()
+		$if debugscanner ? {
+			eprintln('>> p.top_level_statement_end   | tidx:${p.tok.tidx:-5} | p.tok.kind: ${p.tok.kind:-10} | p.tok.lit: $p.tok.lit $p.peek_tok.lit $p.peek_tok2.lit $p.peek_tok3.lit ...')
+		}
+	}
+}
+
+fn (mut p Parser) rewind_scanner_to_current_token_in_new_mode() {
+	// Go back and rescan some tokens, ensuring that the parser's
+	// lookahead buffer p.peek_tok .. p.peek_tok3, will now contain
+	// the correct tokens (possible comments), for the new mode
+	// This refilling of the lookahead buffer is needed for the
+	// .toplevel_comments parsing mode.
+	tidx := p.tok.tidx
+	p.scanner.set_current_tidx(tidx - 5)
+	no_token := token.Token{}
+	p.prev_tok = no_token
+	p.tok = no_token
+	p.peek_tok = no_token
+	p.peek_tok2 = no_token
+	p.peek_tok3 = no_token
+	for {
+		p.next()
+		//eprintln('rewinding to ${p.tok.tidx:5} | goal: ${tidx:5}')
+		if tidx == p.tok.tidx {
+			break
+		}
+	}
+}
--- a/vlib/v/parser/struct.v
+++ b/vlib/v/parser/struct.v
@ -9,6 +9,7 @@ import v.token
 import v.util

 fn (mut p Parser) struct_decl() ast.StructDecl {
+	p.top_level_statement_start()
 	start_pos := p.tok.position()
 	is_pub := p.tok.kind == .key_pub
 	if is_pub {
@ -162,6 +163,7 @@ fn (mut p Parser) struct_decl() ast.StructDecl {
 			}
 			// println('struct field $ti.name $field_name')
 		}
+		p.top_level_statement_end()
 		p.check(.rcbr)
 	}
 	if language == .c {
@ -277,6 +279,7 @@ fn (mut p Parser) struct_init(short_syntax bool) ast.StructInit {
 }

 fn (mut p Parser) interface_decl() ast.InterfaceDecl {
+	p.top_level_statement_start()
 	start_pos := p.tok.position()
 	is_pub := p.tok.kind == .key_pub
 	if is_pub {
@ -334,6 +337,7 @@ fn (mut p Parser) interface_decl() ast.InterfaceDecl {
 			is_pub: true
 		})
 	}
+	p.top_level_statement_end()
 	p.check(.rcbr)
 	return ast.InterfaceDecl{
 		name: interface_name
--- a/vlib/v/scanner/scanner.v
+++ b/vlib/v/scanner/scanner.v
@ -44,12 +44,53 @@ pub mut:
 	is_fmt                      bool // Used only for skipping ${} in strings, since we need literal
 	// string values when generating formatted code.
 	comments_mode               CommentsMode
+	is_inside_toplvl_statement  bool = false // *only* used in comments_mode: .toplevel_comments, toggled by parser
+	all_tokens                  []token.Token // *only* used in comments_mode: .toplevel_comments, contains all tokens
+	tidx                        int
 	eofs                        int
 }
+/*
+How the .toplevel_comments mode works:

+In this mode, the scanner scans *everything* at once, before parsing starts,
+including all the comments, and stores the results in an buffer s.all_tokens.
+
+Then .scan() just returns s.all_tokens[ s.tidx++ ] *ignoring* the
+comment tokens. In other words, by default in this mode, the parser
+*will not see any comments* inside top level statements, so it has
+no reason to complain about them.
+
+When the parser determines, that it is outside of a top level statement,
+it tells the scanner to backtrack s.tidx to the current p.tok index,
+then it changes .is_inside_toplvl_statement to false , and refills its
+lookahead buffer (i.e. p.peek_tok, p.peek_tok2, p.peek_tok3) from the
+scanner.
+
+In effect, from the parser's point of view, the next tokens, that it will
+receive with p.next(), will be the same, as if comments are not ignored
+anymore, *between* top level statements.
+
+When the parser determines, that it is going again inside a top level
+statement, it does the same, this time setting .is_inside_toplvl_statement
+to true, again refilling the lookahead buffer => calling .next() in this
+mode, will again ignore all the comment tokens, till the top level statement
+is finished.
+*/
+
+// The different kinds of scanner modes:
+//
+// .skip_comments - simplest/fastest, just ignores all comments early.
+// This mode is used by the compiler itself.
+//
+// .parse_comments is used by vfmt. Ideally it should handle inline /* */
+// comments too, i.e. it returns every kind of comment as a new token.
+//
+// .toplevel_comments is used by vdoc, parses *only* top level ones
+// that are *outside* structs/enums/fns.
 pub enum CommentsMode {
 	skip_comments
 	parse_comments
+	toplevel_comments
 }

 // new scanner from file.
@ -80,13 +121,32 @@ pub fn new_scanner(text string, comments_mode CommentsMode) &Scanner {
 	return s
 }

-fn (s &Scanner) new_token(tok_kind token.Kind, lit string, len int) token.Token {
+
+
+[inline]
+fn (s &Scanner) should_parse_comment() bool {
+	res := (s.comments_mode == .parse_comments) || (s.comments_mode == .toplevel_comments && !s.is_inside_toplvl_statement)
+	return res
+}
+// NB: this is called by v's parser
+pub fn (mut s Scanner) set_is_inside_toplevel_statement(newstate bool) {
+	s.is_inside_toplvl_statement = newstate
+}
+pub fn (mut s Scanner) set_current_tidx(cidx int) {
+	mut tidx := if cidx < 0 { 0 } else { cidx }
+	tidx = if tidx > s.all_tokens.len { s.all_tokens.len } else { tidx }
+	s.tidx = tidx
+}
+fn (mut s Scanner) new_token(tok_kind token.Kind, lit string, len int) token.Token {
+	cidx := s.tidx
+	s.tidx++
 	return token.Token{
 		kind: tok_kind
 		lit: lit
 		line_nr: s.line_nr + 1
 		pos: s.pos - len + 1
 		len: len
+		tidx: cidx
 	}
 }

@ -537,7 +597,51 @@ fn (mut s Scanner) end_of_file() token.Token {
 	return s.new_token(.eof, '', 1)
 }

+pub fn (mut s Scanner) scan_all_tokens_in_buffer(){
+	// s.scan_all_tokens_in_buffer is used mainly by vdoc,
+	// in order to implement the .toplevel_comments mode.
+	cmode := s.comments_mode
+	s.comments_mode = .parse_comments
+	for {
+		mut t := s.text_scan()
+		s.all_tokens << t
+		if t.kind == .eof {
+			break
+		}
+	}
+	s.comments_mode = cmode
+	s.tidx = 0
+	$if debugscanner ? {
+		for t in s.all_tokens {
+			eprintln('> tidx:${t.tidx:-5} | kind: ${t.kind:-10} | lit: ${t.lit}')
+		}
+	}
+}
+
 pub fn (mut s Scanner) scan() token.Token {
+	if s.comments_mode == .toplevel_comments {
+		return s.buffer_scan()
+	}
+	return s.text_scan()
+}
+
+pub fn (mut s Scanner) buffer_scan() token.Token {
+	for {
+		cidx := s.tidx
+		s.tidx++
+		if cidx >= s.all_tokens.len {
+			return s.end_of_file()
+		}
+		if s.all_tokens[cidx].kind == .comment {
+			if !s.should_parse_comment() {
+				continue
+			}
+		}
+		return s.all_tokens[cidx]
+	}
+}
+
+fn (mut s Scanner) text_scan() token.Token {
 	// if s.comments_mode == .parse_comments {
 	// println('\nscan()')
 	// }
@ -972,7 +1076,7 @@ pub fn (mut s Scanner) scan() token.Token {
 				// fix line_nr, \n was read, and the comment is marked
 				// on the next line
 				s.line_nr--
-				if s.comments_mode == .parse_comments {
+				if s.should_parse_comment() {
 					// Find out if this comment is on its own line (for vfmt)
 					mut is_separate_line_comment := true
 					for j := start-2; j >= 0 && s.text[j] != `\n`; j-- {
@ -1013,7 +1117,7 @@ pub fn (mut s Scanner) scan() token.Token {
 					}
 				}
 				s.pos++
-				if s.comments_mode == .parse_comments {
+				if s.should_parse_comment() {
 					comment := s.text[start..(s.pos - 1)].trim_space()
 					return s.new_token(.comment, comment, comment.len + 4)
 				}
--- a/vlib/v/token/token.v
+++ b/vlib/v/token/token.v
@ -11,6 +11,7 @@ pub:
 	// name_idx int // name table index for O(1) lookup
 	pos     int // the position of the token in scanner text
 	len     int // length of the literal
+	tidx    int // the index of the token
 }

 pub enum Kind {