net.html: create html parser module (#6076)

2020-08-08 23:13:34 -03:00 · 2020-08-08 23:13:34 -03:00 · f6a85d5305
parent 97fec78b40
commit f6a85d5305
7 changed files with 894 additions and 0 deletions
--- a/vlib/net/html/README.md
+++ b/vlib/net/html/README.md
@ -0,0 +1,109 @@
+# V HTML
+
+A HTML parser made in V
+
+## Usage
+
+If description below isn't enought, see test files
+
+### Parser
+
+Responsible for read HTML in full strings or splited string and returns all Tag objets of it HTML or return a DocumentObjectModel, that will try to find how the HTML Tree is.
+
+#### split_parse(data string)
+This functions is the main function called by parse method to fragment parse your HTML
+
+#### parse_html(data string, is_file bool)
+This function is called passing a filename or a complete html data string to it
+
+#### add_code_tag(name string)
+This function is used to add a tag for the parser ignore it's content. For example, if you have an html or XML with a custom tag, like `<script>`, using this function, like `add_code_tag('script')` will make all `script` tags content be jumped, so you still have its content, but will not confuse the parser with it's `>` or `<`
+
+#### finalize()
+When using **split_parse** method, you must call this function to ends the parse completely
+
+#### get_tags() []Tag_ptr
+This functions returns a array with all tags and it's content
+
+#### get_dom() DocumentObjectModel
+Returns the DocumentObjectModel for current parsed tags
+
+### WARNING
+If you want to reuse parser object to parse another HTML, call `initialize_all()` function first
+
+### DocumentObjectModel
+
+A DOM object that will make easier to access some tags and search it
+
+#### get_by_attribute_value(name string, value string) []Tag_ptr
+This function retuns a Tag array with all tags in document that have a attribute with given name and given value
+
+#### get_by_tag(name string) []Tag_ptr
+This function retuns a Tag array with all tags in document that have a name with the given value
+
+#### get_by_attribute(name string) []Tag_ptr
+This function retuns a Tag array with all tags in document that have a attribute with given name
+
+#### get_root() Tag_ptr
+This function returns the root Tag
+
+#### get_all_tags() []Tag_ptr
+This function returns all important tags, removing close tags
+
+### Tag
+
+An object that holds tags information, such as `name`, `attributes`, `children`
+
+#### get_children() []Tag_ptr
+Returns all children as an array
+
+#### get_parent() &Tag
+Returns the parent of current tag
+
+#### get_name() string
+Returns tag name
+
+#### get_content() string
+Returns tag content
+
+#### get_attributes() map[string]string
+Returns all attributes and it value
+
+#### text() string
+Returns the content of the tag and all tags inside it. Also, any `<br>` tag will be converted into `\n`
+
+## Some questions that can appear
+
+### Q: Why in parser have a `builder_str() string` method that returns only the lexeme string?
+    
+A: Because in early stages of the project, strings.Builder are used, but for some bug existing somewhere, it was necessary to use string directly. Later, it's planned to use strings.Builder again
+
+### Q: Why have a `compare_string(a string, b string) bool` method?
+
+A: For some reason when using != and == in strings directly, it not working. So, this method is a workaround
+
+### Q: Will be something like `XPath`?
+
+A: Like XPath yes. Exactly equal to it, no.
+
+## Roadmap
+- [x] Parser
+  - [x] `<!-- Comments -->` detection
+  - [x] `Open Generic tags` detection
+  - [x] `Close Generic tags` detection
+  - [x] `verify string` detection
+  - [x] `tag attributes` detection
+  - [x] `attributes values` detection
+  - [x] `tag text` (on tag it is declared as content, maybe change for text in the future)
+  - [x] `text file for parse` support (open local files for parsing)
+  - [x] `open_code` verification
+- [x] DocumentObjectModel
+  - [x] push elements that have a close tag into stack
+  - [x] remove elements from stack
+  - [x] ~~create a new document root if have some syntax error (deleted)~~
+  - [x] search tags in `DOM` by attributes
+  - [x] search tags in `DOM` by tag type
+  - [x] finish dom test
+
+## License
+[GPL3](LICENSE)
--- a/vlib/net/html/data_structures.v
+++ b/vlib/net/html/data_structures.v
@ -0,0 +1,88 @@
+module html
+
+#include <limits.h>
+struct Stack {
+	null_element int = C.INT_MIN
+mut:
+	elements     []int
+	size         int = 0
+}
+
+fn (stack Stack) is_null(data int) bool {
+	return data == stack.null_element
+}
+
+fn (stack Stack) is_empty() bool {
+	return stack.size <= 0
+}
+
+fn (stack Stack) peek() int {
+	if !stack.is_empty() {
+		return stack.elements[stack.size - 1]
+	}
+	return stack.null_element
+}
+
+fn (mut stack Stack) pop() int {
+	mut to_return := stack.null_element
+	if !stack.is_empty() {
+		to_return = stack.elements[stack.size - 1]
+		stack.size--
+	}
+	return to_return
+}
+
+fn (mut stack Stack) push(item int) {
+	if stack.elements.len > stack.size {
+		stack.elements[stack.size] = item
+	} else {
+		stack.elements << item
+	}
+	stack.size++
+}
+
+struct BTree {
+mut:
+	all_tags     []Tag
+	node_pointer int = 0
+	childrens    [][]int
+	parents      []int
+}
+
+fn (mut btree BTree) add_children(tag Tag) int {
+	btree.all_tags << tag
+	if btree.all_tags.len > 1 {
+		for btree.childrens.len <= btree.node_pointer {
+			// println("${btree.childrens.len} <= ${btree.node_pointer}")
+			mut temp_array := btree.childrens
+			temp_array << []int{}
+			btree.childrens = temp_array
+		}
+		btree.childrens[btree.node_pointer] << btree.all_tags.len - 1
+		for btree.parents.len < btree.all_tags.len {
+			mut temp_array := btree.parents
+			temp_array << 0
+			btree.parents = temp_array
+		}
+		btree.parents[btree.all_tags.len - 1] = btree.node_pointer
+	}
+	return btree.all_tags.len - 1
+}
+
+fn (btree BTree) get_children() []int {
+	return btree.childrens[btree.node_pointer]
+}
+
+fn (btree BTree) get_parent() int {
+	return btree.parents[btree.node_pointer]
+}
+
+fn (btree BTree) get_stored() Tag {
+	return btree.all_tags[btree.node_pointer]
+}
+
+fn (mut btree BTree) move_pointer(to int) {
+	if to < btree.all_tags.len {
+		btree.node_pointer = to
+	}
+}
--- a/vlib/net/html/dom.v
+++ b/vlib/net/html/dom.v
@ -0,0 +1,216 @@
+module html
+
+import os
+
+pub struct DocumentObjectModel {
+mut:
+	root           &Tag
+	constructed    bool = false
+	btree          BTree
+	all_tags       []&Tag
+	all_attributes map[string][]&Tag
+	close_tags     map[string]bool // add a counter to see count how many times is closed and parse correctly
+	attributes     map[string][]string
+	tag_attributes map[string][][]&Tag
+	tag_type       map[string][]&Tag
+	debug_file     os.File
+}
+
+[if debug]
+fn (mut dom DocumentObjectModel) print_debug(data string) {
+	$if debug {
+		if data.len > 0 {
+			dom.debug_file.writeln(data)
+		}
+	}
+}
+
+/*
+fn (dom mut DocumentObjectModel) new_root(tag &Tag) {
+	mut new_tag := &Tag{} new_tag.name = "div"
+	new_tag.add_child(dom.root) new_tag.add_child(tag)
+	dom.root = new_tag
+}
+*/
+fn is_close_tag(tag &Tag) bool {
+	if tag.name.len > 0 {
+		return tag.name[0] == 47 // return if equals to /
+	}
+	return false
+}
+
+fn (mut dom DocumentObjectModel) where_is(item_name, attribute_name string) int {
+	if !(attribute_name in dom.attributes) {
+		temp_array := []string{}
+		dom.attributes[attribute_name] = temp_array
+	}
+	mut string_array := dom.attributes[attribute_name]
+	mut counter := 0
+	for value in string_array {
+		if value == item_name {
+			return counter
+		}
+		counter++
+	}
+	string_array << item_name
+	dom.attributes[attribute_name] = string_array
+	return string_array.len - 1
+}
+
+fn (mut dom DocumentObjectModel) add_tag_attribute(tag &Tag) {
+	for attribute_name in tag.attributes.keys() {
+		attribute_value := tag.attributes[attribute_name]
+		location := dom.where_is(attribute_value, attribute_name)
+		if !(attribute_name in dom.tag_attributes) {
+			dom.tag_attributes[attribute_name] = []
+		}
+		for {
+			mut temp_array := dom.tag_attributes[attribute_name]
+			temp_array << []&Tag{}
+			dom.tag_attributes[attribute_name] = temp_array
+			if location < dom.tag_attributes[attribute_name].len + 1 {
+				break
+			}
+		}
+		mut temp_array := dom.tag_attributes[attribute_name][location]
+		temp_array << tag
+		dom.tag_attributes[attribute_name][location] = temp_array
+	}
+}
+
+fn (mut dom DocumentObjectModel) add_tag_by_type(tag &Tag) {
+	tag_name := tag.name
+	if !(tag_name in dom.tag_type) {
+		dom.tag_type[tag_name] = [tag]
+	} else {
+		mut temp_array := dom.tag_type[tag_name]
+		temp_array << tag
+		dom.tag_type[tag_name] = temp_array
+	}
+}
+
+fn (mut dom DocumentObjectModel) add_tag_by_attribute(tag &Tag) {
+	for attribute_name in tag.attributes.keys() {
+		if !(attribute_name in dom.all_attributes) {
+			dom.all_attributes[attribute_name] = [tag]
+		} else {
+			mut temp_array := dom.all_attributes[attribute_name]
+			temp_array << tag
+			dom.all_attributes[attribute_name] = temp_array
+		}
+	}
+}
+
+fn compare_string(a, b string) bool { // for some reason == doesn't work
+	if a.len != b.len {
+		return false
+	}
+	for i := 0; i < a.len; i++ {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+fn (mut dom DocumentObjectModel) construct(tag_list []Tag_ptr) {
+	dom.constructed = true
+	mut temp_map := map[string]int{}
+	mut temp_int := C.INT_MIN
+	mut temp_string := ''
+	mut stack := Stack{}
+	dom.btree = BTree{}
+	dom.root = tag_list[0]
+	dom.all_tags = [tag_list[0]]
+	temp_map['0'] = dom.btree.add_children(tag_list[0])
+	stack.push(0)
+	root_index := 0
+	for index := 1; index < tag_list.len; index++ {
+		mut tag := tag_list[index]
+		dom.print_debug(tag.str())
+		if is_close_tag(tag) {
+			temp_int = stack.peek()
+			temp_string = tag.name[1..tag.name.len] // print(temp_string + " != " + tag_list[temp_int].name + " >> ") // println(temp_string != tag_list[temp_int].name)
+			for !stack.is_null(temp_int) &&
+				!compare_string(temp_string, tag_list[temp_int].name) && !tag_list[temp_int].closed {
+				dom.print_debug(temp_string + ' >> ' + tag_list[temp_int].name + ' ' +
+					compare_string(temp_string, tag_list[temp_int].name).str())
+				stack.pop()
+				temp_int = stack.peek()
+			}
+			temp_int = stack.peek()
+			if !stack.is_null(temp_int) {
+				temp_int = stack.pop()
+			} else {
+				temp_int = root_index
+			}
+			if stack.is_null(temp_int) {
+				stack.push(root_index)
+			}
+			dom.print_debug('Removed ' + temp_string + ' -- ' + tag_list[temp_int].name)
+		} else if tag.name.len > 0 {
+			dom.add_tag_attribute(tag) // error here
+			dom.add_tag_by_attribute(tag)
+			dom.add_tag_by_type(tag)
+			dom.all_tags << tag
+			temp_int = stack.peek()
+			if !stack.is_null(temp_int) {
+				dom.btree.move_pointer(temp_map[temp_int.str()])
+				temp_map[index.str()] = dom.btree.add_children(tag)
+				mut temp_tag := tag_list[temp_int]
+				position_in_parent := temp_tag.add_child(tag) // tag_list[temp_int] = temp_tag
+				tag.add_parent(temp_tag, position_in_parent)
+				/*
+				dom.print_debug("Added ${tag.name} as child of '" + tag_list[temp_int].name +
+					"' which now has ${dom.btree.get_children().len} childrens")
+				*/
+				dom.print_debug("Added $tag.name as child of '" + temp_tag.name +
+					"' which now has $temp_tag.get_children().len childrens")
+			} else { // dom.new_root(tag)
+				stack.push(root_index)
+			}
+			temp_string = '/' + tag.name
+			if temp_string in dom.close_tags && !tag.closed { // if tag ends with />
+				dom.print_debug('Pushed ' + temp_string)
+				stack.push(index)
+			}
+		}
+	} // println(tag_list[root_index]) for debug purposes
+	dom.root = tag_list[0]
+}
+
+pub fn (mut dom DocumentObjectModel) get_by_attribute_value(name, value string) []Tag_ptr {
+	location := dom.where_is(value, name)
+	if dom.tag_attributes[name].len > location {
+		return dom.tag_attributes[name][location]
+	}
+	return []&Tag{}
+}
+
+pub fn (dom DocumentObjectModel) get_by_tag(name string) []Tag_ptr {
+	if name in dom.tag_type {
+		return dom.tag_type[name]
+	}
+	return []&Tag{}
+}
+
+pub fn (dom DocumentObjectModel) get_by_attribute(name string) []Tag_ptr {
+	if name in dom.all_attributes {
+		return dom.all_attributes[name]
+	}
+	return []&Tag{}
+}
+
+pub fn (dom DocumentObjectModel) get_root() &Tag {
+	return dom.root
+}
+
+pub fn (dom DocumentObjectModel) get_all_tags() []Tag_ptr {
+	return dom.all_tags
+}
+
+/*pub fn (dom DocumentObjectModel) get_xpath() XPath {
+	return XPath{
+		dom: dom
+	}
+}*/
--- a/vlib/net/html/dom_test.v
+++ b/vlib/net/html/dom_test.v
@ -0,0 +1,60 @@
+module html
+
+fn generate_temp_html() string {
+	mut temp_html := '<!doctype html><html><head><title>Giant String</title></head><body>'
+	for counter := 0; counter < 4; counter++ {
+		temp_html += "<div id='name_$counter' "
+		temp_html += "class='several-$counter'>Look at $counter</div>"
+	}
+	temp_html += '</body></html>'
+	return temp_html
+}
+
+fn generate_dom(temp_html string) DocumentObjectModel {
+	mut parser := Parser{}
+	parser.parse_html(temp_html, false)
+	dom := parser.get_dom()
+	return dom
+}
+
+fn test_search_by_tag_type() {
+	dom := generate_dom(generate_temp_html())
+	assert dom.get_by_tag('div').len == 4
+	assert dom.get_by_tag('head').len == 1
+	assert dom.get_by_tag('body').len == 1
+}
+
+fn test_search_by_attribute_value() {
+	mut dom := generate_dom(generate_temp_html())
+	// println(temp_html)
+	print('Amount ')
+	println(dom.get_by_attribute_value('id', 'name_0'))
+	assert dom.get_by_attribute_value('id', 'name_0').len == 1
+}
+
+fn test_access_parent() {
+	mut dom := generate_dom(generate_temp_html())
+	div_tags := dom.get_by_tag('div')[0]
+	assert div_tags.get_parent() != C.NULL
+	parent := div_tags.get_parent()
+	for div_tag in div_tags {
+		assert div_tag.get_parent() == parent
+	}
+}
+
+fn test_search_by_attributes() {
+	dom := generate_dom(generate_temp_html())
+	assert dom.get_by_attribute('id').len == 4
+}
+
+fn test_tags_used() {
+	dom := generate_dom(generate_temp_html())
+	assert dom.get_all_tags().len == 9
+}
+
+fn test_access_tag_fields() {
+	dom := generate_dom(generate_temp_html())
+	id_tags := dom.get_by_attribute('id')
+	assert id_tags[0].get_name() == "div"
+	assert id_tags[1].get_attributes()['class'] == "several-1"
+}
--- a/vlib/net/html/parser.v
+++ b/vlib/net/html/parser.v
@ -0,0 +1,283 @@
+module html
+
+import os
+
+struct LexycalAttributes {
+mut:
+	current_tag      &Tag
+	open_tag         bool = false
+	open_code        bool = false
+	open_string      int = 0
+	open_comment     bool = false
+	is_attribute     bool = false
+	opened_code_type string = ''
+	line_count       int = 0
+	lexeme_builder   string
+	code_tags        map[string]bool = {
+		'script': true
+		'style': true
+	}
+}
+
+fn (mut lxa LexycalAttributes) write_lexeme(data byte) {
+	mut temp := lxa.lexeme_builder
+	temp += data.str()
+	lxa.lexeme_builder = temp
+}
+
+pub struct Parser {
+mut:
+	dom                DocumentObjectModel
+	lexycal_attributes LexycalAttributes = LexycalAttributes{
+		current_tag: &Tag{}
+	}
+	filename           string = 'direct-parse'
+	initialized        bool = false
+	tags               []&Tag
+	debug_file         os.File
+}
+
+pub fn (mut parser Parser) add_code_tag(name string) {
+	if parser.lexycal_attributes.code_tags.keys().len <= 0 {
+		parser.lexycal_attributes.code_tags = map[string]bool{}
+		parser.lexycal_attributes.code_tags['script'] = true
+		parser.lexycal_attributes.code_tags['style'] = true
+	}
+	if name.len > 0 {
+		parser.lexycal_attributes.code_tags[name] = true
+	}
+}
+
+fn (parser Parser) builder_str() string {
+	return parser.lexycal_attributes.lexeme_builder
+}
+
+[if debug]
+fn (mut parser Parser) print_debug(data string) {
+	$if debug {
+		if data.len > 0 {
+			parser.debug_file.writeln(data)
+		}
+	}
+}
+
+fn (mut parser Parser) verify_end_comment(remove bool) bool {
+	lexeme := parser.builder_str()
+	last := lexeme[lexeme.len - 1]
+	penultimate := lexeme[lexeme.len - 2]
+	mut is_end_comment := false
+	if last.str() == '-' && penultimate.str() == '-' {
+		is_end_comment = true
+	}
+	if is_end_comment && remove {
+		temp := parser.lexycal_attributes.lexeme_builder
+		parser.lexycal_attributes.lexeme_builder = temp[0..temp.len - 2]
+	}
+	return is_end_comment
+}
+
+fn blank_string(data string) bool {
+	mut count := 0
+	for word in data {
+		if word == 9 || word == 32 {
+			count++
+		}
+	}
+	return count == data.len
+}
+
+fn (mut parser Parser) initialize_all() {
+	parser.dom = DocumentObjectModel{
+		debug_file: parser.debug_file
+		root: &Tag{}
+	}
+	parser.add_code_tag('')
+	parser.tags = []&Tag{}
+	parser.dom.close_tags['/!document'] = true
+	parser.lexycal_attributes.current_tag = &Tag{}
+	parser.initialized = true
+}
+
+fn (mut parser Parser) generate_tag() {
+	if !parser.lexycal_attributes.open_tag {
+		if parser.lexycal_attributes.current_tag.name.len > 0 ||
+			parser.lexycal_attributes.current_tag.content.len > 0 {
+			parser.tags << parser.lexycal_attributes.current_tag
+		}
+		parser.lexycal_attributes.current_tag = &Tag{}
+	}
+}
+
+pub fn (mut parser Parser) split_parse(data string) {
+	if !parser.initialized {
+		parser.initialize_all()
+	}
+	for word in data {
+		mut is_quotation := false // " or '
+		if word == 34 || word == 39 {
+			is_quotation = true
+		}
+		string_code := match word {
+			34 { 1 } // "
+			39 { 2 } // '
+			else { 0 }
+		}
+		if parser.lexycal_attributes.open_code { // here will verify all needed to know if open_code finishes and string in code
+			parser.lexycal_attributes.write_lexeme(word)
+			if parser.lexycal_attributes.open_string > 0 {
+				if parser.lexycal_attributes.open_string == string_code {
+					parser.lexycal_attributes.open_string = 0
+				}
+			} else if is_quotation {
+				parser.lexycal_attributes.open_string = string_code
+			} else if word == 62 { // only execute verification if is a > // here will verify < to know if code tag is finished
+				name_close_tag := '</' + parser.lexycal_attributes.opened_code_type + '>'
+				temp_string := parser.builder_str()
+				if temp_string.to_lower().ends_with(name_close_tag) {
+					parser.lexycal_attributes.open_code = false
+					// need to modify lexeme_builder to add script text as a content in next loop (not gave error in dom)
+					parser.lexycal_attributes.lexeme_builder = temp_string[0..temp_string.len -
+						name_close_tag.len]
+					parser.lexycal_attributes.current_tag.closed = true
+					parser.lexycal_attributes.current_tag.close_type = .new_tag
+				}
+			}
+		} else if parser.lexycal_attributes.open_comment {
+			if word == 62 && parser.verify_end_comment(false) { // close tag '>'
+				// parser.print_debug(parser.builder_str() + " >> " + parser.lexycal_attributes.line_count.str())
+				parser.lexycal_attributes.lexeme_builder = '' // strings.Builder{}
+				parser.lexycal_attributes.open_comment = false
+				parser.lexycal_attributes.open_tag = false
+			} else {
+				parser.lexycal_attributes.write_lexeme(word)
+			}
+		} else if parser.lexycal_attributes.open_string > 0 {
+			if parser.lexycal_attributes.open_string == string_code {
+				parser.lexycal_attributes.open_string = 0
+				parser.lexycal_attributes.write_lexeme(word)
+				temp_lexeme := parser.builder_str()
+				if parser.lexycal_attributes.current_tag.last_attribute != '' {
+					parser.lexycal_attributes.current_tag.attributes[parser.lexycal_attributes.current_tag.last_attribute] = temp_lexeme.substr(1,
+						temp_lexeme.len - 1) // parser.print_debug(parser.lexycal_attributes.current_tag.last_attribute + " = " + temp_lexeme)
+					parser.lexycal_attributes.current_tag.last_attribute = ''
+				} else {
+					parser.lexycal_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' // parser.print_debug(temp_lexeme)
+				}
+				parser.lexycal_attributes.lexeme_builder = ''
+			} else {
+				parser.lexycal_attributes.write_lexeme(word)
+			}
+		} else if parser.lexycal_attributes.open_tag {
+			if parser.lexycal_attributes.lexeme_builder.len == 0 && is_quotation {
+				parser.lexycal_attributes.open_string = string_code
+				parser.lexycal_attributes.write_lexeme(word)
+			} else if word == 62 { // close tag >
+				complete_lexeme := parser.builder_str().to_lower()
+				parser.lexycal_attributes.current_tag.closed = (complete_lexeme.len > 0 &&
+					complete_lexeme[complete_lexeme.len - 1] == 47) // if equals to /
+				if complete_lexeme.len > 0 && complete_lexeme[0] == 47 {
+					parser.dom.close_tags[complete_lexeme] = true
+				}
+				/*
+				else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
+					parser.lexycal_attributes.current_tag.closed = true
+				}
+				*/
+				if parser.lexycal_attributes.current_tag.name == '' {
+					parser.lexycal_attributes.current_tag.name = complete_lexeme
+				} else if complete_lexeme != '/' {
+					parser.lexycal_attributes.current_tag.attributes[complete_lexeme] = ''
+				}
+				parser.lexycal_attributes.open_tag = false
+				parser.lexycal_attributes.lexeme_builder = '' // if tag name is code
+				if parser.lexycal_attributes.current_tag.name in parser.lexycal_attributes.code_tags {
+					parser.lexycal_attributes.open_code = true
+					parser.lexycal_attributes.opened_code_type = parser.lexycal_attributes.current_tag.name
+				}
+				// parser.print_debug(parser.lexycal_attributes.current_tag.name)
+			} else if word != 9 && word != 32 && word != 61 && word != 10 { // Tab, space, = and \n
+				parser.lexycal_attributes.write_lexeme(word)
+			} else if word != 10 {
+				complete_lexeme := parser.builder_str().to_lower()
+				if parser.lexycal_attributes.current_tag.name == '' {
+					parser.lexycal_attributes.current_tag.name = complete_lexeme
+				} else {
+					parser.lexycal_attributes.current_tag.attributes[complete_lexeme] = ''
+					parser.lexycal_attributes.current_tag.last_attribute = ''
+					if word == 61 { // if was a =
+						parser.lexycal_attributes.current_tag.last_attribute = complete_lexeme
+					}
+				}
+				parser.lexycal_attributes.lexeme_builder = '' // strings.Builder{}
+			}
+			if parser.builder_str() == '!--' {
+				parser.lexycal_attributes.open_comment = true
+			}
+		} else if word == 60 { // open tag '<'
+			temp_string := parser.builder_str()
+			if parser.lexycal_attributes.lexeme_builder.len >= 1 {
+				if parser.lexycal_attributes.current_tag.name.len > 1 &&
+					parser.lexycal_attributes.current_tag.name[0] == 47 && !blank_string(temp_string) {
+					parser.tags << &Tag{
+						name: 'text'
+						content: temp_string
+					}
+				} else {
+					parser.lexycal_attributes.current_tag.content = temp_string // verify later who has this content
+				}
+			}
+			// parser.print_debug(parser.lexycal_attributes.current_tag.str())
+			parser.lexycal_attributes.lexeme_builder = ''
+			parser.generate_tag()
+			parser.lexycal_attributes.open_tag = true
+		} else {
+			parser.lexycal_attributes.write_lexeme(word)
+		}
+	}
+}
+
+pub fn (mut parser Parser) parse_html(data string, is_file bool) {
+	if !parser.initialized {
+		parser.initialize_all()
+	}
+	mut lines := []string{}
+	if is_file {
+		file_lines := os.read_lines(data) or {
+			eprintln('failed to read the file $data')
+			return
+		}
+		lines = file_lines
+	} else {
+		lines = data.split_into_lines()
+	}
+	for line in lines {
+		parser.lexycal_attributes.line_count++
+		parser.split_parse(line)
+	}
+	parser.generate_tag()
+	parser.dom.debug_file = parser.debug_file
+	parser.dom.construct(parser.tags) // println(parser.close_tags.keys())
+}
+
+pub fn (mut parser Parser) finalize() {
+	parser.generate_tag()
+}
+
+pub fn (parser Parser) get_tags() []Tag_ptr {
+	return parser.tags
+}
+
+pub fn (mut parser Parser) get_dom() DocumentObjectModel {
+	if !parser.dom.constructed {
+		parser.generate_tag()
+		parser.dom.construct(parser.tags)
+	}
+	return parser.dom
+}
+
+/*pub fn (mut parser Parser) get_xpath() XPath {
+	dom := parser.get_dom()
+	return XPath{
+		dom: dom
+	}
+}*/
--- a/vlib/net/html/parser_test.v
+++ b/vlib/net/html/parser_test.v
@ -0,0 +1,52 @@
+module html
+
+//import net.http
+
+fn test_split_parse() {
+	mut parser := Parser{}
+	parser.initialize_all()
+	parser.split_parse('<!doctype htm')
+	parser.split_parse('l public')
+	parser.split_parse('><html><he')
+	parser.split_parse('ad><t')
+	parser.split_parse('itle> Hum... ')
+	parser.split_parse('A Tit')
+	parser.split_parse('\nle</ti\ntle>')
+	parser.split_parse('</\nhead><body>\t\t\t<h3>')
+	parser.split_parse('Nice Test!</h3>')
+	parser.split_parse('</bo\n\n\ndy></html>')
+	parser.finalize()
+	assert parser.get_tags().len == 11
+	assert parser.get_tags()[3].get_content() == ' Hum... A Tit\nle'
+}
+
+fn test_giant_string() {
+	mut temp_html := '<!doctype html><html><head><title>Giant String</title></head><body>'
+	for counter := 0; counter < 2000; counter++ {
+		temp_html += "<div id='name_$counter' class='several-$counter'>Look at $counter</div>"
+	}
+	temp_html += '</body></html>'
+	mut parser := Parser{}
+	parser.parse_html(temp_html, false)
+	assert parser.get_tags().len == 4009
+}
+
+fn test_script_tag() {
+	temp_html := "<html><body><script>\nvar googletag = googletag || {};\n
+	googletag.cmd = googletag.cmd || [];if(3 > 5) {console.log('Birl');}\n</script></body></html>"
+	mut parser := Parser{}
+	parser.parse_html(temp_html, false)
+	assert parser.get_tags()[2].get_content().len == 101
+}
+
+/*fn test_download_source() {
+	println('Fetching github data in pastebin')
+	resp := http.get('https://pastebin.com/raw/5snUQgqN') or {
+		println('failed to fetch data from the server')
+		return
+	}
+	println('Finalized fetching, start parsing')
+	mut parser := Parser{}
+	parser.parse_html(resp.text, false)
+	assert parser.get_tags().len == 2244
+}*/
--- a/vlib/net/html/tag.v
+++ b/vlib/net/html/tag.v
@ -0,0 +1,86 @@
+module html
+
+enum CloseTagType {
+	in_name
+	new_tag
+}
+
+[ref_only]
+pub struct Tag {
+mut:
+	name               string = ''
+	attributes         map[string]string // attributes will be like map[name]value
+	last_attribute     string = ''
+	content            string = ''
+	children           []&Tag
+	parent             &Tag = C.NULL
+	position_in_parent int = 0
+	closed             bool = false
+	close_type         CloseTagType = .in_name
+}
+
+fn (mut tag Tag) add_parent(t &Tag, position int) {
+	tag.position_in_parent = position
+	tag.parent = t
+}
+
+fn (mut tag Tag) add_child(t &Tag) int {
+	mut children := tag.children
+	children << t
+	tag.children = children
+	return tag.children.len
+}
+
+pub fn (tag Tag) get_children() []Tag_ptr {
+	return tag.children
+}
+
+pub fn (tag Tag) get_parent() &Tag {
+	return tag.parent
+}
+
+pub fn (tag Tag) get_name() string {
+	return tag.name
+}
+
+pub fn (tag Tag) get_content() string {
+	return tag.content
+}
+
+pub fn (tag Tag) get_attributes() map[string]string {
+	return tag.attributes
+}
+
+pub fn (tag Tag) text() string {
+	if tag.name.len >= 2 && tag.name[0..2] == 'br' {
+		return '\n'
+	}
+	mut to_return := tag.content.replace('\n', '')
+	for index := 0; index < tag.children.len; index++ {
+		to_return += tag.children[index].text()
+	}
+	return to_return
+}
+
+pub fn (tag &Tag) str() string {
+	mut to_return := '<$tag.name'
+	for key in tag.attributes.keys() {
+		to_return += ' $key'
+		value := tag.attributes[key]
+		if value.len > 0 {
+			to_return += '=' + '"${tag.attributes[key]}"'
+		}
+	}
+	to_return += if tag.closed && tag.close_type == .in_name { '/>' } else { '>' }
+	to_return += '$tag.content'
+	if tag.children.len > 0 {
+		// println('${tag.name} have ${tag.children.len} childrens')
+		for index := 0; index < tag.children.len; index++ {
+			to_return += tag.get_children()[index].str()
+		}
+	}
+	if !tag.closed || tag.close_type == .new_tag {
+		to_return += '</$tag.name>'
+	}
+	return to_return
+}